Setup ChopChop runs

Example of how to setup ChopChop runs

Use env already built on HPCC for you

conda activate activate /bigdata/gen220/shared/condaenv/chopchop

OR Setup an environment for this yourself

# setup a conda env for this tool
conda create -p chopchop_env -c anaconda -c bioconda -c conda-forge biopython pandas numpy scipy argparse mysql-python scikit-learn=0.18.1 bedtools bowtie ucsc-twobittofa
conda activate ./chopchop_env

checkout the chopchop code

git clone https://bitbucket.org/valenlab/chopchop.git

Now build up the config file and index folders - this will replace config file with your current folder info

# build config
mkdir -p bowtie_index twoBit_index gene_table isoforms isoforms_MT
# download genome (your project will vary here)
CWD=$(realpath .) # get the current path
cat > chopchop/config.json <<-_EOT_
{
  "PATH": {
    "PRIMER3": "./primer3_core",
    "BOWTIE": "bowtie/bowtie",
    "TWOBITTOFA": "./twoBitToFa",
    "TWOBIT_INDEX_DIR": ${CWD}/twoBit_index,
    "BOWTIE_INDEX_DIR": ${CWD}/bowtie_index,
    "ISOFORMS_INDEX_DIR": ${CWD}/isoforms,
    "ISOFORMS_MT_DIR": ${CWD}/isoforms_MT,
    "GENE_TABLE_INDEX_DIR": ${CWD}/gene_table,
  },
  "THREADS": 1
}
_EOT_

Download some genome data

curl -O https://marchantia.info/download/MpTak_v6.1r2/MpTak_v6.1r2.gff.gz
curl -O https://marchantia.info/download/MpTak_v6.1r2/MpTak_v6.1r2.genome.fasta.gz
gunzip MpTak_v6.1r2.gff.gz MpTak_v6.1r2.genome.fasta.gz

Build indexes

bowtie-build MpTak_v6.1r2.genome.fasta bowtie_index/MpTak
faToTwoBit MpTak_v6.1r2.genome.fasta twoBit_index/MpTak.2bit

Setup gene table and make transcriptome file

# fix problem with GFF3 file
perl -i -p -e 's/MapolyID/mapolyID/' MpTak_v6.1r2.gff
gff3ToGenePred -geneNameAttr=Name MpTak_v6.1r2.gff MpTak.genePred
# make the gene table
echo -e "name\tchrom\tstrand\ttxStart\ttxEnd\tcdsStart\tcdsEnd\texonCount\texonStarts\texonEnds\tscore\tname2\tcdsStartStat\tcdsEndStat\texonFrames" > gene_table/MpTak.gene_table
cat MpTak.genePred >> gene_table/MpTak.gene_table
# make bedfile 
genePredToBed MpTak.genePred MpTak.genes.bed
bedtools getfasta -fi MpTak_v6.1r2.genome.fasta -bed MpTak.genes.bed -nameOnly -s -split -fo MpTak.transcriptome.fa
bowtie-build MpTak.transcriptome.fa isoforms/MpTak

It isn't clear this is needed at all Run Vienna RNA to fold and make MT folder

#SBATCH -p short -c 64 --mem 24gb --out rnafold.log
CPU=64
module load viennarna
module load emboos
# split data into pieces for parallelization
mkdir -p split
cd split
seqretsplit ../MpTak.transcriptome.fa .
cd ..

mkdir -p RNA_fold
cd RNA_fold
parallel -j $CPU RNAplfold \< {} ::: $(ls ../split/*.fasta)

for filename in *.ps; do  
    perl ../chopchop/mountain.pl < $filename > "../isoforms_MT/$(basename "$filename" _dp.ps).mt"  
done  
cd ..

Setup a job for running these - predict sites only in exon 1

#!/usr/bin/bash -l
#SBATCH -N 1 -n 1 -c 4 --mem 8gb --out chopchop_run.log
module load miniconda3
OUT=MpTak_chopchop_run
conda activate /bigdata/gen220/shared/condaenv/chopchop
cd chopchop
mkdir -p $OUT
./chopchop_query.py -G MpTak -o $OUT --genePred_file ../gene_table/MpTak.gene_table --exon 1 --scoringMethod DOENCH_2016

hyphaltip/adrian_chopchop

Setup ChopChop runs