/adrian_chopchop

Example of how to setup Chop Chop runs

Primary LanguageShellMIT LicenseMIT

Setup ChopChop runs

Example of how to setup ChopChop runs

Use env already built on HPCC for you

conda activate activate /bigdata/gen220/shared/condaenv/chopchop

OR Setup an environment for this yourself

# setup a conda env for this tool
conda create -p chopchop_env -c anaconda -c bioconda -c conda-forge biopython pandas numpy scipy argparse mysql-python scikit-learn=0.18.1 bedtools bowtie ucsc-twobittofa
conda activate ./chopchop_env

checkout the chopchop code

git clone https://bitbucket.org/valenlab/chopchop.git

Now build up the config file and index folders - this will replace config file with your current folder info

# build config
mkdir -p bowtie_index twoBit_index gene_table isoforms isoforms_MT
# download genome (your project will vary here)
CWD=$(realpath .) # get the current path
cat > chopchop/config.json <<-_EOT_
{
  "PATH": {
    "PRIMER3": "./primer3_core",
    "BOWTIE": "bowtie/bowtie",
    "TWOBITTOFA": "./twoBitToFa",
    "TWOBIT_INDEX_DIR": ${CWD}/twoBit_index,
    "BOWTIE_INDEX_DIR": ${CWD}/bowtie_index,
    "ISOFORMS_INDEX_DIR": ${CWD}/isoforms,
    "ISOFORMS_MT_DIR": ${CWD}/isoforms_MT,
    "GENE_TABLE_INDEX_DIR": ${CWD}/gene_table,
  },
  "THREADS": 1
}
_EOT_ 

Download some genome data

curl -O https://marchantia.info/download/MpTak_v6.1r2/MpTak_v6.1r2.gff.gz
curl -O https://marchantia.info/download/MpTak_v6.1r2/MpTak_v6.1r2.genome.fasta.gz
gunzip MpTak_v6.1r2.gff.gz MpTak_v6.1r2.genome.fasta.gz

Build indexes

bowtie-build MpTak_v6.1r2.genome.fasta bowtie_index/MpTak
faToTwoBit MpTak_v6.1r2.genome.fasta twoBit_index/MpTak.2bit

Setup gene table and make transcriptome file

# fix problem with GFF3 file
perl -i -p -e 's/MapolyID/mapolyID/' MpTak_v6.1r2.gff
gff3ToGenePred -geneNameAttr=Name MpTak_v6.1r2.gff MpTak.genePred
# make the gene table
echo -e "name\tchrom\tstrand\ttxStart\ttxEnd\tcdsStart\tcdsEnd\texonCount\texonStarts\texonEnds\tscore\tname2\tcdsStartStat\tcdsEndStat\texonFrames" > gene_table/MpTak.gene_table
cat MpTak.genePred >> gene_table/MpTak.gene_table
# make bedfile 
genePredToBed MpTak.genePred MpTak.genes.bed
bedtools getfasta -fi MpTak_v6.1r2.genome.fasta -bed MpTak.genes.bed -nameOnly -s -split -fo MpTak.transcriptome.fa
bowtie-build MpTak.transcriptome.fa isoforms/MpTak

It isn't clear this is needed at all Run Vienna RNA to fold and make MT folder

#SBATCH -p short -c 64 --mem 24gb --out rnafold.log
CPU=64
module load viennarna
module load emboos
# split data into pieces for parallelization
mkdir -p split
cd split
seqretsplit ../MpTak.transcriptome.fa .
cd ..

mkdir -p RNA_fold
cd RNA_fold
parallel -j $CPU RNAplfold \< {} ::: $(ls ../split/*.fasta)

for filename in *.ps; do  
    perl ../chopchop/mountain.pl < $filename > "../isoforms_MT/$(basename "$filename" _dp.ps).mt"  
done  
cd ..

Setup a job for running these - predict sites only in exon 1

#!/usr/bin/bash -l
#SBATCH -N 1 -n 1 -c 4 --mem 8gb --out chopchop_run.log
module load miniconda3
OUT=MpTak_chopchop_run
conda activate /bigdata/gen220/shared/condaenv/chopchop
cd chopchop
mkdir -p $OUT
./chopchop_query.py -G MpTak -o $OUT --genePred_file ../gene_table/MpTak.gene_table --exon 1 --scoringMethod DOENCH_2016