A collection of scripting software used in the stickleback project

Marine Assembly: From Pacbio, Nanopore, 10x Genomics, to HiC Lastz workflow: axt to chain, to netted chains

GATK SNPs/INDELs calling work flow

Reference

https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels

Step 0: Environment setup, (see dockerfile)

FROM ubuntu:20.04
ENV DEBIAN_FRONTEND=noninteractive
# Install dependencies
RUN apt-get update && apt-get install -y \
coreutils \
samtools \
bwa \
openjdk-17-jdk \
openjdk-17-jre \
fastqc \
python3 \
python3-pip \
&& pip3 install --upgrade pip \
&& pip3 install cutadapt \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
TODO: conda environment might be the better option to set up env and third party software

Step 1: Trim Fastqs and align to reference

trim_galore --output_dir "$OUTPUT_DIR" --basename "$PREFIX" --trim-n --max_n 0 --cores $(( THREADS > 4 ? 4 : THREADS )) --paired "$READ_ONE" "$READ_TWO"
# Define paths for Trim Galore output
TRIMMED_FWD="${OUTPUT_DIR}/${PREFIX}_val_1.fq.gz"
TRIMMED_REV="${OUTPUT_DIR}/${PREFIX}_val_2.fq.gz"
# Adapter trimming and quality filtering with BBDuk
echo "["$(date)"]\tRemoving adapters with BBDuk..."
./bbduk.sh \
in1="$TRIMMED_FWD" in2="$TRIMMED_REV" \
out1="${OUTPUT_DIR}/${PREFIX}_R1.fastq.gz" out2="${OUTPUT_DIR}/${PREFIX}_R2.fastq.gz" \
minlen=25 qtrim=rl trimq=10 ktrim=r k=25 mink=11 hdist=1 \
ref=$ADAPTERS
# MAPPING with bwa and sorting with samtools
echo -e "["$(date)"]\tAligning and sorting..."
BAM="${OUTPUT_DIR}/${PREFIX}.bam"
SORT_THREADS=$(( $THREADS / 2 )) # Adjust based on your system's capabilities
bwa mem -t "$THREADS" "$REFERENCE" "${OUTPUT_DIR}/${PREFIX}_R1.fastq.gz" "${OUTPUT_DIR}/${PREFIX}_R2.fastq.gz" | \
samtools sort -@ "$SORT_THREADS" -T "${OUTPUT_DIR}/${PREFIX}.tmp" -o "$BAM"
# Index the final BAM file
samtools index "$BAM"

Step 2: Process BAMs and call raw genotype variance

gatk --java-options "-Xmx$MEM" MarkDuplicates -I "$BAM" -O /dev/stdout -M /dev/null -VALIDATION_STRINGENCY SILENT -CREATE_INDEX true | \
samtools view -b -@ "$THREADS" -f 1 - | \
gatk --java-options "-Xmx$MEM" AddOrReplaceReadGroups -I /dev/stdin -O /dev/stdout -LB "WGS" -PL "$PL" -PU "ILUMINA" -SM "$PREFIX" | \
gatk --java-options "-Xmx$MEM" FixMateInformation -I /dev/stdin -O "$PROCESSED_BAM" -ADD_MATE_CIGAR true -IGNORE_MISSING_MATES true -TMP_DIR "${PREFIX}.fixmate.tmp"
# Index the final output BAM
samtools index "$PROCESSED_BAM"
# Variant calling
gatk --java-options "-Xmx$MEM" HaplotypeCaller -I "$PROCESSED_BAM" -O "${PREFIX}.g.vcf.gz" -R "$REFERENCE" -ERC GVCF --native-pair-hmm-threads "$THREADS"
# Assuming $gVCF should be ${PREFIX}.g.vcf.gz based on previous command
gatk --java-options "-Xmx$(echo $MEM | sed 's/G/g/')" GenotypeGVCFs -R "$REFERENCE" -V "${PREFIX}.g.vcf.gz" -O "${PREFIX}.vcf.gz"

Step 3: Combine raw genotypes into one population vcf

SAMPLES="${DIR}/${PREFIX}.list"
: > "$SAMPLES"
for i in "$DIR"/*.g.vcf.gz; do
echo "$i" >> "$SAMPLES"
done
MERGED="${DIR}/${PREFIX}.merged.g.vcf.gz"
VCF="${DIR}/${PREFIX}.cohort.genotyped.vcf.gz"
# Combine GVCFs
gatk --java-options "-Xmx32g" CombineGVCFs -R "$REFERENCE" --variant "$SAMPLES" -O "$MERGED"
# Genotype GVCFs
gatk --java-options "-Xmx32g" GenotypeGVCFs -R "$REFERENCE" -V "$MERGED" -O "$VCF"

Step 4: Initial filtering of raw snp and indel calls

RAW_SNP="${PREFIX}.raw.SNPs.vcf.gz"
FILTERED_SNP="${PREFIX}.filtered.SNPs.vcf.gz"
# Using named pipe for intermediate steps
mkfifo raw.snps.tmp default.snps.tmp frequency.snps.tmp
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V "$COHORT_VCF" -select-type SNP -O raw.snps.tmp &
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V raw.snps.tmp --filter-expression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "default_snp_filter" -O default.snps.tmp &
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V default.snps.tmp --filter-expression "AC < 4" --filter-name "frequency_filter" -O frequency.snps.tmp &
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V frequency.snps.tmp --exclude-filtered -O "$FILTERED_SNP"

Step 5: Recalibrate initial alignments and recall variance using genotype vcf

# Step 1: Base quality score recalibration
gatk BaseRecalibrator -R "$REFERENCE" \
--known-sites "$SNP_VCF" \
--known-sites "$INDEL_VCF" \
-I "$BAM" \
-O "$RECALIBRATED_TABLE"
# Step 2: Apply BQSR
gatk ApplyBQSR -R "$REFERENCE" \
-I "$BAM" \
--bqsr-recal-file "$RECALIBRATED_TABLE" \
-O "$BAM_RECALIBRATED"
# Step 3: Index recalibrated BAM
samtools index "$BAM_RECALIBRATED"
# Step 4: Variant calling with HaplotypeCaller
gatk HaplotypeCaller \
--input "$BAM_RECALIBRATED" \
--output "$GENOTYPE_VCF" \
--reference "$REFERENCE" \
-ERC GVCF

Step 6: Combine population genotypes into one vcf

SAMPLES="${DIR}/${PREFIX}.list"
: > "$SAMPLES"
for i in "$DIR"/*.g.vcf.gz; do
echo "$i" >> "$SAMPLES"
done
MERGED="${DIR}/${PREFIX}.merged.g.vcf.gz"
VCF="${DIR}/${PREFIX}.cohort.genotyped.vcf.gz"
# Combine GVCFs
gatk --java-options "-Xmx32g" CombineGVCFs -R "$REFERENCE" --variant "$SAMPLES" -O "$MERGED"
# Genotype GVCFs
gatk --java-options "-Xmx32g" GenotypeGVCFs -R "$REFERENCE" -V "$MERGED" -O "$VCF"

Step 7: The variant recalibration step fits a Gaussian mixture model to the contextual annotations given to each variant

# Recalibrate variant quality scores for SNPs
gatk --java-options "-Xmx30g" VariantRecalibrator \
-R "$REFERENCE" \
-V "$RAW_SNP" \
-resource:trainingSnpDb,known=false,training=true,truth=true,prior=8.0 "$SNPS_DB" \
-an DP -an QD -an FS -an SOR -an MQ -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff \
-mode SNP \
-O "$BQSR_SNP" \
--tranches-file "$TRACE_FILE" \
--rscript-file "${BQSR_SNP}.output.plots.R"
# Apply the recalibration to the SNP set
gatk --java-options "-Xmx30g" ApplyVQSR \
-R "$REFERENCE" \
-V "$RAW_SNP" \
-ts-filter-level 99.9 \
-recal-file "$BQSR_SNP" \
--tranches-file "$TRACE_FILE" \
-O "${PREFIX}.vqsr.SNPs.vcf.gz"
# Filter the final set of SNPs
gatk --java-options "-Xmx8g" VariantFiltration \
-V "${PREFIX}.vqsr.SNPs.vcf.gz" \
-O "${PREFIX}.vqsr.ase.genotype.SNPs.vcf.gz" \
--cluster-window-size 35 \
--cluster-size 3 \
--filter-name "FS" --filter-expression "FS > 30.0" \
--filter-name "QD" --filter-expression "QD < 2.0"