/CSeQTLworkflow

This readme repository contains sample codes for executing steps within our manuscript

Primary LanguageHTML

CSeQTLworkflow

Introduction and Outline

This repository contains template codes for runnings workflow steps including

for our manuscript Cell Type-Specific Expression Quantitative Trait Loci.

Reference Data

GTEx Brain and Blood

Since GTEx samples are available on the NHGRI AnVIL cloud, they were processed using a combination of Docker and WDL with details provided in this repository.

CommonMind Consortium

Click to expand!

Code to obtain CMC inputs

# Install package
install.packages("synapser",
	repos = c("https://sage-bionetworks.github.io/ran","http://cran.fhcrc.org"))

library(synapser)

# Login to Synapse
username = "." # your username
password = "." # your password
synLogin(username,password)

down_dir = "." # user-specified directory to download files

# Function used to download files
entity = "syn4600989" # an example SYN ID unique to a file
synGet(entity = entity,downloadLocation = down_dir)

# List of SYN_IDs, use synGet() to download
"syn4600989" 	# CMC Human sampleIDkey metadata
"syn4600985" 	# QC'd genotype bed file data
"syn4600987" 	# QC'd bim file
"syn4600989" 	# QC fam file
"syn4935690" 	# README file
"syn5600756" 	# list of outlier samples
"syn18080588"	# SampleID key
"syn3354385"	# clinical data
"syn18358379"	# RNAseq meta/QC data

# List of BAM SYN_IDs
tableId = "syn11638850" 
results = synTableQuery(smart_sprintf("select * from %s 
	where species='Human' and dataType='geneExpression'", tableId))
# results$filepath
aa = as.data.frame(results)
aa = aa[grep("bam",aa$name),]
aa = aa[grep("accepted",aa$name),] # excluding unmapped bam files
dim(aa)
saveRDS(aa,"aligned_bam_files.rds")

# Download BAM files one by one
BAM_dir = "./BAM"; dir.create(BAM_dir)
for(samp in sort(unique(aa$specimenID))){
	samp_dir = file.path(BAM_dir,samp)
	dir.create(samp_dir)
	samp_syn_ids = aa$id[which(aa$specimenID == samp)]
	for(samp_syn_id in samp_syn_ids){
		synGet(entity = samp_syn_id,down_dir = samp_dir)
	}
	cat("\n")
}

# SNP6 annotation CSV file
tmp_link = "http://www.affymetrix.com/Auth/analysis"
tmp_link = file.path(tmp_link,"downloads/na35/genotyping")
tmp_link = file.path(tmp_link,"GenomeWideSNP_6.na35.annot.csv.zip")
system(sprintf("wget %s",tmp_link))

Blueprint

Click to expand!
  • Download metadata and BAMs with Pyega3

  • cred_file.json contains user login information

  • To obtain metadata,

     # for example
     dataset=EGAD00001002663 # genotypes
     # dataset=EGAD00001002671 # purified cell type 1
     # dataset=EGAD00001002674 # purified cell type 2
     # dataset=EGAD00001002675 # purified cell type 3
    
     # Download metadata code
     pyega3 -cf cred_file.json files $dataset
  • To obtain BAM files one by one,

     # num threads/cores
     nt=1
    
     # Example file id per BAM
     id=EGAF00001330176
    
     # Download code
     pyega3 -cf cred_file.json -c $nt fetch $id

BAM workflow

Click to expand!
  • BamToFastq

    Click to expand!

    For paired-end reads

     java -Xmx4g -jar picard.jar SamToFastq \
     	INPUT=input.bam FASTQ=output_1.fastq.gz \
     	SECOND_END_FASTQ=output_2.fastq.gz \
     	UNPAIRED_FASTQ=output_unpair.fastq.gz \
     	INCLUDE_NON_PF_READS=true VALIDATION_STRINGENCY=SILENT

    For single-end reads

     java -Xmx4g -jar picard.jar SamToFastq \
     	INPUT=input.bam FASTQ=output.fastq \
     	INCLUDE_NON_PF_READS=true \
     	VALIDATION_STRINGENCY=SILENT
  • Build STAR index

    Click to expand!
     fasta_fn=Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta
     gtf_fn=gencode.v26.GRCh38.ERCC.genes.gtf
     
     STAR --runMode genomeGenerate \
     	--genomeDir ./star_index \
     	--genomeFastaFiles $fasta_fn \
     	--sjdbGTFfile $gtf_fn \
     	--sjdbOverhang 99 --runThreadN 1
  • Run STAR for read alignment

    Click to expand!

    For paired-end reads

     STAR --runMode alignReads \
     	--runThreadN 1 --genomeDir ./star_index --twopassMode Basic \
     	--outFilterMultimapNmax 20 --alignSJoverhangMin 8 \
     	--alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 \
     	--outFilterMismatchNoverLmax 0.1 --alignIntronMin 20 \
     	--alignIntronMax 1000000 --alignMatesGapMax 1000000 \
     	--outFilterType BySJout --outFilterScoreMinOverLread 0.33 \
     	--outFilterMatchNminOverLread 0.33 --limitSjdbInsertNsj 1200000 \
     	--readFilesIn output_1.fastq.gz output_2.fastq.gz \
     	--readFilesCommand zcat --outFileNamePrefix output_hg38 \
     	--outSAMstrandField intronMotif --outFilterIntronMotifs None \
     	--alignSoftClipAtReferenceEnds Yes --quantMode TranscriptomeSAM \
     	GeneCounts --outSAMtype BAM Unsorted --outSAMunmapped Within \
     	--genomeLoad NoSharedMemory --chimSegmentMin 15 \
     	--chimJunctionOverhangMin 15 --chimOutType WithinBAM SoftClip \
     	--chimMainSegmentMultNmax 1 --outSAMattributes NH HI AS nM NM ch \
     	--outSAMattrRGline ID:rg1 SM:sm1

    For single-end reads

     STAR --runMode alignReads \
     	--runThreadN 1 --genomeDir ./star_index --twopassMode Basic \
     	--outFilterMultimapNmax 20 --alignSJoverhangMin 8 \
     	--alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 \
     	--outFilterMismatchNoverLmax 0.1 --alignIntronMin 20 \
     	--alignIntronMax 1000000 --alignMatesGapMax 1000000 \
     	--outFilterType BySJout --outFilterScoreMinOverLread 0.33 \
     	--outFilterMatchNminOverLread 0.33 --limitSjdbInsertNsj 1200000 \
     	--readFilesIn output.fastq.gz --readFilesCommand zcat \
     	--outFileNamePrefix output_hg38 --outSAMstrandField intronMotif \
     	--outFilterIntronMotifs None --alignSoftClipAtReferenceEnds Yes \
     	--quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM Unsorted \
     	--outSAMunmapped Within --genomeLoad NoSharedMemory \
     	--chimSegmentMin 15 --chimJunctionOverhangMin 15 \
     	--chimOutType WithinBAM SoftClip --chimMainSegmentMultNmax 1 \
     	--outSAMattributes NH HI AS nM NM ch \
     	--outSAMattrRGline ID:rg1 SM:sm1
  • MarkDuplicates

    Click to expand!
     # Sort reads by coordinate
     samtools sort -@ 0 -o output_hg38.sortedByCoordinate.bam \
     	output_hg38Aligned.out.bam
     
     # Make bam index
     samtools index -b -@ 1 output_hg38.sortedByCoordinate.bam
     
     # MarkDuplicates
     java -Xmx4g -jar picard.jar MarkDuplicates \
     	INPUT=output_hg38.sortedByCoordinate.bam \
     	OUTPUT=output_hg38.sortedByCoordinate.md.bam \
     	M=out.marked_dup_metrics.txt ASSUME_SORT_ORDER=coordinate

    Process post-markduplicate sorted bam

     # Count num reads in bam
     samtools view -c -@ 0 output_hg38.sortedByCoordinate.md.bam
     
     # Re-index bam
     samtools index -b -@ 1 output_hg38.sortedByCoordinate.md.bam
  • Get TReC and ASReC

    Download asSeq source package

    Click to expand!
     url=https://github.com/Sun-lab/asSeq/raw
     url=$url/master/asSeq_0.99.501.tar.gz
     
     wget $url

    Install R package and run asSeq to get unique reads and filter

    Click to expand!
     install.packages(pkgs = "asSeq_0.99.501.tar.gz",
     	type = "source",repos = NULL)
     
     PE = TRUE 
     	# set TRUE for paired-end samples
     	# set FALSE for single-end
     
     flag1 = Rsamtools::scanBamFlag(isUnmappedQuery = FALSE,
     	isSecondaryAlignment = FALSE,isDuplicate = FALSE,
     	isNotPassingQualityControls = FALSE,
     	isSupplementaryAlignment = FALSE,isProperPair = PE)
     
     param1 = Rsamtools::ScanBamParam(flag = flag1,
     	what = "seq",mapqFilter = 255)
     
     bam_file = "output_hg38.sortedByCoordinate.md.bam"
     bam_filt_fn = "output.filtered.asSeq.bam"
     Rsamtools::filterBam(file = bam_file,
     	destination = bam_filt_fn,
     	param = param1)

    Create exon image file

    Click to expand!
     gtf_fn = "gencode.v26.GRCh38.ERCC.genes.gtf.gz"
     exdb = GenomicFeatures::makeTxDbFromGFF(file = gtf_fn,
     	format = "gtf")
     exons_list_per_gene = GenomicFeatures::exonsBy(exdb,
     	by = "gene")
     
     gtf_rds_fn = "exon_by_genes_gencode_v26.rds"
     saveRDS(exons_list_per_gene,gtf_rds_fn)

    Get total read count (TReC)

    Click to expand!
     genes = readRDS(gtf_rds_fn)
     bamfile = Rsamtools::BamFileList(bam_filt_fn,
     	yieldSize = 1000000)
     se = GenomicAlignments::summarizeOverlaps(features = genes,
     	reads = bamfile,mode = "Union",singleEnd = !PE,
     	ignore.strand = TRUE,fragments = PE)
     ct = as.data.frame(SummarizedExperiment::assay(se))

    Filter reads by Qname

    Click to expand!
     samtools sort -n -o output.filtered.asSeq.sortQ.bam \
     	output.filtered.asSeq.bam

    Extract allele-specific reads, outputs hap1.bam, hap2.bam, hapN.bam

    Click to expand!
     het_snp_fn = "<tab delimited filename of heterozygous SNPs for sample>"
     	# Columns: chr, position, hap1 allele, hap2 allele
     	# no header
     
     bam_filt_sortQ_fn = "output.filtered.asSeq.sortQ"
     asSeq::extractAsReads(input = sprintf("%s.bam",bam_filt_sortQ_fn),
     	snpList = het_snp_fn,min.avgQ = 20,min.snpQ = 20)

    Count allele-specific read counts (ASReC)

    Click to expand!
     se1 = GenomicAlignments::summarizeOverlaps(features = genes,
     	reads = sprintf("%s_hap1.bam",bam_filt_sortQ_fn),mode = "Union",
     	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)
     se2 = GenomicAlignments::summarizeOverlaps(features = genes,
     	reads = sprintf("%s_hap2.bam",bam_filt_sortQ_fn),mode = "Union",
     	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)
     seN = GenomicAlignments::summarizeOverlaps(features = genes,
     	reads = sprintf("%s_hapN.bam",bam_filt_sortQ_fn),mode = "Union",
     	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)

    Save read counts

    Click to expand!
     ct1 = as.data.frame(SummarizedExperiment::assay(se1))
     ct2 = as.data.frame(SummarizedExperiment::assay(se2))
     ctN = as.data.frame(SummarizedExperiment::assay(seN))
     cts = cbind(ct,ct1,ct2,ctN) # trec, hap1, hap2, hapN
     dim(cts); cts[1:2,]
     out_fn = "output.trecase.txt"
     write.table(cts,file = out_fn,quote = FALSE,
     	sep = "\t", eol = "\n")

Deconvolution

Click to expand!
  • Signature expression derived from single cell RNAseq

  • Comments:

    • Transcripts per million (TPM), gene count normalized by exonic gene length and then all genes are normalized by total normalized gene count, then multipled by 1 million
    • Cell sizes: For brain, summation of gene count normalized by exonic gene length. For blood, obtained from EPIC and ICeDT papers.
  • Deconvolution Inputs

    • Bulk RNA-seq TPM (bulk_tpm),
    • scRNA-seq TPM (sig_tpm),
    • cell sizes
  • Template code

    Input variables and objects. Make sure the genes are ordered consistently between sig_tpm and bulk_tpm

     work_dir = "." # working directory
     setwd(work_dir)
     
     sig_tpm 	# TPM signature expression matrix
     bulk_tpm 	# TPM bulk expression matrix
     

    CIBERSORT: [Paper, Software]

     sig_fn = file.path(work_dir,"signature.txt")
     mix_fn = file.path(work_dir,"mixture.txt")
     write.table(cbind(rowname=rownames(sig_tpm),sig_tpm),
     	file = sig_fn,sep = "\t",quote = FALSE,row.names = FALSE)
     write.table(cbind(rowname=rownames(bulk_tpm),bulk_tpm),
     	file = mix_fn,sep = "\t",quote = FALSE,row.names = FALSE)
     
     source("CIBERSORT.R") # obtained from CIBERSORT website
     results = CIBERSORT(sig_matrix = sig_fn,mixture_file = mix_fn,
     	perm = 0,QN = FALSE,absolute = FALSE,abs_method = 'sig.score',
     	filename = "DECON")
     unlink(x = c(sig_fn,mix_fn))
     ciber_fn = sprintf("CIBERSORT-Results_%s.txt","DECON")
     unlink(ciber_fn)
     QQ = ncol(sig_tpm)
     
     # Extract proportion of transcripts per cell type per sample
     pp_bar_ciber = results[,seq(QQ)]
     
     # Calculate proportion of cell types per sample
     pp_hat_ciber = t(apply(pp_bar_ciber,1,function(xx){
     	yy = xx / cell_sizes; yy / sum(yy)
     }))
     

    ICeDT: [Paper, Software]

     fit = ICeDT::ICeDT(Y = bulk_tpm,Z = sig_tpm,
     	tumorPurity = rep(0,ncol(bulk_tpm)),refVar = NULL,
     	rhoInit = NULL,maxIter_prop = 4e3,
     	maxIter_PP = 4e3,rhoConverge = 1e-2)
     
     # Extract proportion of transcripts per cell type per sample
     pp_bar_icedt = t(fit$rho)[,-1]
     
     # Calculate proportion of cell type per sample
     pp_hat_icedt = t(apply(pp_bar_icedt,1,function(xx){
     	yy = xx / cell_sizes; yy / sum(yy)
     }))
     

eQTL mapping

Click to expand!
  • BULK mode: If running bulk analyses, set RHO to a matrix with N rows and 1 column and set XX_trecPC to residual TReC PCs calculated without accounting for cell types.

  • Cell type-specific (CTS) mode: If running cell type-specific analyses, set RHO to estimated cell type proportions and set XX_trecPC to proportion-adjusted residual TReC PCs.

  • Example codes

    Click to expand!

    Inputs

     devtools::install_github("pllittle/smarter")
     devtools::install_github("pllittle/CSeQTL")
     
     # Sample-specific variables
     RHO 			# cell type proportions matrix
     XX_base 	# baseline covariates, continuous variables centered
     XX_genoPC 	# genotype PCs, centered
     XX_trecPC 	# residual TReC PCs, centered
     XX = cbind(Int = 1,XX_base,XX_genoPC,XX_trecPC)
     
     # Gene and sample variables
     TREC 	# TReC vector
     SNP	# phased genotype vector
     hap2	# 2nd haplotype counts
     ASREC 	# total haplotype counts = hap1 + hap2
     PHASE 	# Indicator vector of whether or not to use haplotype counts
     
     # Tuning arguments
     trim 		# TRUE for trimmed analysis, FALSE for untrimmed
     thres_TRIM 	# if trim = TRUE, the Cooks' distance cutoff to trim sample TReCs
     ncores 	# number of threads/cores to parallelize the loop, improve runtime
     
     # ASREC-related cutoffs to satisfy to use allele-specific read counts
     numAS 		# cutoff to determine if a sample has sufficient read counts
     numASn 	# cutoff of samples having ASREC >= numAS
     numAS_het 	# minimum number for sum(ASREC >= numAS & SNP %in% c(1,2))
     cistrans 	# p-value cutoff on cis/trans testing to determine which p-value 
     		# (TReC-only or TReC+ASReC) to report

    eQTL mapping for one gene

     gs_out = CSeQTL_GS(XX = XX,TREC = TREC,SNP = SNP,hap2 = hap2,
     	ASREC = ASREC,PHASE = PHASE,RHO = RHO,trim = trim,
     	thres_TRIM = 20,numAS = 5,numASn = 5,numAS_het = 5,
     	cistrans = 0.01,ncores = ncores,show = TRUE)
     

    Hypothesis testing output. Matrices where rows are genomic loci and columns are cell types for CTS mode or a single column for BULK mode.

     # TReC-only likelihood ratio test statistics with 1 DF
     gs_out$LRT_trec
     
     # TReC+ASReC likelihood ratio test statistics with 1 DF
     gs_out$LRT_trecase
     
     # Cis/Trans likelihood ratio test statistics with 1 DF
     gs_out$LRT_cistrans
     

Enrichment

Click to expand!
  • Functional Enrichment with Torus with example code provided below based on this markdown with examples and documentation.

     torus -d eqtls.gz \
     	-smap snpMap.gz \
     	-gmap geneMap.gz \
     	-annot annot.gz \
     	-est > output_enrich
  • GWAS Enrichment with Jackknife-based inference

     library(CSeQTL)
     
     work_dir = "." # specify a work directory
     
     # Download GWAS catalog
     gwas = CSeQTL:::get_GWAS_catalog(work_dir = work_dir)

    Inputs

    • DATA: R data.frame containing headers 'Chr', 'POS', and columns leading with 'EE_' such as 'EE_Astrocyte', 'EE_Excitatory' corresponding to eQTL binary indicators 0 or 1. Rows correspond to gene/SNP pairings
    • which_gwas: Either 'gwas_catalog' or some specific grouped phenotype
    • nBLOCKS: Specify the block size for jackknife estimation and inference

    Code

     # Run GWAS enrichment
     enrich_final = c()
     
     ## Across all phenotypes
     enrich_catalog = CSeQTL:::run_gwasEnrich_analysis(DATA = DATA,
     	work_dir = work_dir,which_gwas = "gwas_catalog",
     	nBLOCKS = 200,verbose = TRUE)
     enrich_final = rbind(enrich_final,enrich_catalog)
     
     ## Per grouped phenotype
     all_phenos = unique(gwas$myPHENO)
     all_phenos = all_phenos[all_phenos != "gwas_catalog"]
     for(pheno in all_phenos){
     	enrich_pheno = CSeQTL:::run_gwasEnrich_analysis(DATA = DATA,
     		work_dir = work_dir,which_gwas = pheno,
     		nBLOCKS = 200,verbose = TRUE)
     	enrich_final = rbind(enrich_final,enrich_pheno)
     }