CSeQTLworkflow

Introduction and Outline

This repository contains template codes for runnings workflow steps including

for our manuscript Cell Type-Specific Expression Quantitative Trait Loci.

Reference Data

GTEx Brain and Blood

Since GTEx samples are available on the NHGRI AnVIL cloud, they were processed using a combination of Docker and WDL with details provided in this repository.

CommonMind Consortium

Click to expand!

Code to obtain CMC inputs

# Install package
install.packages("synapser",
	repos = c("https://sage-bionetworks.github.io/ran","http://cran.fhcrc.org"))

library(synapser)

# Login to Synapse
username = "." # your username
password = "." # your password
synLogin(username,password)

down_dir = "." # user-specified directory to download files

# Function used to download files
entity = "syn4600989" # an example SYN ID unique to a file
synGet(entity = entity,downloadLocation = down_dir)

# List of SYN_IDs, use synGet() to download
"syn4600989" 	# CMC Human sampleIDkey metadata
"syn4600985" 	# QC'd genotype bed file data
"syn4600987" 	# QC'd bim file
"syn4600989" 	# QC fam file
"syn4935690" 	# README file
"syn5600756" 	# list of outlier samples
"syn18080588"	# SampleID key
"syn3354385"	# clinical data
"syn18358379"	# RNAseq meta/QC data

# List of BAM SYN_IDs
tableId = "syn11638850" 
results = synTableQuery(smart_sprintf("select * from %s 
	where species='Human' and dataType='geneExpression'", tableId))
# results$filepath
aa = as.data.frame(results)
aa = aa[grep("bam",aa$name),]
aa = aa[grep("accepted",aa$name),] # excluding unmapped bam files
dim(aa)
saveRDS(aa,"aligned_bam_files.rds")

# Download BAM files one by one
BAM_dir = "./BAM"; dir.create(BAM_dir)
for(samp in sort(unique(aa$specimenID))){
	samp_dir = file.path(BAM_dir,samp)
	dir.create(samp_dir)
	samp_syn_ids = aa$id[which(aa$specimenID == samp)]
	for(samp_syn_id in samp_syn_ids){
		synGet(entity = samp_syn_id,down_dir = samp_dir)
	}
	cat("\n")
}

# SNP6 annotation CSV file
tmp_link = "http://www.affymetrix.com/Auth/analysis"
tmp_link = file.path(tmp_link,"downloads/na35/genotyping")
tmp_link = file.path(tmp_link,"GenomeWideSNP_6.na35.annot.csv.zip")
system(sprintf("wget %s",tmp_link))

Blueprint

Click to expand!

Download metadata and BAMs with Pyega3
cred_file.json contains user login information

To obtain metadata,

 # for example
 dataset=EGAD00001002663 # genotypes
 # dataset=EGAD00001002671 # purified cell type 1
 # dataset=EGAD00001002674 # purified cell type 2
 # dataset=EGAD00001002675 # purified cell type 3

 # Download metadata code
 pyega3 -cf cred_file.json files $dataset

To obtain BAM files one by one,

 # num threads/cores
 nt=1

 # Example file id per BAM
 id=EGAF00001330176

 # Download code
 pyega3 -cf cred_file.json -c $nt fetch $id

BAM workflow

Click to expand!

BamToFastq

Click to expand!

For paired-end reads

 java -Xmx4g -jar picard.jar SamToFastq \
 	INPUT=input.bam FASTQ=output_1.fastq.gz \
 	SECOND_END_FASTQ=output_2.fastq.gz \
 	UNPAIRED_FASTQ=output_unpair.fastq.gz \
 	INCLUDE_NON_PF_READS=true VALIDATION_STRINGENCY=SILENT

For single-end reads

 java -Xmx4g -jar picard.jar SamToFastq \
 	INPUT=input.bam FASTQ=output.fastq \
 	INCLUDE_NON_PF_READS=true \
 	VALIDATION_STRINGENCY=SILENT

Build STAR index

Click to expand!

 fasta_fn=Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta
 gtf_fn=gencode.v26.GRCh38.ERCC.genes.gtf
 
 STAR --runMode genomeGenerate \
 	--genomeDir ./star_index \
 	--genomeFastaFiles $fasta_fn \
 	--sjdbGTFfile $gtf_fn \
 	--sjdbOverhang 99 --runThreadN 1

Run STAR for read alignment

Click to expand!

For paired-end reads

 STAR --runMode alignReads \
 	--runThreadN 1 --genomeDir ./star_index --twopassMode Basic \
 	--outFilterMultimapNmax 20 --alignSJoverhangMin 8 \
 	--alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 \
 	--outFilterMismatchNoverLmax 0.1 --alignIntronMin 20 \
 	--alignIntronMax 1000000 --alignMatesGapMax 1000000 \
 	--outFilterType BySJout --outFilterScoreMinOverLread 0.33 \
 	--outFilterMatchNminOverLread 0.33 --limitSjdbInsertNsj 1200000 \
 	--readFilesIn output_1.fastq.gz output_2.fastq.gz \
 	--readFilesCommand zcat --outFileNamePrefix output_hg38 \
 	--outSAMstrandField intronMotif --outFilterIntronMotifs None \
 	--alignSoftClipAtReferenceEnds Yes --quantMode TranscriptomeSAM \
 	GeneCounts --outSAMtype BAM Unsorted --outSAMunmapped Within \
 	--genomeLoad NoSharedMemory --chimSegmentMin 15 \
 	--chimJunctionOverhangMin 15 --chimOutType WithinBAM SoftClip \
 	--chimMainSegmentMultNmax 1 --outSAMattributes NH HI AS nM NM ch \
 	--outSAMattrRGline ID:rg1 SM:sm1

For single-end reads

 STAR --runMode alignReads \
 	--runThreadN 1 --genomeDir ./star_index --twopassMode Basic \
 	--outFilterMultimapNmax 20 --alignSJoverhangMin 8 \
 	--alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 \
 	--outFilterMismatchNoverLmax 0.1 --alignIntronMin 20 \
 	--alignIntronMax 1000000 --alignMatesGapMax 1000000 \
 	--outFilterType BySJout --outFilterScoreMinOverLread 0.33 \
 	--outFilterMatchNminOverLread 0.33 --limitSjdbInsertNsj 1200000 \
 	--readFilesIn output.fastq.gz --readFilesCommand zcat \
 	--outFileNamePrefix output_hg38 --outSAMstrandField intronMotif \
 	--outFilterIntronMotifs None --alignSoftClipAtReferenceEnds Yes \
 	--quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM Unsorted \
 	--outSAMunmapped Within --genomeLoad NoSharedMemory \
 	--chimSegmentMin 15 --chimJunctionOverhangMin 15 \
 	--chimOutType WithinBAM SoftClip --chimMainSegmentMultNmax 1 \
 	--outSAMattributes NH HI AS nM NM ch \
 	--outSAMattrRGline ID:rg1 SM:sm1

MarkDuplicates

Click to expand!

 # Sort reads by coordinate
 samtools sort -@ 0 -o output_hg38.sortedByCoordinate.bam \
 	output_hg38Aligned.out.bam
 
 # Make bam index
 samtools index -b -@ 1 output_hg38.sortedByCoordinate.bam
 
 # MarkDuplicates
 java -Xmx4g -jar picard.jar MarkDuplicates \
 	INPUT=output_hg38.sortedByCoordinate.bam \
 	OUTPUT=output_hg38.sortedByCoordinate.md.bam \
 	M=out.marked_dup_metrics.txt ASSUME_SORT_ORDER=coordinate

Process post-markduplicate sorted bam

 # Count num reads in bam
 samtools view -c -@ 0 output_hg38.sortedByCoordinate.md.bam
 
 # Re-index bam
 samtools index -b -@ 1 output_hg38.sortedByCoordinate.md.bam

Get TReC and ASReC

Download asSeq source package

Click to expand!

 url=https://github.com/Sun-lab/asSeq/raw
 url=$url/master/asSeq_0.99.501.tar.gz
 
 wget $url

Install R package and run asSeq to get unique reads and filter

Click to expand!

 install.packages(pkgs = "asSeq_0.99.501.tar.gz",
 	type = "source",repos = NULL)
 
 PE = TRUE 
 	# set TRUE for paired-end samples
 	# set FALSE for single-end
 
 flag1 = Rsamtools::scanBamFlag(isUnmappedQuery = FALSE,
 	isSecondaryAlignment = FALSE,isDuplicate = FALSE,
 	isNotPassingQualityControls = FALSE,
 	isSupplementaryAlignment = FALSE,isProperPair = PE)
 
 param1 = Rsamtools::ScanBamParam(flag = flag1,
 	what = "seq",mapqFilter = 255)
 
 bam_file = "output_hg38.sortedByCoordinate.md.bam"
 bam_filt_fn = "output.filtered.asSeq.bam"
 Rsamtools::filterBam(file = bam_file,
 	destination = bam_filt_fn,
 	param = param1)

Create exon image file

Click to expand!

 gtf_fn = "gencode.v26.GRCh38.ERCC.genes.gtf.gz"
 exdb = GenomicFeatures::makeTxDbFromGFF(file = gtf_fn,
 	format = "gtf")
 exons_list_per_gene = GenomicFeatures::exonsBy(exdb,
 	by = "gene")
 
 gtf_rds_fn = "exon_by_genes_gencode_v26.rds"
 saveRDS(exons_list_per_gene,gtf_rds_fn)

Get total read count (TReC)

Click to expand!

 genes = readRDS(gtf_rds_fn)
 bamfile = Rsamtools::BamFileList(bam_filt_fn,
 	yieldSize = 1000000)
 se = GenomicAlignments::summarizeOverlaps(features = genes,
 	reads = bamfile,mode = "Union",singleEnd = !PE,
 	ignore.strand = TRUE,fragments = PE)
 ct = as.data.frame(SummarizedExperiment::assay(se))

Filter reads by Qname

Click to expand!

 samtools sort -n -o output.filtered.asSeq.sortQ.bam \
 	output.filtered.asSeq.bam

Extract allele-specific reads, outputs hap1.bam, hap2.bam, hapN.bam

Click to expand!

 het_snp_fn = "<tab delimited filename of heterozygous SNPs for sample>"
 	# Columns: chr, position, hap1 allele, hap2 allele
 	# no header
 
 bam_filt_sortQ_fn = "output.filtered.asSeq.sortQ"
 asSeq::extractAsReads(input = sprintf("%s.bam",bam_filt_sortQ_fn),
 	snpList = het_snp_fn,min.avgQ = 20,min.snpQ = 20)

Count allele-specific read counts (ASReC)

Click to expand!

 se1 = GenomicAlignments::summarizeOverlaps(features = genes,
 	reads = sprintf("%s_hap1.bam",bam_filt_sortQ_fn),mode = "Union",
 	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)
 se2 = GenomicAlignments::summarizeOverlaps(features = genes,
 	reads = sprintf("%s_hap2.bam",bam_filt_sortQ_fn),mode = "Union",
 	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)
 seN = GenomicAlignments::summarizeOverlaps(features = genes,
 	reads = sprintf("%s_hapN.bam",bam_filt_sortQ_fn),mode = "Union",
 	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)

Save read counts

Click to expand!

 ct1 = as.data.frame(SummarizedExperiment::assay(se1))
 ct2 = as.data.frame(SummarizedExperiment::assay(se2))
 ctN = as.data.frame(SummarizedExperiment::assay(seN))
 cts = cbind(ct,ct1,ct2,ctN) # trec, hap1, hap2, hapN
 dim(cts); cts[1:2,]
 out_fn = "output.trecase.txt"
 write.table(cts,file = out_fn,quote = FALSE,
 	sep = "\t", eol = "\n")

Deconvolution

Click to expand!

Signature expression derived from single cell RNAseq
- Middle Temporaral Gyrus
- Pipeline/Workflow to perform quality control on samples and genes, perform dimension reduction, clustering, and compare cluster assignments to existing cell type labeling. The signature matrix for brain that we have used is signature_MTG.rds
- Downstream differential expression analysis
Comments:
- Transcripts per million (TPM), gene count normalized by exonic gene length and then all genes are normalized by total normalized gene count, then multipled by 1 million
- Cell sizes: For brain, summation of gene count normalized by exonic gene length. For blood, obtained from EPIC and ICeDT papers.
Deconvolution Inputs
- Bulk RNA-seq TPM (bulk_tpm),
- scRNA-seq TPM (sig_tpm),
- cell sizes

Template code

Input variables and objects. Make sure the genes are ordered consistently between sig_tpm and bulk_tpm

 work_dir = "." # working directory
 setwd(work_dir)
 
 sig_tpm 	# TPM signature expression matrix
 bulk_tpm 	# TPM bulk expression matrix

CIBERSORT: [Paper, Software]

 sig_fn = file.path(work_dir,"signature.txt")
 mix_fn = file.path(work_dir,"mixture.txt")
 write.table(cbind(rowname=rownames(sig_tpm),sig_tpm),
 	file = sig_fn,sep = "\t",quote = FALSE,row.names = FALSE)
 write.table(cbind(rowname=rownames(bulk_tpm),bulk_tpm),
 	file = mix_fn,sep = "\t",quote = FALSE,row.names = FALSE)
 
 source("CIBERSORT.R") # obtained from CIBERSORT website
 results = CIBERSORT(sig_matrix = sig_fn,mixture_file = mix_fn,
 	perm = 0,QN = FALSE,absolute = FALSE,abs_method = 'sig.score',
 	filename = "DECON")
 unlink(x = c(sig_fn,mix_fn))
 ciber_fn = sprintf("CIBERSORT-Results_%s.txt","DECON")
 unlink(ciber_fn)
 QQ = ncol(sig_tpm)
 
 # Extract proportion of transcripts per cell type per sample
 pp_bar_ciber = results[,seq(QQ)]
 
 # Calculate proportion of cell types per sample
 pp_hat_ciber = t(apply(pp_bar_ciber,1,function(xx){
 	yy = xx / cell_sizes; yy / sum(yy)
 }))

ICeDT: [Paper, Software]

 fit = ICeDT::ICeDT(Y = bulk_tpm,Z = sig_tpm,
 	tumorPurity = rep(0,ncol(bulk_tpm)),refVar = NULL,
 	rhoInit = NULL,maxIter_prop = 4e3,
 	maxIter_PP = 4e3,rhoConverge = 1e-2)
 
 # Extract proportion of transcripts per cell type per sample
 pp_bar_icedt = t(fit$rho)[,-1]
 
 # Calculate proportion of cell type per sample
 pp_hat_icedt = t(apply(pp_bar_icedt,1,function(xx){
 	yy = xx / cell_sizes; yy / sum(yy)
 }))

eQTL mapping

Click to expand!

BULK mode: If running bulk analyses, set RHO to a matrix with N rows and 1 column and set XX_trecPC to residual TReC PCs calculated without accounting for cell types.
Cell type-specific (CTS) mode: If running cell type-specific analyses, set RHO to estimated cell type proportions and set XX_trecPC to proportion-adjusted residual TReC PCs.

Example codes

Click to expand!

Inputs

 devtools::install_github("pllittle/smarter")
 devtools::install_github("pllittle/CSeQTL")
 
 # Sample-specific variables
 RHO 			# cell type proportions matrix
 XX_base 	# baseline covariates, continuous variables centered
 XX_genoPC 	# genotype PCs, centered
 XX_trecPC 	# residual TReC PCs, centered
 XX = cbind(Int = 1,XX_base,XX_genoPC,XX_trecPC)
 
 # Gene and sample variables
 TREC 	# TReC vector
 SNP	# phased genotype vector
 hap2	# 2nd haplotype counts
 ASREC 	# total haplotype counts = hap1 + hap2
 PHASE 	# Indicator vector of whether or not to use haplotype counts
 
 # Tuning arguments
 trim 		# TRUE for trimmed analysis, FALSE for untrimmed
 thres_TRIM 	# if trim = TRUE, the Cooks' distance cutoff to trim sample TReCs
 ncores 	# number of threads/cores to parallelize the loop, improve runtime
 
 # ASREC-related cutoffs to satisfy to use allele-specific read counts
 numAS 		# cutoff to determine if a sample has sufficient read counts
 numASn 	# cutoff of samples having ASREC >= numAS
 numAS_het 	# minimum number for sum(ASREC >= numAS & SNP %in% c(1,2))
 cistrans 	# p-value cutoff on cis/trans testing to determine which p-value 
 		# (TReC-only or TReC+ASReC) to report

eQTL mapping for one gene

 gs_out = CSeQTL_GS(XX = XX,TREC = TREC,SNP = SNP,hap2 = hap2,
 	ASREC = ASREC,PHASE = PHASE,RHO = RHO,trim = trim,
 	thres_TRIM = 20,numAS = 5,numASn = 5,numAS_het = 5,
 	cistrans = 0.01,ncores = ncores,show = TRUE)

Hypothesis testing output. Matrices where rows are genomic loci and columns are cell types for CTS mode or a single column for BULK mode.

 # TReC-only likelihood ratio test statistics with 1 DF
 gs_out$LRT_trec
 
 # TReC+ASReC likelihood ratio test statistics with 1 DF
 gs_out$LRT_trecase
 
 # Cis/Trans likelihood ratio test statistics with 1 DF
 gs_out$LRT_cistrans

Enrichment

Click to expand!

Functional Enrichment with Torus with example code provided below based on this markdown with examples and documentation.
```
 torus -d eqtls.gz \
 	-smap snpMap.gz \
 	-gmap geneMap.gz \
 	-annot annot.gz \
 	-est > output_enrich
```

GWAS Enrichment with Jackknife-based inference

 library(CSeQTL)
 
 work_dir = "." # specify a work directory
 
 # Download GWAS catalog
 gwas = CSeQTL:::get_GWAS_catalog(work_dir = work_dir)

Inputs

DATA: R data.frame containing headers 'Chr', 'POS', and columns leading with 'EE_' such as 'EE_Astrocyte', 'EE_Excitatory' corresponding to eQTL binary indicators 0 or 1. Rows correspond to gene/SNP pairings
which_gwas: Either 'gwas_catalog' or some specific grouped phenotype
nBLOCKS: Specify the block size for jackknife estimation and inference

Code

 # Run GWAS enrichment
 enrich_final = c()
 
 ## Across all phenotypes
 enrich_catalog = CSeQTL:::run_gwasEnrich_analysis(DATA = DATA,
 	work_dir = work_dir,which_gwas = "gwas_catalog",
 	nBLOCKS = 200,verbose = TRUE)
 enrich_final = rbind(enrich_final,enrich_catalog)
 
 ## Per grouped phenotype
 all_phenos = unique(gwas$myPHENO)
 all_phenos = all_phenos[all_phenos != "gwas_catalog"]
 for(pheno in all_phenos){
 	enrich_pheno = CSeQTL:::run_gwasEnrich_analysis(DATA = DATA,
 		work_dir = work_dir,which_gwas = pheno,
 		nBLOCKS = 200,verbose = TRUE)
 	enrich_final = rbind(enrich_final,enrich_pheno)
 }

pllittle/CSeQTLworkflow

CSeQTLworkflow

Introduction and Outline

Reference Data

GTEx Brain and Blood

CommonMind Consortium

Blueprint

BAM workflow

Deconvolution

eQTL mapping

Enrichment