Extract new clinical gene sets from the literature

snRNAseq MDD https://pubmed.ncbi.nlm.nih.gov/32341540/
snRNAseq ASD https://pubmed.ncbi.nlm.nih.gov/31097668/
DOUBLE CHECK: bulk sex differences SCZ https://www.biologicalpsychiatryjournal.com/article/S0006-3223(21)01180-X/fulltext
snRNAseq SCZ https://www.medrxiv.org/content/10.1101/2020.11.06.20225342v1.full
snRNAseq and spatial SCZ https://www.biorxiv.org/content/10.1101/2020.11.17.386458v2
Literature search for other new datasets that have been published since 2020 (single cell or bulk)
Find relevant supplementary tables for differentially expressed genes between cases and controls (and by cell type for snRNAseq)
Extract gene lists and decide a cut off value for significant genes to include for registration with spatial data @lcolladotor might be able to help with assigning appropriate cut off (might also be relevant for @shkwon17 for AD project)
how to use a gene set when they don't provide ensembl id. maps_ids function?
read ASD study I added and figure out they generated their gene sets. Do they have them by cluster and could we the make an entire separate heat map comparing out data to theirs.
drop double white matter clusters and meninges cluster?? Does the enrichment use the top 100 genes and should we use more?

@abspangler13 where where you doing this and how are along are you?

I performed it for the k= 9 data set against all of the datasets from the pilot study and two new datasets that I added. Here's the code for the two new datasets I added as well as some comments about two sets we were interested in adding.

spatialDLPFC/code/analysis/10_Clinical_Gene_Set_Enrichment/01_Clinical_Gene_Set_Enrichment.R

Lines 251 to 306 in ea6c27c

    
           ############## 
        
           #### Nagy sn_rna_seq in dlpfc in MDD 
        
           #### https://www.nature.com/articles/s41593-020-0621-y 
        
           #### sup table 6 is marker genes 
        
           #### sup table 32 is DEGs 
        
           #### file = /dcs04/lieber/lcolladotor/spatialDLPFC_LIBD4035/spatialDLPFC/processed-data/rdata/spe/10_clinical_gene_set_enrichment/41593_2020_621_MOESM3_ESM.xlsx 
        
           ############# 
        
           mdd <- as.data.frame(read_excel("/dcs04/lieber/lcolladotor/spatialDLPFC_LIBD4035/spatialDLPFC/processed-data/rdata/spe/10_clinical_gene_set_enrichment/41593_2020_621_MOESM3_ESM.xlsx", sheet = "Supplementary Table 32", skip = 2, )) 
        
           # need to somehow get gene_id or ensemblID 
        
           ens4 <- select(org.Hs.eg.db, 
        
               columns = c("ENSEMBL", "ENTREZID", "SYMBOL"), 
        
               keytypes = "SYMBOL", 
        
               keys = as.character(unique(mdd$Gene)) 
        
           ) 
        
           mdd_geneList <- with( 
        
               mdd, 
        
               list( 
        
                   # DE_PE_ASD.Up = ensembl_gene_id[ASD.t.value > 0 & ASD.fdr < 0.05], 
        
                   # DE_PE_ASD.Down = ensembl_gene_id[ASD.t.value < 0 & ASD.fdr < 0.05], 
        
                   # DE_PE_BD.Up = ensembl_gene_id[BD.t.value > 0 & BD.fdr < 0.05], 
        
                   # DE_PE_BD.Down = ensembl_gene_id[BD.t.value < 0 & BD.fdr < 0.05], 
        
                   # DE_PE_SCZ.Up = ensembl_gene_id[SCZ.t.value > 0 & SCZ.fdr < 0.05], 
        
                   # DE_PE_SCZ.Down = ensembl_gene_id[SCZ.t.value < 0 & SCZ.fdr < 0.05] 
        
               ) 
        
           ) 
        
           ############## 
        
           ### snRNAseq ASD 
        
           ### https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7678724/#SD5 
        
           ### data S4 is DEGS 
        
           ### data S3 is marker genes 
        
           ### file = /dcs04/lieber/lcolladotor/spatialDLPFC_LIBD4035/spatialDLPFC/processed-data/rdata/spe/10_clinical_gene_set_enrichment/NIHMS1053005-supplement-Data_S4.xls 
        
           ############# 
        
           asd_rnaseq <- as.data.frame(read_excel("/dcs04/lieber/lcolladotor/spatialDLPFC_LIBD4035/spatialDLPFC/processed-data/rdata/spe/10_clinical_gene_set_enrichment/NIHMS1053005-supplement-Data_S4.xls", sheet = "ASD_DEGs")) 
        
           asd_rnaseq <- clean_names(asd_rnaseq) 
        
           asdRNA_geneList <- list( 
        
               DE_ASD_RNA.Up = asd_rnaseq$gene_id[asd_rnaseq$fold_change > 0 & asd_rnaseq$q_value < 0.05], 
        
               DE_ASD_RNA.Down = asd_rnaseq$gene_id[asd_rnaseq$fold_chang < 0 & asd_rnaseq$q_value < 0.05] 
        
           ) 
        
           ############ 
        
           ### https://www.medrxiv.org/content/10.1101/2020.11.06.20225342v1.full 
        
           ### snRNAseq SCZ 
        
           ### supplementary table 6 is DEGS, can't figure out how to download 
        
           ########## 
        
           ########## 
        
           ### snRNAseq and spatial SCZ 
        
           ### https://www.biorxiv.org/content/10.1101/2020.11.17.386458v2 
        
           ### supplementary table 2 is marker genes 
        
           ### supplementary table 4 is DEGs, can't figure out how to download 
        
           ###########

@kmaynard12 is going to work on this. @lahuuki, I wrote https://github.com/LieberInstitute/spatialDLPFC/tree/main/code/analysis/10_clinical_gene_set_enrichment in such a way that you would need to make 2 new scripts. One for extracting the gene IDs (ENSEMBL IDs) from the different tables @kmaynard12 will select, then another one for computing the odds ratio + making the heatmaps.

@lahuuki check https://jhu-genomics.slack.com/archives/C044681R5H6/p1669916306967689

@kmaynard12 are there other case/control snRNA-seq datasets beyond the PEC ones we should be looking at?

Related to https://jhu-genomics.slack.com/archives/C01EA7VDJNT/p1673285746357859

@lahuuki I think that we can close this issue, right?

	##############
	#### Nagy sn_rna_seq in dlpfc in MDD
	#### https://www.nature.com/articles/s41593-020-0621-y
	#### sup table 6 is marker genes
	#### sup table 32 is DEGs
	#### file = /dcs04/lieber/lcolladotor/spatialDLPFC_LIBD4035/spatialDLPFC/processed-data/rdata/spe/10_clinical_gene_set_enrichment/41593_2020_621_MOESM3_ESM.xlsx
	#############
	mdd <- as.data.frame(read_excel("/dcs04/lieber/lcolladotor/spatialDLPFC_LIBD4035/spatialDLPFC/processed-data/rdata/spe/10_clinical_gene_set_enrichment/41593_2020_621_MOESM3_ESM.xlsx", sheet = "Supplementary Table 32", skip = 2, ))

	# need to somehow get gene_id or ensemblID
	ens4 <- select(org.Hs.eg.db,
	columns = c("ENSEMBL", "ENTREZID", "SYMBOL"),
	keytypes = "SYMBOL",
	keys = as.character(unique(mdd$Gene))
	)

	mdd_geneList <- with(
	mdd,
	list(
	# DE_PE_ASD.Up = ensembl_gene_id[ASD.t.value > 0 & ASD.fdr < 0.05],
	# DE_PE_ASD.Down = ensembl_gene_id[ASD.t.value < 0 & ASD.fdr < 0.05],
	# DE_PE_BD.Up = ensembl_gene_id[BD.t.value > 0 & BD.fdr < 0.05],
	# DE_PE_BD.Down = ensembl_gene_id[BD.t.value < 0 & BD.fdr < 0.05],
	# DE_PE_SCZ.Up = ensembl_gene_id[SCZ.t.value > 0 & SCZ.fdr < 0.05],
	# DE_PE_SCZ.Down = ensembl_gene_id[SCZ.t.value < 0 & SCZ.fdr < 0.05]
	)
	)

	##############
	### snRNAseq ASD
	### https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7678724/#SD5
	### data S4 is DEGS
	### data S3 is marker genes
	### file = /dcs04/lieber/lcolladotor/spatialDLPFC_LIBD4035/spatialDLPFC/processed-data/rdata/spe/10_clinical_gene_set_enrichment/NIHMS1053005-supplement-Data_S4.xls
	#############

	asd_rnaseq <- as.data.frame(read_excel("/dcs04/lieber/lcolladotor/spatialDLPFC_LIBD4035/spatialDLPFC/processed-data/rdata/spe/10_clinical_gene_set_enrichment/NIHMS1053005-supplement-Data_S4.xls", sheet = "ASD_DEGs"))
	asd_rnaseq <- clean_names(asd_rnaseq)
	asdRNA_geneList <- list(
	DE_ASD_RNA.Up = asd_rnaseq$gene_id[asd_rnaseq$fold_change > 0 & asd_rnaseq$q_value < 0.05],
	DE_ASD_RNA.Down = asd_rnaseq$gene_id[asd_rnaseq$fold_chang < 0 & asd_rnaseq$q_value < 0.05]
	)


	############
	### https://www.medrxiv.org/content/10.1101/2020.11.06.20225342v1.full
	### snRNAseq SCZ
	### supplementary table 6 is DEGS, can't figure out how to download
	##########

	##########
	### snRNAseq and spatial SCZ
	### https://www.biorxiv.org/content/10.1101/2020.11.17.386458v2
	### supplementary table 2 is marker genes
	### supplementary table 4 is DEGs, can't figure out how to download
	###########