This repository contains the R code used to analyse the single-cell RNA-seq dataset shown in:
Delile, J., Rayon, T., Melchionda, M., Edwards, A., Briscoe, J., & Sagner, A. (2019). Single cell transcriptomics reveals spatial and temporal dynamics of gene expression in the developing mouse spinal cord. Development, dev-173807. https://doi.org/10.1242/dev.173807
- Data availability
- Figure shortcuts
- Analysis preliminaries
- 1. Load and hygienize dataset
- 2. Knowledge-based identification of all cell populations
- 3. Population size dynamics
- 4. Combinatorial DE tests
- 5. Neuronal populations clustering
- 6. Neurogenesis dynamics
- 7. Export annotations
Those interested in processing the dataset independently should consider downloading:
- the UMI raw count matrix as outputted by the 10X Genomics cellranger pipeline.
- the Annotated cell meta-data indicating the cells’ sample times, replicate ids and types as determined by the following pipeline. “Type_step1” and “Type_step2” stands for the outcome of the 2-step partitioning algorithm (Section 2). “Neuron_subtypes” indicates the result of the per-neuronal-type subclustering (Section 5). “Pseudotime” contains the neurogenesis ordering coordinates (Section 6).
Figure 1 | Figure 2 | Figure 3 | Figure 4 |
---|---|---|---|
B C D-E | A-D E | A B | A |
Figure 5 | Figure 6 | Figure 7 | Figure S1 |
---|---|---|---|
A | A | A-B C D | A-C D E-F G |
Figure S2 | Figure S3 | Figure S4 | Figure S6 |
---|---|---|---|
A | A | A-H | A B-D |
To reproduce the analysis, the files contained in R_files, input_files and dataset must to be downloaded from this repository (most conveniently using git clone). The UMI count matrix is zipped and must be uncompressed into the dataset folder.
unzip("./dataset/UMI_count.tsv.zip", exdir="./dataset/")
The Antler package as it was at the time of publication is required and can be installed with devtools.
devtools::install_github("juliendelile/Antler", ref = "Development2019")
library(Antler)
Most functions not provided by Antler are stored in MouseSpinalCordAtlas_tools.R
source('./R_files/MouseSpinalCordAtlas_tools.R')
The output path can be changed to any existing directory path
output_path = './output/'
m = Antler$new(plot_folder=output_path, num_cores=4)
m$loadDataset(folderpath="./dataset/", phenoData_filename="phenoData.csv", assayData_filename="UMI_count.tsv")
Annotate gene names
m$setCurrentGeneNames(geneID_mapping_file=system.file("extdata", "Annotations/biomart_ensemblid_genename_mmusculus.csv", package="Antler"))
Display counts pre-QC
m$plotReadcountStats(data_status="Raw", by="timepoint", category="replicate_id", basename="preQC")
Remove cells having more than 6% of mitochondrial UMI counts
m$removeGenesFromRatio(
candidate_genes=grep('^mt-', grep('^mt-', m$getGeneNames(), value=T), value=T),
threshold = 0.06,
)
Remove outliers genes and cells
m$removeOutliers( lowread_thres = -Inf,
highread_thres = Inf,
genesmin = 500,
cellmin = 3,
data_status = 'Raw')
Display counts post-QC
m$plotReadcountStats(data_status="Raw", by="timepoint", category="replicate_id", basename="postQC")
Cell identities are determined by associating each cell to the closest target population state defined by a list of known marker genes
cell_partition = doCellPartition(known_template_file="./input_files/partitioning_table.csv", readcounts=m$getReadcounts(data_status='Raw'))
pop_colors = getPopulationColors(known_template_file="./input_files/partitioning_table.csv")
for(md in c("Type_step1", "Type_step2", "Type_step2_unique", "DV")){pData(m$expressionSet)[[md]] <- cell_partition[[md]]}
cellcluster_sizes_cumsum_step1 = setNames(
c(0, cumsum(table(pData(m$expressionSet)$Type_step1))),
c(levels(pData(m$expressionSet)$Type_step1), ""))
m$readcounts_norm=m$readcounts_raw
m$readcounts_norm[m$readcounts_norm >= 2] <- 1 # threshold used in doCellPartition
pop_def_mask_step1 = markersToMask(cell_partition$step1_markers)
m$dR$genemodules = as.list(rownames(pop_def_mask_step1))
m$plotGeneModules(
basename='FIG1_Map_Step1',
displayed.gms = c('dR.genemodules'),
displayed.geneset=NA,
use.dendrogram=NA,
display.clusters=NULL,
file_settings=list(list(type='pdf', width=10, height=3)),
data_status=c('Normalized'),
gene_transformations='none',
display.legend=TRUE,
cell.ordering=order(pData(m$expressionSet)$Type_step1), # works iff Type_step1 are factors
extra_colors=cbind(
NA
"UMI counts"=colorRampPalette(c("white", "black"))(n = 101)[as.integer(1+100*(colSums(m$readcounts_raw) / max(colSums(m$readcounts_raw))))]
),
extra_legend=list("text"=c("", levels(pData(m$expressionSet)$Type_step1)), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m$expressionSet)$Type_step1)))])),
genemodules.palette=rep("white", length(m$dR$genemodules)),
rect_overlay=apply(which(pop_def_mask_step1==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_step1[[x[2]]],
xright=cellcluster_sizes_cumsum_step1[[x[2]+1]],
ytop=length(unlist(m$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m$dR$genemodules)) - x[1]+1 + 0.5
)
}),
pretty.params=list("size_factor"=3, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
white_rects = c(lapply(
list('Neuron', c('Blood', 'Hematopoeitic', 'Erythropoeitic', 'Erythrocytes', 'Erythrocytes II'), c('Mesoderm I', 'Mesoderm II', 'Mesoderm III', 'Mesoderm IV', 'Mesoderm V', 'Mesoderm VI', 'Myoblast'), c('Neural Crest I', 'Neural Crest II'), c('Neural crest neurons I', 'Neural crest neurons II', 'Neural crest neurons III')),
# list('Progenitor', 'Neuron', c('Blood', 'Hematopoeitic', 'Erythropoeitic', 'Erythrocytes', 'Erythrocytes II'), c('Mesoderm I', 'Mesoderm II', 'Mesoderm III', 'Mesoderm IV', 'Mesoderm V', 'Mesoderm VI', 'Myoblast'), c('Neural Crest I', 'Neural Crest II'), c('Neural crest neurons I', 'Neural crest neurons II', 'Neural crest neurons III'), c('Skin', 'NULL')),
function(l){
list(
xleft=min(which(names(cellcluster_sizes_cumsum_step1) %in% l)) %>% cellcluster_sizes_cumsum_step1[[.]],
xright= (max(which(names(cellcluster_sizes_cumsum_step1) %in% l))+1) %>% cellcluster_sizes_cumsum_step1[.],
ytop=0.5,
ybottom=length(unlist(m$dR$genemodules)) + 0.5,
color="white",
width=1
)
}),
lapply(
list(c('Tubb3', 'Elavl3'), c('Sox17', 'Fermt3', 'Klf1', 'Hemgn', 'Car2'), c('Foxc1', 'Foxc2', 'Twist1', 'Twist2', 'Meox1', 'Meox2', 'Myog'), 'Sox10', c('Tlx2', 'Six1')), function(l){
# list('Sox2', c('Tubb3', 'Elavl3'), c('Sox17', 'Fermt3', 'Klf1', 'Hemgn', 'Car2'), c('Foxc1', 'Foxc2', 'Twist1', 'Twist2', 'Meox1', 'Meox2', 'Myog'), 'Sox10', c('Tlx2', 'Six1'), 'Krt8'), function(l){
list(
xleft=0,
xright=max(cellcluster_sizes_cumsum_step1),
ytop=min(which(unlist(m$dR$genemodules) %in% l)) %>% {length(unlist(m$dR$genemodules)) - . + 1.5},
ybottom=max(which(unlist(m$dR$genemodules) %in% l)) %>% {length(unlist(m$dR$genemodules)) - . + 2.5},
color="white",
width=1
)
}))
m$plotGeneModules(
basename='FIG1_Map_Step1_whitelines',
displayed.gms = c('dR.genemodules'),
displayed.geneset=NA,
use.dendrogram=NA,
display.clusters=NULL,
file_settings=list(list(type='pdf', width=10, height=3)),
data_status=c('Normalized'),
gene_transformations='none',
display.legend=TRUE,
cell.ordering=order(pData(m$expressionSet)$Type_step1), # works iff Type_step1 are factors
extra_colors=cbind(
NA
),
extra_legend=list("text"=c("", levels(pData(m$expressionSet)$Type_step1)), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m$expressionSet)$Type_step1)))])),
genemodules.palette=rep("white", length(m$dR$genemodules)),
rect_overlay=c(white_rects, apply(which(pop_def_mask_step1==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_step1[[x[2]]],
xright=cellcluster_sizes_cumsum_step1[[x[2]+1]],
ytop=length(unlist(m$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m$dR$genemodules)) - x[1]+1 + 0.5,
color="white", #"gray90",
width=0.5
)
})),
pretty.params=list("size_factor"=3, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
cellcluster_sizes_cumsum_step2 = setNames(
c(0, cumsum(table(pData(m$expressionSet)$Type_step2_unique))),
c(levels(pData(m$expressionSet)$Type_step2_unique), ""))
bothSteps_mask = markersToMask(cell_partition$bothSteps_markers)
m$dR$genemodules = as.list(rownames(bothSteps_mask))
m$plotGeneModules(
basename='FIGS1_Map_Step2',
displayed.gms = c('dR.genemodules'),
displayed.geneset="naked", # NA, plot all genes
use.dendrogram=NA,
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=15, height=8)),
data_status=c('Raw'),
gene_transformations='logscaled',
display.legend=TRUE,
cell.ordering=order(pData(m$expressionSet)$Type_step2_unique),
extra_colors=cbind(
"Timepoint"=pop_colors$timepoint[as.character(pData(m$expressionSet)$timepoint)],
"Step 1 Type"=pop_colors$Step1[as.character(pData(m$expressionSet)$Type_step1)],
"Step 2 Type"=pop_colors$Step2[as.character(pData(m$expressionSet)$Type_step2)]
),
extra_legend=list("text"=c("", levels(pData(m$expressionSet)$Type_step2)), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m$expressionSet)$Type_step2)))])),
rect_overlay=apply(which(bothSteps_mask==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_step2[[x[2]]],
xright=cellcluster_sizes_cumsum_step2[[x[2]+1]],
ytop=length(unlist(m$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m$dR$genemodules)) - x[1]+1 + 0.5
)
}),
pretty.params=list("size_factor"=3, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
Estimate quantity of cell doublets by calculating the ratio of the expressed cell type markers in other populations
dcounts = estimateDoublets()
write.csv(dcounts, file=paste0(output_path, "/FIGS1_doublet_estimates.csv"))
Export markers to csv
write.table(markersToMask(cell_partition$step2_markers) %>% cbind("Gene"=rownames(.), .), file=paste0(output_path,'/Table_S1.csv'), row.names=F, sep=";", quote=FALSE)
Select and create structures for 1. neural cells, 2. neuronal progenitors and 3. Neurons
# 1. Neural cells
m_neural = m$copy()
m_neural$excludeCellsFromIds(which(!pData(m_neural$expressionSet)$Type_step1 %in% c('Progenitor', 'Neuron')))
m_neural$excludeCellsFromIds(which(pData(m_neural$expressionSet)$Type_step2 %in% c('Null_Progenitor', 'Null_Neuron')))
m_neural$excludeUnexpressedGenes(min.cells=1, min.level=1, verbose=TRUE, data_status="Raw")
pData(m_neural$expressionSet)$Type_step1 = factor(pData(m_neural$expressionSet)$Type_step1, levels=levels(droplevels(pData(m_neural$expressionSet)$Type_step1)))
pData(m_neural$expressionSet)$Type_step2 = factor(pData(m_neural$expressionSet)$Type_step2, levels=levels(droplevels(pData(m_neural$expressionSet)$Type_step2)))
pData(m_neural$expressionSet)$Type_step2_unique = factor(pData(m_neural$expressionSet)$Type_step2_unique, levels=levels(droplevels(pData(m_neural$expressionSet)$Type_step2_unique)))
# 2. Neuronal progenitors
m_prog = m_neural$copy()
m_prog$excludeCellsFromIds(which(!pData(m_prog$expressionSet)$Type_step1 == 'Progenitor'))
m_prog$excludeUnexpressedGenes(min.cells=1, min.level=1, verbose=TRUE, data_status="Raw")
pData(m_prog$expressionSet)$Type_step1 = factor(pData(m_prog$expressionSet)$Type_step1, levels=levels(droplevels(pData(m_prog$expressionSet)$Type_step1)))
pData(m_prog$expressionSet)$Type_step2 = factor(pData(m_prog$expressionSet)$Type_step2, levels=levels(droplevels(pData(m_prog$expressionSet)$Type_step2)))
m_prog$dR$genemodules = as.list(unique(unlist(cell_partition$bothSteps_markers_neural[levels(droplevels(pData(m_prog$expressionSet)$Type_step2))])))
# 3. Neurons
m_neuron = m_neural$copy()
m_neuron$excludeCellsFromIds(which(!pData(m_neuron$expressionSet)$Type_step1 == 'Neuron'))
m_neuron$excludeUnexpressedGenes(min.cells=1, min.level=1, verbose=TRUE, data_status="Raw")
pData(m_neuron$expressionSet)$Type_step1 = factor(pData(m_neuron$expressionSet)$Type_step1, levels=levels(droplevels(pData(m_neuron$expressionSet)$Type_step1)))
pData(m_neuron$expressionSet)$Type_step2 = factor(pData(m_neuron$expressionSet)$Type_step2, levels=levels(droplevels(pData(m_neuron$expressionSet)$Type_step2)))
m_neuron$dR$genemodules = as.list(unique(unlist(cell_partition$bothSteps_markers_neural[levels(droplevels(pData(m_neuron$expressionSet)$Type_step2))])))
# Save structures to file
saveRDS(m_neural, file=paste0(output_path, '/m_neural.rds'))
# m_neural=readRDS(paste0(output_path, '/m_neural.rds'))
saveRDS(m_neuron, file=paste0(output_path, '/m_neuron.rds'))
# m_neuron=readRDS(paste0(output_path, '/m_neuron.rds'))
saveRDS(m_prog, file=paste0(output_path, '/m_prog.rds'))
# m_prog=readRDS(paste0(output_path, '/m_prog.rds'))
saveRDS(m, file=paste0(output_path, '/m_typed.rds'))
# m = readRDS(paste0(output_path, '/m_typed.rds'))
bubble_chart.df = data.frame(t(m_neural$getReadcounts('Raw')[unique(unlist(cell_partition$bothSteps_markers_neural)), ]), check.names=F) %>%
tibble::rownames_to_column('cellname') %>%
tidyr::gather(genename, value, -cellname) %>%
# dplyr::mutate(genename=factor(genename, levels=rev(unique(unlist(cell_partition$bothSteps_markers_neural))))) %>%
dplyr::mutate(genename=factor(genename, levels=(unique(unlist(cell_partition$bothSteps_markers_neural))))) %>%
dplyr::left_join(pData(m_neural$expressionSet) %>% tibble::rownames_to_column("cellname"), by="cellname") %>%
dplyr::group_by(Type_step2, genename) %>%
dplyr::summarise(mean=mean(value)) %>%
dplyr::group_by(genename) %>%
dplyr::mutate(mean_norm_max=mean/max(mean),
Type_step2 = factor(Type_step2, levels=rev(levels(Type_step2)))) %>%
dplyr::ungroup() %>%
dplyr::mutate(partitionning=factor(
(function(x,y){
unlist(lapply(seq(length(x)),
function(i){
markersToMask(cell_partition$bothSteps_markers_neural)[x[[i]],y[i]]}))
}
)(as.character(genename), as.character(Type_step2)),
levels=c(0,1))) %>%
dplyr::mutate(fillcol = (as.integer(partitionning)-1) * as.integer(Type_step2))
unfocus_grey = "grey"
pdf(paste0(output_path, '/FIG1_Progenitors_bubble_chart.pdf'), height=3.5, width=7, useDingbats=FALSE)
p <- ggplot(bubble_chart.df %>% dplyr::filter(Type_step2 %in% levels(droplevels(pData(m_prog$expressionSet)$Type_step2)), genename %in% unlist(m_prog$dR$genemodules))) +
geom_count(aes(x=genename, y=Type_step2, size=mean_norm_max, fill=factor(fillcol)), color="black", shape=21) + scale_size_area(max_size=5) + scale_fill_manual(breaks=seq(0, length(levels(bubble_chart.df$Type_step2))), values=c(unfocus_grey, rev(unname(pop_colors$Step2[levels(pData(m_prog$expressionSet)$Type_step2)])))) + scale_x_discrete(position = "top") + xlab("") + ylab("") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 0))
print(p)
graphics.off()
pdf(paste0(output_path, '/FIG1_Neurons_bubble_chart.pdf'), height=3.5, width=10, useDingbats=FALSE)
p <- bubble_chart.df %>%
dplyr::filter(Type_step2 %in% levels(droplevels(pData(m_neuron$expressionSet)$Type_step2)), genename %in% unlist(m_neuron$dR$genemodules)) %>%
dplyr::mutate(genename=factor(genename, levels=unique(unlist(cell_partition$bothSteps_markers_neural[levels(droplevels(pData(m_neuron$expressionSet)$Type_step2))])))) %>%
ggplot(.) + geom_count(aes(x=genename, y=Type_step2, size=mean_norm_max, fill=factor(fillcol)), color="black", shape=21) + scale_size_area(max_size=5) + scale_fill_manual(breaks=seq(0, length(levels(bubble_chart.df$Type_step2))), values=c(unfocus_grey, rev(unname(pop_colors$Step2[levels(pData(m_neuron$expressionSet)$Type_step2)])))) +
# scale_fill_manual(breaks=c(0,1), values=c(unfocus_grey, pop_colors$Step2)) +
scale_x_discrete(position = "top") + xlab("") + ylab("") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 0))
print(p)
graphics.off()
tSNE computation takes about 30min, skip to next section if the code has already run once.
m$normalize("geometric_mean_sizeFactors")
step1.tsne.data = m$getReadcounts('Normalized')[unique(unlist(cell_partition$step1_markers)), ]
set.seed(1)
t0=Sys.time()
step1.tsne.coords = Rtsne::Rtsne(t(step1.tsne.data), pca=FALSE, perplexity=100, max_iter=2000, verbose=T, check_duplicates = FALSE)$Y
print(Sys.time()-t0)
saveRDS(step1.tsne.coords, file=paste0(output_path, '/step1.tsne.coords.rds'))
Load pre calculated tsne coordinates
step1.tsne.coords = readRDS(paste0(output_path, '/step1.tsne.coords.rds'))
png(paste0(output_path, '/FIG1_Map_Step1_tSNE_Normalized_Types.png'), width=4000, height=4000, pointsize=75)
op <- par(mfrow = c(1,1),
oma = c(5,4,0,0) + 0.1,
mar = c(0,0,1,1) + 0.1)
vertex.plot.order = rev(order(pData(m$expressionSet)$Type_step1))
plot(step1.tsne.coords[vertex.plot.order,], , col=pop_colors$Step1[as.character(pData(m$expressionSet)$Type_step1)][vertex.plot.order], pch=16, cex=.7, main='Step 1 Types', xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
par(op)
par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE)
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n")
legend(x="left",
legend=names(pop_colors$Step1),
col=pop_colors$Step1,
pch=15,
y.intersp = 1.2, cex=.8,
bg='transparent',
box.col='transparent'
)
graphics.off()
png(paste0(output_path, '/FIG1_Map_Step1_tSNE_Normalized_Types_naked.png'), width=4000, height=4000, pointsize=75)
vertex.plot.order = rev(order(pData(m$expressionSet)$Type_step1))
plot(step1.tsne.coords[vertex.plot.order,], , col=pop_colors$Step1[as.character(pData(m$expressionSet)$Type_step1)][vertex.plot.order], pch=16, cex=.7, xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
Progenitor only / Neuron only
png(paste0(output_path, '/FIG1_Map_Step1_tSNE_Normalized_Progenitor_Focus.png'), width=2000, height=2000, pointsize=40)
plot(step1.tsne.coords, col=ifelse(pData(m$expressionSet)$Type_step1=="Progenitor", "#D63624", "grey"), pch=16, cex=.7, main='Progenitors', xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
png(paste0(output_path, '/FIG1_Map_Step1_tSNE_Normalized_Neuron_Focus.png'), width=2000, height=2000, pointsize=40)
plot(step1.tsne.coords, col=ifelse(pData(m$expressionSet)$Type_step1=="Neuron", "#D63624", "grey"), pch=16, cex=.7, main='Neurons', xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
Replicate per timepoint plots
# relabel replicate id to remove gaps
rep_colors = RColorBrewer::brewer.pal(9, "Set1")[1:3]
for(tp in unique(pData(m$expressionSet)$timepoint)) {
rep_ids = unique(pData(m$expressionSet)$replicate_id[pData(m$expressionSet)$timepoint==tp])
cell_colors = ifelse(
pData(m$expressionSet)$timepoint != tp,
'grey',
rep_colors[pData(m$expressionSet)$replicate_id]
)
tp_cell_ids = sample(which((m$expressionSet)$timepoint == tp))
vertex.plot.order=c(setdiff(seq(m$getNumberOfCells()), tp_cell_ids), tp_cell_ids)
png(paste0(output_path, '/FIGS1_Map_Step1_tSNE_Normalized_ReplicateIDs_', tp, '.png'), width=2000, height=2000, pointsize=40)
plot(step1.tsne.coords[vertex.plot.order, ], col=cell_colors[vertex.plot.order], pch=16, cex=.7, main=paste0('Replicates E', tp, ' (n=', length(rep_ids), ')'), xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
}
Plot Sex
sex.df=cbind.data.frame(Xist=m$getReadcounts('Raw')['Xist',], pData(m$expressionSet)) %>%
dplyr::group_by(replicate_id, timepoint) %>%
dplyr::summarize(Xist_mean=mean(Xist)) %>%
dplyr::mutate(Sex=ifelse(Xist_mean < 1, "Male", "Female")) %>%
dplyr::arrange(timepoint)
cell_sex = pData(m$expressionSet) %>%
dplyr::select(replicate_id, timepoint) %>%
dplyr::left_join(sex.df, by=c("timepoint"="timepoint", "replicate_id"="replicate_id")) %>%
.$Sex
sex_colors = c("Male"="cornflowerblue", "Female"="coral1")
png(paste0(output_path, '/FIGS1_Map_Step1_tSNE_Normalized_Sex.png'), width=2000, height=2000, pointsize=40)
plot(step1.tsne.coords, col=sex_colors[cell_sex], pch=16, cex=.4, main='Sex', xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
count.df = pData(m_neural$expressionSet) %>%
dplyr::select(timepoint, Type_step1, Type_step2) %>%
dplyr::mutate(
timepoint=factor(timepoint, levels=sort(unique(timepoint))),
Type_step1=factor(Type_step1, levels=levels(droplevels(Type_step1))),
Type_step2=factor(Type_step2, levels=levels(droplevels(Type_step2))),
) %>%
dplyr::group_by(timepoint, Type_step1, Type_step2) %>%
dplyr::tally() %>%
dplyr::group_by(timepoint) %>%
dplyr::add_tally(name="nn") %>%
dplyr::mutate(freq_norm=n/nn) %>%
dplyr::group_by(timepoint, Type_step1) %>%
dplyr::add_tally(name="nnn") %>%
dplyr::mutate(freq_norm_type=n/nnn) %>%
dplyr::ungroup()
Display population size dynamics
legacy_color = setNames(
c('#d3aed6ff', '#b0cbd8ff', '#c0d2d9ff', '#d1d9daff', '#e3e2dbff', '#f4e9dcff', '#ffeeddff', '#f3e8d4ff', '#daddc3ff', '#c2d1b1ff', '#a7c59eff', '#8fb98cff', '#b2aed6ff', '#FED908', '#b0cbd8ff', '#c0d2d9ff', '#d1d9daff', '#e3e2dbff', '#f4e9dcff', '#ffeeddff', '#f3e8d4ff', '#daddc3ff', '#c2d1b1ff', '#c2d1b1ff', '#a7c59eff', '#8fb98cff'),
c('RP', 'dp1', 'dp2', 'dp3', 'dp4', 'dp5', 'dp6', 'p0', 'p1', 'p2', 'pMN', 'p3', 'FP', 'Null_Progenitor', 'dl1', 'dl2', 'dl3', 'dl4', 'dl5', 'dl6', 'V0', 'V1', 'V2a', 'V2b', 'MN', 'V3')
)
p2 <- ggplot(count.df[which(count.df$Type_step1=="Progenitor"),], aes(x=timepoint, y=freq_norm, fill=Type_step2, group=interaction(Type_step2, Type_step1))) + geom_area() + ggtitle("Progenitors") + ylab("") + xlab('') + theme(legend.position="none") + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + theme_minimal() + theme(legend.position="none")
p3 <- ggplot(count.df[which(count.df$Type_step1=="Neuron"),], aes(x=timepoint, y=freq_norm, fill=Type_step2, group=interaction(Type_step2, Type_step1))) + geom_area() + ggtitle("Neurons") + ylab("") + xlab('') + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + theme_classic() + theme_minimal() + theme(legend.position="none")
p4 <- ggplot(count.df[which(count.df$Type_step1=="Progenitor"),], aes(x=timepoint, y=freq_norm_type, fill=Type_step2, group=interaction(Type_step2, Type_step1))) + geom_area() + ggtitle("Progenitor Ratios") + ylab("") + xlab('') + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + theme_minimal() + theme(legend.position="none")
p5 <- ggplot(count.df[which(count.df$Type_step1=="Neuron"),], aes(x=timepoint, y=freq_norm_type, fill=Type_step2, group=interaction(Type_step2, Type_step1))) + geom_area() + ggtitle("Neuron Ratios") + ylab("") + xlab('') + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + theme_minimal() + theme(legend.position="none")
p1 <- ggplot(count.df[which(count.df$Type_step1 %in% c("Progenitor", "Neuron")),], aes(x=timepoint, y=freq_norm, fill=Type_step2, group=interaction(Type_step1, Type_step2), alpha=Type_step1)) + geom_area(color='black', size=.05) + ggtitle("Neural Ratios") + ylab("") + xlab('') + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + scale_alpha_manual(breaks=c("Progenitor", "Neuron"), values=c(0.5, 1)) + theme(legend.key.width=unit(0.15,"cm"), legend.key.height=unit(0.15,"cm"), legend.position = c(0.6, 0.6)) + guides(fill=guide_legend(ncol=3))
# print(p1)
# graphics.off()
g_legend <- function(a.gplot){
pdf(file=NULL)
tmp <- ggplot_gtable(ggplot_build(a.gplot))
graphics.off()
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)}
lgd <- g_legend(p1)
pdf(paste0(output_path, 'FIG2_DV_Population_Ratios.pdf'), width=7, height=4, useDingbats=FALSE)
gridExtra::grid.arrange(grobs=list(p1 + theme(legend.position="none"),p2,p3,lgd,p4,p5), layout_matrix=matrix(seq(6), ncol=3, byrow=T))
graphics.off()
Comparison with Kicheva et al., 2014
kicheva_vals = read.table(file='./input_files/kicheva2014_Fig1G.csv', sep='\t', stringsAsFactors=F, header=FALSE, skipNul=T)
colnames(kicheva_vals) <- kicheva_vals[1,]
kicheva_vals <- kicheva_vals[-1,]
kicheva_vals <- kicheva_vals %>%
dplyr::filter(Type %in% c("average", "sd")) %>%
tidyr::gather(Timepoint, value, -Type, -Domain) %>%
dplyr::group_by(Timepoint) %>%
dplyr::mutate(value=value/max(value, na.rm=T)) %>% # works because max is always a mean value, not a sd
dplyr::ungroup() %>%
dplyr::mutate(dataset="Kicheva et al., 2014", Timepoint=as.numeric(Timepoint)) %>%
dplyr::filter(Domain != "total") %>%
tidyr::spread(Type, value)
baseline=90
both_dataset.df = count.df %>%
dplyr::filter(Type_step1=="Progenitor" & Type_step2 %in% levels(pData(m_prog$expressionSet)$Type_step2)) %>%
dplyr::filter(!Type_step2 %in% c(NULL, "RP")) %>%
dplyr::select(timepoint, Type_step2, freq_norm_type) %>%
dplyr::rowwise() %>%
dplyr::mutate(
# Timepoint=switch(as.numeric(timepoint), "E9.5"=60, "10.5"=80, "11.5"=100), # James email values
Timepoint=switch(as.numeric(timepoint), "E9.5"=baseline-48, "10.5"=baseline-24, "11.5"=baseline, "12.5"=baseline+24, "13.5"=baseline+48), # extrapolated from manuscript (E11.5 ~ 90hph)
Domain=switch(as.character(Type_step2), "FP"="FP", "p3"="p3", "pMN"="pMN", "p2"="pI", "p1"="pI", "p0"="pI", "dp6"="pD", "dp5"="pD", "dp4"="pD", "dp3"="pD", "dp2"="pD", "dp1"="pD"),
average=freq_norm_type,
sd=NA,
dataset="scRNA-seq"
) %>%
dplyr::ungroup() %>%
dplyr::group_by(Domain, Timepoint) %>%
dplyr::summarise(average=sum(average), dataset=unique(dataset)) %>%
dplyr::bind_rows(kicheva_vals) %>%
dplyr::mutate(dataset=factor(dataset, levels=c('scRNA-seq', 'Kicheva et al., 2014')))
p <- ggplot(both_dataset.df) +
geom_line(aes(x=Timepoint, y=average, color=Domain, linetype=dataset, group=interaction(dataset, Domain)), size=.2) +
geom_point(aes(x=Timepoint, y=average, color=Domain), size=.5) +
geom_errorbar(aes(x=Timepoint, ymin=average-sd, ymax=average+sd, color=Domain, linetype=dataset)) +
xlab("Time (hph)") + ylab("Progenitors (fraction)") + xlim(0,150) +
scale_colour_manual(breaks=c("FP", "p3", "pMN", "pI", "pD"), values=c("#B2AED6", "#82B383", "#F59498", "#D4D4D4", "#A7C7D7")) +
theme_minimal()
pdf(paste0(output_path, 'FIG2_DV_Population_Ratios_Comparison_Kicheva.pdf'), width=7, height=4, useDingbats=FALSE)
print(p)
graphics.off()
Resample dataset and select populations
m_neuron_combDE = resampleDataForCombinationDE(
orig_m_neural_path = paste0(output_path, '/m_neural.rds'),
num_min_cells_per_type2 = 40, # remove cell types with less than X cells
selected_type1 = "Neuron", # "Both", "Neuron", "Progenitor"
sampling_categories = c('Type_step2'),
num_cells_per_category = 200, # number of cells sampled per timepoint X Type_step2
dispersion_prefiltering_zscore = 0 # NULL, pre_select genes with sufficient dispersion score
)
neuron_partitionning_genes = unique(unlist(cell_partition$step2_markers[levels(pData(m_neuron_combDE$expressionSet)$Type_step2)]))
Pre-select type patterned genes
neuron_patterned_genes_test = m_neuron_combDE$runMonocle2DETest(
fullModelFormulaStr="~Type_step2",
reducedModelFormulaStr = "~1",
expressionFamily=VGAM::negbinomial.size()
)
neuron_patterned_genes = neuron_patterned_genes_test %>% as_tibble %>% dplyr::arrange(qval) %>% dplyr::filter(qval < 1e-5) %>% .$gene_short_name %>% as.character
neuron_partitionning_genes %>% {length(intersect(., neuron_patterned_genes))/length(.)} %>% {sprintf("%.2f%% of the markers used to partition neurons are included in the patterned gene list", .*100)}
m_neuron_combDE$excludeGenesFromIds(which(!m_neuron_combDE$getGeneNames() %in% neuron_patterned_genes))
Generate tested models
neuronCombinatorialModels = generateCombinatorialModels(m_neuron_combDE)
Run tests (4h45 with 8 cores)
Monocle_run_neuron = runCombinatorialDETests(m_neuron_combDE, neuronCombinatorialModels, numcores=8)
save(Monocle_run_neuron, file=paste0(output_path,'/Monocle_run_neuron_NegBinFamilyResample'))
# load(paste0(output_path,'/Monocle_run_neuron_NegBinFamilyResample.Rdata'))
Filter best model, for each gene
Monocle_run_neuron_topModel = getTopModel(m_neuron_combDE, Monocle_run_neuron, temporal_pattern_thres=.2)
saveRDS(Monocle_run_neuron_topModel, file=paste0(output_path, '/Monocle_run_neuron_topModel.rds'))
# Monocle_run_neuron_topModel = readRDS(file=paste0(output_path, '/Monocle_run_neuron_topModel.rds'))
Add additional measures and filter models (fold change, average level, time correlation…)
pval_thres_neuron = 1e-9
neuron_models_annotations = getModelAnnotations(m_neuron_combDE, model_list=unique(Monocle_run_neuron_topModel$model))
m_neuron_combDE$dR$genemodules = makeGeneModules(Monocle_run_neuron_topModel, neuron_models_annotations,
pval_thres = pval_thres_neuron, pos_pop_level_threshold = .15,
pos_pop_ratio_threshold = .08, log2fc_thres = 2,
gm_grouping_vars = c("model", "temporal_pattern"),
gm_ordering_vars = c("temporal_pattern", "first_domain_factor", "num_domain")
)
length(unlist(m_neuron_combDE$dR$genemodules))
setdiff(neuron_partitionning_genes, unlist(m_neuron_combDE$dR$genemodules))
print(paste0("Number of neuronal categories: ", length(which(unlist(lapply(m_neuron_combDE$dR$genemodules, length)) > 0))))
saveRDS(m_neuron_combDE$dR$genemodules, file=paste0(output_path, '/neurons_combDE_genelists.rds'))
save.image(paste0(output_path, '/workspace_postCombDE_neurons.Rdata'))
# For figure 3A, we do not use temporal pattern categories
gms.ordered = makeGeneModules(Monocle_run_neuron_topModel, neuron_models_annotations,
pval_thres = pval_thres_neuron, pos_pop_level_threshold = .15,
pos_pop_ratio_threshold = .08, log2fc_thres = 2,
gm_grouping_vars = c("model"),
gm_ordering_vars = c("first_domain_factor", "num_domain")
)
saveRDS(gms.ordered, file=paste0(output_path, '/neurons_combDE_genelists_ordered.rds'))
Plot predicted genes
neuron_combDE_by_domain = list()
for(x in setdiff(names(m_neuron_combDE$dR$genemodules), "Missing")) {
for(p in strsplit(strsplit(x, " / ")[[1]][1], '-')[[1]]) {
neuron_combDE_by_domain[[p]] <- c(neuron_combDE_by_domain[[p]], m_neuron_combDE$dR$genemodules[[x]])
}
}
neuron_combDE_by_domain_mask = markersToMask(neuron_combDE_by_domain[rev(levels(pData(m_neuron_combDE$expressionSet)$Type_step2))])[unlist(m_neuron_combDE$dR$genemodules),]
gene_pValues = (Monocle_run_neuron_topModel %>% {"names<-"(.$pval, .$genename)})[unlist(m_neuron_combDE$dR$genemodules)]
gene_pValues_colors = colorRampPalette(c("white", "black"))(n = 10)[cut(log10(1e-100 + gene_pValues), 10)]
gene_pValues_colors[which(gene_pValues == 0)] <- "orange"
cellcluster_sizes_combDE = table(pData(m_neuron_combDE$expressionSet)$Type_step2)
cellcluster_sizes_cumsum_neuron_combDE = c(0, cumsum(cellcluster_sizes_combDE))
names(cellcluster_sizes_cumsum_neuron_combDE) <- c(names(cellcluster_sizes_combDE), "")
tf_list = getTFs()
interesting_genes = sort(setdiff(unlist(m_neuron_combDE$dR$genemodules), sort(unique(unlist(cell_partition$bothSteps_markers_neural)))))
plot_basename_neuron = paste0('FIGS3_Neural_DE_Combinatorial_Neurons_NegBinResampled')
m_neuron_combDE$plotGeneModules(
basename=plot_basename_neuron,
displayed.gms = c('dR.genemodules'),
displayed.geneset=NA, # plot all genes
use.dendrogram=NA,
display.clusters=NULL, #"State",
# file_settings=list(list(type='png', width=1500, height=4000)),
file_settings=list(list(type='pdf', width=7, height=as.integer(0.01*length(unlist(m_neuron_combDE$dR$genemodules)) + 6))),
data_status='Raw',
gene_transformations=c('log', 'logscaled'),
display.legend=TRUE,
cell.ordering=order(rev(pData(m_neuron_combDE$expressionSet)$Type_step2), pData(m_neuron_combDE$expressionSet)$timepoint),
extra_colors=cbind(
"Neural types"=pop_colors$Step2[as.character(pData(m_neuron_combDE$expressionSet)$Type_step2)],
"timepoint"=pop_colors$timepoint[as.character(pData(m_neuron_combDE$expressionSet)$timepoint)]
),
genemodules.palette=rep(c("gray80", "gray90"), length(m_neuron_combDE$dR$genemodules)),
genes.extra_colors=cbind(
"Partitionning"=unname(unlist(lapply(unlist(m_neuron_combDE$dR$genemodules), function(l) if(l %in% neuron_partitionning_genes) 'red' else 'white'))),
"pval"=gene_pValues_colors,
"TF"=unname(unlist(lapply(unlist(m_neuron_combDE$dR$genemodules), function(l) if(l %in% tf_list) 'yellow' else 'white'))),
"Time behavior"=c("blue", "red", "white")[(Monocle_run_neuron_topModel %>% {"names<-"(.$temporal_pattern, .$genename)}) [unlist(m_neuron_combDE$dR$genemodules)]],
"+"=unname(unlist(lapply(unlist(m_neuron_combDE$dR$genemodules), function(l) if(l %in% interesting_genes) 'green' else 'white')))
),
extra_legend=list("text"=c("", levels(droplevels(pData(m_neuron_combDE$expressionSet)$Type_step2))), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m_neuron_combDE$expressionSet)$Type_step2)))])),
rect_overlay=c(lapply(seq(rev(levels(pData(m_neuron_combDE$expressionSet)$Type_step2))), function(i){
list(
xleft=cellcluster_sizes_cumsum_neuron_combDE[[i]],
xright=cellcluster_sizes_cumsum_neuron_combDE[[i+1]],
ytop=0.5,
ybottom=length(unlist(m_neuron_combDE$dR$genemodules)) + 0.5,
width=.2,
color="grey"
)
}),
apply(which(neuron_combDE_by_domain_mask==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_neuron_combDE[[x[2]]],
xright=cellcluster_sizes_cumsum_neuron_combDE[[x[2]+1]],
ytop=length(unlist(m_neuron_combDE$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m_neuron_combDE$dR$genemodules)) - x[1]+1 + 0.5,
color='black'
)
})
),
pretty.params=list("size_factor"=1, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
Export predicted genes to file
m_neuron_combDE$writeGeneModules(basename=plot_basename_neuron, gms='dR.genemodules')
Resample dataset and select populations
m_prog_combDE = resampleDataForCombinationDE(
orig_m_neural_path = paste0(output_path, '/m_neural.rds'),
num_min_cells_per_type2 = 40, # remove cell types with less than X cells
selected_type1 = "Progenitor", # "Both", "Neuron", "Progenitor"
sampling_categories = c('Type_step2'),
num_cells_per_category = 200, # number of cells sampled per timepoint X Type_step2
dispersion_prefiltering_zscore = NULL # 0, pre_select genes with sufficient dispersion score
)
prog_partitionning_genes = unique(unlist(cell_partition$step2_markers[levels(pData(m_prog_combDE$expressionSet)$Type_step2)]))
Pre-select type patterned genes
prog_patterned_genes_test = m_prog_combDE$runMonocle2DETest(
fullModelFormulaStr="~Type_step2",
reducedModelFormulaStr = "~1",
expressionFamily=VGAM::negbinomial.size()
)
prog_patterned_genes = prog_patterned_genes_test %>% as_tibble %>% dplyr::arrange(qval) %>% dplyr::filter(qval < 1e-5) %>% .$gene_short_name %>% as.character
prog_partitionning_genes %>% {length(intersect(., prog_patterned_genes)) / length(.)} %>% {sprintf("%.2f%% of the markers used to partition progenitors are included in the patterned gene list", .*100)}
m_prog_combDE$excludeGenesFromIds(which(!m_prog_combDE$getGeneNames() %in% prog_patterned_genes))
Generate tested models
progCombinatorialModels = generateCombinatorialModels(m_prog_combDE)
Run tests (about 24h with 8 cores)
Monocle_run_prog = runCombinatorialDETests(m_prog_combDE, progCombinatorialModels, numcores=8)
save(Monocle_run_prog, file=paste0(output_path,'/Monocle_run_prog_NegBinFamilyResample.Rdata'))
# load(paste0(output_path,'/Monocle_run_prog_NegBinFamilyResample.Rdata'))
Filter best model, for each gene
Monocle_run_prog_topModel = getTopModel(m_prog_combDE, Monocle_run_prog, temporal_pattern_thres=.2)
saveRDS(Monocle_run_prog_topModel, file=paste0(output_path, '/Monocle_run_prog_topModel.rds'))
# Monocle_run_prog_topModel = readRDS(file=paste0(output_path, '/Monocle_run_prog_topModel.rds'))
Add additional measures and filter models (fold change, average level, time correlation…)
pval_thres_prog = 1e-9
prog_models_annotations = getModelAnnotations(m_prog_combDE, model_list=unique(Monocle_run_prog_topModel$model))
m_prog_combDE$dR$genemodules = makeGeneModules(Monocle_run_prog_topModel, prog_models_annotations,
pval_thres = pval_thres_prog, pos_pop_level_threshold = .2,
pos_pop_ratio_threshold = .1, log2fc_thres = 2,
gm_grouping_vars = c("model", "temporal_pattern"),
gm_ordering_vars = c("temporal_pattern", "first_domain_factor", "num_domain")
)
length(unlist(m_prog_combDE$dR$genemodules))
setdiff(prog_partitionning_genes, unlist(m_prog_combDE$dR$genemodules))
print(paste0("Number of progenitor categories: ", length(which(unlist(lapply(m_prog_combDE$dR$genemodules, length)) > 0))))
save.image(paste0(output_path, '/workspace_postCombDE_progenitors.rds'))
saveRDS(m_prog_combDE$dR$genemodules, file=paste0(output_path, '/progenitors_combDE_genelists.Rdata'))
Plot predicted genes
prog_combDE_by_domain = list()
for(x in setdiff(names(m_prog_combDE$dR$genemodules), "Missing")) {
for(p in strsplit(strsplit(x, " / ")[[1]][1], '-')[[1]]) {
prog_combDE_by_domain[[p]] <- c(prog_combDE_by_domain[[p]], m_prog_combDE$dR$genemodules[[x]])
}
}
# we drop the Type_step2 in case a type has been excluded from the tests
prog_combDE_by_domain_mask = markersToMask(prog_combDE_by_domain[rev(levels(droplevels(pData(m_prog_combDE$expressionSet)$Type_step2)))])[unlist(m_prog_combDE$dR$genemodules),]
gene_pValues = (Monocle_run_prog_topModel %>% {"names<-"(.$pval, .$genename)})[unlist(m_prog_combDE$dR$genemodules)]
gene_pValues_colors = colorRampPalette(c("white", "black"))(n = 10)[cut(log10(1e-100 + gene_pValues), 10)]
gene_pValues_colors[which(gene_pValues == 0)] <- "orange"
cellcluster_sizes_combDE = table(droplevels(pData(m_prog_combDE$expressionSet)$Type_step2))
cellcluster_sizes_cumsum_prog_combDE = c(0, cumsum(cellcluster_sizes_combDE))
names(cellcluster_sizes_cumsum_prog_combDE) <- c(names(cellcluster_sizes_combDE), "")
tf_list = getTFs()
interesting_genes = sort(setdiff(unlist(m_prog_combDE$dR$genemodules), sort(unique(unlist(cell_partition$bothSteps_markers_neural)))))
plot_basename_prog = paste0('FIGS2_Neural_DE_Combinatorial_Progenitors_NegBinResampled')
m_prog_combDE$plotGeneModules(
basename=plot_basename_prog,
displayed.gms = c('dR.genemodules'),
displayed.geneset=NA, # plot all genes
use.dendrogram=NA,
display.clusters=NULL, #"State",
# file_settings=list(list(type='png', width=1500, height=4000)),
file_settings=list(list(type='pdf', width=7, height=as.integer(0.01*length(unlist(m_prog_combDE$dR$genemodules)) + 6))),
data_status='Raw',
gene_transformations=c('log', 'logscaled'),
display.legend=TRUE,
cell.ordering=order(rev(pData(m_prog_combDE$expressionSet)$Type_step2), pData(m_prog_combDE$expressionSet)$timepoint),
extra_colors=cbind(
"Neural types"=pop_colors$Step2[as.character(pData(m_prog_combDE$expressionSet)$Type_step2)],
"timepoint"=pop_colors$timepoint[as.character(pData(m_prog_combDE$expressionSet)$timepoint)]
),
genemodules.palette=rep(c("gray80", "gray90"), length(m_prog_combDE$dR$genemodules)),
genes.extra_colors=cbind(
"Partitionning"=unname(unlist(lapply(unlist(m_prog_combDE$dR$genemodules), function(l) if(l %in% prog_partitionning_genes) 'red' else 'white'))),
"pval"=gene_pValues_colors,
"TF"=unname(unlist(lapply(unlist(m_prog_combDE$dR$genemodules), function(l) if(l %in% tf_list) 'yellow' else 'white'))),
"Time behavior"=c("blue", "red", "white")[(Monocle_run_prog_topModel %>% {"names<-"(.$temporal_pattern, .$genename)}) [unlist(m_prog_combDE$dR$genemodules)]],
"+"=unname(unlist(lapply(unlist(m_prog_combDE$dR$genemodules), function(l) if(l %in% interesting_genes) 'green' else 'white')))
),
extra_legend=list("text"=c("", levels(droplevels(pData(m_prog_combDE$expressionSet)$Type_step2))), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m_prog_combDE$expressionSet)$Type_step2)))])),
rect_overlay=c(lapply(seq(rev(levels(droplevels(pData(m_prog_combDE$expressionSet)$Type_step2)))), function(i){
list(
xleft=cellcluster_sizes_cumsum_prog_combDE[[i]],
xright=cellcluster_sizes_cumsum_prog_combDE[[i+1]],
ytop=0.5,
ybottom=length(unlist(m_prog_combDE$dR$genemodules)) + 0.5,
width=.2,
color="grey"
)
}),
apply(which(prog_combDE_by_domain_mask==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_prog_combDE[[x[2]]],
xright=cellcluster_sizes_cumsum_prog_combDE[[x[2]+1]],
ytop=length(unlist(m_prog_combDE$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m_prog_combDE$dR$genemodules)) - x[1]+1 + 0.5,
color='black'
)
})
),
pretty.params=list("size_factor"=1, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
Export predicted genes to file
m_prog_combDE$writeGeneModules(basename=plot_basename_prog, gms='dR.genemodules')
# ###############################
library(MSigDB)
gene_categories = list(
"Adhesion" = m$getGeneNames() %>% .[toupper(.) %in% unique(unlist(MSigDB[["C5_GENE_ONTOLOGY"]][grep('ADHESION', names(MSigDB[["C5_GENE_ONTOLOGY"]]), value=T)]["GO_CELL_CELL_ADHESION"]))],
"TF" = tf_list,
"NeuroTransmitters" = m$getGeneNames() %>% .[toupper(.) %in% unique(unlist(MSigDB[["C5_GENE_ONTOLOGY"]][["GO_NEUROTRANSMITTER_TRANSPORT"]]))]
)
neuron_markers_selection = names(gene_categories) %>% {setNames(lapply(., function(gc){intersect(unname(unlist(readRDS(paste0(output_path, '/neurons_combDE_genelists_ordered.rds')))), gene_categories[[gc]])}), .)}
# remove gene whose topmodel is in dl6 (population is very small)
dl6_only_genes = Monocle_run_neuron_topModel %>% dplyr::filter(model=="m000001000000" & genename %in% unname(unlist(readRDS(paste0(output_path, '/neurons_combDE_genelists_ordered.rds'))))) %>% .$genename
neuron_markers_selection$Adhesion <- setdiff(neuron_markers_selection$Adhesion, unique(c(tf_list, unlist(cell_partition$bothSteps_markers_neural), dl6_only_genes)))
neuron_markers_selection$TF <- setdiff(neuron_markers_selection$TF, unique(c(unlist(cell_partition$bothSteps_markers_neural), dl6_only_genes)))
neuron_markers_selection$NeuroTransmitters <- setdiff(neuron_markers_selection$NeuroTransmitters, unlist(neuron_markers_selection[c('Adhesion', 'TF')]))
# Order neurotransmitter by category (inhibitory, excitatory, ...etc)
neuro_trans_order = c("Gad1", "Gad2", "Slc6a5", "Slc32a1", "Sv2c", "Slc5a7", "Stx1a", "Slc18a3", "Slc17a6", "Rims4", "Rph3a")
neuron_markers_selection$NeuroTransmitters <- c(intersect(neuro_trans_order, neuron_markers_selection$NeuroTransmitters), setdiff(neuron_markers_selection$NeuroTransmitters, neuro_trans_order))
write(jsonlite::toJSON(neuron_markers_selection, pretty=TRUE), paste0(output_path, '/FIG3_Prediction_NeuronHighlights.json'))
# genelist is a 3-level list: 1. plot 2. plot categories 3. genes
plotPredictedSelection(currm=m_neuron,
genelist=list("FIG3_Neuron_Marker_Prediction"=neuron_markers_selection),
selected_genes=unname(unlist(readRDS(paste0(output_path, '/neurons_combDE_genelists.rds')))),
monocle_run_topModel=Monocle_run_neuron_topModel,
pc = pop_colors$Step2,
zscore=T,
height=7, width=15
)
Plot Claudin3 (gene average per type1 X Dv X timepoint)
plotAveragePerTypeTimeDV(m_neural, gene = "Cldn3", basename="FIG3_Gene_expression_dynamics_")
processed_subsets = list()
denovo_folder = paste0(output_path, "/subClustering_Neuron_by_TypeStep2/")
dir.create(denovo_folder, showWarnings = FALSE)
for(p in levels(droplevels(pData(m$expressionSet) %>% dplyr::filter(Type_step1=="Neuron") %>% .$Type_step2))){
selected_cells = which(pData(m$expressionSet)$Type_step2 %in% p)
if(length(selected_cells) >= 50)
processed_subsets[[p]] <- selected_cells
}
Identify gene modules in each domain
m_pops = list()
for(p in names(processed_subsets)) {
currm = m$copy()
currm$plot_folder = denovo_folder
currm$excludeCellsFromIds(setdiff(seq(m$getNumberOfCells()), processed_subsets[[p]]))
currm$excludeUnexpressedGenes(min.cells=20, min.level=1, verbose=T, data_status='Raw')
corr.curr.mat = fastCor(t(currm$getReadcounts(data_status='Raw')), method="spearman")
# Hack identifyGeneModules as it requires Normalized readcounts (TODO: remove this requirement)
currm$readcounts_norm = currm$readcounts_raw
currm$identifyGeneModules(
method="TopCorr_DR",
corr=corr.curr.mat,
corrgenes_t=2000,
topcorr_num_initial_gms=200,
topcorr_num_final_gms=200,
topcorr_mod_min_cell=5,
topcorr_mod_consistency_thres=0.4,
topcorr_mod_skewness_thres=-Inf,
topcorr_min_cell_level=1,
data_status='Raw'
)
if(length(unlist(currm$topCorr_DR$genemodules)) > 0){
m_pops[[p]] <- currm$copy()
}
}
saveRDS(m_pops, file=paste0(output_path, '/All_Clusters_topCorrDR_Neurons.rds'))
# m_pops = readRDS(file=paste0(output_path, '/All_Clusters_topCorrDR_Neurons.rds'))
subclustering_params = rjson::fromJSON(file='./input_files/subclustering_typestep2_181106.json')
# Add cluster names / colors
for(d in names(subclustering_params)){
subclustering_params[[d]]$populations_names = paste0(d, '.', seq(subclustering_params[[d]]$num_clusters))
subclustering_params[[d]]$populations_colors = colorRampPalette(c("gray80", "black"))(subclustering_params[[d]]$num_clusters)
}
# Extra gene level are normalized globally
extra_genelevels <- apply(m$getReadcounts('Raw')[unique(c(unlist(cell_partition$bothSteps_markers_neural_unique), unlist(lapply(subclustering_params, function(x) x$extra_markers)))), ], 1, function(x) log(1+x)/max(log(1+x)))
extra_genelevels_colors = apply(extra_genelevels, 2, function(x) colorRampPalette(c("#0464DF", "#FFE800"))(n = 1000)[1+as.integer(999*x)])
rownames(extra_genelevels_colors) <- rownames(extra_genelevels)
plotSubC = TRUE
m_pops2 = list()
for(p in names(subclustering_params)){
print(p)
m_pops[[p]]$identifyCellClusters(method='hclust', used_genes="topCorr_DR.genemodules", data_status='Raw')
clustering_order = m_pops[[p]]$cellClusters[['hclust']]$res$order
if(plotSubC)
m_pops[[p]]$plotGeneModules(
main=p,
basename=paste0(p, '_deNOVO_Hclust'),
displayed.gms = c('topCorr_DR.genemodules'),
displayed.geneset=getGenesFromGOterms(c('GO:0007399', 'GO:0030154', 'GO:0016477')), #NA
use.dendrogram=NA, #'hclust',
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=10, height=10)),
data_status='Raw',
gene_transformations=c('logscaled'), # 'log'
display.legend=TRUE,
cell.ordering=unlist(clustering_order),
extra_colors=cbind(
"Neural types"=getClusterColors()[pData(m_pops[[p]]$expressionSet)$Type_step2],
"DV position"=c(colorRampPalette(c("grey", "darkgreen"))(n = 13), 'white')[pData(m_pops[[p]]$expressionSet)$DV],
extra_genelevels_colors[m_pops[[p]]$getCellsNames(), ]
),
genemodules.text_colors=unlist(lapply(m_pops[[p]]$topCorr_DR$genemodules, function(l) if(any(unique(unlist(cell_partition$bothSteps_markers_neural)) %in% l)) 'red' else 'black')),
pretty.params=(list("size_factor"=4, "ngenes_per_lines" = 10, "side.height.fraction"=.6))
)
if(p %in% names(subclustering_params)){
m_pops$cellClusters[['curatedSelection']] <- NULL
m_pops[[p]]$dR$genemodules <- Filter(function(x){any(unlist(subclustering_params[[p]]$populations_markers) %in% x)}, m_pops[[p]]$topCorr_DR$genemodules)
if(length(m_pops[[p]]$dR$genemodules) > 1){
m_pops[[p]]$identifyCellClusters(method='hclust', clust_name="curatedSelection" , used_genes="dR.genemodules", data_status='Raw', numclusters = subclustering_params[[p]][["num_clusters"]])
timepoint_clusters_stats = cbind.data.frame(
cell_ids=m_pops[[p]]$cellClusters[['curatedSelection']]$cell_ids,
timepoint=pData(m_pops[[p]]$expressionSet)$timepoint,
timepoint_norm=
(pData(m_pops[[p]]$expressionSet)$timepoint-min(pData(m$expressionSet)$timepoint)) /
(max(pData(m$expressionSet)$timepoint)-min(pData(m$expressionSet)$timepoint))
) %>%
dplyr::group_by(cell_ids) %>%
dplyr::summarise(
mean=mean(timepoint),
sd=sd(timepoint),
mean_norm=mean(timepoint_norm),
sd_norm=sd(timepoint_norm)
)
cell_ids_timepoint_color = timepoint_clusters_stats %>% {"names<-"(hsv(1, .$mean_norm, 1, 1), .$cell_ids)}
if(plotSubC)
m_pops[[p]]$plotGeneModules(
main=p,
basename=paste0(p, '_deNOVO_Curated_Hclust'),
displayed.gms = c('dR.genemodules'),
displayed.geneset=unique(c(getGenesFromGOterms(c('GO:0007399', 'GO:0030154', 'GO:0016477')), unlist(subclustering_params[[p]]$populations_markers))), #NA
use.dendrogram="curatedSelection", #'hclust',
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=10, height=10)),
data_status='Raw',
gene_transformations=c('logscaled'), # 'log'
display.legend=TRUE,
extra_legend=list("text"=c("", subclustering_params[[p]]$populations_names), "colors"=c('white', subclustering_params[[p]]$populations_colors)),
extra_colors=cbind(
cell_ids_timepoint_color[m_pops[[p]]$cellClusters[['curatedSelection']]$cell_ids],
subclustering_params[[p]]$populations_colors[m_pops[[p]]$cellClusters[['curatedSelection']]$cell_ids],
extra_genelevels_colors[m_pops[[p]]$getCellsNames(), unlist(subclustering_params[[p]]$extra_markers)]
),
pretty.params=(list("size_factor"=4, "ngenes_per_lines" = 5, "side.height.fraction"=.6))
)
if(plotSubC)
m_pops[[p]]$plotGeneModules(
main=p,
basename=paste0(p, '_deNOVO_Hclust_CuratedOrdering'),
displayed.gms = c('topCorr_DR.genemodules'),
displayed.geneset=getGenesFromGOterms(c('GO:0007399', 'GO:0030154', 'GO:0016477')), #NA
use.dendrogram="curatedSelection", #'hclust',
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=10, height=15)),
data_status='Raw',
gene_transformations=c('logscaled'), # 'log'
display.legend=TRUE,
# cell.ordering=,
extra_colors=cbind(
"Neural types"=getClusterColors()[pData(m_pops[[p]]$expressionSet)$Type_step2],
"DV position"=c(colorRampPalette(c("grey", "darkgreen"))(n = 13), 'white')[pData(m_pops[[p]]$expressionSet)$DV],
extra_genelevels_colors[m_pops[[p]]$getCellsNames(), ]
),
genemodules.text_colors=unlist(lapply(m_pops[[p]]$topCorr_DR$genemodules, function(l) if(any(unique(unlist(cell_partition$bothSteps_markers_neural)) %in% l)) 'red' else 'black')),
genemodules.extra_colors = cbind(
# colorRampPalette(c("white", "black"))(n = 10)[cut(genemodules.skewness, 10)],
# c("white", "green")[1+1*(genemodules.skewness > 2)]
c("white", "green")[1+1*unlist(lapply(m_pops[[p]]$topCorr_DR$genemodules,function(x){length(intersect(x, unlist(subclustering_params[[p]]$populations_markers))) > 0}))],
c("white", "green")[1+1*unlist(lapply(m_pops[[p]]$topCorr_DR$genemodules,function(x){length(intersect(x, unlist(subclustering_params[[p]]$populations_markers))) > 0}))]
# c("white", "red")[1+1*(seq(length(m_pops[[p]]$topCorr_DR$genemodules)) %in% curated_list)]
),
pretty.params=(list("size_factor"=2, "ngenes_per_lines" = 15, "side.height.fraction"=.6))
)
# average dataset from hclust
# ! we should logscale the dataset globally before doing a domain specific gm/cluster average
domain_clust_gm_avg = data.frame(t(m_pops[[p]]$getReadcounts('Raw')[unlist(m_pops[[p]]$dR$genemodules), ]), check.names=F) %>%
tibble::rownames_to_column('cellname') %>%
tidyr::gather(genename, value, -cellname) %>%
dplyr::left_join(cbind.data.frame(cell_ids=m_pops[[p]]$cellClusters[['curatedSelection']]$cell_ids, cellname=m_pops[[p]]$getCellsNames()), by="cellname") %>%
dplyr::left_join(m_pops[[p]]$dR$genemodules %>% {cbind.data.frame("gm_id"=rep(seq(length(.)), lapply(.,length)), genename=unlist(.))}, by="genename") %>%
dplyr::group_by(genename) %>%
dplyr::mutate(logscaled = scale(log(1+value), center=T, scale=T)) %>%
dplyr::group_by(cell_ids, gm_id) %>% # average per gm
dplyr::summarise(mean_logscaled=mean(logscaled)) %>%
tidyr::spread(cell_ids, mean_logscaled) %>%
tibble::column_to_rownames("gm_id")
# Remove "excluded" cell clusters
domain_clust_gm_avg <- domain_clust_gm_avg[, which(subclustering_params[[p]]$populations_names != "Excluded")]
colnames(domain_clust_gm_avg) <- setdiff(subclustering_params[[p]]$populations_names, "Excluded")
m_pops2[[p]] <- m_pops[[p]]$copy()
}
# if(is.null(m_pops$cellClusters[['curatedSelection']])){
# m_pops$cellClusters[['curatedSelection']] <- list("num_clusters"=1)
# }
}
m_pops[[p]]$writeGeneModules(p, gms="topCorr_DR.genemodules")
m_pops[[p]]$writeGeneModules(p, gms="dR.genemodules")
}
saveRDS(m_pops2, file=paste0(output_path, '/All_Clusters_topCorrDR_Neurons_v2.rds'))
# m_pops2 = readRDS(file=paste0(output_path, '/All_Clusters_topCorrDR_Neurons_v2.rds'))
Export curated gene module list
curated_gms.df = do.call(rbind.data.frame, lapply(names(m_pops2), function(p){
do.call(rbind.data.frame,
lapply(seq(length(m_pops2[[p]]$dR$genemodules)), function(i){
list("domain"=p, "GM_id"=i, "Genes"=paste0(m_pops2[[p]]$dR$genemodules[[i]], collapse=","))
}))
}))
write.table(curated_gms.df, file=paste0(output_path,'/Table_S2.csv'), row.names=F, sep=";", quote=FALSE)
Clean-up V2a and MN subclusters
excl_subtypes = list(
"V2a"=list("cl"=c(5), "gm"=2:5),
"MN"=list("cl"=c(5), "gm"=1:10)
)
excl_subtypes_cl = unlist(lapply(names(excl_subtypes), function(i){paste0(i,'.',excl_subtypes[[i]]$cl)}))
# average cluster sample time
cluster_avg_sample_time = setNames(
lapply(names(m_pops2), function(p){
unlist(lapply(seq(m_pops2[[p]]$cellClusters[['curatedSelection']]$num_clusters), function(i){
mean(pData(m_pops[[p]]$expressionSet)$timepoint[which(m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids == i)])
})) %>% {(.-min(pData(m$expressionSet)$timepoint)) / (max(pData(m$expressionSet)$timepoint)-min(pData(m$expressionSet)$timepoint))}
}),
names(m_pops2))
for(p in names(excl_subtypes)){
print(p)
currm = m_pops2[[p]]$copy()
excl_cells = which(currm$cellClusters[['curatedSelection']]$cell_ids %in% excl_subtypes[[p]]$cl)
currm$excludeCellsFromIds(excl_cells)
currm$cellClusters[['curatedSelection']]$cell_ids <- currm$cellClusters[['curatedSelection']]$cell_ids[-excl_cells]
currm$cellClusters[['curatedSelection']]$res <- currm$cellClusters[['curatedSelection']]$res %>% as.dendrogram %>% dendextend::prune(m_pops2[[p]]$getCellsNames()[excl_cells]) %>% as.hclust
currm$dR$genemodules.selected <- currm$dR$genemodules[excl_subtypes[[p]]$gm]
currm$plotGeneModules(
main=p,
basename=paste0(p, '_deNOVO_Curated_Hclust_CLEAN'),
displayed.gms = c('dR.genemodules.selected'),
displayed.geneset=unique(c(getGenesFromGOterms(c('GO:0007399', 'GO:0030154', 'GO:0016477')), unlist(subclustering_params[[p]]$populations_markers))), #NA
use.dendrogram="curatedSelection", #'hclust',
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=10, height=10)),
data_status='Raw',
gene_transformations=c('logscaled'), # 'log'
display.legend=TRUE,
extra_legend=list("text"=c("", subclustering_params[[p]]$populations_names), "colors"=c('white', subclustering_params[[p]]$populations_colors)),
extra_colors=cbind(
cluster_avg_sample_time[[p]] %>% {.[currm$cellClusters[['curatedSelection']]$cell_ids]} %>% {hsv(1, ., 1, 1)},
subclustering_params[[p]]$populations_colors[currm$cellClusters[['curatedSelection']]$cell_ids],
extra_genelevels_colors[currm$getCellsNames(), unlist(subclustering_params[[p]]$extra_markers)]
),
pretty.params=(list("size_factor"=4, "ngenes_per_lines" = 5, "side.height.fraction"=.6))
)
}
# Summary table indicating the number of genes and modules for each step of the gene module identification method used to cluster the neuronal subtypes.
domain_stats = cbind.data.frame(
"Domain"=names(m_pops2),
"Cells"=unlist(lapply(m_pops2, function(x) x$getNumberOfCells())),
"Expressed Genes"=unlist(lapply(m_pops2, function(x) x$getNumberOfGenes())),
"Correlated Genes"=unlist(lapply(m_pops2, function(x) length(unlist(x$topCorr_DR$genemodules.history[['corrgenes']])))),
"Unbiased genes"=unlist(lapply(m_pops2, function(x) length(unlist(x$topCorr_DR$genemodules)))),
"Curated genes"=unlist(lapply(m_pops2, function(x) length(unlist(x$dR$genemodules)))),
"Unbiased gene modules"=unlist(lapply(m_pops2, function(x) length(x$topCorr_DR$genemodules))),
"Curated gene modules"=unlist(lapply(m_pops2, function(x) length(x$dR$genemodules))),
"Neuron subtypes"=unlist(lapply(m_pops2, function(x) x$cellClusters[['curatedSelection']]$num_clusters))
)
write.table(domain_stats, file=paste0(output_path,'/Table_S4.csv'), row.names=F, sep=";", quote=FALSE)
Build aggregated dendrogram
data_time = unlist(lapply(names(m_pops2), function(p){
print(p)
m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids %>% .[!duplicated(.)] %>% {setNames(cluster_avg_sample_time[[p]][.], paste0(p, '.', .))}
})) %>% rbind(.,.)
data_col = unlist(lapply(names(m_pops2), function(p){
print(p)
m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids %>% .[!duplicated(.)] %>% subclustering_params[[p]]$populations_colors[.]
}))
library(ape)
pruned_dendrograms = lapply(names(m_pops2), function(p){
print(p)
redondant_cell_names = m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids %>% .[duplicated(.)] %>% names
m_pops2[[p]]$cellClusters[['curatedSelection']]$res %>% as.phylo %>% ape::drop.tip(redondant_cell_names) %>% as.dendrogram
})
dend_merge_all = Reduce(function(x,y) merge(x, y, height=1000, adjust="auto"), pruned_dendrograms)
pdf(paste0(output_path, '/FIG4_subclusters_dendrogram_full.pdf'), height=3, pointsize=2, useDingbats=FALSE)
heatmap.3(
data_time,
Colv=dend_merge_all,
Rowv=FALSE,
dendrogram="column",
labRow=FALSE,
ColSideColors=t(rbind(
data_col,
hsv(1, data_time[1,], 1, 1)
)),
useRaster=FALSE,
lmat = rbind(c(5, 4, 0), c(0,1,0), c(3, 2,0)),
lhei = c(1, .5, .2),
lwid = c(1, 10, 1),
key=F
)
graphics.off()
Aggregate all domains heatmaps in a single pdf
agg_subclust_name = paste0(output_path, "/FIG4_Domain_sub_clustering.pdf")
subclust_path = grep("pdf", list.files(m_pops2[[1]]$plot_folder, full.names=T), value=T)
system2(command="gs", args=sprintf("-dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=%s %s", agg_subclust_name, paste0(subclust_path, collapse=" ")))
Download all neuronal subclusters
Gene average per cell type
subtypes.df = do.call(rbind, lapply(names(subclustering_params), function(p){
cbind.data.frame(
subtype_name = paste0(p, '.', m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids),
cellname = names(m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids),
subtype_cluster_id = m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids,
type = p
)
}))
subtype_map_genes = list(
"all"=unlist(lapply(subclustering_params, function(x) unlist(x$populations_markers))),
"custom"=c("Onecut2","Pou2f2","Zfhx2","Zfhx3","Zfhx4","Pou3f1","Nfia","Nfib","Nfix","Neurod2","Neurod6","Tcf4","Enc1","Olig3","Hes6","Neurog1","Neurog2","Ascl1","Gadd45g","Neurod1","Neurod4")
)
subtypes_gene_avg = data.frame(t(m$getReadcounts('Raw')[unique(unlist(subtype_map_genes)), which(pData(m$expressionSet)$Type_step2 %in% intersect(names(subclustering_params), names(m_pops2)))]), check.names=F) %>%
tibble::rownames_to_column('cellname') %>%
tidyr::gather(genename, value, -cellname) %>%
dplyr::left_join(subtypes.df, by="cellname") %>%
dplyr::group_by(genename, subtype_name) %>%
dplyr::mutate(mean=mean(value)) %>%
dplyr::distinct(genename, subtype_name, .keep_all=TRUE) %>%
dplyr::group_by(genename) %>%
dplyr::mutate(mean_norm_max=mean/max(mean),
type = factor(type, levels=names(subclustering_params))
) %>%
dplyr::ungroup()
mgnames = "custom"
print(mgnames)
p <- subtypes_gene_avg %>%
dplyr::mutate(
genename=factor(genename, levels=unique(subtype_map_genes[[mgnames]]))
) %>%
dplyr::filter(
genename %in% unique(subtype_map_genes[[mgnames]]) &
!subtype_name %in% excl_subtypes_cl
) %>%
ggplot(.) +
geom_count(aes(x=subtype_name, y=genename, size=mean_norm_max, fill=type), color='gray80', stroke=0, shape=21) +
scale_size_area(max_size=5) +
scale_fill_manual(breaks=names(pop_colors$Step2), values=pop_colors$Step2) +
# scale_color_manual(breaks=names(pop_colors$Step2), values=pop_colors$Step2) +
xlab("") + ylab("") +
coord_flip() +
# scale_x_discrete(position = "bottom") +
scale_y_discrete(position = "bottom") +
theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 0))#, size=8))
pdf(paste0(output_path, '/FIG5_subclustering_map_', mgnames,'_vertical.pdf'), height=10, width=6, useDingbats=FALSE)
print(p)
graphics.off()
mgnames = "all"
print(mgnames)
p <- subtypes_gene_avg %>%
dplyr::mutate(
genename=factor(genename, levels=unique(subtype_map_genes[[mgnames]]))
) %>%
dplyr::filter(
genename %in% unique(subtype_map_genes[[mgnames]]) &
!subtype_name %in% excl_subtypes_cl
) %>%
ggplot(.) +
geom_count(aes(x=genename, y=subtype_name, size=mean_norm_max, fill=type), color='gray80', stroke=0, shape=21) +
scale_size_area(max_size=3) +
scale_fill_manual(breaks=names(pop_colors$Step2), values=pop_colors$Step2) +
# scale_color_manual(breaks=names(pop_colors$Step2), values=pop_colors$Step2) +
scale_x_discrete(position = "top") + xlab("") + ylab("") +
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 0, size=7))
pdf(paste0(output_path, '/FIG5_subclustering_map_', mgnames,'.pdf'), height=7, width=12, useDingbats=FALSE)
print(p)
graphics.off()
Export subcluster ids in new m_neural structure
m_neural = readRDS(paste0(output_path, '/m_neural.rds'))
pData(m_neural$expressionSet)$subtypes = rep(NA, m_neural$getNumberOfCells())
subtypes=unlist(lapply(grep("Null_", names(subclustering_params), value=T, invert=T), function(p){
# subtypes=unlist(lapply(grep("Null_", names(m_pops2), value=T, invert=T), function(p){
setNames(paste0(p, '.', m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids), names(m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids))
}))
pData(m_neural$expressionSet)[names(subtypes), "subtypes"] <- subtypes
pData(m_neural$expressionSet)$subtypes[which(pData(m_neural$expressionSet)$subtypes %in% excl_subtypes_cl)] <- NA
saveRDS(m_neural, file=paste0(output_path, '/m_neural_subtypes.rds'))
# m_neural = readRDS(file=paste0(output_path, '/m_neural_subtypes.rds'))
Plot the two waves of neurogenesis in selected domains (Figure 6A)
gene_list = c('Onecut2', 'Pou2f2', 'Zfhx3', 'Zfhx4', 'Nfia', 'Nfib', 'Neurod2', 'Neurod6')
domain_list = rev(c('V3', 'MN', 'V2b', 'V2a', 'V1', 'V0', "dl6", "dl5", "dl4", "dl3", "dl2", "dl1"))
dot.input2 = pData(m_neural$expressionSet) %>%
tibble::rownames_to_column("cell_name") %>%
dplyr::filter(Type_step2 %in% domain_list) %>%
dplyr::select(cell_name, timepoint, Type_step2) %>%
dplyr::left_join(
as.data.frame(t(m_neural$getReadcounts('Raw')[gene_list,])) %>% tibble::rownames_to_column("cell_name") %>% tidyr::gather(gene_name, level, -cell_name),
by="cell_name"
) %>%
dplyr::group_by(timepoint, gene_name, Type_step2) %>%
dplyr::summarize(mean=mean(level), count=n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(
gene_name=factor(gene_name, levels=gene_list),
Type_step2=factor(Type_step2, levels=domain_list)
)
# dot.input2[which(dot.input2$count < min.cells), "mean"] <- NA
dot.input2 <- dot.input2 %>% dplyr::filter(Type_step2 %in% domain_list)
gg <- ggplot(dot.input2, aes(x = factor(timepoint), y = gene_name)) +
geom_count(aes(size = mean, fill = factor(timepoint)), color="black", shape=21) +
facet_wrap(~ unlist(dot.input2$Type_step2), nrow=2) +
scale_size_area(max_size = 5) +
scale_fill_manual(values=pop_colors$timepoint) +
theme_bw() +
ylab("Genes") +
xlab("Embryonic days") +
guides(color = 'none')
pdf(paste0(output_path, "FIG6_Domain_dynamics.pdf"), width = 10, height = 4, useDingbats = FALSE)
print(gg)
graphics.off()
Resample dataset to have as many cells at each pairs of (timepoint x domain) conditions
m_RP = m_neural$copy()
RNGkind("Mersenne-Twister")
set.seed(1)
equal_sample_size = 500
selected_cell_ids = unlist(lapply(
2:12, function(dv){
dv_ids = which(pData(m_RP$expressionSet)$DV==dv)
p = 1 / table(pData(m_RP$expressionSet)$timepoint[dv_ids])
sample(
dv_ids,
equal_sample_size,
prob=p[as.character(pData(m_RP$expressionSet)$timepoint[dv_ids])],
replace=T)
}))
m_RP$excludeCellsFromIds(setdiff(seq(m_RP$getNumberOfCells()), selected_cell_ids))
m_RP$excludeUnexpressedGenes(min.cells=1, min.level=1, verbose=TRUE, data_status="Raw")
# ### Run GM identification with subsample cells
Identify gene modules with Antler’s topCorr dimensional reduction technique
corr.mat.RP = fastCor(t(m_RP$getReadcounts(data_status='Raw')), method="spearman")
# Hack identifyGeneModules as it requires Normalized readcounts (TODO: remove this requirement)
m_RP$readcounts_norm = m_RP$readcounts_raw
m_RP$identifyGeneModules(
method="TopCorr_DR",
corr=corr.mat.RP,
topcorr_corr_min=1, # lower than usual
corr_t = 0.3,
topcorr_num_min_final_gms=200,
topcorr_mod_min_cell=10, # default
topcorr_mod_consistency_thres=0.4, # default
topcorr_mod_skewness_thres=-Inf, # default
topcorr_min_cell_level=.5, # default
data_status='Raw'
)
Select gene modules with common neuro-glio bifurcation markers
bif_genelist = list(
"early"=c('Lin28a'),
"glial"=c('Fabp7'),
"progenitor"=c('Sox2'),
"neuron"=c("Tubb3", "Elavl3")
)
m_RP$dR$genemodules = lapply(bif_genelist, function(l){
unlist(Filter(function(x){any(l %in% x)}, m_RP$topCorr_DR$genemodules))
})
names(m_RP$dR$genemodules) = names(bif_genelist)
m_RP$plotGeneModules(
basename='GM_RP_byDV_Resampled_Selected',
displayed.gms = c('dR.genemodules'),
displayed.geneset="naked",
use.dendrogram=NA,
display.clusters=NULL,
display.legend=T,
file_settings=
list(
list(type='pdf', width=30, height=15)
# list(type='png', width=2000, height=1500)
),
cell.ordering = order(pData(m_RP$expressionSet)$DV),
data_status='Raw',
gene_transformations='logscaled',
extra_colors=cbind(
pop_colors$timepoint[as.character(pData(m_RP$expressionSet)$timepoint)],
"Neuron/Progenitor"=ifelse(pData(m_RP$expressionSet)$Type_step1 == "Progenitor", pop_colors$Step1[['Progenitor']], pop_colors$Step1[['Neuron']]),
"DV position"=pop_colors$DV[as.character(pData(m_RP$expressionSet)$DV)]
),
pretty.params=(list("size_factor"=6, "ngenes_per_lines" = 10, "side.height.fraction"=.3))
)
# Remove DV bias with monocle
candidate_gms = m_RP$dR$genemodules
gm_genes_DV_test = m_RP$runMonocle2DETest(
fullModelFormulaStr="~DV",
reducedModelFormulaStr = "~1",
expressionFamily=VGAM::negbinomial.size(),
tested_genes=unlist(candidate_gms)
)
logpval_thres = 15
pdf(paste0(output_path, '/gene_DV_pval.pdf'), useDingbats=FALSE)
plot(-log10(gm_genes_DV_test$pval), seq(nrow(gm_genes_DV_test)), cex=0)
text(-log10(gm_genes_DV_test$pval), seq(nrow(gm_genes_DV_test)), gm_genes_DV_test$current_gene_names, cex=.3)
abline(v=logpval_thres, col="red")
dev.off()
biased_genes = gm_genes_DV_test %>% dplyr::mutate(logpval=-log10(pval)) %>% dplyr::filter(logpval > logpval_thres) %>% .$gene_short_name %>% as.character
selected_gms = candidate_gms %>% {lapply(., function(l) setdiff(l, biased_genes))}
saveRDS(selected_gms, paste0(output_path, '/DV_debiased_GMs.rds'))
# selected_gms = readRDS(paste0(output_path, '/DV_debiased_GMs.rds'))
m_RP$normalize("geometric_mean_sizeFactors")
m_neural$normalize("geometric_mean_sizeFactors")
pca.input=log(m_RP$getReadcounts('Normalized')[unlist(selected_gms), ]+1)
pca_res= prcomp(t(pca.input), center=TRUE, scale=TRUE)
pca_coord = pca_res$x[,1:2]
summary(pca_res)$importance[,1:2]
apply(pca_coord, 2, var)
table(abs(t(t(pca_res$x %*% t(pca_res$rotation)) * pca_res$scale + pca_res$center) - t(pca.input)) < 1e-10)
table(abs( pca_res$x - t((pca.input - pca_res$center) / pca_res$scale) %*% pca_res$rotation ) < 1e-10)
# Rotate all cells with PCA
pca.input_all=log(m_neural$getReadcounts('Normalized')[unlist(selected_gms), ]+1)
pca_res_all_x = t((pca.input_all - pca_res$center) / pca_res$scale) %*% pca_res$rotation
# Manual x-axis flip to have neuron on the right hand side
pca_res_all_x[, 1] <- - pca_res_all_x[, 1]
pData(m_neural$expressionSet)$Pseudotime = pca_res_all_x[, 1] %>% {100*((. - min(.))/(max(.) - min(.)))}
plotNeuroSpace(cell_domains=2:12, genelist=c('Lin28a', 'Fabp7', 'Sox2', "Tubb3", "Elavl3"), basename="Fig7A_")
# plotNeuroSpace(cell_domains=2, genelist=c('Sox2', "Tubb3"), basename="Fig7B_2_")
# plotNeuroSpace(cell_domains=3, genelist=c('Sox2', "Tubb3"), basename="Fig7B_3_")
# plotNeuroSpace(cell_domains=12, genelist=c('Sox2', "Tubb3"), basename="Fig7B_12_")
library(monocle)
df_pheno = pData(m_neural$expressionSet)
df_feature = cbind(fData(m_neural$expressionSet), 'gene_short_name'=fData(m_neural$expressionSet)$current_gene_names) # "gene_short_name" may be required by monocle
rownames(df_feature) <- df_feature$gene_short_name
currHSMM <- monocle::newCellDataSet(
as.matrix(m_neural$getReadcounts(data_status='Raw')),
phenoData = new("AnnotatedDataFrame", data = df_pheno),
featureData = new('AnnotatedDataFrame', data = df_feature),
lowerDetectionLimit = .5,
expressionFamily=VGAM::negbinomial.size()
)
currHSMM <- estimateSizeFactors(currHSMM)
currHSMM <- estimateDispersions(currHSMM) # only with VGAM::negbinomial.size()
smoothed_genes = getDispersedGenes(m_neural$getReadcounts('Normalized'), 0)
# smoothed_genes = sort(unique(c(
# (pt_genes.df %>% .$gene_list %>% unlist %>% unique %>% sort),
# unlist(lapply(highlighted_genes, function(x) x[[2]]))
# )))
num_pt = 100
mincells = 20
Smoothing runtime takes about 20 mins
t0=Sys.time()
bif_dataset_vgam = do.call(cbind, lapply(2:12, function(dv){
print(dv)
cds_subset <- currHSMM[smoothed_genes, which(pData(currHSMM)$DV == dv)]
newdata <- data.frame(Pseudotime = seq(min(pData(cds_subset)$Pseudotime), max(pData(cds_subset)$Pseudotime), length.out = num_pt))
sc <- monocle::genSmoothCurves(cds_subset, cores=1, trend_formula = '~sm.ns(Pseudotime, df=3)', relative_expr = T, new_data = newdata)
insufficiently_covered_genes = apply(m_neural$getReadcounts('Normalized')[rownames(exprs(cds_subset)), colnames(exprs(cds_subset))], 1, function(x){length(which(x>0))<mincells})
sc[insufficiently_covered_genes, ] <- NA
colnames(sc) <- paste0(dv, "_1_", seq(num_pt))
sc
}))
t1=Sys.time()
print(t1-t0)
bif_dataset_whole_log = log(1+bif_dataset_vgam)
pt_genes.df = read.table(file='./input_files/pseudotime_genes_short.csv', sep='\t', stringsAsFactors=F, header=TRUE, skipNul=T) %>%
tidyr::gather(gene_type, gene_list, -domain, -DV) %>%
dplyr::rowwise() %>%
dplyr::mutate(
gene_list=strsplit(gene_list, split=", ")
)
pt_genes.list = pt_genes.df %>% dplyr::arrange(desc(DV)) %>% .$gene_list %>% unlist %>% unique %>% as.list
plotNeurogenesisHeatmap(
bif_dataset_whole_log %>% is.na %>% replace(bif_dataset_whole_log, ., 0),
pt_clusters_reordered = pt_genes.list,
plotfullname=paste0(output_path, '/FIG7C_Neurogenesis_heatmap.pdf'),
num_pt=num_pt,
plotted_DV=2:12,
zscore_cap = NULL,
dv_colors=pop_colors$DV
)
highlighted_genes = list(
"dp2-dl2"=list(11, c("Foxd3", "Neurog1", "Olig3", "Msx1", "Sox2", "Tubb3")),
"dp3-dl3"=list(10, c("Isl1", "Neurog2", "Olig3", "Msx1", "Pax3", "Tubb3")),
"p0-V0"=list(6, c("Dbx1", "Evx1", "Neurog1", "Pax6", "Sox2", "Tubb3")),
"p3-V3"=list(2, c("Nkx2-2", "Neurog3", "Sim1", "Olig3", "Sox2"))
)
plot.data = t(apply(bif_dataset_whole_log[, grep(paste0('_1_'), colnames(bif_dataset_whole_log), value=T)], 1, function(x) x/max(x, na.rm=T)))
plot.data[is.na(plot.data)] <- 0
for(n in names(highlighted_genes)){
print(n)
NA
selected_domain_cols = paste0(highlighted_genes[[n]][[1]],'_1_', 1:num_pt)
# select genes
genes = highlighted_genes[[n]][[2]]
print(paste0("Missing genes:", paste0(genes %>% .[!. %in% rownames(plot.data)], collapse=', ')))
genes <- genes %>% .[. %in% rownames(plot.data)]
# set up dataset for ggplot
data = as.data.frame(plot.data[genes, selected_domain_cols]) %>%
tibble::rownames_to_column("genename") %>%
tidyr::gather(PT, level, -genename) %>%
dplyr::mutate(PT=factor(PT, levels=selected_domain_cols))
data <- data %>%
dplyr::group_by(genename) %>%
dplyr::mutate(level=level/max(level))
# ggplot
p <- ggplot(data) +
geom_line(aes(x=as.numeric(PT), y=level, color=genename, group=genename)) +
# scale_x_continuous(pretty(adata$PT)) +
scale_colour_manual(values = getClusterColors(v=2)) + ggtitle(n) + xlab("Pseudotime") +
ylab("Relative expression") + theme_classic() +
theme(axis.text.x = element_blank())
pdf(paste0(output_path, '/PT_profile_', n, '.pdf'), height=4, useDingbats=FALSE)
print(p)
graphics.off()
}
Export complete dataset meta-data, including all cell types
full_pData = read.table('./dataset/phenoData.csv', header=TRUE, sep="\t", row.names=1, as.is=TRUE, stringsAsFactors=F, check.names=FALSE)
full_pData$Type_step1 = rep("Outliers", nrow(full_pData))
full_pData[rownames(pData(m$expressionSet)), "Type_step1"] <- as.character(pData(m$expressionSet)$Type_step1)
full_pData$Type_step2 = rep("Outliers", nrow(full_pData))
full_pData[rownames(pData(m$expressionSet)), "Type_step2"] <- as.character(pData(m$expressionSet)$Type_step2)
full_pData[rownames(pData(m$expressionSet)), "Type_step2_unique"] <- as.character(pData(m$expressionSet)$Type_step2_unique)
full_pData$DV = rep(NA, nrow(full_pData))
full_pData[rownames(pData(m_neural$expressionSet)), "DV"] <- as.character(pData(m_neural$expressionSet)$DV)
full_pData$Neuron_subtypes = rep(NA, nrow(full_pData))
full_pData[rownames(pData(m_neural$expressionSet)), "Neuron_subtypes"] <- as.character(pData(m_neural$expressionSet)$subtypes)
full_pData$Pseudotime = rep(NA, nrow(full_pData))
full_pData[rownames(pData(m_neural$expressionSet)), "Pseudotime"] <- as.character(pData(m_neural$expressionSet)$Pseudotime)
write.table(x=full_pData[, c('timepoint', 'replicate_id', 'Type_step1', 'Type_step2', 'Type_step2_unique', 'DV', 'Neuron_subtypes', 'Pseudotime')], file='./output/phenoData_annotated.csv', sep='\t', row.names=TRUE, quote=TRUE, col.names=NA)