The R package WMRCA
integrates the current common computational biology methods for cancer subtypes identification and provides a standardized framework for cancer subtype analysis based multi-omics data, such as gene expression, miRNA expression, DNA methylation and others. See vignette('WMRCA')
for the comprehensive user guide.
To install the WMRCA package in R, simply type
install.packages('devtools')
library(devtools)
install_github('guojunliu7/WMRCA')
if the installation invokes i386 arch, try to use:
install_github('guojunliu7/WMRCA', INSTALL_opts="--no-multiarch")
To make the WMRCA package fully functional (especially the Open Babel and rjava related functionalities), we recommend the users some tips:
Several dependencies of the WMRCA package may require some system-level libraries, check the corresponding manuals of these packages for detailed installation guides.
Code snippet: example for mRNA data preprocessing
# This preprocess shoud be used for mRNA, miRNA, lncRNA, CNV, methylation datasets
mRNA=read.csv("nsclc_mRNA_expr.csv",header=T, sep=",",row.names=1,check.names=F)
###Data filter
mRNA1=data.filter(mRNA,percentage=0.8)
####Data filling
mRNA1[mRNA1==0]<-NA
mRNA1<-as.matrix(mRNA1)
mRNA2=data.imputation(mRNA1,fun="mean")
###Feature Selection
mRNA3=SbyVar(mRNA2, cut.type="topk",value=1000)
###Data Standardization
mRNA4=data.normalization(mRNA3,type="feature_Median",log2=TRUE)
write.csv(mRNA4,file = "preprocessed_nsclc_mRNA.csv")
Code snippet: feature selection based on metabolism dataset
data(mRNAexp)
data_lipid=metabolism(mRNAexp,type="Amino_acid")
Code snippet: Execute WMRCA based on five datasets
data(mRNAexp)
data(miRNAexp)
data(lncRNAexp)
data(methylation)
data(CNVmatrix)
mRNAexp=data.filter(mRNAexp,percentage=0.6)
miRNAexp=data.filter(miRNAexp,percentage=0.6)
lncRNAexp=data.filter(lncRNAexp,percentage=0.6)
methylation=data.filter(methylation,percentage=0.6)
CNVmatrix=data.filter(CNVmatrix,percentage=0.6)
mRNAexp[mRNAexp==0]<-NA
miRNAexp[miRNAexp==0]<-NA
lncRNAexp[lncRNAexp==0]<-NA
methylation[methylation==0]<-NA
CNVmatrix[CNVmatrix==0]<-NA
mRNAexp<-as.matrix(mRNAexp)
miRNAexp<-as.matrix(miRNAexp)
lncRNAexp<-as.matrix(lncRNAexp)
methylation<-as.matrix(methylation)
CNVmatrix<-as.matrix(CNVmatrix)
mRNAexp=data.imputation(mRNAexp,fun="mean")
miRNAexp=data.imputation(miRNAexp,fun="mean")
lncRNAexp=data.imputation(lncRNAexp,fun="mean")
methylation=data.imputation(methylation,fun="mean")
CNVmatrix=data.imputation(CNVmatrix,fun="mean")
results = WMRCA(mRNAexp,miRNAexp,lncRNAexp,methylation,CNVmatrix,maxK=6,
reps=50,pItem=0.8,pFeature=1,clusterAlg="hc",distance="pearson",
innerLinkage="complete",seed=1262118388.71279,plot="pdf")
Code snippet: example for run Similarity Network Fusion (SNF)
# runSNF
data(mRNAexp)
data(miRNAexp)
data(lncRNAexp)
data(methylation)
data(CNVmatrix)
result=runSNF(mRNAexp,miRNAexp,lncRNAexp,methylation,CNVmatrix, clusterNum=3, K=20, alpha=0.5, t=20)
result$group
# similarly, runiCluster, runCC, runCNMF
Code snippet: Visualization
# survival analysis
data(results)
data(meta)
maxK = 6
Kvec = 2:maxK
x1 = 0.1; x2 = 0.9 # threshold defining the intermediate sub-interval
PAC = rep(NA,length(Kvec))
names(PAC) = paste("K=",Kvec,sep="") # from 2 to maxK
for(i in Kvec){
M = results[[i]]$consensusMatrix
Fn = ecdf(M[lower.tri(M)])
PAC[i-1] = Fn(x2) - Fn(x1)
}
optK = Kvec[which.min(PAC)]
Surplot(results,meta,optK=3)
# differential expression analysis
data("mRNAexp")
mRNAexp=as.matrix(mRNAexp)
Normal_Data=as.matrix(mRNAexp[,sample(1:100,20)])
result=Dif.limma(Tumor_Data=mRNAexp,Normal_Data=Normal_Data,group=NULL,topk=NULL,RNAseq=FALSE)
# proportion of ambiguous clustering (PAC) plot
data(dot_df)
PACplot(dot_df)
# drugsensitivity (in development)
data(exprData)
data(studyResponse)
drugsensitivity(testMatrix=exprData,
studyResponse=studyResponse,
bortIndex=bortIndex,
drug="Bosutinib",
tissueType = "all",
batchCorrect = "eb",
selection=1,
dataset = "cgp2014",colours=c('#e94753','#47a4e9'))
# calculate indices
setwd("/Users/LGJ-15/Manuscripts/CFC_20230304")
inteMatrix=read.csv("integrated_results_12-Dec-2023 19.30.csv",header=T, sep=",",row.names=1, check.names=F)
a <- indices(inteMatrix,results_CoC,stas_all=TRUE,plot = TRUE,clusterNum=6,vote=TRUE)
# Check distribution
pdf(file="checkDistribution.pdf",height = 4,
width = 4)
data.checkDistribution(mRNAexp)
# plot ROC
require(ROCR)
setwd("/Users/LGJ-15/Manuscripts/nsclc/results/cluster_subtype_info")
combined_results=read.csv("/Users/LGJ-15/Manuscripts/CFC_20230304/nsclc_mRNA_results_by_ConsensusClusterPlus_31-Dec-2023 03.22.csv",header=T, sep=",",check.names=F)
print(confusion_matrix <- table(combined_results$mRNA_cc, combined_results$sample))
tn <- confusion_matrix [1, 1]
tp <- confusion_matrix [2, 2]
fp <- confusion_matrix [2, 1]
fn <- confusion_matrix [1, 2]
# # only for cnmf, The correct predictions are all on the anti-diagonal line.
# tn <- confusion_matrix [2, 1]
# tp <- confusion_matrix [1, 2]
# fp <- confusion_matrix [1, 1]
# fn <- confusion_matrix [2, 2]
sensitivity = tp/(tp + fn)# Calculate sensitivity, or TPR, true rate, recall rate (Recall)
specificity = tn/(tn + fp)
ppp = tp/(tp+fp)
npp = tn/(tn+fn)
accuracy = (tp+tn)/(tp+tn+fp+fn)#calculate accuracy
n = 3 # valitated float number
result <- paste("Sensitivity = ", round(sensitivity, n) , #n is the number of bits reserved
"\nSpecificity = ", round(specificity, n), # /n line breaks
"\nPositive Predictive Value = ", round(ppp, n),
"\nNegative Predictive Value = ", round(npp, n),
"\nAccuracy = ", round(accuracy, n), "\n", sep="")
cat(result)
# Plot confusion matrix
pdf(file="confusion matrix_mRNA_CC.pdf",height = 4,
width = 4)
plot(confusion_matrix,col=c("#FF0000","#0000FF"),
main="Confusion Matrix",
xlab="Predicted",
ylab="Actual"
)
# Plot confusion matrix
fourfoldplot(confusion_matrix,color=c("green","red"),main = "Confusion Matrix")
dev.off()
# Plot ROC curve
library(ROCR)
library(RColorBrewer)
pred <- prediction(combined_results$mRNA_cc, combined_results$sample)
performance(pred,'auc')@y.values #AUC value
perf <- performance(pred,'tpr','fpr') # for CNMF, need to change the position of 'tpr' and 'fpr'
auc <- performance(pred,'auc')
auc = unlist(slot(auc,"y.values"))
set1 = c(brewer.pal(9,"Set1"),
brewer.pal(8, "Dark2"))
pdf(file="ROC_mRNA_CC.pdf",height = 4,
width = 4)
plot(perf,
xlim =c(0,1), ylim=c(0,1),col=set1[2],
main = "",
lwd = 3,
cex.main=1.3,
cex.lab=1.2,
cex.axis=1.2,
font=1.2,
)
#plot(pred,col=set1[2],add = TRUE,lwd = 3)
legend("bottomright", bty = "n" ,col=set1[2],
lty=1, paste(c("AUC value ="),round(auc, 3)),lwd=3)
graphics::abline(a = 0, b = 1,lwd = 2,lty=2)
dev.off()
# plot t-SNE
set.seed(123456)
intersection_tumor <- as.data.frame(lapply(mRNAexp, as.numeric))
row.names(intersection_tumor) <- row.names(mRNAexp)
colnames(intersection_tumor) <- colnames(mRNAexp)
intersection_tumor = na.omit(intersection_tumor)
intersection_tumor <- as.matrix(intersection_tumor)
library(Rtsne)
tSNE_res<- Rtsne(t(intersection_tumor), dims= 2, perplexity= 10, verbose= F, max_iter= 500, check_duplicates= F)
tsne<- data.frame(tSNE1 = tSNE_res[["Y"]][,1], tSNE2= tSNE_res[["Y"]][,2], cluster = annCol$results)
ggplot(tsne, aes(x= tSNE1, y= tSNE2, color= cluster)) +
geom_point(size= 4.5, alpha= 0.5) +
stat_ellipse(level= 0.85, show.legend = F) +
theme(legend.position= "top")+
theme_bw()
WMRCA implemented and integrated the state-of-the-art clustering algorithms and multi-omics data modeling analysis functionalities with R.
-
We have refined the cluster-of-clusters method, specifically the Weighted Majority Rule-based Cluster-of-clusters Approach (WMRCA), for the integration of multi-omics data to accurately predict tumor subtypes.
-
The WMRCA model uses ten evaluation metrics to assess clustering performance and applies a weighted majority voting rule to automatically determine the optimal number of clusters.
-
The WMRCA+ model, which features genes related to lipid metabolism, demonstrates superior performance compared to other commonly used clustering algorithms.
Guojun Liu, et al. WMRCA: A weighted majority rule-based cluster-of-clusters approach for cancer subtype prediction using multi-omics data. Briefings in Bioinformatics, submitted.
The WMRCA package is developed by Inner Mongolia Key Laboratory of Functional Genomics and Bioinformatics, Inner Mongolia University of Science and Technology, Baotou, China
- Guojun Liu gjliu77@gmail.com