Error occurred in running test data
Opened this issue · 1 comments
Hello NetMoss team, what a wonderful job!
When I test the NetMoss pipeline according to your manual, some errors occurred in netROC():
myROC = netROC(case_dir,control_dir,marker,metadata)
[1] "training datasets..."
Error in y - ymean : non-numeric argument to binary operator
In addition: Warning messages:
1: In randomForest.default(m, y, ...) :
The response has five or fewer unique values. Are you sure you want to do regression?
2: In mean.default(y) : argument is not numeric or logical: returning NA
How could I fix it?
Hello NetMoss team, what a wonderful job! When I test the NetMoss pipeline according to your manual, some errors occurred in netROC():
myROC = netROC(case_dir,control_dir,marker,metadata)
[1] "training datasets..."
Error in y - ymean : non-numeric argument to binary operator
In addition: Warning messages:
1: In randomForest.default(m, y, ...) :
The response has five or fewer unique values. Are you sure you want to do regression?
2: In mean.default(y) : argument is not numeric or logical: returning NAHow could I fix it?
Hi I am not a team member but I polished the netROC
function and run it successfully. Below is the code:
netROC <- function (case_dir, control_dir, marker, metadata, plot.roc = T,
train.num = 20)
{
my.wd2 = getwd()
print("training datasets...")
case_data_list <- list()
case_samples <- c()
setwd(case_dir)
dir3 = list.files()
n3 = length(dir3)
for (m in 1:n3) {
x.case.net = read.table(file = dir3[m], header = T, sep = "\t",
row.names = 1)
case_data_list[[m]] = x.case.net
case_samples <- c(case_samples, length(colnames(x.case.net)) -
1)
}
control_data_list <- list()
control_samples <- c()
setwd(control_dir)
dir4 = list.files()
n4 = length(dir4)
for (m in 1:n4) {
x.control.net = read.table(file = dir4[m], header = T,
sep = "\t", row.names = 1)
control_data_list[[m]] = x.control.net
control_samples <- c(control_samples, length(colnames(x.control.net)) -
1)
}
roc.all = list()
for (kk in 1:n3) {
eval(parse(text = paste("h = control_data_list[[", kk,
"]]", sep = "")))
eval(parse(text = paste("d = case_data_list[[", kk, "]]",
sep = "")))
hh = h
hh$genus = rownames(hh)
hh = melt(hh, id.vars = "genus")
dd = d
dd$genus = rownames(dd)
dd = melt(dd, id.vars = "genus")
all1 = rbind(hh, dd)
all2 = dcast(all1, genus ~ variable, mean, fill = 0)
sample.all = all2[, -1]
rownames(sample.all) = all2$genus
aa = data.frame(t(sample.all))
aa$sum = rowSums(aa)
aa = aa[which(aa$sum != 0), ]
sample.all2 = data.frame(aa[, -ncol(aa)])
m.marker = intersect(as.character(colnames(sample.all2)),
as.character(rownames(marker)))
m.meta = metadata[as.character(rownames(metadata)) %in%
as.character(rownames(sample.all2)), ]
eval(parse(text = paste("auc_crc", kk, "_all = data.frame()",
sep = "")))
for (mm in 1:train.num) {
train.data = sample(rownames(sample.all2), nrow(sample.all2) *
0.7, replace = F)
sample.train = sample.all2[as.character(rownames(sample.all2)) %in%
as.character(train.data), ]
sample.train$group = m.meta[as.character(rownames(sample.train)),
"type"]
test.data = setdiff(rownames(sample.all2), train.data)
sample.test = sample.all2[as.character(rownames(sample.all2)) %in%
as.character(test.data), ]
sample.test$group = m.meta[as.character(rownames(sample.test)),
"type"]
if (length(intersect(m.marker, colnames(sample.train))) ==
1) {
sample.train2 = data.frame(sample.train[, as.character(colnames(sample.train)) %in%
as.character(m.marker)])
colnames(sample.train2) = as.character(intersect(m.marker,
colnames(sample.train)))
rownames(sample.train2) = rownames(sample.train)
}
else {
sample.train2 = sample.train[, as.character(colnames(sample.train)) %in%
as.character(m.marker)]
}
sample.train2$group = as.factor(sample.train$group) # Factorise sample.train$group
rf.train = randomForest(group ~ ., sample.train2,
ntree = 1000, nperm = 100, importance = T)
importance_rf = data.frame(importance(rf.train))
importance_rf = importance_rf[order(importance_rf$MeanDecreaseAccuracy,
decreasing = T), ]
train.5_10 = replicate(5, rfcv(sample.train2[-ncol(sample.train2)],
sample.train2$group, cv.fold = 10, step = 1.5),
simplify = F)
train.5_10.2 = data.frame(sapply(train.5_10, "[[",
"error.cv"))
train.5_10.2$names = rownames(train.5_10.2)
train.5_10.2 = melt(train.5_10.2, id = "names")
train.5_10.2$names = as.numeric(as.character(train.5_10.2$names))
train.5_10.2 = summaryBy(value ~ names, train.5_10.2,
FUN = mean)
marker.num = min(train.5_10.2[which(train.5_10.2$value.mean ==
min(train.5_10.2$value.mean)), 1])
marker.re = data.frame(rownames(importance_rf[1:marker.num,
]))
colnames(marker.re) = "Name"
if (length(intersect(marker.re$Name, colnames(sample.train))) ==
1) {
sample.train3 = data.frame(sample.train[, as.character(colnames(sample.train)) %in%
as.character(marker.re$Name)])
colnames(sample.train3) = as.character(intersect(marker.re$Name,
colnames(sample.train)))
rownames(sample.train3) = rownames(sample.train)
} else {
sample.train3 = sample.train[, as.character(colnames(sample.train)) %in%
as.character(marker.re$Name)]
}
sample.train3$group = as.factor(sample.train$group) # Factorise sample.train$group
rf.train = randomForest(group ~ ., sample.train3,
ntree = 1000, nperm = 100, importance = T)
test_pred.select = predict(rf.train, sample.test)
test_freq.select = table(test_pred.select, sample.test$group)
group_select = data.frame(test_pred.select)
group_select$test_orig.select = sample.test[as.character(rownames(group_select)),
"group"] %>% as.factor() ## Factorise test_orig.select otherwise it cannot be converted to a numeric variable.
e11 = group_select
e11$test_pred.select = as.numeric(e11$test_pred.select)
e11$test_orig.select = as.numeric(e11$test_orig.select)
e11[which(e11$test_orig.select != 1), 2] = 0
e11[which(e11$test_pred.select != 1), 1] = 0
for (i in 1:nrow(e11)) ifelse(e11[i, 1] == e11[i,
2], e11[i, 3] <- 1, e11[i, 3] <- 0)
b = data.frame(predict(rf.train, sample.test, type = "prob"))
e11$prob = b[as.character(rownames(e11)), "disease"]
p = ggplot(e11, aes(d = test_orig.select, m = prob)) +
geom_roc(n.cuts = 0) + style_roc()
auc <- calc_auc(p)
eval(parse(text = paste("roc_crc", kk, "_", mm, " = e11",
sep = "")))
eval(parse(text = paste("auc_crc", kk, "_all[", mm,
",1] = ", mm, sep = "")))
eval(parse(text = paste("auc_crc", kk, "_all[", mm,
",2] = auc$AUC", sep = "")))
if (auc$AUC > 0.7) {
eval(parse(text = paste("roc.all[[", kk, "]] = roc_crc",
kk, "_", mm, sep = "")))
break
}
if (mm == train.num) {
eval(parse(text = paste("m.auc = which(auc_crc",
kk, "_all$V2 == max(auc_crc", kk, "_all$V2))",
sep = "")))
eval(parse(text = paste("roc.all[[", kk, "]] = roc_crc",
kk, "_", m.auc, sep = "")))
}
}
}
for (nn in 1:n3) {
eval(parse(text = paste("crc", nn, " = roc.all[[", nn,
"]]", sep = "")))
eval(parse(text = paste("crc", nn, "$type = 'CRC", nn,
"'", sep = "")))
}
if (n3 > 1) {
mydata = crc1
for (nn in 2:n3) {
eval(parse(text = paste("mydata = rbind(mydata,crc",
nn, ")", sep = "")))
}
w.case = data.frame(sample = case_samples)
w.case$w1 = 1/w.case$sample * sum(w.case$sample)
w.case$w2 = w.case$w1/sum(w.case$w1)
y <- mydata$test_orig.select
w = rep(w.case[1, 3], nrow(crc1))
for (nn in 2:n3) {
eval(parse(text = paste("w = c(w,rep(w.case[", nn,
",3],nrow(crc", nn, ")))", sep = "")))
}
} else {
mydata = crc1
w.case = data.frame(sample = case_samples)
w.case$w1 = 1/w.case$sample * sum(w.case$sample)
w.case$w2 = w.case$w1/sum(w.case$w1)
y <- mydata$test_orig.select
w = rep(w.case[1, 3], nrow(crc1))
}
y.hat <- mydata$prob
tp.fp <- WeightedROC(y.hat, y, w)
if (plot.roc)
p2 = ggplot() + geom_path(aes(FPR, TPR), data = tp.fp) +
# coord_equal() + # I muted coord_equal() because it would result in error for me.
annotate("text", x = 0.75, y = 0.15,
label = paste("AUC =", round(WeightedAUC(tp.fp),
2))) + labs(x = "False positive fraction", y = "True positive fraction",
title = "Combined ROC") + theme_bw()
setwd(my.wd2)
ggsave("NetMoss_ROC.png", p2)
tp.fp2 = tp.fp[, c(3, 1, 2)]
return(tp.fp2)
}