#' @title DNB
#'
#' @description Compute the Dynamic Network Biomarkers(DNB) model
#'
#' @details return a DNB_obj object
#' @details write score results into DNB_score_matrix.txt
#' @details write gene list from the Module of max CI score into DNB_maxModule.txt
#'
#' @param data the gene expression matrix,
#' which can be a single-cell RNA-seq GEM with at least three group/clusters
#' or a matrix merging bulk GEMs from at least three different sample
#' @param meta a data.frame with rownames as cell-id as well as one column of group infomation
#' @param diffgenes which genes we're interested in
#' @param allgenes the whole genes that ordered by expression, or the rownames of GEM (default)
#' @param meta_levels the order of meta group, default ordered by decreasing if be null
#' @param high_method the method to select genes for the first step, by either high_cv (default) or top_gene
#' @param high_cutoff the cutoff value corresponding to the high_method,
#' with the range between 0-1 for high_cv and 1-length(allgenes) for top_gene
#' @param cutree_method the method to select tree(module) numbers, by either h (height, default) or k (number K)
#' @param cutree_cutoff the cutoff value corresponding to the cutree_method,
#' with the range between 0-1 for h and a number greater than 0 for k
#' @param minModule the min number of genes of the module meeting requirements
#' @param maxModule the max number of genes of the module meeting requirements
#' @param quiet do not print output of process during calculation, default FALSE
#'
#' @return DNB_obj
#'
#' @examples DNB(...)
#'
#' @author Kaiyu Wang, in ChenLab of CAS, Shanghai, China
#'
#' @export
#'
DNB <- function(
data,
meta,
diffgenes,
allgenes = NULL,
meta_levels = NULL,
high_method = c('high_cv', 'top_gene'),
high_cutoff = 0.6,
cutree_method = c('h', 'k'),
cutree_cutoff = 0.98,
minModule = 7,
maxModule = 60,
quiet = FALSE
) {
if (!is.matrix(data) && !is.data.frame(data))
stop("ERROR data input! Should be matrix or data.frame!")
if (is.null(allgenes))
allgenes <- rownames(data)
if (!all(diffgenes %in% allgenes))
stop("ERROR genes! Please check the allgenes or diffgenes input!")
if (!is.data.frame(meta))
meta <- as.data.frame(meta) # if meta is a vector with names
if (!all(colnames(data) %in% rownames(meta)))
stop("ERROR meta! Please check the rownames of meta df or the colnames of data input!")
if (ncol(meta) != 1)
stop("ERROR meta! Meta df only has one column!")
if (!all(meta_levels %in% meta[, 1]))
stop("ERROR meta_levels! Should be equal to meta!")
match.arg(high_method)
match.arg(cutree_method)
if (length(high_method != 1))
high_method <- high_method[1]
if (length(cutree_method != 1))
cutree_method <- cutree_method[1]
if (high_method == "high_cv") {
if (high_cutoff > 1 || high_cutoff <= 0)
stop("ERROR high_cutoff! Should be between 0 and 1 when high_method is high_cv!")
} else {
if (high_cutoff < 1 || high_cutoff > length(allgenes))
stop("ERROR high_cutoff! Should be between less than length of allgenes when high_method is top_gene!")
}
if (cutree_method == "h") {
if (cutree_cutoff > 1 || cutree_cutoff <= 0)
stop("ERROR cutree_cutoff! Should be between 0 and 1 when cutree_method is h!")
} else {
message("select tree by k! Carefully choose a proper value, or error may occur later.")
if (cutree_cutoff < 1)
stop("ERROR cutree_cutoff! Should be between greater than 1 when cutree_method is k!")
}
mycat <- function(..., quiet = F) {
if (!quiet) cat(...)
}
if (is.null(meta_levels)) {
meta <- meta[order(meta[, 1], decreasing = T), , drop = F]
group <- unique(meta[, 1])
} else {
meta_tmp <- meta[1, , drop = F]
rownames(meta_tmp) <- 'NA'
for (i in meta_levels) {
meta_tmp_tmp <- meta[meta[, 1] == i, , drop = F]
meta_tmp_tmp <- meta_tmp_tmp[order(rownames(meta_tmp_tmp), decreasing = T), , drop = F]
meta_tmp <- rbind(meta_tmp, meta_tmp_tmp)
}
meta <- meta_tmp[-1, , drop = F]
group <- meta_levels
}
data <- data[, rownames(meta), drop = F]
SCORE <- c()
cv <- c()
SD <- c()
MEAN <- c()
PCC_IN <- c()
PCC_OUT <- c()
MODULE <- list()
bestMODULE <- c()
for (group_tmp in group) {
cat('Now run ', group_tmp, '\n', sep = '')
data_tmp <- t(data[, rownames(meta)[meta[, 1] == group_tmp]])
data_tmp <- data.frame(data_tmp, check.names = FALSE)
mycat("\tProcess ...\n", quiet = quiet)
sd_all <- sapply(data_tmp, sd)
mean_all <- sapply(data_tmp, mean)
cv_all <- sd_all / mean_all
# cv_all[is.na(cv_all)] <- 0
if (high_method == 'top_gene') {
hig_gene <- allgenes[1:high_cutoff]
} else if (high_method == 'high_cv') {
hig_num <- ceiling(length(cv_all) * high_cutoff) #cv cutoff
cv_rank <- sort(cv_all, decreasing = T, na.last = T)[1:hig_num]
hig_gene <- names(cv_rank)
}
if (length(hig_gene) <= 1)
stop("ERROR number of high gene! Only find one high gene or less!")
sle_tab <- data_tmp[, hig_gene, drop = F]
#calculate pcc
cor_sle <- mycor(sle_tab)
diag(cor_sle) <- 0
dis_sle <- 1 - abs(cor_sle)
#clustering
mycat("\tCluster...\n", "\tcutree method be ", cutree_method, " with cutoff value ", cutree_cutoff, "\n", sep = "", quiet = quiet)
dist <- as.dist(dis_sle)
dist[is.na(dist)] <- 1
clu_sle <- hclust(dist)
if (cutree_method == 'h')
cut_sle <- cutree(clu_sle, h = cutree_cutoff) #pcc cutoff
else if (cutree_method == 'k')
tryCatch(
cut_sle <- cutree(clu_sle, k = cutree_cutoff),
error = function(x) stop("Not proper value of k!")
)
#calcute score
count <- 1
used_vecto <- NULL
pcc_in <- c()
pcc_out <- c()
score <- c()
cv_mean <- c()
sd_mean <- c()
mean_mean <- c()
mycat("\tCompute score ...\n", quiet = quiet)
for (i in 1:max(cut_sle)) {
if (i %% 50 == 0)
mycat("\t\tNow run ", i, "\n", sep = "", quiet = quiet)
###get out group data from whole gene base/high sd gene###
group_name <- names(which(cut_sle == i))
group_out <- setdiff(hig_gene, group_name) ##get out group from high sd gene
num_in <- length(group_name)
size <- length(group_name)
if (size > minModule) {
###make in group & out group table###
tab_in <- sle_tab[, group_name]
tab_out <- sle_tab[, group_out]
###calculate parameters####
cv_mean[count] <- mean(cv_all[group_name], na.rm = T)
sd_mean[count] <- mean(sd_all[group_name])
mean_mean[count] <- mean(mean_all[group_name])
pcc_in[count] <- mean(abs(as.dist(mycor(tab_in))), na.rm = T)
pcc_out[count] <- mean(abs(mycor(tab_in, tab_out)), na.rm = T) ###correlation of in 2 out
score[count] <- ifelse(
pcc_out[count] == 0,
0,
sqrt(size) * sd_mean[count] * pcc_in[count] / pcc_out[count] ###score for ci
)
n <- num_in ##try n = 1 or n = num of module
count <- count + 1
used_vecto <- c(used_vecto, i)
}
}
mycat("\tFind total of ", length(score), " scores! \n", quiet = quiet)
mycat("\tCompute Modules ...\n", quiet = quiet)
MODULE_names <- list()
fit_n <- 1
if (length(score) > 0) {
for (mm in 1:length(score)) {
MODULE_names[[mm]] <- names(which(cut_sle == used_vecto[mm]))
intersctDiff <- intersect(diffgenes, MODULE_names[[mm]])
if (length(intersctDiff) > 0) {
mycat("\t\tFind Module_", fit_n, ": ", sep = "", quiet = quiet)
mycat("CI->", score[mm], "; ", sep = "", quiet = quiet)
mycat("number of gene->", length(MODULE_names[[mm]]), "; ", sep = "", quiet = quiet)
mycat("intersested genes->", paste0(intersctDiff, collapse = ","), "\n", sep = "", quiet = quiet)
fit_n <- fit_n + 1
}
}
} else {
mycat("\t\tFind no module!\n", quiet = quiet)
}
mycat("\tFilter Modules ...\n\t\tProcess: ", quiet = quiet)
index <- 0
while (index >= 0) {
if (length(score) == 0) {
bestModule <- 0
SCORE_tmp <- 0
PCC_IN_tmp <- 0
PCC_OUT_tmp <- 0
cv_tmp <- 0
sd_tmp <- 0
mean_tmp <- 0
MODULE_tmp <- ""
mycat("\n\t\tNo Module meets requirements!", quiet = quiet)
break
}
if (index %% 100 == 0)
mycat(index, " ", sep = "", quiet = quiet)
max_num <- which.max(score)
SCORE_tmp <- max(score)
PCC_IN_tmp <- pcc_in[max_num]
PCC_OUT_tmp <- pcc_out[max_num]
cv_tmp <- cv_mean[max_num]
sd_tmp <- sd_mean[max_num]
mean_tmp <- mean_mean[max_num]
MODULE_tmp <- names(which(cut_sle == used_vecto[max_num]))
LenMODULE <- length(MODULE_tmp)
Lendiff <- length(intersect(diffgenes, MODULE_tmp))
if (Lendiff > 0 && LenMODULE > minModule && LenMODULE < maxModule) {
bestModule <- 1
mycat("\n\t\tFind the Module meeting requirements!", quiet = quiet)
break
} else if (SCORE_tmp == 0) {
bestModule <- 0
MODULE_tmp <- ""
mycat("\n\t\tNo Module meets requirements!", quiet = quiet)
break
} else {
score[max_num] <- 0
}
index <- index + 1
}
mycat("\n", quiet = quiet)
SCORE <- append(SCORE, SCORE_tmp)
PCC_IN <- append(PCC_IN, PCC_IN_tmp)
PCC_OUT <- append(PCC_OUT, PCC_OUT_tmp)
cv <- append(cv, cv_tmp)
SD <- append(SD, sd_tmp)
MEAN <- append(MEAN, mean_tmp)
MODULE <- append(MODULE, list(MODULE_tmp))
bestMODULE <- append(bestMODULE, bestModule)
mycat(group_tmp, ' ends up successfully!\n', sep = '', quiet = quiet)
}
MODULEs <- lapply(MODULE, function(x) paste0(x, collapse = ","))
cat("Now write output into DNB_score_matrix.txt ...\n")
out_table <- rbind(SCORE, PCC_IN, PCC_OUT, cv, SD, MEAN, bestMODULE, MODULEs)
rownames(out_table)[4] <- "CV"
rownames(out_table)[1] <- "CI"
colnames(out_table) <- group
write.table(out_table, "DNB_score_matrix.txt", quote = F)
cat("Now write MODULE of max CI score into DNB_maxModule.txt ...\n")
Module_index <- which.max(SCORE)
if (length(Module_index) > 1) {
warning("There are more than one Module sharing same max CI score. Choose one randomly.")
Module_index <- Module_index[1]
}
write.table(MODULE[[Module_index]], "DNB_maxModule.txt", quote = F)
res <- list(
data = data,
group = group,
meta = meta,
CI = SCORE,
PCC_IN = PCC_IN,
PCC_OUT = PCC_OUT,
CV = cv,
SD = SD,
MEAN = MEAN,
MODULE = MODULE
)
class(res) <- "DNB_obj"
cat("ALL run over!\n")
return(res)
}
#' @title maxDNB
#'
#' @description Compute the Module of max CI score from DNB model
#'
#' @details return a maxDNB_obj object
#' @details write score results into maxDNB_score_matrix.txt
#'
#' @param DNB_obj output of function DNB
#'
#' @return maxDNB_obj
#'
#' @examples maxDNB_obj <- maxDNB(DNB_obj)
#'
#' @author Kaiyu Wang, in ChenLab of CAS, Shanghai, China
#'
#' @export
#'
maxDNB <- function(DNB_obj) {
if (class(DNB_obj) != "DNB_obj")
stop("ERROR input! Should be an 'DNB_obj' object")
Module_index <- which.max(DNB_obj$CI)
Module_SCORE <- max(DNB_obj$CI)
if (Module_SCORE == 0)
stop("No Module meet requirement! CI score should be greater than 0!")
if (length(Module_index) > 1) {
warning("There are more than one Module sharing same max CI score.")
Module_index <- Module_index[1]
}
dnb_name <- DNB_obj$MODULE[[Module_index]]
all <- rownames(DNB_obj$data)
dnb_out <- all[which(!all %in% dnb_name)]
size <- length(dnb_name)
score <- c()
pcc_in <- c()
pcc_out <- c()
cv <- c()
Sd <- c()
Mean <- c()
for (index_tmp in seq(DNB_obj$group)) {
cat('Now run ', DNB_obj$group[index_tmp], '\n', sep = '')
data_tmp <- t(DNB_obj$data[, rownames(DNB_obj$meta)[DNB_obj$meta[, 1] == DNB_obj$group[index_tmp]]])
data_tmp <- data.frame(data_tmp, check.names = FALSE)
sd_all <- sapply(data_tmp, sd)
mean_all <- sapply(data_tmp, mean)
cv_all <- sd_all / mean_all
# cv_all[is.na(cv_all)] <- 0
cv[index_tmp] <- mean(cv_all[dnb_name], na.rm = T) # if mean is zero, cv is not defined, so do not consider this one.
Sd[index_tmp] <- mean(sd_all[dnb_name])
Mean[index_tmp] <- mean(mean_all[dnb_name])
tab_in <- data_tmp[, dnb_name]
tab_out <- data_tmp[, dnb_out]
pcc_in[index_tmp] <- mean(abs(as.dist(mycor(tab_in))), na.rm = T)
pcc_out[index_tmp] <- mean(abs(mycor(tab_in, tab_out)), na.rm = T)
score[index_tmp] <- ifelse(
pcc_out[index_tmp] == 0,
0,
sqrt(size) * mean(sd_all[dnb_name]) * pcc_in[index_tmp] / pcc_out[index_tmp]
)
}
DNB_table <- rbind(score, pcc_in, pcc_out, cv, Sd, Mean)
colnames(DNB_table) <- DNB_obj$group
rownames(DNB_table) <- c('CI', 'PCC_IN', 'PCC_OUT', 'CV', 'SD', 'MEAN')
cat("Now write output into maxDNB_score_matrix.txt ...\n")
write.table(DNB_table, "maxDNB_score_matrix.txt", quote = F)
res <- list(
CI = score,
PCC_IN = pcc_in,
PCC_OUT = pcc_out,
CV = cv,
SD = Sd,
MEAN = Mean,
group = DNB_obj$group
)
class(res) <- "maxDNB_obj"
cat("ALL run over!\n")
return(res)
}
#' @title DNBplot
#'
#' @description Visualize the result of DNB model
#'
#' @details plot by 1) ggplot2 if installed, or 2) R built-in plot
#' @details plot into \{file_prefix\}(_)(max)DNB_(gg)plot.pdf
#'
#' @param DNB_obj output of function 1) DNB or 2) maxDNB
#' @param file_prefix prefix of save pdf file, default NULL
#' @param show whether if plot be shown in Console, default FALSE
#'
#' @return NULL
#'
#' @examples DNB_obj <- list(CI = runif(3, 0, 1),
#' PCC_IN = runif(3, 0, 1),
#' PCC_OUT = runif(3, 0, 1),
#' SD = runif(3, 0, 1),
#' group = c("a","b","c"))
#' @examples class(DNB_obj) <- 'DNB_obj' # or 'maxDNB_obj'
#' @examples DNBplot(DNB_obj, show = TRUE)
#'
#' @author Kaiyu Wang, in ChenLab of CAS, Shanghai, China
#'
#' @export
#'
DNBplot <- function(DNB_obj, file_prefix = NULL, show = FALSE) {
if (!class(DNB_obj) %in% c('DNB_obj', 'maxDNB_obj'))
stop("ERROR input! Should be an 'DNB_obj' or 'maxDNB_obj' object")
df_score <- data.frame(
CI = DNB_obj$CI,
PCC_IN = DNB_obj$PCC_IN,
PCC_OUT = DNB_obj$PCC_OUT,
SD = DNB_obj$SD,
Names = factor(DNB_obj$group, levels = DNB_obj$group)
)
ggplot_exist <- suppressPackageStartupMessages(require(ggplot2)) && suppressPackageStartupMessages(require(gridExtra))
if (ggplot_exist) {
pp <- function(df, y) {
df_score <- df[, c("Names", y)]
colnames(df_score)[2] <- 'y'
ggplot(df_score, aes(x = Names, y = y, group = 1)) +
geom_point() +
geom_line() +
theme_classic(base_size = 15) +
ggtitle(y) +
theme(axis.text.x = element_text(face ="bold", size = 12, color = "black"),
axis.text.y = element_text(face ="bold", size = 12, color = "black"),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_text(hjust = 0.5))
}
pdf_file <- paste0(sub("_obj$", "", class(DNB_obj)), "_ggplot.pdf")
} else {
pp <- function(df_score, y) {
plot(df_score[, y], type = "l", col = "blue", main = y, xaxt = "n", xlab = "group")
points(df_score[, y], col = "black")
axis(1, seq(df_score$Names), df_score$Names)
}
pdf_file <- paste0(sub("_obj$", "", class(DNB_obj)), "_plot.pdf")
}
pdf_file <- ifelse(
is.null(file_prefix),
pdf_file,
paste(file_prefix, pdf_file, sep = "_")
)
cat("Plot into ", pdf_file, " ...\n", sep = "")
pdf(pdf_file, height = 10, width = 10)
if (!ggplot_exist)
layout(matrix(1:4, ncol = 2))
p1 <- pp(df_score, 'CI')
p2 <- pp(df_score, 'PCC_IN')
p3 <- pp(df_score, 'PCC_OUT')
p4 <- pp(df_score, 'SD')
if (ggplot_exist)
grid.arrange(p1, p2, p3, p4)
dev.off()
if (!is.null(dev.list()))
while (names(dev.list())[length(dev.list())] == 'pdf') {
dev.off() # error often occurs with unknown reason
if (is.null(dev.list())) break
}
if (show) {
if (ggplot_exist) {
grid.arrange(p1, p2, p3, p4)
} else {
layout(matrix(1:4, ncol = 2))
p1; p2; p3; p4
}
}
cat("Plot over!\n")
}
#' @title mycor
#'
#' @description custom function of cor(correlation)
#'
#' @details ignore the warning and error of correlation computation
#'
#' @param ... same to parameters of cor()
#'
#' @author Kaiyu Wang, in ChenLab of CAS, Shanghai, China
#'
mycor <- function(...) {
# to ignore the warning and error of correlation computation
obj <- suppressWarnings(cor(...))
obj[is.na(obj)] <- 1
obj
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.