DNB: Compute the Dynamic Network Biomarkers(DNB) model

Documented in DNB DNBplot maxDNB mycor

#' @title DNB
#'
#' @description Compute the Dynamic Network Biomarkers(DNB) model
#'
#' @details return a DNB_obj object
#' @details write score results into DNB_score_matrix.txt
#' @details write gene list from the Module of max CI score into DNB_maxModule.txt
#'
#' @param data the gene expression matrix,
#'      which can be a single-cell RNA-seq GEM with at least three group/clusters
#'      or a matrix merging bulk GEMs from at least three different sample
#' @param meta a data.frame with rownames as cell-id as well as one column of group infomation
#' @param diffgenes which genes we're interested in
#' @param allgenes the whole genes that ordered by expression, or the rownames of GEM (default)
#' @param meta_levels the order of meta group, default ordered by decreasing if be null
#' @param high_method the method to select genes for the first step, by either high_cv (default) or top_gene
#' @param high_cutoff the cutoff value corresponding to the high_method,
#'      with the range between 0-1 for high_cv and 1-length(allgenes) for top_gene
#' @param cutree_method the method to select tree(module) numbers, by either h (height, default) or k (number K)
#' @param cutree_cutoff the cutoff value corresponding to the cutree_method,
#'      with the range between 0-1 for h and a number greater than 0 for k
#' @param minModule the min number of genes of the module meeting requirements
#' @param maxModule the max number of genes of the module meeting requirements
#' @param quiet do not print output of process during calculation, default FALSE
#'
#' @return DNB_obj
#'
#' @examples DNB(...)
#'
#' @author Kaiyu Wang, in ChenLab of CAS, Shanghai, China
#'
#' @export
#'
DNB <- function(
    data,
    meta,
    diffgenes,
    allgenes = NULL,
    meta_levels = NULL,
    high_method = c('high_cv', 'top_gene'),
    high_cutoff = 0.6,
    cutree_method = c('h', 'k'),
    cutree_cutoff = 0.98,
    minModule = 7,
    maxModule = 60,
    quiet = FALSE
) {
    if (!is.matrix(data) && !is.data.frame(data))
        stop("ERROR data input! Should be matrix or data.frame!")
    if (is.null(allgenes))
        allgenes <- rownames(data)
    if (!all(diffgenes %in% allgenes))
        stop("ERROR genes! Please check the allgenes or diffgenes input!")
    if (!is.data.frame(meta))
        meta <- as.data.frame(meta) # if meta is a vector with names
    if (!all(colnames(data) %in% rownames(meta)))
        stop("ERROR meta! Please check the rownames of meta df or the colnames of data input!")
    if (ncol(meta) != 1)
        stop("ERROR meta! Meta df only has one column!")
    if (!all(meta_levels %in% meta[, 1]))
        stop("ERROR meta_levels! Should be equal to meta!")
    match.arg(high_method)
    match.arg(cutree_method)
    if (length(high_method != 1))
        high_method <- high_method[1]
    if (length(cutree_method != 1))
        cutree_method <- cutree_method[1]
    if (high_method == "high_cv") {
        if (high_cutoff > 1 || high_cutoff <= 0)
            stop("ERROR high_cutoff! Should be between 0 and 1 when high_method is high_cv!")
    } else {
        if (high_cutoff < 1 || high_cutoff > length(allgenes))
            stop("ERROR high_cutoff! Should be between less than length of allgenes when high_method is top_gene!")
    }
    if (cutree_method == "h") {
        if (cutree_cutoff > 1 || cutree_cutoff <= 0)
            stop("ERROR cutree_cutoff! Should be between 0 and 1 when cutree_method is h!")
    } else {
        message("select tree by k! Carefully choose a proper value, or error may occur later.")
        if (cutree_cutoff < 1)
            stop("ERROR cutree_cutoff! Should be between greater than 1 when cutree_method is k!")
    }
    
    mycat <- function(..., quiet = F) {
        if (!quiet) cat(...)
    }
    
    if (is.null(meta_levels)) {
        meta <- meta[order(meta[, 1], decreasing = T), , drop = F]
        group <- unique(meta[, 1])
    } else {
        meta_tmp <- meta[1, , drop = F]
        rownames(meta_tmp) <- 'NA'
        for (i in meta_levels) {
            meta_tmp_tmp <- meta[meta[, 1] == i, , drop = F]
            meta_tmp_tmp <- meta_tmp_tmp[order(rownames(meta_tmp_tmp), decreasing = T), , drop = F]
            meta_tmp <- rbind(meta_tmp, meta_tmp_tmp)
        }
        meta <- meta_tmp[-1, , drop = F]
        group <- meta_levels
    }
    data <- data[, rownames(meta), drop = F]

    SCORE <- c()
    cv <- c()
    SD <- c()
    MEAN <- c()
    PCC_IN <- c()
    PCC_OUT <- c()
    MODULE <- list()
    bestMODULE <- c()

    for (group_tmp in group) {
        cat('Now run ', group_tmp, '\n', sep = '')
        data_tmp <- t(data[, rownames(meta)[meta[, 1] == group_tmp]])
        data_tmp <- data.frame(data_tmp, check.names = FALSE)

        mycat("\tProcess ...\n", quiet = quiet)
        sd_all <- sapply(data_tmp, sd)
        mean_all <- sapply(data_tmp, mean)
        cv_all <- sd_all / mean_all
        # cv_all[is.na(cv_all)] <- 0

        if (high_method == 'top_gene') {
            hig_gene <- allgenes[1:high_cutoff]
        } else if (high_method == 'high_cv') {
            hig_num <- ceiling(length(cv_all) * high_cutoff) #cv cutoff
            cv_rank <- sort(cv_all, decreasing = T, na.last = T)[1:hig_num]
            hig_gene <- names(cv_rank)
        }
        if (length(hig_gene) <= 1)
            stop("ERROR number of high gene! Only find one high gene or less!")
        sle_tab <- data_tmp[, hig_gene, drop = F]

        #calculate pcc
        cor_sle <- mycor(sle_tab)
        diag(cor_sle) <- 0
        dis_sle <- 1 - abs(cor_sle)

        #clustering
        mycat("\tCluster...\n", "\tcutree method be ", cutree_method, " with cutoff value ", cutree_cutoff, "\n", sep = "", quiet = quiet)
        dist <- as.dist(dis_sle)
        dist[is.na(dist)] <- 1
        clu_sle <- hclust(dist)
        if (cutree_method == 'h')
            cut_sle <- cutree(clu_sle, h = cutree_cutoff)    #pcc cutoff
        else if (cutree_method == 'k')
            tryCatch(
                cut_sle <- cutree(clu_sle, k = cutree_cutoff),
                error = function(x) stop("Not proper value of k!")
            )

        #calcute score
        count <- 1
        used_vecto <- NULL
        pcc_in <- c()
        pcc_out <- c()
        score <- c()
        cv_mean <- c()
        sd_mean <- c()
        mean_mean <- c()

        mycat("\tCompute score ...\n", quiet = quiet)
        for (i in 1:max(cut_sle)) {
            if (i %% 50 == 0)
                mycat("\t\tNow run ", i, "\n", sep = "", quiet = quiet)
            ###get out group data from whole gene base/high sd gene###
            group_name <- names(which(cut_sle == i))
            group_out <- setdiff(hig_gene, group_name) ##get out group from high sd gene
            num_in <- length(group_name)
            size <- length(group_name)
            
            if (size > minModule) {
                ###make in group & out group table###
                tab_in <- sle_tab[, group_name]
                tab_out <- sle_tab[, group_out]

                ###calculate parameters####
                cv_mean[count] <- mean(cv_all[group_name], na.rm = T)
                sd_mean[count] <- mean(sd_all[group_name])
                mean_mean[count] <- mean(mean_all[group_name])
                pcc_in[count] <- mean(abs(as.dist(mycor(tab_in))), na.rm = T)
                pcc_out[count] <- mean(abs(mycor(tab_in, tab_out)), na.rm = T) ###correlation of in 2 out

                score[count] <- ifelse(
                    pcc_out[count] == 0,
                    0,
                    sqrt(size) * sd_mean[count] * pcc_in[count] / pcc_out[count] ###score for ci
                )
                n <- num_in ##try n = 1 or n = num of module
                count <- count + 1
                used_vecto <- c(used_vecto, i)
            }
        }
        mycat("\tFind total of ", length(score), " scores! \n", quiet = quiet)

        mycat("\tCompute Modules ...\n", quiet = quiet)
        MODULE_names <- list()
        fit_n <- 1
        if (length(score) > 0) {
            for (mm in 1:length(score)) {
                MODULE_names[[mm]] <- names(which(cut_sle == used_vecto[mm]))
                intersctDiff <- intersect(diffgenes, MODULE_names[[mm]])
                if (length(intersctDiff) > 0) {
                    mycat("\t\tFind Module_", fit_n, ": ", sep = "", quiet = quiet)
                    mycat("CI->", score[mm], "; ", sep = "", quiet = quiet)
                    mycat("number of gene->", length(MODULE_names[[mm]]), "; ", sep = "", quiet = quiet)
                    mycat("intersested genes->", paste0(intersctDiff, collapse = ","), "\n", sep = "", quiet = quiet)
                    fit_n <- fit_n + 1
                }
            }
        } else {
            mycat("\t\tFind no module!\n", quiet = quiet)
        }

        mycat("\tFilter Modules ...\n\t\tProcess: ", quiet = quiet)
        index <- 0
        while (index >= 0) {
            if (length(score) == 0) {
                bestModule <- 0
                SCORE_tmp <- 0
                PCC_IN_tmp <- 0
                PCC_OUT_tmp <- 0
                cv_tmp <- 0
                sd_tmp <- 0
                mean_tmp <- 0
                MODULE_tmp <- ""
                mycat("\n\t\tNo Module meets requirements!", quiet = quiet)
                break
            }
            if (index %% 100 == 0)
                mycat(index, " ", sep = "", quiet = quiet)
            
            max_num <- which.max(score)
            SCORE_tmp <- max(score)
            PCC_IN_tmp <- pcc_in[max_num]
            PCC_OUT_tmp <- pcc_out[max_num]
            cv_tmp <- cv_mean[max_num]
            sd_tmp <- sd_mean[max_num]
            mean_tmp <- mean_mean[max_num]
            MODULE_tmp <- names(which(cut_sle == used_vecto[max_num]))
            
            LenMODULE <- length(MODULE_tmp)
            Lendiff <- length(intersect(diffgenes, MODULE_tmp))
            if (Lendiff > 0 && LenMODULE > minModule && LenMODULE < maxModule) {
                bestModule <- 1
                mycat("\n\t\tFind the Module meeting requirements!", quiet = quiet)
                break
            } else if (SCORE_tmp == 0) {
                bestModule <- 0
                MODULE_tmp <- ""
                mycat("\n\t\tNo Module meets requirements!", quiet = quiet)
                break
            } else {
                score[max_num] <- 0
            }
            index <- index + 1
        }
        mycat("\n", quiet = quiet)
        SCORE <- append(SCORE, SCORE_tmp)
        PCC_IN <- append(PCC_IN, PCC_IN_tmp)
        PCC_OUT <- append(PCC_OUT, PCC_OUT_tmp)
        cv <- append(cv, cv_tmp)
        SD <- append(SD, sd_tmp)
        MEAN <- append(MEAN, mean_tmp)
        MODULE <- append(MODULE, list(MODULE_tmp))
        bestMODULE <- append(bestMODULE, bestModule)
        mycat(group_tmp, ' ends up successfully!\n', sep = '', quiet = quiet)
    }

    MODULEs <- lapply(MODULE, function(x) paste0(x, collapse = ","))
    cat("Now write output into DNB_score_matrix.txt ...\n")
    out_table <- rbind(SCORE, PCC_IN, PCC_OUT, cv, SD, MEAN, bestMODULE, MODULEs)
    rownames(out_table)[4] <- "CV"
    rownames(out_table)[1] <- "CI"
    colnames(out_table) <- group
    write.table(out_table, "DNB_score_matrix.txt", quote = F)

    cat("Now write MODULE of max CI score into DNB_maxModule.txt ...\n")
    Module_index <- which.max(SCORE)
    if (length(Module_index) > 1) {
        warning("There are more than one Module sharing same max CI score. Choose one randomly.")
        Module_index <- Module_index[1]
    }
    write.table(MODULE[[Module_index]], "DNB_maxModule.txt", quote = F)

    res <- list(
        data = data, 
        group = group, 
        meta = meta, 
        CI = SCORE,
        PCC_IN = PCC_IN, 
        PCC_OUT = PCC_OUT, 
        CV = cv, 
        SD = SD,
        MEAN = MEAN, 
        MODULE = MODULE
    )
    class(res) <- "DNB_obj"

    cat("ALL run over!\n")
    return(res)
}

#' @title maxDNB
#'
#' @description Compute the Module of max CI score from DNB model
#'
#' @details return a maxDNB_obj object
#' @details write score results into maxDNB_score_matrix.txt
#'
#' @param DNB_obj output of function DNB
#'
#' @return maxDNB_obj
#'
#' @examples maxDNB_obj <- maxDNB(DNB_obj)
#'
#' @author Kaiyu Wang, in ChenLab of CAS, Shanghai, China
#'
#' @export
#'
maxDNB <- function(DNB_obj) {
    if (class(DNB_obj) != "DNB_obj")
        stop("ERROR input! Should be an 'DNB_obj' object")
    
    Module_index <- which.max(DNB_obj$CI)
    Module_SCORE <- max(DNB_obj$CI)
    if (Module_SCORE == 0) 
        stop("No Module meet requirement! CI score should be greater than 0!")
    if (length(Module_index) > 1) {
        warning("There are more than one Module sharing same max CI score.")
        Module_index <- Module_index[1]
    }
    dnb_name <- DNB_obj$MODULE[[Module_index]]

    all <- rownames(DNB_obj$data)
    dnb_out <- all[which(!all %in% dnb_name)]
    size <- length(dnb_name)

    score <- c()
    pcc_in <- c()
    pcc_out <- c()
    cv <- c()
    Sd <- c()
    Mean <- c()

    for (index_tmp in seq(DNB_obj$group)) {
        cat('Now run ', DNB_obj$group[index_tmp], '\n', sep = '')
        data_tmp <- t(DNB_obj$data[, rownames(DNB_obj$meta)[DNB_obj$meta[, 1] == DNB_obj$group[index_tmp]]])
        data_tmp <- data.frame(data_tmp, check.names = FALSE)
        sd_all <- sapply(data_tmp, sd)
        mean_all <- sapply(data_tmp, mean)
        cv_all <- sd_all / mean_all
        # cv_all[is.na(cv_all)] <- 0
        cv[index_tmp] <- mean(cv_all[dnb_name], na.rm = T) # if mean is zero, cv is not defined, so do not consider this one.
        Sd[index_tmp] <- mean(sd_all[dnb_name])
        Mean[index_tmp] <- mean(mean_all[dnb_name])
        tab_in <- data_tmp[, dnb_name]
        tab_out <- data_tmp[, dnb_out]
        pcc_in[index_tmp] <- mean(abs(as.dist(mycor(tab_in))), na.rm = T)
        pcc_out[index_tmp] <- mean(abs(mycor(tab_in, tab_out)), na.rm = T)
        score[index_tmp] <- ifelse(
            pcc_out[index_tmp] == 0,
            0,
            sqrt(size) * mean(sd_all[dnb_name]) * pcc_in[index_tmp] / pcc_out[index_tmp]
        )
    }

    DNB_table <- rbind(score, pcc_in, pcc_out, cv, Sd, Mean)
    colnames(DNB_table) <- DNB_obj$group
    rownames(DNB_table) <- c('CI', 'PCC_IN', 'PCC_OUT', 'CV', 'SD', 'MEAN')
    cat("Now write output into maxDNB_score_matrix.txt ...\n")
    write.table(DNB_table, "maxDNB_score_matrix.txt", quote = F)

    res <- list(
        CI = score, 
        PCC_IN = pcc_in, 
        PCC_OUT = pcc_out, 
        CV = cv, 
        SD = Sd, 
        MEAN = Mean, 
        group = DNB_obj$group
    )
    class(res) <- "maxDNB_obj"

    cat("ALL run over!\n")
    return(res)
}

#' @title DNBplot
#'
#' @description Visualize the result of DNB model
#'
#' @details plot by 1) ggplot2 if installed, or 2) R built-in plot
#' @details plot into \{file_prefix\}(_)(max)DNB_(gg)plot.pdf
#'
#' @param DNB_obj output of function 1) DNB or 2) maxDNB
#' @param file_prefix prefix of save pdf file, default NULL
#' @param show whether if plot be shown in Console, default FALSE
#'
#' @return NULL
#'
#' @examples DNB_obj <- list(CI = runif(3, 0, 1),
#'                 PCC_IN = runif(3, 0, 1),
#'                 PCC_OUT = runif(3, 0, 1),
#'                 SD = runif(3, 0, 1),
#'                 group = c("a","b","c"))
#' @examples class(DNB_obj) <- 'DNB_obj' # or 'maxDNB_obj'
#' @examples DNBplot(DNB_obj, show = TRUE)
#'
#' @author Kaiyu Wang, in ChenLab of CAS, Shanghai, China
#'
#' @export
#'
DNBplot <- function(DNB_obj, file_prefix = NULL, show = FALSE) {
    if (!class(DNB_obj) %in% c('DNB_obj', 'maxDNB_obj'))
        stop("ERROR input! Should be an 'DNB_obj' or 'maxDNB_obj' object")

    df_score <- data.frame(
        CI = DNB_obj$CI,
        PCC_IN = DNB_obj$PCC_IN,
        PCC_OUT = DNB_obj$PCC_OUT,
        SD = DNB_obj$SD,
        Names = factor(DNB_obj$group, levels = DNB_obj$group)
    )
    
    ggplot_exist <- suppressPackageStartupMessages(require(ggplot2)) && suppressPackageStartupMessages(require(gridExtra))
    if (ggplot_exist) {
        pp <- function(df, y) {
            df_score <- df[, c("Names", y)]
            colnames(df_score)[2] <- 'y'

            ggplot(df_score, aes(x = Names, y = y, group = 1)) +
            geom_point() +
            geom_line() +
            theme_classic(base_size = 15) +
            ggtitle(y) +
            theme(axis.text.x  = element_text(face ="bold", size = 12, color = "black"),
                  axis.text.y  = element_text(face ="bold", size = 12, color = "black"),
                  axis.title.x = element_blank(),
                  axis.title.y = element_blank(),
                  plot.title = element_text(hjust = 0.5))
        }
        pdf_file <- paste0(sub("_obj$", "", class(DNB_obj)), "_ggplot.pdf")
    } else {
        pp <- function(df_score, y) {
            plot(df_score[, y], type = "l", col = "blue", main = y, xaxt = "n", xlab = "group")
            points(df_score[, y], col = "black")
            axis(1, seq(df_score$Names), df_score$Names)
        }
        pdf_file <- paste0(sub("_obj$", "", class(DNB_obj)), "_plot.pdf")
    }
    
    pdf_file <- ifelse(
        is.null(file_prefix),
        pdf_file,
        paste(file_prefix, pdf_file, sep = "_")
    )
    
    cat("Plot into ", pdf_file, " ...\n", sep = "")
    pdf(pdf_file, height = 10, width = 10)
    if (!ggplot_exist) 
        layout(matrix(1:4, ncol = 2))
    p1 <- pp(df_score, 'CI')
    p2 <- pp(df_score, 'PCC_IN')
    p3 <- pp(df_score, 'PCC_OUT')
    p4 <- pp(df_score, 'SD')
    if (ggplot_exist)
        grid.arrange(p1, p2, p3, p4)
    dev.off()

    if (!is.null(dev.list()))
        while (names(dev.list())[length(dev.list())] == 'pdf') {
            dev.off() # error often occurs with unknown reason
            if (is.null(dev.list())) break
        }
    
    if (show) {
        if (ggplot_exist) {
            grid.arrange(p1, p2, p3, p4)
        } else { 
            layout(matrix(1:4, ncol = 2))
            p1; p2; p3; p4
        }
    }
    
    cat("Plot over!\n")
}

#' @title mycor
#'
#' @description custom function of cor(correlation)
#'
#' @details ignore the warning and error of correlation computation
#'
#' @param ... same to parameters of cor()
#'
#' @author Kaiyu Wang, in ChenLab of CAS, Shanghai, China
#'
mycor <- function(...) {
  # to ignore the warning and error of correlation computation
  obj <- suppressWarnings(cor(...))
  obj[is.na(obj)] <- 1
  obj
}