R/univariate_extension.R
In CePa: Centrality-Based Pathway Enrichment

Documented in cepa.univariate cepa.univariate.all

# == title
# Apply centrality-extented GSA on a list of pathways
#
# == param
# -mat expression matrix in which rows are genes and columns are samples
# -label a `sampleLabel` object identify the design of the microarray experiment
# -pc a ``pathway.catalogue`` object storing information of pathways
# -cen centrality measuments, it can ce a string, or a function
# -cen.name centrality measurement names. By default it is parsed from ``cen`` argument
# -nlevel node level transformation, should be one of "tvalue", "tvalue_sq", "tvalue_abs".
#            Also self-defined functions are allowed
# -plevel pathway level transformation, should be one of "max", "min", "median", "sum", "mean", "rank".
#           Also, self-defined functions are allowed
# -iter number of simulations
#
# == details
# The traditional gene-set analysis (GSA) to find significant pathways 
# uses the whole expression matrix. GSA methods are implemented via either a univariate 
# or a multivariate procedure. In univariate analysis, node level statistics are 
# initially calculated from fold changes or statistical tests (e.g., t-test). 
# These statistics are then combined into a pathway level statistic by summation or 
# averaging. Multivariate analysis considers the correlations between genes in the 
# pathway and calculates the pathway level statistic directly from the expression 
# value matrix using Hotelling's T^2 test or MANOVA models. The function implement
# univariate procedure of GSA with network centralities.
#
# If users use the `PID.db` data, all genes should be formatted in gene symbol.
#
# If the centrality measurement is set as a string, only pre-defined "equal.weight",
# "in.degree", "out.degree", "degree", "betweenness", "in.reach", "out.reach",
# "reach", "in.spread", "out.spread" and "spread" are allowed. More centrality
# measurements can be used by setting it as a function (such as closeness,
# cluster coefficient). In the function, we recommand users choose
# at least two centrality measurements. Note that the self-defined function should
# only contain one argument which is an igraph object. The default centralities are "equal.weight",
# "in.degree", "out.degree", "betweenness", "in.reach" and "out.reach".
#
# The node level statistic can be self-defined. The self-defined function should contain
# two arguments: a vector for expression value in treatment class and a vector for
# expression value in control class.
#
# The pathway level statistic can be self-defined. The self-defined function should
# only contain one argument: the vector of node-level statistic.
#
# However, in most circumstance, the function is called by `cepa.all`.
#
# We are sorry that only the univariate procedures in GSA are extended. We are still
# trying to figure out the extension for the multivariate procedures in GSA.
#
# == value
# A `cepa.all` class object
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == seealso
# `cepa`
#
# == example
# \dontrun{
# data(PID.db)
# # GSA extension
# # P53_symbol.gct and P53.cls can be downloaded from
# # http://mcube.nju.edu.cn/jwang/lab/soft/cepa/
# eset = read.gct("http://mcube.nju.edu.cn/jwang/lab/soft/cepa/P53_symbol.gct")
# label = read.cls("http://mcube.nju.edu.cn/jwang/lab/soft/cepa/P53.cls", 
#     treatment="MUT", control="WT")
# # will spend about 45 min
# res.gsa = cepa.univariate.all(mat = eset, label = label, pc = PID.db$NCI)
# }
cepa.univariate.all = function(mat, label, pc, cen = default.centralities,
           cen.name = sapply(cen, function(x) ifelse(mode(x) == "name", deparse(x), x)), 
           nlevel = "tvalue_abs", plevel = "mean", iter = 1000) {
    
    nlevelFun = find.nlevelFun(nlevel)  # if nlevelFun is binary, the adj.method and cutoff have been hard coded in the functions
    plevelFun = find.plevelFun(plevel)
    
    # if cen argument is a function, the function should be quoted or substituted
    # because we need the function name
    for(ce in cen) {
        if(is.function(ce)) {
            stop("Functions cannot be used directly because we need the function name, use quote or substitute.\n")
        }
    }
    
    if(!inherits(pc, "pathway.catalogue")) {
        stop("pc argument should be a pathway.catalogue object.")
    }
    
    # if binary transformation
    # function need a whole expression matrix to calculate adjust p-values
    cat("  Calculate gene level values.\n")
    
    n.pathway = length(pc$pathList)
    
    # initialize pathway.result
    pathway.name = names(pc$pathList)    
    pathway.result = list()
    length(pathway.result) = n.pathway
    # pathway.result is like a two layer list
    pathway.result = lapply(pathway.result, function(x) {
                              y = list()
                              length(y) = length(cen.name)
                              names(y) = cen.name
                              return(y)
                            })
    names(pathway.result) = pathway.name
    
    cat("  Calculate pathway score...\n")
    
    for(i in 1:length(pc$pathList)) {
        
        cat("    ", i, "/", length(pc$pathList), ", ", pathway.name[i], "...\n", sep="")
        
        path = pc$pathList[[i]]
        inter = pc$interactionList[pc$interactionList[, 1] %in% path, 2:3]
        
        pathway = generate.pathway(as.matrix(inter))
        
        
        
        
        cat("      Calculate node level value and permutate sample labels...\n")
        # nodes in the pathway
        node = pathway.nodes(pathway)
        
         # only the mapping in the pathway
        mapping = pc$mapping[pc$mapping[, 1] %in% node, ]
    
        # gene expression just in the pathway
        l = rownames(mat) %in% mapping[, 2]
        cat("      ", sum(l), " genes measured in the pathway...\n", sep = "")
        # if no genes are measure in the pathway
        if(sum(l) == 0) {
            node.level.from.expr = rep(0, length(node))
            node.level.t.value = rep(0, length(node))
            r.node.level.from.expr = matrix(0, nrow = length(node), ncol = iter)
        } else {
            mat.gene = mat[l, ,drop = FALSE]
            mat.gene = t(apply(mat.gene, 1, scale))  # really need to be scaled?
            
            # generate node expression from its member genes
            # if there is no member gene, the expression is zero
            mat.node = matrix(0, nrow=length(node), ncol = dim(mat)[2])
            rownames(mat.node) = node
            for(k in 1:length(node)) {
                l = mapping[, 1] == node[k]
                gene.in.node = unique(mapping[l, 2])
                gene.in.node.in.mat = gene.in.node[gene.in.node %in% rownames(mat.gene)]
                if(length(gene.in.node.in.mat) == 1) {
                    mat.node[k, ] = mat.gene[gene.in.node.in.mat, ]
                } else if(length(gene.in.node.in.mat) > 1) {  # use the biggeset component of member gene expression matrix
                    mm = t(mat.gene[gene.in.node.in.mat, ])
                    pcar = prcomp(mm)
                    mat.node[k, ] = predict(pcar, mm)[, 1]
                }
            }
            
            node.level.from.expr = apply(mat.node, 1, function(x) nlevelFun(x[.treatment(label)], x[.control(label)]))
            node.level.t.value = apply(mat.node, 1, function(x) nodeLevelFun.tvalue(x[.treatment(label)], x[.control(label)]))
            
            node.level.from.expr[is.na(node.level.from.expr)] = 0
            node.level.t.value[is.na(node.level.t.value)] = 0
    
            r.node.level.from.expr = matrix(0, nrow = length(node), ncol = iter)
            for(k in 1:iter) {
                r.label = .permutate(label)
                r.node.level.from.expr[, k] = apply(mat.node, 1, function(x) nlevelFun(x[.treatment(r.label)], x[.control(r.label)]))
                r.node.level.from.expr[is.na(r.node.level.from.expr[, k]), k] = 0
            }
        }
        
        j = 0
        for(ce in cen) {
            j = j + 1
            pathway.result[[i]][[j]] = cepa.univariate(mat = mat, label = label, pc = pc, pathway = pathway,
                                                    cen = ce, iter = iter, nlevel = nlevel, plevel = plevel,
                                                    node.level.from.expr = node.level.from.expr, node.level.t.value = node.level.t.value,
                                                    r.node.level.from.expr = r.node.level.from.expr)
            
            cat("      - ", ce, ": ", round(pathway.result[[i]][[j]]$p.value, 3), "\n", sep = "")
        }
    }

    class(pathway.result) = "cepa.all"
    return(pathway.result)

}

# == title
# Apply centrality-extended GSA on a single pathway
#
# == param
# -mat expression matrix in which rows are genes and columns are samples
# -label a `sampleLabel` object identify the design of the microarray experiment
# -pc a ``pathway.catalogue`` object storing information of pathways
# -pathway `igraph::igraph` object or edge list
# -id identify the number of the pathway in the catalogue
# -cen centrality measuments, it can ce a string, or function has been quote
# -cen.name centrality measurement names
# -nlevel node level transformation, should be one of "tvalue", "tvalue_sq", "tvalue_abs".
#             Also self-defined functions are allowed, see `cepa.univariate.all` for detail.
# -plevel pathway level transformation, should be one of "max", "min", "median", "sum", "mean", "rank".
#             Also, self-defined functions are allowed, see `cepa.univariate.all` for detail.
# -node.level.from.expr for simplicity of computing
# -node.level.t.value for simplicity of computing
# -r.node.level.from.expr for simplicity of computing
# -iter number of simulations
#
# == details
# The function is always called by `cepa.univariate.all`. But you can still
# use it if you realy want to analysis just one pathway under one centrality.
#
# == value
# A `cepa` class object
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# \dontrun{
#
# data(PID.db)
#
# # GSA extension
# # P53_symbol.gct and P53_cls can be downloaded from
# # http://mcube.nju.edu.cn/jwang/lab/soft/cepa/
# eset = read.gct("P53_symbol.gct")
# label = read.cls("P53.cls", treatment="MUT", control="WT")
# # will spend about 45 min
# res.gsa = cepa.univariate(mat = eset, label = label, pc = PID.db$NCI, id = 2)
# }
cepa.univariate = function(mat, label, pc, pathway = NULL, id = NULL, cen = "equal.weight",
                   cen.name = if(is.function(cen)) deparse(substitute(cen)) 
                              else if(mode(cen) == "name") deparse(cen) 
                              else cen,
                   iter = 1000, nlevel = "tvalue_abs", plevel = "mean",
                   node.level.from.expr = NULL, node.level.t.value = NULL,
                   r.node.level.from.expr = NULL) {
    
    # in this version, we do not allow user-defined nlevel, nlevel and plevel functions
    nlevelFun = find.nlevelFun(nlevel)  # if nlevelFun is binary, the adj.method and cutoff have been hard coded in the functions
    plevelFun = find.plevelFun(plevel)
    
    
    # you can specify a pathway object or a pathway id
    if(! is.null(pathway)) {   # a pathway is specified
        if(is.matrix(pathway) || is.data.frame(pathway)) {   # only edge list
            if(length(dim(pathway)) != 2 || dim(pathway)[2] != 2) {
                stop("if pathway is a matrix or data frame, it should be 2 dimension and the number of columns is 2.\n")
            }
            pathway = generate.pathway(pathway)    # generate an igraph object
        } else if(!inherits(pathway, "igraph")) {    # it should be an igraph object
            stop("Since pathway is not formatted as edge list, it should be an igraph object.")
        }
    } else if(! is.null(id)) {  # if the pathway is not specified, but the pathway ID is available
        # get interactions in the pathway
        path = pc$pathList[[id]]
        inter = pc$interactionList[pc$interactionList[, 1] %in% path, 2:3]
        # generate graph from edge list
        pathway = generate.pathway(inter)
    } else {  # one of pathway and id should be set
        stop("You should specify pathway argument or id argument.")
    }

    if(iter < 100) {
        stop("Iterations should not be smaller than 100.\n")
    }
    
    # single centrality!
    if(length(cen) != 1) {
        stop("Length of cen must be equal to 1.\n") 
    }
    
    weight = centrality(pathway, cen)

    add = 0
    # if there are none-zero weight values
    if(sum(weight == 0) != length(weight)) {
        add = min(weight[weight > 0])/100
    }
    weight = weight + ifelse(sum(weight == weight[1]) == length(weight), 0, add)
    
    # nodes in the pathway
    node = pathway.nodes(pathway)
    
     # only the mapping in the pathway
    mapping = pc$mapping[pc$mapping[, 1] %in% node, ]
    
    # get node names formatted with genes
    node.name = node
    member = character(0)
    for(i in 1:length(node)) {
        # genes that exsit in the node
        l = mapping[, 1] == node[i]
        
        # if find nodes with genes mapped
        if(sum(l)) {
            member = sort(unique(mapping[l, 2]))
            node.name[i] = paste(member, collapse = "\n")
        }
    }
    
    if(is.null(node.level.from.expr)) {
        # gene expression just in the pathway
        l = rownames(mat) %in% mapping[, 2]
        if(sum(l) == 0) {
            node.level.from.expr = rep(0, length(node))
            node.level.t.value = rep(0, length(node))
        } else {
            mat.gene = mat[l, ,drop=FALSE]
            mat.gene = t(apply(mat.gene, 1, scale))  # really need to be scaled?
            
            # generate node expression from its member genes
            # if there is no member gene, the expression is zero
            mat.node = matrix(0, nrow=length(node), ncol = dim(mat)[2])
            rownames(mat.node) = node
            for(i in 1:length(node)) {
                l = mapping[, 1] == node[i]
                gene.in.node = unique(mapping[l, 2])
                gene.in.node.in.mat = gene.in.node[gene.in.node %in% rownames(mat.gene)]
                if(length(gene.in.node.in.mat) == 1) {
                    mat.node[i, ] = mat.gene[gene.in.node.in.mat, ]
                } else if(length(gene.in.node.in.mat) > 1) {  # use the biggeset component of member gene expression matrix
                    mm = t(mat.gene[gene.in.node.in.mat, ])
                    pcar = prcomp(mm)
                    mat.node[i, ] = predict(pcar, mm)[, 1]
                }
            }
            
            node.level.from.expr = apply(mat.node, 1, function(x) nlevelFun(x[.treatment(label)], x[.control(label)]))
            node.level.t.value = apply(mat.node, 1, function(x) nodeLevelFun.tvalue(x[.treatment(label)], x[.control(label)]))
        }
    }
    
    node.level.from.expr[is.na(node.level.from.expr)] = 0
    node.level.t.value[is.na(node.level.t.value)] = 0
    
    node.level = weight * node.level.from.expr
    ds = quantile(node.level, c(1, 0.75, 0.5, 0))
    names(ds) = c("max", "q75", "median", "min")
    
    s = plevelFun(node.level)    # calculate nodes
    
    # now the permutation
    s.random = numeric(iter)
    ds.random = matrix(0, iter, 4)   # descriptive statistic of the node
    colnames(ds.random) = c("max", "q75", "median", "min")
    if(sum(rownames(mat) %in% mapping[, 2]) == 0) {
        s.random = rep(0, iter)
    } else {
        for(i in 1:iter) {
            
            if(is.null(r.node.level.from.expr)) {
                r.label = .permutate(label)
                r.node.level.from.expr.current = apply(mat.node, 1, function(x) nlevelFun(x[.treatment(r.label)], x[.control(r.label)]))
            } else {
                r.node.level.from.expr.current = r.node.level.from.expr[, i]
            }
            
            r.node.level.from.expr.current[is.na(r.node.level.from.expr.current)] = 0
    
            r.node.level = weight * r.node.level.from.expr.current
            s.random[i] = plevelFun(r.node.level)

            ds.random[i, ] = quantile(r.node.level, c(1, 0.75, 0.5, 0))
        }
    }
    
    p.value = (sum(s.random >= s) + 1) / (iter + 1)
    
    res = list("score" = s,                                  # pathway score
               "score.distribution" = ds,                    # distribution of node value in the pathway
               "score.random" = s.random,                    # simulated pathway scores
               "score.distribution.random" = ds.random,      # distribution of node value in the pathway in each simulation
               "p.value" = p.value,                          # p value
               "centrality" = cen.name,                      # centrality name
               "weight" = weight,                            # weight for each node
               "node.level.t.value" = node.level.t.value,    # value for each node, exclude the centrality part
               "node.level" = node.level,                    # value for each node, exclude the centrality part
               "node.name" = node.name,                      # node names
               "pathway" = pathway,                          # pathway in igraph format
               "framework" = "gsa.univariate")                          
               
    class(res) = "cepa"
    
    return(invisible(res))

}

# calculate node.level.only.from.expr by expression of member genes
node.score = function(gene.level = NULL, gene.in.node = NULL, nlevelFun = NULL) {

    node.level.from.expr = numeric(length(gene.in.node))
    for(i in 1:length(gene.in.node)) {
        node.level.from.expr[i] = ifelse(length(gene.in.node[[i]]), nlevelFun(gene.level[names(gene.level) %in% gene.in.node[[i]]]), 0)
    }
    return(node.level.from.expr)
}

nodeLevelFun.tvalue = function(x1, x2, ...) {
    n1 = length(x1)
    n2 = length(x2)
    v1 = var(x1)
    v2 = var(x2)
    ifelse(v1 + v2 == 0, 0, (mean(x1) - mean(x2)) / sqrt(v1/n1 + v2/n2))
}
nodeLevelFun.tvalue_sq = function(x1, x2, ...) {
    nodeLevelFun.tvalue(x1, x2)^2
}
nodeLevelFun.tvalue_abs = function(x1, x2, ...) {
    abs(nodeLevelFun.tvalue(x1, x2))
}


pathwayLevelFun.max = function(x) {
    max(x, na.rm = TRUE)
}
pathwayLevelFun.min = function(x) {
    min(x, na.rm = TRUE)
}
pathwayLevelFun.median = function(x) {
    median(x, na.rm = TRUE)
}
pathwayLevelFun.sum = function(x) {
    sum(x, na.rm = TRUE)
}
pathwayLevelFun.mean = function(x) {
    mean(x, na.rm = TRUE)
}
pathwayLevelFun.rank = function(x) {
    wilcox.test(x, exact = FALSE)$statistic
}


# arguments are different according to different nlevel functions
find.nlevelFun = function(method) {
    if(is.character(method)) {
        f = switch(method,
                   tvalue     = nodeLevelFun.tvalue,
                   tvalue_sq  = nodeLevelFun.tvalue_sq,
                   tvalue_abs = nodeLevelFun.tvalue_abs,
                   stop("Default node-level functions in string format are only tvalue, tvalue_sq and tvalue_abs."))
    } else if ( is.function(method) ) {
        if(length(as.list(args(method))) != 3) {
            stop("Self-defined node-level function only can have two arguments.")
        }
        f = method
    }
    return(f)
}

find.plevelFun = function(method) {
    if(is.character(method)) {
        f = switch(method,
               max    = pathwayLevelFun.max,
               min    = pathwayLevelFun.min,
               median = pathwayLevelFun.median,
               sum    = pathwayLevelFun.sum,
               mean   = pathwayLevelFun.mean,
               rank   = pathwayLevelFun.rank,
               stop("Wrong pathway level method."))
    } else if ( is.function(method) ) {
        if(length(as.list(args(method))) != 2) {
            stop("Self-defined pathway-level function only can have one argument.")
        }
        f = method
    }
    return(f)
}