R/pathRank.R

Defines functions processNetwork samplePaths rankPvalue rankShortestPaths pathRanker getPaths getPathsAsEIDs extractPathNetwork

Documented in extractPathNetwork getPathsAsEIDs pathRanker samplePaths

###############################################################################
#
# pathRank.R:     This file contains functions to rank paths provided with a w
# weigthed igraph object.
#
# author: Ahmed Mohamed <mohamed@kuicr.kyoto-u.ac.jp>
#
# This is released under GPL-2.
#
# Documentation was created using roxygen
#
###############################################################################

#' Creates a subnetwork from a ranked path list
#'
#' Creates a subnetwork from a ranked path list generated by \code{\link{pathRanker}}.
#'
#' @param paths The paths extracted by \code{\link{pathRanker}}.
#' @param graph A annotated igraph object.
#'
#' @return A subnetwork from all paths provided. If paths are computed for several
#' labels (sample categories), a subnetwork is returned for each label.
#'
#' @author Ahmed Mohamed
#' @family Path ranking methods
#' @export
#' @examples
#' 	## Prepare a weighted reaction network.
#' 	## Conver a metabolic network to a reaction network.
#'  data(ex_sbml) # bipartite metabolic network of Carbohydrate metabolism.
#'  rgraph <- makeReactionNetwork(ex_sbml, simplify=TRUE)
#'
#' 	## Assign edge weights based on Affymetrix attributes and microarray dataset.
#'  # Calculate Pearson's correlation.
#' 	data(ex_microarray)	# Part of ALL dataset.
#' 	rgraph <- assignEdgeWeights(microarray = ex_microarray, graph = rgraph,
#' 		weight.method = "cor", use.attr="miriam.uniprot",
#' 		y=factor(colnames(ex_microarray)), bootstrap = FALSE)
#'
#' 	## Get ranked paths using probabilistic shortest paths.
#'  ranked.p <- pathRanker(rgraph, method="prob.shortest.path",
#' 					K=20, minPathSize=6)
#'
#' 	## Get the subnetwork of paths in reaction graph.
#' 	reaction.sub <- getPathsAsEIDs(ranked.p, rgraph)
#'
#' 	## Get the subnetwork of paths in the original metabolic graph.
#' 	metabolic.sub <- getPathsAsEIDs(ranked.p, ex_sbml)
#'
extractPathNetwork <- function(paths, graph){
    p.eids <- getPathsAsEIDs(paths, graph)
    if(length(paths$y.labels)>1){
        graph.ls <- lapply(p.eids, function(x)
                    subgraph.edges(graph, as.integer(unlist(x))) )
        names(graph.ls) <- paths$y.labels
        return(graph.ls)
    }else
        return(subgraph.edges(graph,as.integer(unlist(p.eids))))
}

#' Convert a ranked path list to edge ids of a graph
#'
#' Convert a ranked path list to Edge ids of a graph, where paths can come from
#' a different representation (for example matching path from a reaction network to
#' edges on a metabolic network).
#'
#' @param paths The paths extracted by \code{\link{pathRanker}}.
#' @param graph A annotated igraph object.
#'
#' @return A list of edge ids on the provided graph.
#'
#' @author Ahmed Mohamed
#' @family Path ranking methods
#' @export
#' @examples
#' 	## Prepare a weighted reaction network.
#' 	## Conver a metabolic network to a reaction network.
#'  data(ex_sbml) # bipartite metabolic network of Carbohydrate metabolism.
#'  rgraph <- makeReactionNetwork(ex_sbml, simplify=TRUE)
#'
#' 	## Assign edge weights based on Affymetrix attributes and microarray dataset.
#'  # Calculate Pearson's correlation.
#' 	data(ex_microarray)	# Part of ALL dataset.
#' 	rgraph <- assignEdgeWeights(microarray = ex_microarray, graph = rgraph,
#' 		weight.method = "cor", use.attr="miriam.uniprot",
#' 		y=factor(colnames(ex_microarray)), bootstrap = FALSE)
#'
#' 	## Get ranked paths using probabilistic shortest paths.
#'  ranked.p <- pathRanker(rgraph, method="prob.shortest.path",
#' 					K=20, minPathSize=6)
#'
#' 	## Get the edge ids along paths in the reaction graph.
#' 	path.eids <- getPathsAsEIDs(ranked.p, rgraph)
#'
#' 	## Get the edge ids along paths in the original metabolic graph.
#' 	path.eids <- getPathsAsEIDs(ranked.p, ex_sbml)
#'
getPathsAsEIDs <- function(paths, graph){
    if(length(paths$y.labels)>1){
        eids <- lapply(paths$paths, getPaths, graph, paths$source.net)
        names(eids) = paths$y.labels

    }else{
        eids <- getPaths(paths$paths, graph, paths$source.net)
    }
    return(eids)
}


getPaths <- function(paths, graph, source.net){
    if(length(paths)==0) return(list())

    # graph source unknown
    if(is.null(graph$type) || is.null(source.net))
        return(lapply(paths, function(x) E(graph,path=x$genes)))

    # (G.graph,G.graph) or (R.graph, R.graph)
    if(graph$type == source.net)
        return(lapply(paths, function(x) E(graph,path=x$genes)))

    # (G.graph, R.graph)
    if(graph$type=="G.graph" && source.net=="R.graph"){
        gene.nodes <- lapply(paths, function(x) V(graph)[sub("^(.*)##", "",V(graph)$name) %in% x$genes])
        eid <- lapply(gene.nodes,function(x) E(graph)[x %--% x])

        return(eid)
    }

    # (R.graph, G.graph) or (MR.graph, G.graph) else (MR.graph, R.graph)
    if(source.net=="G.graph"){
        gene.nodes = lapply(paths, function(x) sub("^(.*)##", "",x$genes))
    }else{
        gene.nodes = lapply(paths, "[[", "genes")
    }

    # (R.graph, G.graph)
    if(graph$type=="R.graph")
        return(lapply(gene.nodes, function(x) E(graph, path=x)))

    # It must be MR.graph
    eid <- list()
    for(i in 1:length(paths)){
        deleted.edges <- unlist(lapply(lapply(grep("->",paths[[i]]$compounds),
                                function(x) unlist(strsplit(paths[[i]]$compounds[[x]], "->"))),
                        function(y) mapply(
                                    function(from,to)get.shortest.paths(graph, from, to, mode="out", output="epath"),
                                    head(y,-1), y[-1])
                ))

        paths[[i]]$compounds <- unlist(strsplit(paths[[i]]$compounds, "(->)|( \\+ )"))
        paths[[i]]$genes <- gene.nodes[[i]]
        eid[[i]] = c(as.numeric(deleted.edges),
                    E(graph)[.to(paths[[i]]$genes[1]) |
                        paths[[i]]$genes %--% paths[[i]]$compounds|
                        .from(tail(paths[[i]]$genes,1)) ]
                    )
    }
    return(eid)


}

# TODO: pvalue description
#' Extracting and ranking paths from a network
#'
#' Given a weighted igraph object, path ranking finds a set of node/edge sequences (paths) to
#' maximize the sum of edge weights.
#' \code{pathRanker(method="prob.shortest.path")} extracts the K most probable paths within
#' a weighted network.
#' \code{pathRanker(method="pvalue")} extracts a list of paths whose sum of edge weights are
#' significantly higher than random paths of the same length.
#'
#'
#' The input here is \code{graph}. A weight must be assigned to each edge. Bootstrapped Pearson correlation edge weights
#' can be assigned to each edge by \code{\link{assignEdgeWeights}}. However the specification of the edge weight is flexible
#' with the condition that increasing values indicate stronger relationships between vertices.
#'
#' \subsection{Probabilistic Shortest Paths}{
#' \code{pathRanker(method="prob.shortest.path")} finds the K most probable loopless paths given a weighted network.
#' Before the paths are ranked the edge weights are converted into probabilistic edge weights using the Empirical
#' Cumulative Distribution (ECDF) over all edge weights.  This is called ECDF edge weight. The ECDF edge weight
#' serves as a probabilistic rank of the most important gene-gene interactions.  The probabilistic nature of the ECDF
#' edge weights allow for a significance test to determine if a path contains any functional structure or is
#' simply a random walk.  The probability of a path is simily the product of all ECDF weights along the path.
#' This is computed as a sum of the logs of the ECDF edge weights.
#'
#' The follwing arguments can be passed to \code{pathRanker(method="prob.shortest.path")}:
#' \describe{
#' \item{\code{K}}{Maximum number of paths to extract. Defaults to 10.}
#' \item{\code{minPathSize}}{The minimum number of edges for each extracted path. Defualts to 1.}
#' \item{\code{normalize}}{Specify if you want to normalize the probabilistic edge weights (across different labels)
#' before extracting the paths. Defaults to TRUE.}
#' }
#' }
#'
#' \subsection{P-value method}{
#' \code{pathRanker(method="pvalue")} searches all paths between the specified start and end vertices, and if a
#' significant path is found it returns it. However, It doesn't search for the best path between the start and
#' terminal vertices,  as there could be many paths which lead to the same terminal vertex, and searching through
#' all of them is time comsuming.  We just stop when the first significant path is found.
#'
#' All provided edge weights are recaled from 0-1. Path significance is calculated based on the empirical distribution
#' of random paths of the same length. This can be estimated using \code{\link{samplePaths}} and passed as an argument.
#'
#' The follwing arguments can be passed to \code{pathRanker(method="pvalue")}:
#' \describe{
#' \item{\code{sampledpaths}}{The emripical results from \code{\link{samplePaths}}.}
#' \item{\code{alpha}}{The P value cut-off. Defualts to 0.01}
#' }
#' }
#' @param graph A weighted igraph object. Weights must be in \code{edge.weights} or \code{weight}
#' edge attributes.
#' @param method Which path ranking method to use.
#' @param start A list of start vertices, given by their vertex id.
#' @param end  A list of terminal vertices, given by their vertex id.
#' @param verbose Whether to display the progress of the function.
#' @param ... Method-specific parameters. See Details section.
#'
#' @return
#' A list of paths where each path has the following items:
#' \item{gene}{The ordered sequence of genes visited along the path.}
#' \item{compounds}{The ordered sequence of compounds visited along the path.}
#' \item{weights}{The ordered sequence of the log(ECDF edge weights) along the path.}
#' \item{distance}{The sum of the log(ECDF edge weights) along each path.  (a sum of logs is a product)}
#'
#'
#' @author Timothy Hancock, Ichigaku Takigawa, Nicolas Wicker and Ahmed Mohamed
#' @family Path ranking methods
#' @seealso getPathsAsEIDs, extractPathNetwork
#' @export
#' @examples
#' 	## Prepare a weighted reaction network.
#' 	## Conver a metabolic network to a reaction network.
#'  data(ex_sbml) # bipartite metabolic network of Carbohydrate metabolism.
#'  rgraph <- makeReactionNetwork(ex_sbml, simplify=TRUE)
#'
#' 	## Assign edge weights based on Affymetrix attributes and microarray dataset.
#'  # Calculate Pearson's correlation.
#' 	data(ex_microarray)	# Part of ALL dataset.
#' 	rgraph <- assignEdgeWeights(microarray = ex_microarray, graph = rgraph,
#' 		weight.method = "cor", use.attr="miriam.uniprot",
#' 		y=factor(colnames(ex_microarray)), bootstrap = FALSE)
#'
#' 	## Get ranked paths using probabilistic shortest paths.
#'  ranked.p <- pathRanker(rgraph, method="prob.shortest.path",
#' 					K=20, minPathSize=6)
#'
#' 	## Get significantly correlated paths using "p-valvue" method.
#' 	##   First, establish path score distribution by calling "samplePaths"
#'  pathsample <- samplePaths(rgraph, max.path.length=10,
#'                         num.samples=100, num.warmup=10)
#'
#' 	##   Get all significant paths with p<0.1
#' 	significant.p <- pathRanker(rgraph, method = "pvalue",
#'                 sampledpaths = pathsample ,alpha=0.1)
#'
pathRanker <- function(graph, method=c("prob.shortest.path", "pvalue") ,start, end, verbose=TRUE, ...){
    # Checking the graph
    if(is.null(E(graph)$edge.weights)){
        if(!is.null(E(graph)$weight))
            E(graph)$edge.weights <- E(graph)$weight
        else{
            stop("No edge weights provided.")
        }
    }

    if(length(method)>1 || !method %in% c("prob.shortest.path", "pvalue"))
        stop("Please specify the ranking method 'prob.shortest.path' or 'pvalue'.")

    if(method == "prob.shortest.path")
        return(rankShortestPaths(graph, start=start, end=end, verbose=verbose, ...))
    else return(rankPvalue(graph, start=start, end=end, verbose=verbose, ...))

}

rankShortestPaths <- function(graph, K=10, minPathSize=1, start, end, normalize = TRUE, verbose) {
    pg = processNetwork(graph, start, end, scale="ecdf" ,normalize)

    zret <- list()
    for (i in 1:ncol(pg$weights)) {
        if(verbose){
            if (ncol(pg$weights) > 1) message("Extracting the ",K," most probable paths for ",graph$y.labels[i])
            else message("Extracting the ",K," most probable paths.")
        }

        # min path size is increased by 2 to include start and end compounds.
        ps <- .Call("pathranker",
                pg$nodes,     #nodes
                pg$edges,                #Edges
                pg$weights[,i],            #Edge weights
                K=K,minpathsize = minPathSize+2)    #add 2 nodes to min path length ("s" & "t")



        idx <- which(sapply(ps,is.null))
        if (length(idx)>0) ps <- ps[-idx]
        if(length(ps)==0){
            if(ncol(pg$weights) > 1)
                message("  Warning:Counldn't find paths matching the criteria for ",graph$y.labels[i])
            else message("  Warning:Counldn't find paths matching the criteria.")
        }

        ps <- ps[order(sapply(ps,"[[", "distance"))] #order paths by distance

        if (ncol(pg$weights) > 1){
            zret[[i]] <- ps
        }else zret <- ps
    }

    if(length(zret)==0)return(NULL)

    if (ncol(pg$weights) > 1) {
        names(zret) <- graph$y.labels
        colnames(pg$weights) <- paste("prob",graph$y.labels,sep = ":")
    } else colnames(pg$weights) <- "prob"

    return(list(paths = zret,edge.weights = pg$weights, y.labels=graph$y.labels, source.net=graph$type))
}

rankPvalue <- function(graph, sampledpaths, start, end, alpha = 0.01, verbose) {
    # Checking the graph
    if(is.null(E(graph)$edge.weights)){
        if(!is.null(E(graph)$weight))
            E(graph)$edge.weights <- E(graph)$weight
        else{
            stop("No edge weights provided.")
        }
    }

    range <- ifelse(missing(sampledpaths), ecount(graph),
                    if(length(graph$y.labels)==1)nrow(sampledpaths)-1 else nrow(sampledpaths[[1]])-1)

    # Defining start reactions end their scopes.
    if(missing(start)){
        if(verbose) message("Start nodes weren't provided. Using entry nodes as start points.")
        start <- V(graph)[degree(graph,mode="in")==0]$name
    }
    if(missing(end)){
        end <- V(graph)[ unique(unlist(neighborhood(graph, order=range, nodes=start, mode="out"))) ]
    }

    pg = processNetwork(graph, start, end, scale="rescale", normalize=FALSE)

    zret=NULL
    for (i in 1:ncol(pg$weights)) {
        if(verbose){
            if (ncol(pg$weights) > 1) message("Extracting non-random path p<",alpha," for ",graph$y.labels[i])
            else message("Extracting non-random path p<",alpha)
        }

        p.sample <- if(missing(sampledpaths)) NULL
                    else if(ncol(pg$weights) > 1) t(sampledpaths[[i]])
                    else t(sampledpaths)

        ps <- .Call("scope",
                pg$nodes,
                pg$edges,
                pg$weights[,i],    #add column subscript
                SAMPLEDPATHS = p.sample,
                ALPHA = alpha,
                ECHO = as.integer(verbose))

        idx <- which(sapply(ps$paths,is.null))
        ps$paths <- ps$paths[-idx]

        if(length(ps)==0) stop("couldn't find any significant paths")

        ps$paths <- ps$paths[order(sapply(ps$paths,"[[", "pvalue"))] #order paths by pvalue

        if (ncol(pg$weights) > 1) zret[[i]] <- ps
        else zret <- ps
    }
    if (ncol(pg$weights) > 1)
        names(zret) <- graph$y.labels

    zret$y.labels=graph$y.labels
    zret$source.net = graph$type

    return(zret)
}

#' Creates a set of sample path p-values for each length given a weighted network
#'
#' Randomly traverses paths of increasing lengths within a set network to create an
#' empirical pathway distribution for more accurate determination of path significance.
#'
#' Can take a bit of time.
#'
#' @param graph A weighted igraph object. Weights must be in \code{edge.weights} or \code{weight}
#' edge attributes.
#' @param max.path.length The maxmimum path length.
#' @param num.samples The numner of paths to sample
#' @param num.warmup The number of warm up paths to sample.
#' @param verbose Whether to display the progress of the function.
#'
#' @return A matrix where each row is a path length and each column is the number of paths sampled.
#'
#' @author Timothy Hancock
#' @author Ahmed Mohamed
#' @family Path ranking methods
#' @export
#' @examples
#' 	## Prepare a weighted reaction network.
#' 	## Conver a metabolic network to a reaction network.
#'  data(ex_sbml) # bipartite metabolic network of Carbohydrate metabolism.
#'  rgraph <- makeReactionNetwork(ex_sbml, simplify=TRUE)
#'
#' 	## Assign edge weights based on Affymetrix attributes and microarray dataset.
#'  # Calculate Pearson's correlation.
#' 	data(ex_microarray)	# Part of ALL dataset.
#' 	rgraph <- assignEdgeWeights(microarray = ex_microarray, graph = rgraph,
#' 		weight.method = "cor", use.attr="miriam.uniprot",
#' 		y=factor(colnames(ex_microarray)), bootstrap = FALSE)
#'
#' 	## Get significantly correlated paths using "p-valvue" method.
#' 	##   First, establish path score distribution by calling "samplePaths"
#'  pathsample <- samplePaths(rgraph, max.path.length=10,
#'                         num.samples=100, num.warmup=10)
#'
#' 	##   Get all significant paths with p<0.1
#' 	significant.p <- pathRanker(rgraph, method = "pvalue",
#'                 sampledpaths = pathsample ,alpha=0.1)
#'
samplePaths <- function(graph, max.path.length, num.samples=1000, num.warmup=10, verbose=TRUE){
    # Checking the graph
    if(is.null(E(graph)$edge.weights)){
        if(!is.null(E(graph)$weight))
            E(graph)$edge.weights <- E(graph)$weight
        else{
            stop("No edge weights provided.")
        }
    }

    if (max.path.length > vcount(graph)-1) {
        if(verbose) message("Maximum Path Length is larger than Number of Network Vertices ... setting them to be equal.")
        max.path.length <-  vcount(graph)-1
    }

    # Get edge.weights and apply ecdf on each column
    edge.weights = do.call("rbind", E(graph)$edge.weights)
    edge.probs <- apply(edge.weights, 2, rescale, c(1,0))


    # Prepare edgelist as required by PathRanker method.
    label = lapply(E(graph)$attr, paste, collapse=' + ')    # For edges annotated with >1 compound, concatenate.
    edgelist = get.edgelist(graph, names=FALSE)
    edgelist = data.frame(from=edgelist[,1], to=edgelist[,2], label=unlist(label), stringsAsFactors=FALSE)

    pg = list(nodes=V(graph)$name, edges=edgelist, weights=edge.probs)

    if(ncol(pg$weights) == 1){
        ps <- .Call("samplepaths",
                pg$nodes,
                pg$edges,
                pg$weights,
                MAXPATHLENGTH = as.integer(max.path.length),
                SAMPLEPATHS = as.integer(num.samples),
                WARMUPSTEPS = as.integer(num.warmup)
              )

        return( matrix(ps,max.path.length+1,num.samples,byrow = TRUE) )
    }else{
        ps <- list()
        for (i in 1:ncol(pg$weights)) {
            if(verbose) message("Sampling",num.samples," for",graph$y.labels[i],"\n")
            p.samplei <- .Call("samplepaths",
                    pg$nodes,
                    pg$edges,
                    pg$weights,
                    MAXPATHLENGTH = as.integer(max.path.length),
                    SAMPLEPATHS = as.integer(num.samples),
                    WARMUPSTEPS = as.integer(num.warmup)
            )
            ps[[i]] <- matrix(p.samplei, max.path.length+1, num.samples, byrow = TRUE)
        }
        names(ps) <- graph$y.labels
        return(ps)
    }
}

processNetwork <- function(graph, start, end, scale=c("ecdf", "rescale"), normalize){
    # Add S, T vetrices for the shortest path algorithm.
    snodes <- if(missing(start)) V(graph)[degree(graph,mode="in")==0]$name else V(graph)[start]$name
    enodes <- if(missing(end)) V(graph)[degree(graph,mode="out")==0]$name else V(graph)[end]$name
    enodes <- enodes[!enodes %in% snodes]

    graph <- graph + vertices("s", "t")
    graph <- add.edges(graph,
            c(rbind("s",snodes),
              rbind(enodes,"t")),
            attr=list(edge.weights=list(rep(1, length(unlist(graph$y.labels)) )),
                    compound=""))

    # Get edge.weights and apply ecdf on each column
    edge.weights <- do.call("rbind", as.list(E(graph)$edge.weights))

    if(sum(!is.finite(edge.weights))>0){
        warning("Edge weights contain non-finite numbers. Setting them to the minimum edge weight")

        edge.weights <- apply(edge.weights, 2,
                            function(x){
                                x[ !is.finite(x) ] <- min( x[ is.finite(x) ] )
                                return(x)
                            })
    }
    if(scale=="ecdf")
        edge.probs <- apply(edge.weights, 2, function(x) ecdf(x[1:ecount(graph)])(x))
    else edge.probs <- apply(edge.weights, 2, rescale, c(1,0))


    # Normalize across Y labels
    if (ncol(edge.probs) > 1 & normalize == TRUE) {
        edge.probs <- edge.probs / rowSums(edge.probs)
    }

    if(scale=="ecdf")
        edge.probs <- -log(edge.probs)

    # Prepare edgelist as required by PathRanker method.
    label = lapply(E(graph)$compound, paste, collapse=' + ')    # For edges annotated with >1 compound, concatenate.
    edgelist = get.edgelist(graph, names=FALSE)
    edgelist = data.frame(from=edgelist[,1], to=edgelist[,2], label=unlist(label), stringsAsFactors=FALSE)


    return(list(nodes=V(graph)$name, edges=edgelist, weights=edge.probs))
}

#Adapted from scales package
rescale <- function (x, to = c(0, 1), from = range(x, na.rm = TRUE)){
    (x - from[1])/diff(from) * diff(to) + to[1]
}
ahmohamed/NetPathMiner documentation built on May 15, 2023, 12:53 p.m.