R/RWR_shortestpaths.R

Defines functions RWR_ShortestPaths extract_highest_scoring_path extract_node_from_row open_cytoscape save_results get_shortest_paths calculate_path_df create_individual_path_df

Documented in extract_highest_scoring_path RWR_ShortestPaths

########################################################################
# Find shortest paths between genes in gene sets. Given a single gene set,
#   find the shortest paths between the genes in that gene set. Given two
#   gene sets, find the shortest paths for pairs of genes between gene sets.
# - Input: Pre-computed multiplex network and one or two genesets
# - Output: Edge list table
# Copyright (C) 2022  David Kainer
#
# This file is part of RWRtoolkit.
#
# RWRtoolkit is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# RWRtoolkit is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# RWRtoolkit. If not, see <https://www.gnu.org/licenses/>.
########################################################################

#' @importFrom dplyr %>%
#' @importFrom foreach %dopar%
#' @importFrom foreach %:%

########################################################################
# Internal Functions
########################################################################
create_individual_path_df <- function(nw_merged, vpath, g, h){
  sg_df <- igraph::as_data_frame(
      igraph::induced_subgraph(
        nw_merged,
        vids = vpath,
        impl = "create_from_scratch"
      )
    )


    sg_df$pathname <- paste(
      g,
      h,
      sep = "_"
    )

    sg_df$pathlength <- length(vpath)

    path_string <- paste(
      lapply(
        vpath,
        function(x) igraph::V(nw_merged)$name[x]
      ),
      collapse = "->"
    )

    sg_df$pathelements <- path_string
    sg_df

}


calculate_path_df <- function(nw_merged, sp, g, h){
  path_dfs <- NULL
  for (path in 1:length(sp$vpaths)){
    path_data <- sp$vpaths[[path]]
    single_path_df <- create_individual_path_df(nw_merged, path_data, g, h)
    path_dfs <- rbind(path_dfs, single_path_df)
  }
  path_dfs
}





get_shortest_paths <- function(nw_merged,
                               source_geneset,
                               target_geneset,
                               threads = 1) {
  # For each gene in source_geneset, get the
  # shortest path to each gene in target_geneset.

  targets <- which(igraph::V(nw_merged)$name %in% target_geneset$gene)
  wt <- 1 - igraph::E(nw_merged)$weightnorm

  message(sprintf(
    "Calculating Shortest paths for %d x %d = %d gene pairs\n",
    nrow(source_geneset),
    length(targets),
    nrow(source_geneset) * length(targets)
  ))

  print(nw_merged)
  doParallel::registerDoParallel(cores = threads)
  res <- foreach::foreach(g = source_geneset$gene, .combine = rbind) %:%
    foreach::foreach(t = targets, .combine = rbind) %dopar% {
      # Only get the shortest path if the start and end
      # nodes are not the same!
      h <- igraph::V(nw_merged)$name[t]
      if (h != g) {
        # print("G TO T")
        sp <- igraph::all_shortest_paths(
          nw_merged,
          from = g,
          to = h,
          weights = wt
        )
        calculate_path_df(nw_merged, sp, g, h)
      }
    }
  doParallel::stopImplicitCluster()

  message("Finished calculating Shortest paths.")
  return(res)
}

save_results <- function(rwr_result,
                         source_geneset = NULL,
                         target_geneset = NULL,
                         outdir = NULL,
                         out_path = NULL) {
  # First, generate the out_path.
  if (is.null(outdir) && is.null(out_path)) {
    warning("You must provide either `outdir` or `out_path`.")
  } else if (!is.null(out_path)) {
    out_path <- out_path
  } else {
    if (is.null(target_geneset)) {
      out_path <- get_file_path(
        "RWR-SPATHS_",
        source_geneset$setid[1],
        outdir = outdir
      )
    } else {
      out_path <- get_file_path(
        "RWR-SPATHS_",
        source_geneset$setid[1],
        target_geneset$setid[1],
        outdir = outdir
      )
    }
  }

  # If out_path is still NULL, there's nothing to do.
  if (!is.null(out_path)) {
    if (!dir.exists(dirname(out_path))) {
      dir.create(dirname(out_path), recursive = TRUE)
    }
    write_table(rwr_result, out_path)
    message(sprintf("Saved results to file:\n  %s\n", out_path))
  }
}

open_cytoscape <- function(res, source_geneset, target_geneset) {
  # Visualize in Cyto.
  ig <- igraph::graph_from_data_frame(res, directed = F)
  igraph::V(ig)$endpoint <- igraph::V(ig)$name %in%
    c(source_geneset$gene, target_geneset$gene)

  igraph::V(ig)$color <- ifelse(igraph::V(ig)$name %in% source_geneset$gene, "source",
                ifelse(igraph::V(ig)$name %in% target_geneset$gene, "target", "default"))
  
  RCy3::cytoscapePing()
  RCy3::createNetworkFromIgraph(ig, "RWR_ShortestPaths_Output")
  RCy3::setVisualStyle("RWRtoolkit_Comp_Athal_Style") 
}



extract_node_from_row <- function(path_df,
                                  node_name,
                                  known_path = NULL,
                                  is_penultimate = FALSE,
                                  ultimate = NULL) {
  rows <- path_df[(path_df$to == node_name & !path_df$from %in% known_path) |
    (path_df$from == node_name & !path_df$to %in% known_path), ]

  node_in_to_data <- if (is_penultimate) {
    node_name %in% rows$to && ultimate %in% rows$from
  } else {
    node_name %in% rows$to
  }

  from_column <- if (node_in_to_data) "to" else "from"
  from_node <- unique(rows[[from_column]])

  to_column <- if (node_in_to_data) "from" else "to"
  to_node <- unique(rows[[to_column]])

  to_node <- unlist(
    lapply(
      to_node,
      function(x) if (x %in% known_path) NULL else x
    )
  )

  from_node <- unlist(
    lapply(
      from_node,
      function(x) if (x %in% known_path) NULL else x
    )
  )

  # TODO: Refactor to be less convoluted.
  if (is_penultimate && !is.null(ultimate)) {
    # the penultimate node has nodes within the from and to as it

    if (ultimate %in% from_node) {
      from_node <- unlist(
        lapply(
          from_node,
          function(x) if (x == ultimate) x else NULL
        )
      )
    }

    if (ultimate %in% to_node) {
      to_node <- unlist(
        lapply(
          to_node,
          function(x) if (x == ultimate) x else NULL
        )
      )
    }
  }

  list(
    from_column = from_column,
    from = from_node,
    to_column = to_column,
    to = to_node
  )
}



########################################################################
# Main Function
########################################################################

#'  Extract Highest Scoring Path
#'
#' `extract_highest_scoring_path`  parses through the dataframe output of
#' RWR_ShortestPaths and extracts the highest weighted edge per layer of a
#' user defined path. The method with which these edges are extracted can
#' be either by the normalized weight (default) or by the non-normalized
#' weight. If the edges are unweighted, this will not yield any meaningful
#' results
#'
#' @param shortest_paths_df A dataframe denoting the shortest paths of
#'                          between a series of predefined genes. The output
#'                          of RWR_ShortestPaths
#' @param desired_path      A string denoting the desired path that the user
#'                          wishes to extract (must match pathname of an
#'                          existing path within the shorest_paths_df).
#' @param weight_type       A string determining the weight used to determine
#'                          the highest weighted path. Options:
#'                          "weightnorm" (Default)  uses normalized edges by
#'                                      edgeweight_i / N_vertices_in_layer
#'
#'                          "weight" uses edge weight alone.
#' @return Returns a data frame following the path with the highest edge weights
#' @examples
#'
#' # An example of Running `extract_highest_scoring_path`
#'
#' extdata.dir <- system.file("example_data", package = "RWRtoolkit")
#' multiplex_object_filepath <- paste(extdata.dir,
#'   "/string_interactions.Rdata",
#'   sep = ""
#' )
#' geneset1_filepath <- paste(extdata.dir, "/geneset1.tsv", sep = "")
#' geneset2_filepath <- paste(extdata.dir, "/geneset2.tsv", sep = "")
#' outdir <- "./rwr_shortestpath"
#'
#' rwr_shortest_path_output <- RWR_ShortestPaths(
#'   data = multiplex_object_filepath,
#'   source_geneset = geneset1_filepath,
#'   target_geneset = geneset2_filepath,
#'   write_to_file = TRUE,
#'   outdir = outdir
#' )
#'
#' optimal_path <- extract_highest_scoring_path(
#'   rwr_shortest_path_output,
#'   desired_path = "TPI1_PMM2"
#' )
#'
#' @export
extract_highest_scoring_path <- function(shortest_paths_df,
                                         desired_path,
                                         weight_type = "normalized") {
  weight_column <- if (weight_type == "weightnorm") "weightnorm" else "weight"
  path_rows_df <- shortest_paths_df[
    shortest_paths_df$pathname == desired_path,
  ]

  total_path <- unique(path_rows_df$pathelements)

  split_path <- unlist(stringr::str_split(total_path, "->"))

  known_path <- c()
  path_rows <- NULL

  for (node_idx in seq(1, length(split_path) - 1)) {
    is_penultimate <- if (node_idx == length(split_path) - 1) T else F
    ultimate <- if (is_penultimate) split_path[length(split_path)] else NULL

    node <- split_path[node_idx]
    node_information <- extract_node_from_row(
      path_rows_df,
      node,
      known_path,
      is_penultimate,
      ultimate
    )

    from_column <- node_information$from_column
    to_column <- node_information$to_column

    desired_rows <- path_rows_df[
      path_rows_df[[from_column]] == node_information$from &
        path_rows_df[[to_column]] == node_information$to,
    ]

    top_row <- desired_rows[which.max(desired_rows[[weight_column]]), ]

    top_row$from <- node_information$from
    top_row$to <- node_information$to

    known_path <- append(known_path, node)
    path_rows <- rbind(path_rows, top_row)
  }
  path_rows
}

#' RWR Shortest Paths
#'
#' `RWR_ShortestPaths` Find shortest paths between genes in the given gene
#' sets. These will be output as a table of edges. There are two ways to use
#' this:
#' 1) provide one geneset (g) and you will get all shortest paths between genes
#' in g.
#' 2) provide two genesets (g and p) and you will get the shortest paths
#' between genes in g and genes in p.
#'
#' @param data Path to the .Rdata file for your combo of
#' underlying functional networks. This file is produced by
#' RWR_make_MHobject.R
#' @param source_geneset Path to the gene set file. If given with
#' --target_geneset, find shortest paths between genes in --source_geneset and
#' --target_geneset. Otherwise, find shortest paths among genes in
#' --source_geneset only. It must have the following cols without heading:
#' {<}setid{>} {<}gene{>}.
#' @param target_geneset Path to the second geneset file. If it is not provided
#' then the shortest paths will be calculated between genes in source_geneset.
#' @param outdir Full path to the output file directory. Two output files will
#' be generated with different suffixes.  Default "."
#' @param out_path Specify the full path for output. Ignore --outdir and
#' --modname and use this instead.
#' @param threads Number of threads to use.
#' @param cyto Include this parameter if you wish to see the shortest paths in
#' Cytoscape. Cytoscape must already be running first!  If your geneset(s) are
#' large (e.g. 100+) then loading all the shortest paths fully into cytoscape
#' can take a while. Default FALSE
#' @param verbose Verbose mode.  Default FALSE
#' @param write_to_file Also write the results to a file.  Default FALSE
#' @return Returns a data frame.
#' @examples
#'
#' # An example of Running RWR_ShortestPaths
#'
#' extdata.dir <- system.file("example_data", package = "RWRtoolkit")
#' multiplex_object_filepath <- paste(extdata.dir,
#'   "/string_interactions.Rdata",
#'   sep = ""
#' )
#' geneset1_filepath <- paste(extdata.dir, "/geneset1.tsv", sep = "")
#' geneset2_filepath <- paste(extdata.dir, "/geneset2.tsv", sep = "")
#' outdir <- "./rwr_shortestpath"
#'
#' rwr_shortest_path_output <- RWR_ShortestPaths(
#'   data = multiplex_object_filepath,
#'   source_geneset = geneset1_filepath,
#'   target_geneset = geneset2_filepath,
#'   write_to_file = TRUE,
#'   outdir = outdir
#' )
#'
#' @export
RWR_ShortestPaths <- function( # nolint
                              data = NULL,
                              source_geneset = NULL,
                              target_geneset = NULL,
                              outdir = ".",
                              out_path = NULL,
                              threads = 1,
                              cyto = FALSE,
                              verbose = FALSE,
                              write_to_file = FALSE) {
  nw_mpo <- NULL
  # This is a saved version of the multiplex network and adj matrix.
  load(data)
  nw_mpo <- nw.mpo # nolint - loaded from filepath
  if (verbose) {
    message("Loaded data:")
    print(ls())
  }

  # If geneset is NULL, get shortest paths
  # from source_geneset to source_geneset.
  if (is.null(target_geneset)) {
    target_geneset <- source_geneset
  }
  # Gene set 1.
  message("Getting source gene set ...")
  source_geneset_plus_extras <- load_geneset(source_geneset, nw_mpo, verbose)
  source_genes <- source_geneset_plus_extras[[1]]
  message(head(source_genes))

  # Gene set 2.
  message("Getting target gene set ...")
  target_geneset_plus_extras <- load_geneset(target_geneset, nw_mpo, verbose)
  target_genes <- target_geneset_plus_extras[[1]]
  message(head(target_genes))

  nw_merged <- merged_with_all_edges(nw_mpo)$merged_network

  res <- get_shortest_paths(nw_merged, source_genes, target_genes, threads)

  if (write_to_file) {
    save_results(
      res,
      source_geneset = source_genes,
      target_geneset = target_genes,
      outdir = outdir,
      out_path = out_path
    )
  }

  if (cyto) {
    open_cytoscape(res, source_genes, target_genes)
  }

  return(res)
}
dkainer/RWRtoolkit documentation built on Jan. 11, 2025, 3:26 a.m.