###############################################################################
# Perform RWR from one geneset and returns either (1) ranking of the rest of
# the genes in the network or (2) ranking of genes in a provided second geneset.
# - Input: Pre-computed multiplex network and at least one geneset of seeds.
# - Output: Table with the ranking of genes from (1) network or (2) second
# geneset
# Copyright (C) 2022 David Kainer
#
# This file is part of RWRtoolkit.
#
# RWRtoolkit is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# RWRtoolkit is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# RWRtoolkit. If not, see <https://www.gnu.org/licenses/>.
###############################################################################
#' @importFrom dplyr %>%
########################################################################
# Internal Functions
########################################################################
#' @keywords internal
calc_metrics_loe <- function(res, seed_geneset, query_geneset) {
# PR example: 5 genes left out need to be detected from 10 rankings
# ranking: 1 2 3 4 5 6 7 8 9 10
# truth: 1 0 1 0 0 1 0 0 1 1
# recall: 0.2 0.2 0.4 0.4 0.4 0.6 0.6 0.6 0.8 1.0
# Prec: 1 0.5 0.67 0.5 0.4 0.5 0.43 0.38 0.44 0.5
# Avg Prec: 1 + 0 + 0.67 + 0 +0 +0.5 + 0 + 0 + 0.44 +0.5) / 5 = 0.62
# if there are genes in geneset1 that are also in geneset2...
# then manually append them to the top of RWR results with the best
# possible rank. This is because RWR_LOE does not provide ranks to
# genes in the seed geneset, but in this scenario we need them to have
# a top rank because technically they represent exact overlap between
# geneset1 and geneset2
dupes <- dplyr::slice(
query_geneset,
which(query_geneset$gene %in% seed_geneset$gene)
)
if (nrow(dupes) > 0) {
warning(sprintf("WARNING: %i genes in the query geneset are also found in the seed geneset. They will be removed from the query geneset before running eval\n", length(dupes)), appendLF = T) # nolint
res <- rbind(
data.frame(
NodeNames = dupes$gene,
Score = 1,
rank = 1,
num_in_network = dplyr::first(res$num_in_network),
num_seeds = dplyr::first(res$num_seeds),
networks = dplyr::first(res$networks),
modname = dplyr::first(res$modname),
seed_geneset = dplyr::first(res$seed_geneset),
query_geneset = dplyr::first(res$query_geneset),
InQueryGeneset = 1
),
res
)
}
res <- calc_ROCPRC(
df = res,
scorescol = "rank",
labelscol = "InQueryGeneset"
)
filtervalue <- dplyr::filter(res, rank == nrow(!!query_geneset))
outputvalue <- filtervalue %>% dplyr::pull(PREC) # nolint PREC is column name
# 1. precision @ size(query_geneset) (aka R-PREC)
output <- data.frame(
value = ifelse(length(outputvalue) > 0,
outputvalue,
NA
),
measure = "P@SizeQueryGeneSet"
)
# 2. Avg PRC (uses Average Precision, not interpolated precision)
output <- rbind(
output,
dplyr::filter(res, TP == 1) %>%
dplyr::summarise(
value = sum(PREC) / sum(InQueryGeneset),
measure = "AvgPrec"
)
)
# 3. AUPRC
output <- rbind(
output,
dplyr::summarise(
res,
value = area_under_curve(
REC,
PREC,
method = "trapezoid",
ties = "max"
),
measure = "AUPRC"
)
)
# 4. AUROC
output <- rbind(
output,
dplyr::summarise(
res,
value = sum(REC) / dplyr::n(), measure = "AUROC"
)
)
# # 5. NDCG
# output <- rbind(
# output,
# dplyr::filter(
# res, rank == nrow(!!query_geneset)
# ) %>%
# dplyr::summarise(
# value = ndcg,
# measure = "NDCG@SizeQueryGeneSet"
# )
# )
# output <- rbind(
# output,
# dplyr::summarise(
# res,
# value = area_under_curve(
# REC,
# ndcg,
# method = "trapezoid",
# ties = "max"
# ),
# measure = "AUNDCG"
# )
# )
return(list(summary = output, results = res))
}
#' @keywords internal
view_top_network_loe <- function(results,
nw_mpo,
seed_geneset,
query_geneset = NULL,
ntop = 199,
cyto = 1,
modname = "") {
if (cyto) {
#1. compress multiplex to aggregate net
merged_mp <- merged_with_all_edges(nw_mpo)$merged_network
#2. extract "ntop" results from "results" as subnet
top_rows <- results$RWRM_Results[results$RWRM_Results$rank <= ntop, ]
top_ranks <- top_rows$NodeNames
seed_genes <- results$Seed_Nodes
subgraph_nodes <- c(top_ranks, seed_genes)
topresults <- igraph::subgraph(merged_mp, subgraph_nodes)
igraph::V(topresults)$label.cex <- 0.6 # nolint: igraph method
if (!is.null(query_geneset)) {
type_data <- dplyr::case_when(
igraph::V(topresults)$name %in% seed_geneset$gene ~ "InSet1",
igraph::V(topresults)$name %in% query_geneset$gene ~ "InSet2",
TRUE ~ "Other"
)
igraph::V(topresults)$type <- type_data
} else {
type_data <- dplyr::case_when(
igraph::V(topresults)$name %in% seed_geneset$gene ~ "InSet1",
TRUE ~ "Other"
)
igraph::V(topresults)$type <- type_data
}
igraph::V(topresults)$Rank <- NA # nolint: igraph methods
tmp <- match(igraph::V(topresults)$name, results$RWRM_Results$NodeNames)
igraph::V(topresults)$Rank <- results$RWRM_Results$rank[tmp] # nolint: igraph methods
igraph::V(topresults)$color <- ifelse(igraph::V(topresults)$name %in% seed_geneset$gene, "source",
ifelse(igraph::V(topresults)$name %in% query_geneset$gene, "target", "default"))
RCy3::cytoscapePing()
RCy3::createNetworkFromIgraph(
topresults,
title = paste0("RWR_LOE Seeds with Top ", ntop),
collection = modname
)
RCy3::setVisualStyle("RWRtoolkit_Comp_Athal_Style")
}
}
#' @keywords internal
save_plots_loe <- function(metrics,
seed_geneset,
query_geneset,
outdir,
modname) {
# this uses custom scoring and avoids the use of Precrec library
if (is.null(outdir)) {
message("You must provide a path to --outdir to save the metrics and plots.") # nolint message
return(1)
}
if (!dir.exists(outdir)) {
dir.create(outdir, recursive = TRUE)
}
p1 <- ggplot2::ggplot(metrics$results) +
ggplot2::geom_path(
ggplot2::aes(x = REC, y = PREC), #nolint column names
col = "darkorange",
alpha = 0.5
) +
ggplot2::ylim(c(0, 1)) +
ggplot2::xlab("Recall") +
ggplot2::ylab("Precision")
### ROC
# plot ROC curve including all folds and average
p2 <- ggplot2::ggplot(metrics$results) +
ggplot2::geom_line(
ggplot2::aes(x = FPR, y = REC), #nolint column names
col = "darkorange",
alpha = 1
) +
ggplot2::geom_abline(ggplot2::aes(intercept = 0, slope = 1),
linetype = "dashed",
col = "darkgrey"
) +
ggplot2::xlab("FPR") +
ggplot2::ylab("TPR")
# plot ranking distribution of hits in bins of 100 for each fold
p4 <- ggplot2::ggplot(
metrics$results %>% dplyr::filter(InQueryGeneset == 1)
) +
ggplot2::geom_histogram(
ggplot2::aes(x = rank),
alpha = 1,
binwidth = 100
)
out_path <- get_file_path(seed_geneset$setid[1],
query_geneset$setid[1],
modname,
outdir = outdir,
ext = "metrics.png"
)
png(out_path, width = 1200, height = 1000)
grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2)))
print(p1, vp = vplayout(1, 1)) # Top left
print(p2, vp = vplayout(1, 2)) # Top right
print(p4, vp = vplayout(2, 1:2)) # Bottom Right
dev.off()
message(paste("File saved:", out_path))
}
########################################################################
# Main Function
########################################################################
#' RWR LOE
#'
#' `RWR_LOE` perform RWR lines-of-evidence. RWR_LOE has two possible functions.
#' Given one geneset of seeds, rankings for all other genes in the network will
#' be returned. Given a second geneset of genes to be queried, ranking for just
#' the genes in that geneset will be returned. This can be used to build
#' multiple lines of evidence from the various input networks to relate the two
#' gene sets.
#'
#' @param data The path to the Rdata object generated from RWR_make_multiplex
#' @param seed_geneset The path to the gene set file. It must have the following
#' first two columns with no headers tab-delimited:
#' {<}setid{>} {<}gene{>} {<}weight{>}
#' @param query_geneset The path to the optional second gene set file. It must
#' have the following first two columns with no headers
#' tab-delimited:
#' {<}setid{>} {<}gene{>} {<}weight{>}
#' @param restart Sets the restart parameter [0,1). Higher value means the
#' walker will jump back to a seed node more often.
#' Defaulted to 0.7
#' @param tau Comma-separated list of values between that MUST add up to the
#' number of network layers in the .Rdata file. One value per network
#' layer that determines the probability that the random walker will
#' restart in that layer. e.g. if there are three layers (A,B,C) in
#' your multiplex network, then --tau '0.2,1.3,1.5' will mean that
#' layer A is less likely to be walked on after a restart than layers
#' B or C.
#' Default 1.0
#' @param outdir Full path to the output file directory
#' @param numranked Proprtion of ranked genes to return \[0,1\]. e.g. 0.1 will
#' return the top 10%. Default 1.0
#' @param plot Output PNG plot of ROC and PRC of the query geneset with respect the
#' seed geneset.
#' @param modname String to include in output file name. Default "default"
#' @param cyto Specify a number N > 0 if you wish to see a network of the seeds
#' and top N ranked genes (cytoscape must already be running)
#' @param verbose Verbose mode (Default False)
#' @return Returns a list of:
#' (1) a dataframe of results (scores for genes) and
#' (2) the seed genes.
#' @examples
#' # An example of a default RWR LOE with one geneset which will
#' # return a ranked list
#'
#' extdata.dir <- system.file("example_data", package = "RWRtoolkit")
#' multiplex_object_filepath <- paste(
#' extdata.dir,
#' "/string_interactions.Rdata",
#' sep = ""
#' )
#' geneset1_filepath <- paste(extdata.dir, "/geneset1.tsv", sep = "")
#' geneset2_filepath <- paste(extdata.dir, "/geneset2.tsv", sep = "")
#' outdir <- "./rwr_loe"
#'
#' # for all other genes in the network
#' loe_gene_seed_list <- RWR_LOE(
#' data = multiplex_object_filepath,
#' seed_geneset = geneset1_filepath,
#' tau = "1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0",
#' outdir = outdir
#' )
#'
#' # An example of an RWR LOE with two genesets, the first defines the seeds
#' # and the output will be rankings for the genes in the second geneset
#' loe_gene_seed_list2 <- RWR_LOE(
#' data = multiplex_object_filepath,
#' seed_geneset = geneset1_filepath,
#' query_geneset = geneset2_filepath,
#' tau = "1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0",
#' outdir = outdir
#' )
#'
#' @export
RWR_LOE <- function(data = NULL, # nolint PACKAGE FUNCTION NAME
seed_geneset = NULL,
query_geneset = NULL,
restart = 0.7,
tau = "1.0",
outdir = "./",
numranked = 1.0,
plot = FALSE,
modname = "default",
cyto = 0,
verbose = FALSE) {
# Check parameters and print key parameters on verbose mode
if (is.null(seed_geneset)) {
stop("ERROR: Mandatory arguement seed_geneset is missing.")
}
if (verbose) {
cat(
paste(
"\nRdata file: ",
data,
"\nSeed geneset: ",
seed_geneset,
"\nQuery geneset: ",
query_geneset,
"\noutput: ",
outdir,
"\nproportion of ranked nodes to be saved: ",
numranked,
"\n\n"
)
)
}
# load the data:
data_list <- load_multiplex_data(data)
nw_mpo <- data_list$nw.mpo
nw_adjnorm <- data_list$nw.adjnorm
# Set tau which must match the network layers (utility function)
tau <- get_or_set_tau(nw_mpo, optTau = tau)
# Load the gene sets, or set to NULL (utility function)
seed_geneset <- load_geneset(seed_geneset, nw_mpo, verbose)$geneset
query_geneset <- load_geneset(
query_geneset,
nw_mpo,
verbose
)$geneset # will return NULL if not set by user
# Core of method
message("\nBeginning RWR_LOE ...")
# Set network_names to be the concatenation of individual network names
network_names <- paste(
names(nw_mpo)[1:nw_mpo$Number_of_Layers],
collapse = "_"
)
# Run RWR on multiplex network starting at seed genes defined by
# the seed_geneset variable.
results <- RandomWalkRestartMH::Random.Walk.Restart.Multiplex(
x = nw_adjnorm,
MultiplexObject = nw_mpo,
Seeds = seed_geneset$gene,
r = restart,
tau = tau,
weights = seed_geneset$weight,
)
# Add a rank column to the results (min_rank gives the minimum rank on a
# tie, skips next ranking)
# Any gene that scored 0, give it worst possible rank
results$RWRM_Results <- results$RWRM_Results %>% # nolint RWRMH package
dplyr::mutate(rank = dplyr::min_rank(-Score)) %>%
dplyr::mutate(
rank = dplyr::if_else(
Score == 0,
true = nw_mpo$Number_of_Nodes_Multiplex,
false = rank
)
)
# Add in additional columns to results
results$RWRM_Results <- results$RWRM_Results %>% # nolint RWRMH package
dplyr::mutate(
num_in_network = nrow(seed_geneset),
num_seeds = length(results$Seed_Nodes),
networks = network_names,
modname = modname,
seed_geneset = seed_geneset$setid[1]
)
# If a second geneset was input, flag genes that are in query_geneset
if (!is.null(query_geneset)) {
query_message <- "A query geneset was provided. Results will be filtered for only genes from the query geneset." # nolint message
message(query_message)
results$RWRM_Results <- results$RWRM_Results %>% # nolint RWRMH package
tibble::add_column(query_geneset = query_geneset$setid[1]) %>%
tibble::add_column(
InQueryGeneset = as.numeric(
results$RWRM_Results$NodeNames %in% query_geneset$gene
)
)
} else {
message("A query geneset was not provided.")
}
message("... RWR_LOE complete.")
print(head(results$RWRM_Results))
# Define output path
if (!is.null(query_geneset)) {
out_path <- get_file_path(
"RWR-LOE",
seed_geneset$setid[1],
"vs",
query_geneset$setid[1],
modname,
outdir = outdir,
ext = ".ranks.tsv"
)
} else {
out_path <- get_file_path("RWR-LOE",
seed_geneset$setid[1],
modname,
outdir = outdir,
ext = ".ranks.tsv"
)
}
message(paste("Output path:", out_path))
# Save the table of results, ie, scores and ranks
# for each gene in the network.
result_table <- results$RWRM_Results %>% dplyr::slice_head(prop = numranked)
write_table(result_table, out_path, verbose = TRUE)
# If evaluation mode is turned on and both a query and seed geneset are present, plot ROC and PRC.
if (!is.null(query_geneset) && plot) {
message("evaluating metrics for finding query genes from seeds")
metrics <- calc_metrics_loe(
results$RWRM_Results,
seed_geneset,
query_geneset
)
out_path <- get_file_path(
seed_geneset$setid[1],
query_geneset$setid[1],
modname,
outdir = outdir,
ext = "metrics.tsv"
)
write_table(metrics$summary, out_path)
message(paste("Saved metrics summary to file:", out_path))
save_plots_loe(metrics, seed_geneset, query_geneset, outdir, modname)
} else if (is.null(query_geneset) && plot) {
warning(sprintf("Plot mode is turned ON but there is no query geneset, evaluation will not occur.\n")) # nolint message
}
# If cytoscape mode is turned on, open an interactive cytoscape session.
if (cyto > 0) {
message("opening top n network in cytoscape")
view_top_network_loe(
results,
nw_mpo,
seed_geneset,
query_geneset,
ntop = cyto,
cyto = cyto,
modname = modname
)
}
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.