# library(r4projects)
# setwd(get_project_wd())
# source("R/enrich_pathways.R")
# source("R/7_utils.R")
#
# setwd("demo_data/2_smartd_project")
# load("peak_marker")
# load("urine_metabolomics_data.rda")
# # load("hmdb_compound_ms1.rda")
# load("kegg_compound_ms1.rda")
# # load("metabolic_network.rda")
# # edge_data <- tidygraph::as_tibble(metabolic_network, active = "edges")
# # node_data <- tidygraph::as_tibble(metabolic_network, active = "nodes")
# #
# # node_data <-
# # node_data %>%
# # dplyr::filter(!is.na(KEGG_ID))
# #
# # edge_data <-
# # edge_data %>%
# # dplyr::filter(!is.na(from_compound_KEGG_ID) &
# # !is.na(to_compound_KEGG_ID)) %>%
# # dplyr::filter(
# # from_compound_KEGG_ID %in% node_data$KEGG_ID &
# # to_compound_KEGG_ID %in% node_data$KEGG_ID
# # )
# #
# # node_data <-
# # node_data %>%
# # dplyr::filter(KEGG_ID %in% unique(
# # c(
# # edge_data$from_compound_KEGG_ID,
# # edge_data$to_compound_KEGG_ID
# # )
# # ))
# #
# # node_data <-
# # node_data %>%
# # dplyr::mutate(name = KEGG_ID) %>%
# # dplyr::distinct(name, .keep_all = TRUE)
# #
# # edge_data <-
# # edge_data %>%
# # dplyr::mutate(from = from_compound_KEGG_ID, to = to_compound_KEGG_ID) %>%
# # dplyr::filter(from %in% node_data$name &
# # to %in% node_data$name) %>%
# # dplyr::distinct(from, to, .keep_all = TRUE)
# #
# # node_data <-
# # node_data %>%
# # dplyr::filter(name %in% unique(c(edge_data$from, edge_data$to)))
# #
# # metabolic_network <-
# # tidygraph::tbl_graph(nodes = node_data,
# # edges = edge_data,
# # directed = FALSE)
# #
# # save(metabolic_network, file = "metabolic_network.rda")
# # load("metabolic_network.rda")
# #
# ###remove duplicated edges
# # node_data <- tidygraph::as_tibble(metabolic_network, active = "nodes")
# # edge_data <- tidygraph::as_tibble(metabolic_network, active = "edges")
# # edge_data <-
# # edge_data %>%
# # dplyr::mutate(from = from_compound_KEGG_ID, to = to_compound_KEGG_ID) %>%
# # dplyr::filter(from != to) %>%
# # dplyr::distinct(from, to, .keep_all = TRUE)
# #
# # edge_data <-
# # edge_data %>%
# # dplyr::mutate(rpair_id = apply(cbind(from, to), 1, function(x)
# # paste(sort(as.character(
# # x
# # )), collapse = "_"))) %>%
# # dplyr::distinct(rpair_id, .keep_all = TRUE)
# #
# #
# # metabolic_network <-
# # tidygraph::tbl_graph(nodes = node_data,
# # edges = edge_data,
# # directed = FALSE)
# #
# # save(metabolic_network, file = "metabolic_network.rda")
# load("metabolic_network.rda")
# # load("hmdb_pathway.rda")
# load("kegg_hsa_pathway.rda")
#
# # ###for HMDB, only remain the primary_pathway
# # hmdb_pathway <-
# # hmdb_pathway %>%
# # dplyr::filter(stringr::str_detect(pathway_class, "primary_pathway"))
#
# library(tidymass)
# library(tidyverse)
# library(igraph)
# library(ggraph)
#
# up_marker <-
# peak_marker %>%
# dplyr::filter(score > 0 & correlation1 > 0.3 & p1_adj < 0.05) %>%
# dplyr::select("Gene ID", score, p1_adj)
#
# down_marker <-
# peak_marker %>%
# dplyr::filter(score < 0 &
# correlation1 < -0.3 & p1_adj < 0.05) %>%
# dplyr::select("Gene ID", score, p1_adj)
#
# up_marker <-
# up_marker %>%
# dplyr::left_join(urine_metabolomics_data@variable_info,
# by = c("Gene ID" = "variable_id"))
#
# down_marker <-
# down_marker %>%
# dplyr::left_join(urine_metabolomics_data@variable_info,
# by = c("Gene ID" = "variable_id"))
#
# feature_table_marker_up <-
# up_marker %>%
# dplyr::select("Gene ID", mz, rt, score, p1_adj) %>%
# dplyr::rename(variable_id = "Gene ID",
# degree = score,
# p_value = p1_adj) %>%
# dplyr::mutate(polarity = case_when(
# stringr::str_detect(variable_id, "POS") ~ "positive",
# stringr::str_detect(variable_id, "NEG") ~ "negative"
# ))
#
# feature_table_marker_down <-
# down_marker %>%
# dplyr::select("Gene ID", mz, rt, score, p1_adj) %>%
# dplyr::rename(variable_id = "Gene ID",
# degree = score,
# p_value = p1_adj) %>%
# dplyr::mutate(polarity = case_when(
# stringr::str_detect(variable_id, "POS") ~ "positive",
# stringr::str_detect(variable_id, "NEG") ~ "negative"
# ))
#
# feature_table_marker <-
# rbind(feature_table_marker_up, feature_table_marker_down)
#
# feature_table_all <-
# urine_metabolomics_data@variable_info %>%
# dplyr::mutate(polarity = case_when(
# stringr::str_detect(variable_id, "POS") ~ "positive",
# stringr::str_detect(variable_id, "NEG") ~ "negative"
# )) %>%
# dplyr::select(variable_id, mz, rt, polarity)
#
# metabolite_database <-
# kegg_compound_ms1
#
# pathway_database <-
# kegg_hsa_pathway
#
# column <- "rp"
# adduct.table <- NULL
# adduct.table = NULL
# ms1.match.ppm = 15
# rt.match.tol = 5
# mz.ppm.thr = 400
# threads = 3
# include_hidden_metabolites = FALSE
#
# fpa_result <-
# perform_fpa(
# feature_table_marker = feature_table_marker,
# feature_table_all = feature_table_all,
# metabolite_database = metabolite_database,
# column = column,
# adduct.table = adduct.table,
# ms1.match.ppm = ms1.match.ppm,
# rt.match.tol = rt.match.tol,
# mz.ppm.thr = mz.ppm.thr,
# threads = threads,
# include_hidden_metabolites = include_hidden_metabolites,
# metabolic_network = metabolic_network,
# pathway_database = pathway_database
# )
#
#
# #####network visualization
# result$enriched_pathways
#
# plot_metabolic_network_fpa(
# fpa_result,
# feature_table_marker = feature_table_marker,
# include_feature = FALSE,
# include_hidden_metabolites = FALSE,
# add_compound_name = TRUE
# )
#
# plot_metabolic_module_fpa(
# fpa_result,
# feature_table_marker = feature_table_marker,
# include_feature = FALSE,
# include_hidden_metabolites = FALSE,
# add_compound_name = TRUE,
# metabolic_module_index = 6
# )
#
#' Perform Feature-based Pathway Analysis (FPA)
#'
#' This function performs feature-based pathway analysis. It contains
#' metabolite annotation based on ms1, network-based metabolite filtering,
#' metabolic module identification.
#'
#' @param feature_table_marker A data frame of marker features.
#' @param feature_table_all A data frame containing all features.
#' @param metabolite_database A metabolite database object containing metabolite spectra.
#' @param column Character vector specifying the chromatography column type. Default is `c("rp", "hilic")`.
#' @param adduct.table Optional data frame containing adduct information. Default is `NULL`.
#' @param ms1.match.ppm Numeric; mass tolerance for MS1 matching in parts per million (ppm). Default is `10`.
#' @param rt.match.tol Numeric; retention time tolerance threshold in seconds. Default is `5`.
#' @param mz.ppm.thr Numeric; M/Z tolerance threshold for filtering in ppm. Default is `400`.
#' @param threads Integer; number of threads to use for parallel processing. Default is `3`.
#' @param include_hidden_metabolites Logical; whether to include hidden metabolites. Default is `FALSE`.
#' @param metabolic_network A metabolic network object representing metabolic reactions.
#' @param pathway_database A pathway database object containing metabolic pathways.
#'
#' @return A list containing:
#' - `metabolic_module_result`: A data frame with metabolic module information.
#' - `final_metabolic_network`: A `tbl_graph` object representing the final metabolic network.
#' - `annotation_table_final`: A data frame with final annotated features.
#'
#' @export
#'
perform_fpa <-
function(feature_table_marker,
feature_table_all,
metabolite_database,
column = c("rp", "hilic"),
adduct.table = NULL,
ms1.match.ppm = 25,
rt.match.tol = 5,
##unit is second
mz.ppm.thr = 400,
threads = 3,
include_hidden_metabolites = FALSE,
metabolic_network,
pathway_database) {
###check the feature_table_marker and feature_table_all
check_feature_table_marker(feature_table_marker)
check_feature_table_all(feature_table_all)
###metabolite annotation
message("Annotating metabolites for feature table...\n")
# Extract metabolic network data: nodes (metabolites) and edges (connections)
edge_data <-
tidygraph::as_tibble(metabolic_network, active = "edges")
node_data <-
tidygraph::as_tibble(metabolic_network, active = "nodes")
# Remove entries with missing KEGG IDs from the metabolite database
metabolite_database@spectra.info <-
metabolite_database@spectra.info %>%
dplyr::filter(!is.na(KEGG.ID)) %>%
dplyr::filter(KEGG.ID %in% node_data$KEGG_ID)
# Perform metabolite annotation based on ms1
annotation_table_all <-
annotate_metabolites_fpa(
feature_table = feature_table_all,
metabolite_database = metabolite_database,
column = column,
adduct.table = adduct.table,
ms1.match.ppm = ms1.match.ppm,
rt.match.tol = rt.match.tol,
mz.ppm.thr = mz.ppm.thr,
threads = threads
)
# Remove pathways that contain no compound
remain_idx <-
pathway_database@compound_list %>%
purrr::map(function(x) {
nrow(x)
}) %>%
unlist() %>%
`>`(0) %>%
which()
pathway_database <-
filter_pathway(object = pathway_database, remain_idx = remain_idx)
# Remove pathway databases that have no pathways left after filtering
if (length(pathway_database) == 0) {
stop("All pathway database length is zero.\n")
}
# Combine peaks into metabolite classes based on retention time similarity
####
message("Identifying metablite classes...\n")
annotation_table_all <-
unique(annotation_table_all$Lab.ID) %>%
furrr::future_map(
.f = function(temp_id) {
x = annotation_table_all %>%
dplyr::filter(Lab.ID == temp_id)
x <- x %>%
dplyr::mutate(rt = as.numeric(rt)) %>%
dplyr::arrange(rt)
rt_class <-
group_peaks_rt(rt = x$rt, rt.tol = rt.match.tol) %>%
dplyr::arrange(rt)
rt_class <- paste(x$Lab.ID[1], rt_class$class, sep = "_")
x <-
data.frame(x,
compound_class = rt_class,
stringsAsFactors = FALSE)
x <-
unique(x$compound_class) %>%
purrr::map(function(y) {
z <-
x[x$compound_class == y, , drop = FALSE]
score <- score_peak_group(z)
z <- data.frame(z, score, stringsAsFactors = FALSE)
z
}) %>%
do.call(rbind, .) %>%
as.data.frame()
x
},
.progress = TRUE
) %>%
do.call(rbind, .) %>%
as.data.frame()
# ####if our method can be consistent with standard method
# annotation_standard <-
# urine_metabolomics_data@variable_info %>%
# dplyr::filter(!is.na(SS)) %>%
# dplyr::filter(!is.na(KEGG.ID)) %>%
# dplyr::filter(KEGG.ID %in% node_data$KEGG_ID) %>%
# dplyr::filter(KEGG.ID %in% metabolite_database@spectra.info$KEGG.ID)
#
# compared_result <-
# annotation_standard %>%
# dplyr::select(variable_id, Compound.name, KEGG.ID, SS) %>%
# dplyr::left_join(annotation_table_all[, c("variable_id",
# "Compound.name",
# "KEGG.ID",
# "compound_class",
# "score")], by = "variable_id")
#
# write.csv(compared_result, "compared_result.csv")
#
# temp <-
# unique(compared_result$variable_id) %>%
# purrr::map(function(x) {
# temp <-
# compared_result %>%
# dplyr::filter(variable_id == x)
#
# accurate_score <-
# temp %>%
# dplyr::filter(KEGG.ID.x == KEGG.ID.y) %>%
# pull(score)
# max_score <- max(temp$score)
# c(accurate_score = accurate_score,
# max_score = max_score,
# diff_score = accurate_score - max_score)
# }) %>%
# do.call(rbind, .) %>%
# as.data.frame() %>%
# dplyr::filter(diff_score != 0)
#
# temp %>%
# dplyr::filter(max_score >= 80) %>%
# pull(accurate_score)
#
#
# idx = temp %>% lapply(length) %>% unlist() %>% `==`(0) %>% which()
#
# temp2 = urine_metabolomics_data@variable_info %>%
# dplyr::filter(variable_id %in% unique(compared_result$variable_id)[idx]) %>%
# dplyr::left_join(metabolite_database@spectra.info[,c("KEGG.ID", "Formula")],
# by = "KEGG.ID")
# write.csv(temp2, "temp2.csv")
#
# x = masstools::sum_formula(formula = "C9H18NO4", adduct = "M+H")
# Rdisop::getMass(molecule = Rdisop::getMolecule("C9H18NO4"))
# (156.040183 - 156.0514)*10^6/400
# Remove redundant metabolite annotations
# calculate_redundance(annotation_table = annotation_table_all)
annotation_table_all <-
remove_redundancy(annotation_table = annotation_table_all)
# calculate_redundance(annotation_table = annotation_table_all2)
# Filter the marker feature table to include only annotated markers
feature_table_marker <-
feature_table_marker %>%
dplyr::filter(variable_id %in% annotation_table_all$variable_id)
annotation_table_marker <-
annotation_table_all %>%
dplyr::filter(variable_id %in% feature_table_marker$variable_id)
message("Detecting metabolic modules from metabolic reaction network.\n")
###detected_metabolites are IDs of metabolites that matched with the metabolic features
detected_metabolites <-
unique(annotation_table_marker$KEGG.ID)
###hidden metabolites are metabolites that are not matched with the metabolic features,
###and are connected to the detected_metabolites in the metabolic network within three steps
message("Detecting hidden metabolites...\n")
if (include_hidden_metabolites) {
hidden_metabolites <-
(
metabolic_network = metabolic_network,
detected_metabolites = detected_metabolites,
threads = threads,
max.reaction = 3
)
hidden_metabolites <-
hidden_metabolites %>%
unlist() %>%
unique()
} else{
hidden_metabolites <- NULL
}
total_metabolites <-
c(detected_metabolites, hidden_metabolites)
####extract subnetwork
sub_metabolic_network <-
metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::filter(name %in% total_metabolites) %>%
dplyr::mutate(node_class = ifelse(name %in% detected_metabolites, "Detected", "Hidden")) %>%
tidygraph::activate(what = "nodes") %>%
dplyr::mutate(degree2 = tidygraph::centrality_degree())
# sub_metabolic_network2 <-
# igraph::subgraph(graph = metabolic_network,
# vids = total_metabolites) %>%
# tidygraph::as_tbl_graph() %>%
# dplyr::mutate(node_class = ifelse(name %in% detected_metabolites, "Detected", "Hidden")) %>%
# tidygraph::activate(what = "nodes") %>%
# dplyr::mutate(degree2 = tidygraph::centrality_degree())
###metabolic_module detection
#####detecting metabolic modules
message("Detecting metabolic modules...\n")
metabolic_modules <-
identify_metabolic_modules(
sub_metabolic_network = sub_metabolic_network,
detected_metabolites = detected_metabolites,
hidden_metabolites = hidden_metabolites
)
###calculate activity scores and impacts of metabolic modules
message("Calculating quality scores of metabolic modules.\n")
module_quality_score <-
calculate_activity_socre(
metabolic_modules = metabolic_modules,
detected_metabolites = detected_metabolites,
hidden_metabolites = hidden_metabolites,
sub_metabolic_network = sub_metabolic_network,
threads = threads
)
# module_quality_score <-
# metabolic_modules %>%
# purrr::map(function(x) {
# calculate_module_quality(graph = sub_metabolic_network, nodes = x)$conductance
# }) %>%
# unlist()
##Module impact
message("Calculating impacts of metabolic modules...\n")
module_impact <-
lapply(metabolic_modules, function(temp_group) {
temp_graph <-
igraph::subgraph(graph = sub_metabolic_network, v = temp_group)
centrality <- calculate_centrality(graph = temp_graph, type = "d")
temp_idx <- which(temp_group %in% detected_metabolites)
module_impact <- sum(centrality[temp_idx]) / sum(centrality)
module_impact
})
module_impact <- unlist(module_impact)
####Hub Dominance Score (HDS) or Dominant Edge Ratio
##DER = \frac{\sum_{\text{top } K} e_i}{|E|}
# where:
# K is a small fraction (e.g., top 10-20%) of nodes sorted by degree.
# e_i is the number of edges connected to those nodes.
# |E| is the total number of edges.
#
# Rationale: If the top nodes account for almost all edges, DER → 1.
message("Calculating Dominant Edge Ratio (DER) of metabolic modules...\n")
module_dominant_edge_rate <-
lapply(metabolic_modules, function(temp_group) {
temp_graph <-
igraph::subgraph(graph = sub_metabolic_network, v = temp_group)
all_degree <- igraph::degree(temp_graph)
# Sort nodes by degree (descending)
sorted_degrees <- sort(all_degree, decreasing = TRUE)
# Define top K nodes (e.g., top 20% most connected nodes)
# K <- ceiling(0.2 * length(V(temp_graph))) # 20% of nodes
# top_nodes <- names(sorted_degrees)[1:K]
top_nodes <-
names(all_degree[all_degree > quantile(sorted_degrees, probs = 0.9)])
if (length(top_nodes) == 0) {
top_nodes <- names(sorted_degrees)[1]
}
all_edges <-
igraph::as_edgelist(temp_graph)
# Count edges where at least one node is in the top nodes
dominant_edges <-
sum(all_edges[, 1] %in% top_nodes |
all_edges[, 2] %in% top_nodes)
# Compute Dominant Edge Ratio (DER)
total_edges <- igraph::gsize(temp_graph)
dominant_edges / total_edges
}) %>%
unlist()
##get NULL distribution
message("Identify null distributation of metabolic module quality scores...\n")
null_quality_score <-
generate_null_activity_score_distribution(
annotation_table_all = annotation_table_all,
feature_table_marker = feature_table_marker,
metabolite_database = metabolite_database,
metabolic_network = metabolic_network,
permutation_times = 5,
threads = threads,
adduct.table = adduct.table,
column = column,
ms1.match.ppm = ms1.match.ppm,
mz.ppm.thr = mz.ppm.thr,
include_hidden_metabolites = include_hidden_metabolites
)
# null_quality_score <-
# generate_null_conductance_distribution(
# annotation_table_all = annotation_table_all,
# feature_table_marker = feature_table_marker,
# metabolite_database = metabolite_database,
# metabolic_network = metabolic_network,
# permutation_times = 5,
# threads = threads,
# adduct.table = adduct.table,
# column = column,
# ms1.match.ppm = ms1.match.ppm,
# mz.ppm.thr = mz.ppm.thr,
# include_hidden_metabolites = include_hidden_metabolites
# )
null_quality_score <-
unlist(null_quality_score)
# mean(module_quality_score)
# mean(null_quality_score)
# median(mean(module_quality_score))
# median(mean(null_quality_score))
####calculate the p values for each metabolic module
module_quality_score <- module_quality_score + 0.00000001
###
para <- MASS::fitdistr(x = null_quality_score, densfun = "gamma")[[1]]
# standard.distribution <- rgamma(n = length(null_quality_score), shape = para[1], rate = para[2])
standard.distribution <- rgamma(n = 100000,
shape = para[1],
rate = para[2])
# ks.test(x = null_quality_score, y = standard.distribution)
cdf <- ecdf(x = standard.distribution)
metabolic_module_p <- 1 - cdf(module_quality_score)
#order module according to p value
# metabolic_modules <- metabolic_modules[order(metabolic_module_p)]
# module_impact <- module_impact[order(metabolic_module_p)]
# metabolic_module_p <- metabolic_module_p[order(metabolic_module_p)]
# module_dominant_edge_rate <- module_dominant_edge_rate[order(metabolic_module_p)]
## get information of each metabolic module
metabolic_module_result <-
lapply(metabolic_modules, function(x) {
temp_graph <-
igraph::subgraph(graph = metabolic_network, v = x)
detected_metabolite_number <- sum(x %in% detected_metabolites)
hidden_metabolite_number <- sum(x %in% hidden_metabolites)
total_metabolite_number <- length(x)
i.id <- x[which(x %in% detected_metabolites)]
h.id <- x[which(x %in% hidden_metabolites)]
detected_metabolite_name <-
data.frame(KEGG.ID = i.id) %>%
dplyr::left_join(
annotation_table_all %>%
dplyr::select(KEGG.ID, Compound.name) %>%
dplyr::distinct(KEGG.ID, .keep_all = TRUE),
by = "KEGG.ID"
) %>%
pull(Compound.name) %>%
paste(collapse = "{}")
hidden_metabolite_name <-
data.frame(KEGG.ID = h.id) %>%
dplyr::left_join(
annotation_table_all %>%
dplyr::select(KEGG.ID, Compound.name) %>%
dplyr::distinct(KEGG.ID, .keep_all = TRUE),
by = "KEGG.ID"
) %>%
pull(Compound.name) %>%
paste(collapse = "{}")
total_metabolite_name <-
data.frame(KEGG.ID = x) %>%
dplyr::left_join(
annotation_table_all %>%
dplyr::select(KEGG.ID, Compound.name) %>%
dplyr::distinct(KEGG.ID, .keep_all = TRUE),
by = "KEGG.ID"
) %>%
pull(Compound.name) %>%
paste(collapse = "{}")
i.id <- paste(i.id, collapse = "{}")
h.id <- paste(h.id, collapse = "{}")
t.id <- paste(x, collapse = "{}")
temp_result <-
c(
total_metabolite_number,
detected_metabolite_number,
hidden_metabolite_number,
t.id,
i.id,
h.id,
total_metabolite_name,
detected_metabolite_name,
hidden_metabolite_name
)
return(temp_result)
})
metabolic_module_result <-
do.call(rbind, metabolic_module_result)
metabolic_module_result <-
data.frame(
module_impact,
module_dominant_edge_rate,
metabolic_module_p,
metabolic_module_result,
stringsAsFactors = FALSE
) %>%
dplyr::arrange(metabolic_module_p) %>%
dplyr::mutate(metabolic_module_name = paste("metabolic_module", 1:nrow(.), sep = "_")) %>%
dplyr::select(metabolic_module_name, dplyr::everything())
colnames(metabolic_module_result) <- c(
"Name",
"Impact",
"Dominant_edge_rate",
"p_value",
"Total_metabolite_number",
"Detected_metabolite_number",
"Hidden_metabolite_number",
"Total_metabolite_id",
"Detected_metabolite_id",
"hidden_metabolite_id",
"Total_metabolite_name",
"Detected_metabolite_name",
"hidden_metabolite_name"
)
metabolic_module_result$Total_metabolite_number <-
as.numeric(metabolic_module_result$Total_metabolite_number)
metabolic_module_result$Detected_metabolite_number <-
as.numeric(metabolic_module_result$Detected_metabolite_number)
metabolic_module_result$Hidden_metabolite_number <-
as.numeric(metabolic_module_result$Hidden_metabolite_number)
#####extract the final dysregulated networks
final_metabolite_id <-
metabolic_module_result %>%
dplyr::filter(p_value < 0.05) %>%
dplyr::pull(Total_metabolite_id) %>%
stringr::str_split(pattern = "\\{\\}") %>%
unlist() %>%
unique()
annotation_table_final <-
annotation_table_all %>%
dplyr::filter(
variable_id %in% feature_table_marker$variable_id &
KEGG.ID %in% final_metabolite_id
)
# ####not in the workflow
# calculate_redundance(annotation_table_final)
#
# annotation_standard <-
# urine_metabolomics_data@variable_info %>%
# dplyr::filter(!is.na(SS)) %>%
# dplyr::filter(!is.na(KEGG.ID)) %>%
# dplyr::filter(KEGG.ID %in% node_data$KEGG_ID) %>%
# dplyr::filter(KEGG.ID %in% metabolite_database@spectra.info$KEGG.ID)
#
# dim(annotation_standard)
#
# compared_result2 <-
# annotation_standard %>%
# dplyr::select(variable_id, Compound.name, KEGG.ID, SS) %>%
# dplyr::left_join(annotation_table_final[, c("variable_id",
# "Compound.name",
# "KEGG.ID",
# "compound_class",
# "score")], by = "variable_id")
#
# write.csv(compared_result2, file = "compared_result2.csv")
#
# unique(compared_result2$variable_id) %>%
# purrr::map(function(x) {
# temp <-
# compared_result2 %>%
# dplyr::filter(variable_id == x)
#
# if (all(is.na(temp$KEGG.ID.y))) {
# return("No annotation")
# }
#
# if (unique(temp$KEGG.ID.x) %in% temp$KEGG.ID.y) {
# return("Correct annotation")
# } else{
# return("Incorrect annotation")
# }
#
# }) %>%
# unlist() %>%
# table()
#
# compared_result3 <-
# compared_result2 %>%
# dplyr::filter(!is.na(KEGG.ID.y))
#
# write.csv(compared_result3, file = "compared_result3.csv")
final_metabolic_network <-
metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::filter(name %in% final_metabolite_id) %>%
tidygraph::activate(what = "nodes") %>%
dplyr::mutate(node_class = ifelse(name %in% detected_metabolites, "Detected", "Hidden")) %>%
tidygraph::activate(what = "nodes") %>%
dplyr::mutate(degree2 = tidygraph::centrality_degree())
final_node_data <-
tidygraph::as_tibble(final_metabolic_network, active = "nodes") %>%
dplyr::select(name, KEGG_ID, HMDB_ID, Compound_name, node_class)
final_edge_data <-
tidygraph::as_tibble(final_metabolic_network, active = "edges") %>%
dplyr::mutate(from = from_compound_KEGG_ID, to = to_compound_KEGG_ID) %>%
dplyr::select(from, to) %>%
dplyr::mutate(edge_class = "metabolic_reaction")
final_edge_data2 <-
annotation_table_final %>%
dplyr::filter(isotope == "[M]") %>%
dplyr::select(variable_id, KEGG.ID) %>%
dplyr::rename(from = variable_id, to = KEGG.ID) %>%
dplyr::mutate(edge_class = "feature_metabolite")
final_node_data2 <-
data.frame(name = unique(final_edge_data2$from)) %>%
dplyr::mutate(
KEGG_ID = NA,
HMDB_ID = NA,
Compound_name = NA,
node_class = "Feature"
)
###add feature information to the final metabolic network
final_node_data <-
rbind(final_node_data, final_node_data2)
final_edge_data <-
rbind(final_edge_data, final_edge_data2)
final_node_data <-
final_node_data %>%
dplyr::filter(name %in% unique(c(
final_edge_data$from, final_edge_data$to
))) %>%
dplyr::distinct(name, .keep_all = TRUE)
final_edge_data <-
final_edge_data %>%
dplyr::filter(from %in% final_node_data$name &
to %in% final_node_data$name)
final_metabolic_network <-
tidygraph::tbl_graph(nodes = final_node_data,
edges = final_edge_data,
directed = FALSE)
####Pathway enrichment for all the metabolites
kegg_id <-
annotation_table_final$KEGG.ID
message("Enriching pathways for dysregulated metabolic network...\n")
enriched_pathways <-
enrich_pathways(
query_id = kegg_id,
query_type = "compound",
id_type = "KEGG",
pathway_database = pathway_database,
p_cutoff = 0.05,
p_adjust_method = "fdr",
threads = threads
)
message("Enriching pathways for each metabolic module...\n")
enriched_pathways_list <-
vector("list", length = nrow(metabolic_module_result))
names(enriched_pathways_list) <- metabolic_module_result$Name
for (i in 1:sum(metabolic_module_result$p_value < 0.05)) {
cat(i, " ")
x <-
metabolic_module_result$Total_metabolite_id[i] %>%
stringr::str_split(pattern = "\\{\\}") %>%
unlist() %>%
unique()
enriched_pathways_list[[i]] <-
enrich_pathways(
query_id = x,
query_type = "compound",
id_type = "KEGG",
pathway_database = pathway_database,
p_cutoff = 0.05,
p_adjust_method = "fdr",
threads = threads
)
}
return(
list(
dysregulated_metabolic_module = metabolic_module_result,
dysregulated_metabolic_network = final_metabolic_network,
annotation_table = annotation_table_final,
enriched_pathways = enriched_pathways,
enriched_pathways_list = enriched_pathways_list
)
)
}
#' Identify Metabolic Modules
#'
#' This function identifies metabolic modules from a metabolic reaction network.
#' It applies a clustering algorithm to detect modules based on network structure.
#' Large modules are recursively split, and weakly connected hidden metabolites are removed.
#'
#' @param sub_metabolic_network A metabolic network represented as a `tbl_graph` object.
#' @param detected_metabolites A character vector of detected metabolite IDs.
#' @param hidden_metabolites A character vector of hidden metabolite IDs.
#'
#' @return A list of metabolic modules, where each module is a character vector
#' containing the metabolite IDs within the module.
#'
#' @export
identify_metabolic_modules <-
function(sub_metabolic_network,
detected_metabolites,
hidden_metabolites) {
# Apply Walktrap clustering to detect metabolic modules
metabolic_modules <- igraph::cluster_walktrap(graph = sub_metabolic_network)
# Extract module membership information
node_name <- metabolic_modules$names
membership <- metabolic_modules$membership
membership_counts <- table(membership)
# Group nodes by module membership
group <- lapply(names(membership_counts), function(x) {
node_name[which(membership == as.numeric(x))]
})
# Remove modules with fewer than 3 nodes
group <- group[unlist(lapply(group, length)) > 3]
# Recursively split modules containing more than 100 nodes
large_module_idx <- which(unlist(lapply(group, length)) > 100)
while (length(large_module_idx) > 0) {
large_group <- group[large_module_idx]
group <- group[-large_module_idx]
new_group <- lapply(large_group, function(x) {
# Extract subnetwork for the large module
temp_graph <- sub_metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::filter(name %in% x)
# Apply clustering again within the subnetwork
temp_metabolic_modules <- igraph::cluster_walktrap(graph = temp_graph)
temp_node_name <- temp_metabolic_modules$names
temp_membership <- temp_metabolic_modules$membership
temp_membership_counts <- table(temp_membership)
# Split into smaller groups
temp_group <- lapply(names(temp_membership_counts), function(y) {
temp_node_name[which(temp_membership == as.numeric(y))]
})
# Remove small groups with fewer than 3 nodes
temp_group <- temp_group[unlist(lapply(temp_group, length)) > 3]
temp_group
})
new_group <- unlist(new_group, recursive = FALSE)
group <- c(group, new_group)
# Identify any remaining large modules
large_module_idx <- which(unlist(lapply(new_group, length)) > 100)
}
# Remove weakly connected groups where the number of edges is less than the number of nodes
remove_idx <- which(unlist(lapply(group, function(x) {
temp_graph <- sub_metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::filter(name %in% x)
node_count <- igraph::gorder(temp_graph) # Number of nodes
edge_count <- igraph::gsize(temp_graph) # Number of edges
degree_values <- igraph::degree(graph = temp_graph, v = x)
# Remove groups where edges are fewer than nodes and all nodes have degree < 3
if (node_count - edge_count > 0 & all(degree_values < 3)) {
return(TRUE)
} else {
return(FALSE)
}
})))
if (length(remove_idx) > 0) {
group <- group[-remove_idx]
}
# Remove hidden metabolites with degree < 2
group <- lapply(group, function(x) {
temp_graph <- sub_metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::filter(name %in% x)
degree_values <- igraph::degree(graph = temp_graph, v = x)
metabolite_class <- sapply(names(degree_values), function(y) {
ifelse(y %in% detected_metabolites, "Detected", "Hidden")
})
remove_idx <- which(degree_values == 1 &
metabolite_class == "Hidden")
while (length(remove_idx) > 0) {
x <- x[-remove_idx]
temp_graph <- sub_metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::filter(name %in% x)
degree_values <- igraph::degree(graph = temp_graph, v = x)
metabolite_class <- sapply(names(degree_values), function(y) {
ifelse(y %in% detected_metabolites, "Detected", "Hidden")
})
remove_idx <- which(degree_values == 1 &
metabolite_class == "Hidden")
}
return(x)
})
# Remove groups that still have fewer than 3 nodes
group <- group[unlist(lapply(group, length)) > 3]
return(group)
}
#' Calculate Node Centrality in a Graph
#'
#' This function computes the centrality of nodes in a graph using either degree centrality or betweenness centrality.
#'
#' @param graph An `igraph` object representing the network.
#' @param type A character string specifying the type of centrality to calculate. Options are "degree" (default) or "betweenness".
#'
#' @return A named vector of centrality values for each node in the graph.
#'
#' @export
#'
calculate_centrality <-
function(graph, type = c("degree", "betweenness")) {
# Ensure that the specified centrality type is valid
type <- match.arg(type)
# Extract node names from the graph
node <- igraph::V(graph)$name
# Compute degree centrality if selected
if (type == "degree") {
node_degree <- igraph::degree(graph = graph, v = node)
return(node_degree)
}
# Compute betweenness centrality
node_betweenness <- lapply(node, function(x) {
i <- match(x, node)
if (i == length(node))
return(NULL)
from <- x
to <- node[(i + 1):length(node)]
# Compute shortest paths between the node and others
temp <- igraph::shortest_paths(graph = graph,
from = from,
to = to)[[1]]
# Extract nodes that lie on shortest paths
temp <- lapply(temp, function(path) {
if (length(path) <= 2)
return(NULL)
names(path)[-c(1, length(path))]
})
unlist(temp)
})
node_betweenness <- unlist(node_betweenness)
node_betweenness <- table(node_betweenness)
# Ensure that all nodes are included in the result
missing_nodes <- setdiff(node, names(node_betweenness))
if (length(missing_nodes) > 0) {
add <- rep(0, length(missing_nodes))
names(add) <- missing_nodes
node_betweenness <- c(node_betweenness, add)
}
# Ensure correct ordering of results
node_betweenness <- node_betweenness[match(node, names(node_betweenness))]
return(node_betweenness)
}
#' Identify Hidden Metabolites in a Metabolic Network
#'
#' This function identifies hidden metabolites that are connected to detected metabolites
#' within a specified number of reaction steps in a metabolic network.
#'
#' @param metabolic_network An `igraph` object representing the metabolic reaction network.
#' @param detected_metabolites A character vector of detected metabolite IDs.
#' @param threads Integer; number of parallel threads to use. Default is `3`.
#' @param max.reaction Integer; maximum number of reaction steps allowed to define a hidden metabolite. Default is `3`.
#'
#' @return A list of hidden metabolites connected to detected metabolites within the specified reaction steps.
#'
#' @export
#'
<-
function(metabolic_network,
detected_metabolites,
threads = 8,
max.reaction = 3) {
# Helper function to find hidden metabolites for each detected metabolite
temp_fun_hidden_metabolites <- function(name,
metabolic_network,
detected_metabolites,
max.reaction) {
options(warn = -1) # Suppress warnings
# Identify remaining detected metabolites for distance calculation
idx <- match(name, detected_metabolites)
to <- detected_metabolites[-c(1:idx)]
# Compute shortest path distances from the current metabolite
distance <- igraph::distances(
graph = metabolic_network,
v = name,
to = to,
mode = "all"
)[1, ]
# Set infinite distances to exceed max.reaction threshold
distance[is.infinite(distance)] <- max.reaction + 10
to <- to[which(distance <= max.reaction)]
if (length(to) == 0) {
return(NULL) # No valid hidden metabolites found
}
# Extract shortest paths between the metabolite and its connected nodes
result <- igraph::shortest_paths(
graph = metabolic_network,
from = name,
to = to,
mode = "all"
)[[1]]
# Extract node names from shortest paths and remove already detected metabolites
result <- unique(unlist(lapply(result, names)))
result <- setdiff(result, detected_metabolites)
return(result)
}
# Select parallel processing backend based on OS
if (masstools::get_os() == "windows") {
bpparam <- BiocParallel::SnowParam(workers = threads, progressbar = TRUE)
} else {
bpparam <- BiocParallel::MulticoreParam(workers = threads, progressbar = TRUE)
}
# Identify hidden metabolites using parallel processing
hidden_metabolites <- BiocParallel::bplapply(
detected_metabolites[-length(detected_metabolites)],
FUN = temp_fun_hidden_metabolites,
BPPARAM = bpparam,
metabolic_network = metabolic_network,
detected_metabolites = detected_metabolites,
max.reaction = max.reaction
)
return(hidden_metabolites)
}
#' Calculate Activity Scores for Metabolic Modules
#'
#' This function calculates activity scores for metabolic modules based on their network properties,
#' including modularity, detected metabolite contributions, and overall network connectivity.
#'
#' @param metabolic_modules A list of metabolic modules, each containing metabolite IDs.
#' @param detected_metabolites A character vector of detected metabolite IDs.
#' @param hidden_metabolites A character vector of hidden metabolite IDs.
#' @param sub_metabolic_network An `igraph` object representing the metabolic subnetwork.
#' @param threads Integer; number of parallel threads to use. Default is `3`.
#'
#' @return A numeric vector of activity scores for each metabolic module.
#'
#' @export
#'
calculate_activity_score <-
function(metabolic_modules,
detected_metabolites,
hidden_metabolites,
sub_metabolic_network,
threads = 8) {
# Helper function to compute activity scores for a given metabolic module
temp_fun_activity_scores <- function(x,
detected_metabolites,
hidden_metabolites,
total_metabolite_number,
sub_metabolic_network) {
options(warn = -1) # Suppress warnings
# Extract the subgraph for the given metabolic module
temp_graph <- igraph::subgraph(graph = sub_metabolic_network, v = x)
detected_number <- sum(x %in% detected_metabolites)
hidden_number <- sum(x %in% hidden_metabolites)
total_edges <- length(igraph::E(sub_metabolic_network)) # Total edges in network
module_edges <- length(igraph::E(temp_graph)) # Edges within the module
node_names <- names(igraph::V(temp_graph))
# Compute modularity-based contribution
value <- sapply(node_names, function(node) {
temp_value <- sapply(node_names, function(x) {
degree_x <- igraph::degree(graph = sub_metabolic_network, v = x)
degree_node <- igraph::degree(graph = sub_metabolic_network, v = node)
(degree_x * degree_node) / (4 * total_edges^2)
})
sum(temp_value)
})
modularity <- module_edges / total_edges - sum(value)
# Compute activity score
q <- modularity * sqrt(total_metabolite_number / length(x))
activity <- detected_number * q / length(x)
return(activity)
}
# Choose appropriate parallelization method based on OS
if (masstools::get_os() == "windows") {
bpparam <- BiocParallel::SnowParam(workers = threads, progressbar = TRUE)
} else {
bpparam <- BiocParallel::MulticoreParam(workers = threads, progressbar = TRUE)
}
# Compute activity scores for all metabolic modules in parallel
activity_scores <- BiocParallel::bplapply(
metabolic_modules,
temp_fun_activity_scores,
BPPARAM = bpparam,
detected_metabolites = detected_metabolites,
hidden_metabolites = hidden_metabolites,
total_metabolite_number = length(detected_metabolites),
sub_metabolic_network = sub_metabolic_network
)
return(unlist(activity_scores))
}
#' Annotate Metabolites for Feature Pathway Analysis (FPA)
#'
#' This function performs metabolite annotation for a given feature table using an MS1-based approach.
#' It processes both positive and negative ionization modes separately and extracts annotation results.
#'
#' @param feature_table A data frame containing feature data, including `variable_id`, `mz`, `rt`, and `polarity` columns.
#' @param metabolite_database A metabolite database object used for annotation.
#' @param column A character string specifying the chromatography column type. Options are `"rp"` (reverse phase) or `"hilic"` (hydrophilic interaction chromatography). Default is `"rp"`.
#' @param adduct.table Optional data frame specifying possible adducts. Default is `NULL`.
#' @param ms1.match.ppm Numeric; mass tolerance for MS1 matching in parts per million (ppm). Default is `10`.
#' @param rt.match.tol Numeric; retention time tolerance threshold for matching in seconds. Default is `5`.
#' @param mz.ppm.thr Numeric; M/Z tolerance threshold for filtering in ppm. Default is `400`.
#' @param threads Integer; number of parallel threads to use for annotation. Default is `3`.
#'
#' @return A data frame containing the annotated metabolites with polarity information.
#'
#' @export
#'
annotate_metabolites_fpa <-
function(feature_table,
metabolite_database,
column = c("rp", "hilic"),
adduct.table = NULL,
ms1.match.ppm = 10,
rt.match.tol = 5,
mz.ppm.thr = 400,
threads = 3) {
column <- match.arg(column) # Ensure valid column type
##### Separate features by polarity #####
feature_table_pos <- feature_table %>%
dplyr::filter(polarity == "positive")
feature_table_neg <-
feature_table %>% dplyr::filter(polarity == "negative")
##### Create mass dataset objects for positive and negative modes #####
# Positive mode processing
if (nrow(feature_table_pos) == 0) {
object_pos <- NULL
} else {
expression_data_pos <- data.frame(sample1 = 1:nrow(feature_table_pos))
rownames(expression_data_pos) <- feature_table_pos$variable_id
sample_info_pos <- data.frame(sample_id = "sample1", class = "Subject")
object_pos <-
massdataset::create_mass_dataset(
expression_data = expression_data_pos,
sample_info = sample_info_pos,
variable_info = feature_table_pos
)
}
# Negative mode processing
if (nrow(feature_table_neg) == 0) {
object_neg <- NULL
} else {
expression_data_neg <- data.frame(sample1 = 1:nrow(feature_table_neg))
rownames(expression_data_neg) <- feature_table_neg$variable_id
sample_info_neg <- data.frame(sample_id = "sample1", class = "Subject")
object_neg <- massdataset::create_mass_dataset(
expression_data = expression_data_neg,
sample_info = sample_info_neg,
variable_info = feature_table_neg
)
}
##### Perform metabolite annotation #####
message("Annotating metabolites in positive mode...\n")
if (!is.null(object_pos)) {
object_pos <- metid::annotate_metabolites(
object = object_pos,
database = metabolite_database,
based_on = "ms1",
polarity = "positive",
column = column,
adduct.table = adduct.table,
ms1.match.ppm = ms1.match.ppm,
candidate.num = 100,
threads = threads,
mz.ppm.thr = mz.ppm.thr
)
annotation_table_pos <-
massdataset::extract_annotation_table(object_pos) %>%
dplyr::mutate(polarity = "positive")
####add feature mz, rt and annotation formula to it
annotation_table_pos <-
annotation_table_pos %>%
dplyr::left_join(feature_table_pos[, c("variable_id", "mz", "rt")], by = c("variable_id" = "variable_id")) %>%
dplyr::left_join(metabolite_database@spectra.info[, c("Lab.ID", "Formula")],
by = c("Lab.ID" = "Lab.ID")) %>%
dplyr::select(variable_id, mz, rt, Formula, Adduct, dplyr::everything()) %>%
dplyr::mutate(isotope = "[M]")
####isotope annotation for annotations
message("Annotating isotopes in positive mode...\n")
future::plan(future::multisession, workers = threads)
system.time(
isotope_pos <-
furrr::future_map(
.x = as.data.frame(t(annotation_table_pos)),
.f = function(x) {
adduct <-
stringr::str_extract(x[5], "\\(.+\\)") %>%
stringr::str_replace("\\(", "") %>%
stringr::str_replace("\\)", "")
mz = as.numeric(stringr::str_trim(x[2], side = "both"))
rt = as.numeric(stringr::str_trim(x[3], side = "both"))
temp_iso <- try(annotate_isotope(
formula = stringr::str_trim(x[4], side = "both"),
adduct = adduct,
mz = mz,
rt = rt,
peak.mz = feature_table_pos$mz,
peak.rt = feature_table_pos$rt,
rt.tol = rt.match.tol,
mz.tol = ms1.match.ppm,
max.isotope = 3
),
silent = TRUE)
if (is(temp_iso, "try-error")) {
return(NULL)
}
if (is.null(temp_iso)) {
return(NULL)
}
# return(temp_iso)
temp_iso <-
cbind(feature_table_pos[temp_iso$peakIndex, ], temp_iso) %>%
dplyr::select(-c(peakIndex))
colnames(temp_iso) <- c("variable_id",
"mz",
"rt",
"polarity",
"mz.error",
"isotope",
"RT.error")
x <- matrix(x, nrow = 1) %>% as.data.frame()
colnames(x) <- colnames(annotation_table_pos)
temp_iso$Compound.name <- x$Compound.name
temp_iso$Lab.ID <- x$Lab.ID
temp_iso$Adduct <- x$Adduct
temp_iso$Formula <- x$Formula
# temp_iso$rt <- x$rt
temp_iso$CAS.ID <- x$CAS.ID
temp_iso$HMDB.ID <- x$HMDB.ID
temp_iso$KEGG.ID <- x$KEGG.ID
temp_iso$Database <- x$Database
temp_iso$ms2_files_id <- NA
temp_iso$ms2_spectrum_id <- NA
temp_iso$CE <- NA
temp_iso$SS <- NA
temp_iso$Total.score <- NA
temp_iso$Level <- NA
temp_iso$mz.match.score <-
(ms1.match.ppm - temp_iso$mz.error) / ms1.match.ppm
temp_iso$RT.match.score <-
(rt.match.tol - temp_iso$RT.error) / rt.match.tol
temp_iso %>%
dplyr::select(colnames(x))
},
.progress = TRUE
)
)
isotope_pos <- isotope_pos %>%
do.call(rbind, .) %>%
as.data.frame()
annotation_table_pos <-
rbind(annotation_table_pos, isotope_pos) %>%
dplyr::arrange(Compound.name, isotope, rt)
} else {
annotation_table_pos <- NULL
}
message("Annotating metabolites in negative mode...\n")
if (!is.null(object_neg)) {
object_neg <- metid::annotate_metabolites(
object = object_neg,
database = metabolite_database,
based_on = "ms1",
polarity = "negative",
column = column,
adduct.table = adduct.table,
ms1.match.ppm = ms1.match.ppm,
candidate.num = 100,
threads = threads,
mz.ppm.thr = mz.ppm.thr
)
annotation_table_neg <-
massdataset::extract_annotation_table(object_neg) %>%
dplyr::mutate(polarity = "negative")
####add feature mz, rt and annotation formula to it
annotation_table_neg <-
annotation_table_neg %>%
dplyr::left_join(feature_table_neg[, c("variable_id", "mz", "rt")], by = c("variable_id" = "variable_id")) %>%
dplyr::left_join(metabolite_database@spectra.info[, c("Lab.ID", "Formula")],
by = c("Lab.ID" = "Lab.ID")) %>%
dplyr::select(variable_id, mz, rt, Formula, Adduct, dplyr::everything()) %>%
dplyr::mutate(isotope = "[M]")
####isotope annotation for annotations
message("Annotating isotopes in negative mode...\n")
future::plan(future::multisession, workers = threads)
system.time(
isotope_neg <-
furrr::future_map(
.x = as.data.frame(t(annotation_table_neg)),
.f = function(x) {
adduct <-
stringr::str_extract(x[5], "\\(.+\\)") %>%
stringr::str_replace("\\(", "") %>%
stringr::str_replace("\\)", "")
mz = as.numeric(stringr::str_trim(x[2], side = "both"))
rt = as.numeric(stringr::str_trim(x[3], side = "both"))
temp_iso <- try(annotate_isotope(
formula = stringr::str_trim(x[4], side = "both"),
adduct = adduct,
mz = mz,
rt = rt,
peak.mz = feature_table_neg$mz,
peak.rt = feature_table_neg$rt,
rt.tol = rt.match.tol,
mz.tol = ms1.match.ppm,
max.isotope = 3
),
silent = TRUE)
if (is(temp_iso, "try-error")) {
return(NULL)
}
if (is.null(temp_iso)) {
return(NULL)
}
# return(temp_iso)
temp_iso <-
cbind(feature_table_neg[temp_iso$peakIndex, ], temp_iso) %>%
dplyr::select(-c(peakIndex))
colnames(temp_iso) <- c("variable_id",
"mz",
"rt",
"polarity",
"mz.error",
"isotope",
"RT.error")
x <- matrix(x, nrow = 1) %>% as.data.frame()
colnames(x) <- colnames(annotation_table_neg)
temp_iso$Compound.name <- x$Compound.name
temp_iso$Lab.ID <- x$Lab.ID
temp_iso$Adduct <- x$Adduct
temp_iso$Formula <- x$Formula
# temp_iso$rt <- x$rt
temp_iso$CAS.ID <- x$CAS.ID
temp_iso$HMDB.ID <- x$HMDB.ID
temp_iso$KEGG.ID <- x$KEGG.ID
temp_iso$Database <- x$Database
temp_iso$ms2_files_id <- NA
temp_iso$ms2_spectrum_id <- NA
temp_iso$CE <- NA
temp_iso$SS <- NA
temp_iso$Total.score <- NA
temp_iso$Level <- NA
temp_iso$mz.match.score <-
(ms1.match.ppm - temp_iso$mz.error) / ms1.match.ppm
temp_iso$RT.match.score <-
(rt.match.tol - temp_iso$RT.error) / rt.match.tol
temp_iso %>%
dplyr::select(colnames(x))
},
.progress = TRUE
)
)
isotope_neg <- isotope_neg %>%
do.call(rbind, .) %>%
as.data.frame()
annotation_table_neg <-
rbind(annotation_table_neg, isotope_neg) %>%
dplyr::arrange(Compound.name, isotope, rt)
} else {
annotation_table_neg <- NULL
}
##### Combine annotation tables #####
annotation_table <-
rbind(annotation_table_pos, annotation_table_neg)
return(annotation_table)
}
#' Generate Null Distribution of Activity Scores
#'
#' This function generates a null distribution of activity scores by performing random sampling
#' of marker metabolites and calculating activity scores through network analysis.
#'
#' @param annotation_table_all A data frame containing all annotated metabolites.
#' @param feature_table_marker A data frame containing marker features.
#' @param metabolite_database A metabolite database object used for annotation.
#' @param metabolic_network An `igraph` object representing the metabolic network.
#' @param permutation_times Integer; number of permutations to perform. Default is `20`.
#' @param threads Integer; number of parallel threads to use. Default is `8`.
#' @param adduct.table Optional data frame specifying possible adducts. Default is `NULL`.
#' @param column Character; chromatography column type. Options are `"rp"` or `"hilic"`. Default is `"rp"`.
#' @param ms1.match.ppm Numeric; MS1 matching tolerance in ppm. Default is `25`.
#' @param mz.ppm.thr Numeric; M/Z tolerance threshold in ppm. Default is `400`.
#' @param include_hidden_metabolites Logical; whether to include hidden metabolites in the network analysis. Default is `FALSE`.
#'
#' @return A list of numeric vectors representing null activity scores for each permutation.
#'
#' @export
#'
generate_null_activity_score_distribution <-
function(annotation_table_all,
feature_table_marker,
metabolite_database,
metabolic_network,
permutation_times = 20,
threads = 8,
adduct.table = NULL,
column = c("rp", "hilic"),
ms1.match.ppm = 25,
mz.ppm.thr = 400,
include_hidden_metabolites = FALSE) {
column <- match.arg(column) # Ensure valid column type
# Initialize list to store activity scores for each permutation
null_activity_score <- vector(mode = "list", length = permutation_times)
marker_number <- nrow(feature_table_marker) # Number of markers
for (i in seq_len(permutation_times)) {
cat(i, " ")
# Randomly sample markers for permutation
if (nrow(feature_table_marker) / length(unique(annotation_table_all$variable_id)) < 0.2) {
temp_marker_variable_id <- sample(
x = setdiff(
unique(annotation_table_all$variable_id),
feature_table_marker$variable_id
),
size = marker_number,
replace = FALSE
)
} else{
temp_marker_variable_id <- sample(
x = unique(annotation_table_all$variable_id),
size = marker_number,
replace = FALSE
)
}
# Extract annotation information for sampled markers
temp_annotation_table_marker <- annotation_table_all %>%
dplyr::filter(variable_id %in% temp_marker_variable_id)
# Extract detected metabolites from annotation table
temp_detected_metabolites <- unique(temp_annotation_table_marker$KEGG.ID)
# Identify hidden metabolites connected to detected metabolites
if (include_hidden_metabolites) {
temp_hidden_metabolites <- (
metabolic_network = metabolic_network,
detected_metabolites = temp_detected_metabolites,
threads = threads,
max.reaction = 3
) %>% unlist() %>% unique()
} else {
temp_hidden_metabolites <- NULL
}
temp_total_metabolites <- c(temp_detected_metabolites, temp_hidden_metabolites)
# Extract subnetwork containing detected and hidden metabolites
temp_sub_metabolic_network <- metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::filter(name %in% temp_total_metabolites) %>%
dplyr::mutate(node_class = ifelse(name %in% temp_detected_metabolites, "Detected", "Hidden")) %>%
tidygraph::mutate(degree2 = tidygraph::centrality_degree())
# Identify metabolic modules in the subnetwork
temp_metabolic_modules <- identify_metabolic_modules(
sub_metabolic_network = temp_sub_metabolic_network,
detected_metabolites = temp_detected_metabolites,
hidden_metabolites = temp_hidden_metabolites
)
# Calculate activity scores for the identified metabolic modules
temp_activity_score <- calculate_activity_score(
metabolic_modules = temp_metabolic_modules,
detected_metabolites = temp_detected_metabolites,
hidden_metabolites = temp_hidden_metabolites,
sub_metabolic_network = temp_sub_metabolic_network
)
# Store the computed activity score in the list
null_activity_score[[i]] <- temp_activity_score
}
return(null_activity_score)
}
#####g
#' Group Retention Time (RT) Peaks
#'
#' This function groups retention time (RT) values based on a specified tolerance,
#' clustering peaks that fall within the same RT range.
#'
#' @param rt A numeric vector of retention times.
#' @param rt.tol Numeric; retention time tolerance for grouping. Default is `10`.
#'
#' @return A data frame with two columns:
#' - `rt`: The retention times of grouped peaks.
#' - `class`: The assigned group ID.
#'
#' @export
#'
group_peaks_rt <-
function(rt, rt.tol = 10) {
# Identify clusters of retention times within the specified tolerance
rt_class <- lapply(rt, function(x) {
which(rt >= x & rt < x + rt.tol)
})
# Ensure each peak is uniquely assigned to a single class
rt_class <- lapply(seq_along(rt_class)[-1], function(i) {
setdiff(rt_class[[i]], unique(unlist(rt_class[1:(i - 1)])))
}) %>% c(rt_class[1], .)
# Remove empty groups
rt_class <- rt_class[which(lapply(rt_class, length) != 0)]
# Assign group names
names(rt_class) <- seq_along(rt_class)
# Convert grouping information into a data frame
rt_class <- purrr::map2(
.x = rt_class,
.y = names(rt_class),
.f = function(x, y) {
data.frame(rt = rt[x],
class = y,
stringsAsFactors = FALSE)
}
) %>% do.call(rbind, .)
rownames(rt_class) <- NULL
return(rt_class)
}
#' Score a Peak Group Based on Adduct and Polarity Information
#'
#' This function assigns a score to a peak group based on the detected adduct types,
#' polarity, and isotopic information. Higher scores indicate a higher likelihood
#' of correct identification.
#'
#' @param peak_group A data frame containing peak group information, including `Adduct`, `polarity`, and `isotope` columns.
#'
#' @return A numeric score representing the confidence of the peak group assignment.
#'
#' @details
#' The scoring system is based on the following criteria:
#' - `+50` if the positive adduct is `(M+H)+`.
#' - `+20` if `(M+H)+` is present and is not the `[M]` isotope.
#' - `+20` if the polarity is positive but the adduct is not `(M+H)+`.
#' - `+10` if the polarity is positive, the adduct is not `(M+H)+`, and the isotope is not `[M]`.
#' - `+50` if the negative adduct is `(M-H)-`.
#' - `+20` if `(M-H)-` is present and is not the `[M]` isotope.
#' - `+20` if the polarity is negative but the adduct is not `(M-H)-`.
#' - `+10` if the polarity is negative, the adduct is not `(M-H)-`, and the isotope is not `[M]`.
#'
#' @export
#'
score_peak_group <-
function(peak_group) {
score <- 0
## This should be optimized in the future
# Positive mode scoring
if (any(peak_group$Adduct == "(M+H)+")) {
score <- score + 50
}
if (any(peak_group$Adduct == "(M+H)+" &
peak_group$isotope != "[M]")) {
score <- score + 20
}
if (any(peak_group$polarity == "positive" &
peak_group$Adduct != "(M+H)+")) {
score <- score + 20
}
if (any(
peak_group$polarity == "positive" &
peak_group$Adduct != "(M+H)+" &
peak_group$isotope != "[M]"
)) {
score <- score + 10
}
# Negative mode scoring
if (any(peak_group$Adduct == "(M-H)-")) {
score <- score + 50
}
if (any(peak_group$Adduct == "(M-H)-" &
peak_group$isotope != "[M]")) {
score <- score + 20
}
if (any(peak_group$polarity == "negative" &
peak_group$Adduct != "(M-H)-")) {
score <- score + 20
}
if (any(
peak_group$polarity == "negative" &
peak_group$Adduct != "(M-H)-" &
peak_group$isotope != "[M]"
)) {
score <- score + 10
}
return(score)
}
#' Calculate Redundancy in Metabolite Annotation
#'
#' This function calculates redundancy metrics in a metabolite annotation table.
#' It evaluates the number of distinct compound classes per compound and the number
#' of compounds assigned to each peak.
#'
#' @param annotation_table A data frame containing metabolite annotations, including `Lab.ID`, `compound_class`, and `variable_id`.
#'
#' @return A numeric vector of length two:
#' - The first value represents the average number of compound classes per compound (`redundancy1`).
#' - The second value represents the average number of compounds assigned to each peak (`redundancy2`).
#'
#' @export
#'
calculate_redundance <-
function(annotation_table) {
# Ensure the annotation table is in a consistent format
annotation_table <- dplyr::bind_rows(annotation_table)
# Redundancy 1: Average number of compound classes per compound
redundancy1 <- annotation_table %>%
dplyr::group_by(Lab.ID) %>%
dplyr::distinct(compound_class) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::pull(n) %>%
mean()
# Redundancy 2: Average number of compounds assigned to each peak
redundancy2 <- annotation_table %>%
dplyr::group_by(variable_id) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::pull(n) %>%
mean()
return(c(redundancy1, redundancy2))
}
#' Remove Redundant Annotations in Metabolite Data
#'
#' This function removes redundant metabolite annotations by filtering out low-confidence matches.
#' It iteratively refines the annotation table based on score thresholds.
#'
#' @param annotation_table A data frame containing metabolite annotations,
#' including `Lab.ID`, `variable_id`, and `score` columns.
#'
#' @return A filtered version of the annotation table with reduced redundancy.
#'
#' @details
#' The function follows these rules:
#' - If a compound has an annotation with a score > 80, it removes annotations with scores ≤ 20.
#' - If a peak has an annotation with a score > 80, it removes other annotations with scores ≤ 20.
#' - Iterates until redundancy stabilizes, ensuring that lower-quality annotations are removed.
#'
#' @export
#'
remove_redundancy <-
function(annotation_table) {
## Track redundancy change across iterations
redundancy_diff <- c(-1, -1)
while (any(redundancy_diff < 0)) {
# Calculate redundancy before filtering
before_redundancy <- calculate_redundance(annotation_table = annotation_table)
# Filter compound annotations: Remove low-scoring annotations if high-scoring ones exist
annotation_table <- annotation_table %>%
dplyr::group_by(Lab.ID) %>%
dplyr::filter(if (any(score > 80)) {
score > 20
} else {
score > 0
}) %>%
dplyr::ungroup()
# Filter peak annotations: Remove lower-scoring annotations when a high-scoring annotation exists
annotation_table <- annotation_table %>%
dplyr::group_by(variable_id) %>%
dplyr::filter(if (any(score > 80)) {
score > 20
} else {
score > 0
}) %>%
dplyr::ungroup()
# Calculate redundancy after filtering
after_redundancy <- calculate_redundance(annotation_table = annotation_table)
# Compute change in redundancy
redundancy_diff <- after_redundancy - before_redundancy
}
return(annotation_table)
}
#' Annotate Isotopes for a Given Molecular Formula
#'
#' This function identifies isotopic peaks for a given molecular formula and adduct,
#' matching them with observed peaks based on mass-to-charge ratio (m/z) and retention time (RT).
#'
#' @param formula Character. The molecular formula of the compound (e.g., `"C9H14N2O12P2"`).
#' @param adduct Character. The adduct ion type (e.g., `"M-H"`).
#' @param charge Numeric. The charge state of the ion. Default is `1`.
#' @param mz Numeric. The accurate mass-to-charge ratio (m/z) of the monoisotopic peak.
#' @param rt Numeric. The retention time (RT) of the compound in seconds.
#' @param peak.mz Numeric vector. Observed m/z values of peaks in the dataset.
#' @param peak.rt Numeric vector. Observed retention times (RT) of peaks in the dataset.
#' @param rt.tol Numeric. The retention time tolerance (in seconds) for matching peaks. Default is `5`.
#' @param mz.tol Numeric. The m/z tolerance (in ppm) for matching peaks. Default is `15`.
#' @param max.isotope Numeric. The maximum number of isotopes to consider. Default is `4`.
#'
#' @return A data frame containing:
#' \item{peakIndex}{Index of the matched peak in the dataset.}
#' \item{mzError.ppm}{Mass error (in ppm) between the observed and theoretical isotope peak.}
#' \item{isotopes}{Label of the identified isotope (e.g., `[M+1]`, `[M+2]`).}
#' \item{rtError.s}{Retention time error (in seconds).}
#'
#' If no isotopic peaks are matched within the given tolerances, the function returns `NULL`.
#'
#' @details
#' The function calculates the theoretical isotope distribution using `Rdisop::getMolecule()`
#' and then matches observed peaks based on m/z and RT tolerances. The function ensures that
#' at least the `[M+1]` isotope is present for a valid match.
#'
#' @importFrom masstools sum_formula
#' @importFrom Rdisop getMolecule getIsotope
#' @importFrom purrr map
#' @importFrom dplyr mutate
#' @export
annotate_isotope <-
function(formula = "C9H14N2O12P2",
adduct = "M-H",
charge = 1,
mz = 402.9998,
rt = 823.1462,
## peak information
peak.mz,
peak.rt,
## other parameters
rt.tol = 5,
mz.tol = 15,
max.isotope = 4) {
formula1 <- masstools::sum_formula(formula = formula, adduct = adduct)
###should be fix latter
if (is.na(formula1)) {
formula1 <- formula
}
molecule <- Rdisop::getMolecule(formula = formula1, # z = charge,
maxisotopes = max.isotope + 1)
isotopes <- t(Rdisop::getIsotope(molecule = molecule))
rownames(isotopes) <-
c("[M]", paste("[M", "+", c(1:(nrow(
isotopes
) - 1)), "]", sep = ""))
isotopes <- data.frame(isotopes, rownames(isotopes), stringsAsFactors = FALSE)
colnames(isotopes) <- c("mz", "intensity", "isotope")
accurate.mz <- mz
isotopes <- isotopes[-1, , drop = FALSE]
###rt filtering
rt.error <- abs(rt - peak.rt)
index1 <- which(rt.error <= rt.tol)
if (length(index1) == 0)
return(NULL)
iso.info <-
purrr::map(
as.data.frame(t(isotopes)),
.f = function(x) {
temp.mz <- as.numeric(x[1])
## calculate error
peak.mz.error <-
abs(temp.mz - peak.mz) * 10^6 / ifelse(temp.mz >= 400, temp.mz, 400)
##has peak matched the mz tolerance
idx <- which(peak.mz.error <= mz.tol & rt.error < rt.tol)
###idx=0, no peaks matched
if (length(idx) == 0) {
# return(c(NA, NA, NA))
return(NULL)
}
## more than one matched, see the intensity ratio error
if (length(idx) > 0) {
idx <- idx[which.min(peak.mz.error[idx])]
return(c(idx, peak.mz.error[idx], x[3]))
}
}
) %>%
do.call(rbind, .) %>%
as.data.frame()
if (nrow(iso.info) == 0) {
return(NULL)
}
if (!"[M+1]" %in% iso.info$V3) {
return(NULL)
}
colnames(iso.info) <- c("peakIndex", "mzError.ppm", "isotopes")
if (!"[M+2]" %in% iso.info$isotopes) {
iso.info <- iso.info[1, , drop = FALSE]
}
iso.info$`rtError.s` <-
rt.error[as.numeric(iso.info$peakIndex)]
iso.info <-
iso.info %>%
dplyr::mutate(
peakIndex = as.numeric(peakIndex),
mzError.ppm = as.numeric(mzError.ppm),
rtError.s = as.numeric(rtError.s)
)
if (!"[M+2]" %in% iso.info$isotopes) {
iso.info <- iso.info[1, ]
}
# rm(list = c("peak.mz", "peak.rt", "peak.int", "cor"))
# gc()
return(iso.info)
}
#' Plot a Metabolic Module from FPA Results
#'
#' This function visualizes a selected metabolic module from the results of a
#' Functional Pathway Analysis (FPA), displaying interactions between metabolites
#' and features in the metabolic network.
#'
#' @param fpa_result List. The output of the Functional Pathway Analysis (FPA),
#' containing `dysregulated_metabolic_module`, `dysregulated_metabolic_network`,
#' and `annotation_table`.
#' @param feature_table_marker Data frame. The feature table containing metabolic
#' markers. Default is `feature_table_marker`.
#' @param include_feature Logical. Whether to include detected metabolic features
#' in the plot. Default is `FALSE`.
#' @param include_hidden_metabolites Logical. Whether to include hidden metabolites
#' in the plot. Default is `FALSE`.
#' @param add_compound_name Logical. Whether to add compound names as labels in
#' the visualization. Default is `TRUE`.
#' @param metabolic_module_index Numeric. The index of the metabolic module to
#' visualize. Default is `1`.
#' @param layout The layout of the network, such as `kk` or `fr`.
#' @param add_pathways Add pathways beside of the network or not. Default is `FALSE`.
#'
#' @return A `ggplot2` object representing the metabolic module network.
#'
#' @details
#' The function extracts and filters a metabolic subnetwork based on the selected
#' module index. It then uses `ggraph` to visualize the metabolic interactions
#' while allowing customization, such as including metabolic features or hidden
#' metabolites.
#'
#' The network layout follows the Fruchterman-Reingold force-directed algorithm
#' (`layout = "fr"`). Nodes represent metabolites and features, while edges
#' represent different types of metabolic interactions.
#'
#' @importFrom stringr str_split
#' @importFrom dplyr filter pull mutate
#' @importFrom tidygraph activate centrality_degree
#' @importFrom ggraph ggraph geom_edge_arc geom_node_point geom_node_text theme_graph
#' @importFrom ggraph scale_edge_color_manual
#' @export
plot_metabolic_module_fpa <-
function(fpa_result,
feature_table_marker = feature_table_marker,
include_feature = FALSE,
include_hidden_metabolites = FALSE,
add_compound_name = TRUE,
metabolic_module_index = 1,
layout = "fr",
add_pathways = FALSE) {
dysregulated_metabolic_module <-
fpa_result$dysregulated_metabolic_module
dysregulated_metabolic_network <-
fpa_result$dysregulated_metabolic_network
annotation_table <-
fpa_result$annotation_table
remain_compound <-
dysregulated_metabolic_module$Total_metabolite_id[metabolic_module_index] %>%
stringr::str_split(pattern = "\\{\\}") %>%
unlist() %>%
unique()
remain_feature <-
annotation_table %>%
dplyr::filter(KEGG.ID %in% remain_compound) %>%
dplyr::pull(variable_id) %>%
unique()
remain_feature <-
remain_feature[remain_feature %in% igraph::V(dysregulated_metabolic_network)$name]
temp_graph <-
dysregulated_metabolic_network %>%
dplyr::filter(name %in% c(remain_compound, remain_feature)) %>%
tidygraph::activate("nodes") %>%
dplyr::mutate(degree = tidygraph::centrality_degree()) %>%
tidygraph::activate("nodes") %>%
dplyr::filter(degree > 0)
if (!include_feature) {
temp_graph <-
temp_graph %>%
tidygraph::activate("nodes") %>%
dplyr::filter(node_class != "Feature")
}
if (!include_hidden_metabolites) {
temp_graph <-
temp_graph %>%
tidygraph::activate("nodes") %>%
dplyr::filter(node_class != "Hidden")
}
plot <-
temp_graph %>%
ggraph::ggraph(layout = layout) +
ggraph::geom_edge_arc(strength = 0.1, aes(color = edge_class)) +
ggraph::scale_edge_color_manual(values = c(
"feature_metabolite" = "#eaeaea",
"metabolic_reaction" = "#6e8da9"
)) +
ggraph::geom_node_point(aes(
color = node_class,
shape = node_class,
size = degree
)) +
scale_color_manual(values = c(
"Feature" = "#ffca3a",
"Detected" = "#ff595e",
"Hidden" = "#6e8da9"
)) +
ggraph::theme_graph()
if (add_compound_name) {
plot <- plot +
ggraph::geom_node_text(aes(label = Compound_name), repel = TRUE)
}
Total_metabolite_number <-
fpa_result$dysregulated_metabolic_module$Total_metabolite_number[metabolic_module_index]
Impact <-
fpa_result$dysregulated_metabolic_module$Impact[metabolic_module_index]
Dominant_edge_rate <-
fpa_result$dysregulated_metabolic_module$Dominant_edge_rate[metabolic_module_index]
p_value <-
fpa_result$dysregulated_metabolic_module$p_value[metabolic_module_index]
Detected_metabolite_number <-
fpa_result$dysregulated_metabolic_module$Detected_metabolite_number[metabolic_module_index]
plot <-
plot +
labs(
title = fpa_result$dysregulated_metabolic_module$Name[metabolic_module_index],
subtitle = paste0(
"Total Metabolite Number: ",
Total_metabolite_number,
"\n",
"Detected Metabolite Number: ",
Detected_metabolite_number,
"\n",
"Impact: ",
Impact,
"\n",
"Dominant Edge Rate: ",
round(Dominant_edge_rate, 3),
"\n",
"P-value: ",
p_value
)
)
if (add_pathways) {
pathway <-
fpa_result$enriched_pathways_list[[metabolic_module_index]]
if (is.null(pathway)) {
plot_pathway <-
ggplot() +
theme_bw() +
labs(title = "No enriched pathways")
} else{
plot_pathway <-
enrich_scatter_plot(object = pathway, label = TRUE)
}
plot <-
plot + patchwork::wrap_plots(plot_pathway, nrow = 1)
}
plot
}
#' Plot the Dysregulated Metabolic Network from FPA Results
#'
#' This function visualizes the dysregulated metabolic network obtained from Functional Pathway Analysis (FPA),
#' highlighting interactions between metabolites and features in the network.
#'
#' @param fpa_result List. The output of the Functional Pathway Analysis (FPA),
#' containing `dysregulated_metabolic_module`, `dysregulated_metabolic_network`,
#' and `annotation_table`.
#' @param feature_table_marker Data frame. The feature table containing metabolic markers.
#' @param include_feature Logical. Whether to include detected metabolic features in the plot.
#' Default is `FALSE`.
#' @param node_color_by_module Logical. When doesn't include features (include_feature = FALSE),
#' whether set the colors of the nodes by metabolics modules.
#' @param include_hidden_metabolites Logical. Whether to include hidden metabolites in the plot.
#' Default is `FALSE`.
#' @param add_compound_name Logical. Whether to add compound names as labels in
#' the visualization. Default is `TRUE`.
#' @param layout The layout of the network, such as `kk` or `fr`.
#'
#' @return A `ggplot2` object representing the metabolic network.
#'
#' @details
#' The function processes the metabolic network by filtering and selecting relevant nodes and edges.
#' It allows the inclusion of metabolic features and hidden metabolites based on user preference.
#' The network is visualized using the Fruchterman-Reingold force-directed algorithm (`layout = "fr"`)
#' to clearly illustrate metabolic interactions.
#'
#' @importFrom tidygraph activate centrality_degree
#' @importFrom dplyr mutate filter
#' @importFrom ggraph ggraph geom_edge_arc geom_node_point geom_node_text theme_graph
#' @importFrom ggplot2 scale_color_manual
#' @importFrom grDevices colorRampPalette
#' @importFrom stats quantile
#' @export
#'
#' @examples
#' \dontrun{
#' # Assume `fpa_result` is a precomputed list from an FPA analysis
#' plot_metabolic_network_fpa(
#' fpa_result = fpa_result,
#' feature_table_marker = feature_table_marker,
#' include_feature = TRUE,
#' include_hidden_metabolites = FALSE,
#' add_compound_name = TRUE
#' )
#' }
plot_metabolic_network_fpa <-
function(fpa_result,
feature_table_marker,
include_feature = FALSE,
node_color_by_module = FALSE,
include_hidden_metabolites = FALSE,
add_compound_name = TRUE,
layout = "fr") {
dysregulated_metabolic_module <-
fpa_result$dysregulated_metabolic_module
dysregulated_metabolic_network <-
fpa_result$dysregulated_metabolic_network
annotation_table <-
fpa_result$annotation_table
temp_graph <-
dysregulated_metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::mutate(degree = tidygraph::centrality_degree())
if (!include_feature) {
temp_graph <-
temp_graph %>%
tidygraph::activate("nodes") %>%
dplyr::filter(node_class != "Feature")
if (node_color_by_module) {
temp_data <-
seq_len(nrow(dysregulated_metabolic_module)) %>%
purrr::map(function(i) {
data.frame(
module = dysregulated_metabolic_module$Name[i],
KEGG.ID = stringr::str_split(
dysregulated_metabolic_module$Total_metabolite_id[i],
"\\{\\}"
)[[1]]
) %>%
dplyr::distinct(KEGG.ID, .keep_all = TRUE)
}) %>%
do.call(rbind, .) %>%
as.data.frame() %>%
dplyr::mutate(module = factor(module, levels = stringr::str_sort(unique(module), numeric = TRUE)))
temp_graph <-
temp_graph %>%
tidygraph::activate("nodes") %>%
dplyr::left_join(temp_data, by = c("name" = "KEGG.ID"))
}
}
if (!include_hidden_metabolites) {
temp_graph <-
temp_graph %>%
tidygraph::activate("nodes") %>%
dplyr::filter(node_class != "Hidden")
}
if (!include_feature & node_color_by_module) {
plot <-
temp_graph %>%
ggraph::ggraph(layout = layout) +
ggraph::geom_edge_arc(strength = 0.1, aes(color = edge_class)) +
ggraph::scale_edge_color_manual(values = c(
"feature_metabolite" = "#eaeaea",
"metabolic_reaction" = "#6e8da9"
)) +
ggraph::geom_node_point(aes(
color = module,
shape = node_class,
size = degree
)) +
scale_color_manual(values =
colorRampPalette(ggsci::pal_aaas("default")(10))(length(unique(
as.character(
tidygraph::as_tibble(temp_graph, active = "nodes") %>% pull(module)
)
)))) +
ggraph::theme_graph()
} else{
plot <-
temp_graph %>%
ggraph::ggraph(layout = layout) +
ggraph::geom_edge_arc(strength = 0.1, aes(color = edge_class)) +
ggraph::scale_edge_color_manual(values = c(
"feature_metabolite" = "#eaeaea",
"metabolic_reaction" = "#6e8da9"
)) +
ggraph::geom_node_point(aes(
color = node_class,
shape = node_class,
size = degree
)) +
scale_color_manual(values = c(
"Feature" = "#ffca3a",
"Detected" = "#ff595e",
"Hidden" = "#6e8da9"
)) +
ggraph::theme_graph()
}
if (add_compound_name) {
plot <- plot +
ggraph::geom_node_text(aes(label = Compound_name), repel = TRUE)
}
plot
}
#' Calculate Node Centrality in a Graph
#'
#' This function calculates the centrality of nodes in a given network graph based on either **degree centrality** or **betweenness centrality**.
#'
#' @param graph An `igraph` object representing the network.
#' @param type A character string specifying the type of centrality to compute. Options are `"degree"` (default) or `"betweenness"`.
#'
#' @return A named numeric vector where names represent node IDs and values represent centrality scores.
#'
#' @details
#' - **Degree centrality** measures the number of direct connections a node has.
#' - **Betweenness centrality** measures the number of shortest paths passing through a node, indicating its role as a bridge in the network.
#' - The function uses `igraph::degree()` for degree centrality and a custom approach for betweenness centrality using `igraph::shortest_paths()`.
#'
#' @importFrom igraph V degree shortest_paths
#'
#' @examples
#' \dontrun{
#' library(igraph)
#'
#' # Create a sample graph
#' g <- make_ring(10)
#' V(g)$name <- paste0("Node", 1:10)
#'
#' # Calculate degree centrality
#' degree_centrality <- calculate_centrality(g, type = "degree")
#' print(degree_centrality)
#'
#' # Calculate betweenness centrality
#' betweenness_centrality <- calculate_centrality(g, type = "betweenness")
#' print(betweenness_centrality)
#' }
#'
#' @export
calculate_centrality <-
function(graph, type = c("degree", "betweenness")) {
type = match.arg(type)
node <- igraph::V(graph)$name
if (type == "degree") {
node.degree <- igraph::degree(graph = graph, v = node)
return(node.degree)
}
node.betweenness <- lapply(node, function(x) {
i <- match(x, node)
if (i == length(node))
return(NULL)
from <- x
to <- node[(i + 1):length(node)]
temp <- igraph::shortest_paths(graph = graph,
from = from,
to = to)[[1]]
temp <- lapply(temp, function(x) {
if (length(x) <= 2)
return(NULL)
names(x)[-c(1, length(x))]
})
unlist(temp)
})
node.betweenness <- unlist(node.betweenness)
node.betweenness <- table(node.betweenness)
node0 <- setdiff(node, names(node.betweenness))
if (length(node0) == 0)
return(node.betweenness)
add <- rep(0, length(node0))
names(add) <- node0
node.betweenness <- c(node.betweenness, add)
node.betweenness <- node.betweenness[match(node, names(node.betweenness))]
return(node.betweenness)
}
#' Calculate Activity Score for Metabolic Modules
#'
#' This function calculates an activity score for each metabolic module based on its detected and hidden metabolites,
#' as well as its structure within a sub-metabolic network. The calculation incorporates modularity and degree-based
#' weighting to assess the significance of the module.
#'
#' @param metabolic_modules A list of metabolic modules, where each module is represented as a vector of metabolite IDs.
#' @param detected_metabolites A character vector of detected metabolite IDs.
#' @param hidden_metabolites A character vector of hidden metabolite IDs.
#' @param sub_metabolic_network An `igraph` object representing the sub-metabolic network.
#' @param threads An integer specifying the number of threads to use for parallel computation. Default is 3.
#'
#' @return A numeric vector where each value represents the activity score of a corresponding metabolic module.
#'
#' @details
#' The activity score is computed by assessing the modularity of each metabolic module within the sub-metabolic network.
#' The function extracts the subgraph corresponding to each module and computes a modularity-based score adjusted for the
#' total number of metabolites in the dataset.
#'
#' The computation is performed in parallel using `BiocParallel`, selecting either `MulticoreParam` (for Unix-based systems)
#' or `SnowParam` (for Windows) to handle multi-threaded execution efficiently.
#'
#' @import igraph
#' @importFrom BiocParallel bplapply MulticoreParam SnowParam
#' @importFrom masstools get_os
#'
#' @examples
#' \dontrun{
#' library(igraph)
#'
#' # Example metabolic network
#' g <- make_ring(10)
#' V(g)$name <- paste0("M", 1:10)
#'
#' # Example input data
#' metabolic_modules <- list(
#' c("M1", "M2", "M3"),
#' c("M4", "M5", "M6", "M7")
#' )
#' detected_metabolites <- c("M1", "M3", "M5", "M7")
#' hidden_metabolites <- c("M2", "M6")
#'
#' # Compute activity scores
#' scores <- calculate_activity_score(
#' metabolic_modules = metabolic_modules,
#' detected_metabolites = detected_metabolites,
#' hidden_metabolites = hidden_metabolites,
#' sub_metabolic_network = g,
#' threads = 2
#' )
#'
#' print(scores)
#' }
#'
#' @export
calculate_activity_socre <-
function(metabolic_modules,
detected_metabolites,
hidden_metabolites,
sub_metabolic_network,
threads = 3) {
temp_fun_activity_scores <-
function(x,
detected_metabolites,
hidden_metabolites,
total_metabolite_number,
sub_metabolic_network = sub_metabolic_network) {
requireNamespace("igraph", quietly = TRUE)
# options(warn = -1)
temp_graph <- igraph::subgraph(graph = sub_metabolic_network, v = x)
detected_number <- sum(x %in% detected_metabolites)
hidden_number <- sum(x %in% hidden_metabolites)
m <- length(igraph::E(sub_metabolic_network))
e <- length(igraph::E(temp_graph))
node_name <- names(igraph::V(temp_graph))
value <- sapply(node_name, function(node) {
temp_value <- sapply(node_name, function(x) {
temp1 <- igraph::degree(graph = sub_metabolic_network, v = x)
temp2 <- igraph::degree(graph = sub_metabolic_network, v = node)
temp1 * temp2 / (4 * m^2)
})
sum(temp_value)
})
modularity <- e / m - sum(value)
# modularity
q <- modularity * sqrt(total_metabolite_number / length(x))
a <- detected_number * q / length(x)
a
}
if (masstools::get_os() == "windows") {
bpparam = BiocParallel::SnowParam(workers = threads, progressbar = TRUE)
} else{
bpparam = BiocParallel::MulticoreParam(workers = threads, progressbar = TRUE)
}
activity_score <-
BiocParallel::bplapply(
metabolic_modules,
temp_fun_activity_scores,
BPPARAM = bpparam,
detected_metabolites = detected_metabolites,
hidden_metabolites = hidden_metabolites,
total_metabolite_number = length(detected_metabolites),
sub_metabolic_network = sub_metabolic_network
)
activity_score <- unlist(activity_score)
return(activity_score)
}
# Function to compute module quality metrics
calculate_module_quality <-
function(graph, nodes) {
temp_graph <-
igraph::induced_subgraph(graph, vids = nodes)
internal_edges <- igraph::gsize(temp_graph)
external_edges <- sum(igraph::degree(graph, v = nodes)) - (2 * internal_edges)
# Conductance
conductance <- ifelse((internal_edges + external_edges) == 0,
0,
external_edges / (internal_edges + external_edges)
)
# Internal Density
num_nodes <- length(nodes)
internal_density <-
ifelse(num_nodes < 2, 0, (2 * internal_edges) / (num_nodes * (num_nodes - 1)))
# Edge Density Ratio (EDR)
edge_density_ratio <-
ifelse(external_edges == 0, 1, internal_edges / external_edges)
# Return metrics
return(
data.frame(
module_size = num_nodes,
internal_edges = internal_edges,
external_edges = external_edges,
conductance = conductance,
internal_density = internal_density,
edge_density_ratio = edge_density_ratio
)
)
}
generate_null_conductance_distribution <-
function(annotation_table_all,
feature_table_marker,
metabolite_database,
metabolic_network,
permutation_times = 5,
threads = 8,
adduct.table = NULL,
column = c("rp", "hilic"),
ms1.match.ppm = 25,
mz.ppm.thr = 400,
include_hidden_metabolites = FALSE) {
column <- match.arg(column) # Ensure valid column type
# Initialize list to store activity scores for each permutation
null_conductance <- vector(mode = "list", length = permutation_times)
marker_number <- nrow(feature_table_marker) # Number of markers
for (i in seq_len(permutation_times)) {
cat(i, " ")
# Randomly sample markers for permutation
if (nrow(feature_table_marker) / length(unique(annotation_table_all$variable_id)) < 0.2) {
temp_marker_variable_id <- sample(
x = setdiff(
unique(annotation_table_all$variable_id),
feature_table_marker$variable_id
),
size = marker_number,
replace = FALSE
)
} else{
temp_marker_variable_id <- sample(
x = unique(annotation_table_all$variable_id),
size = marker_number,
replace = FALSE
)
}
# Extract annotation information for sampled markers
temp_annotation_table_marker <- annotation_table_all %>%
dplyr::filter(variable_id %in% temp_marker_variable_id)
# Extract detected metabolites from annotation table
temp_detected_metabolites <- unique(temp_annotation_table_marker$KEGG.ID)
# Identify hidden metabolites connected to detected metabolites
if (include_hidden_metabolites) {
temp_hidden_metabolites <- (
metabolic_network = metabolic_network,
detected_metabolites = temp_detected_metabolites,
threads = threads,
max.reaction = 3
) %>% unlist() %>% unique()
} else {
temp_hidden_metabolites <- NULL
}
temp_total_metabolites <-
c(temp_detected_metabolites, temp_hidden_metabolites)
# Extract subnetwork containing detected and hidden metabolites
temp_sub_metabolic_network <- metabolic_network %>%
tidygraph::activate("nodes") %>%
dplyr::filter(name %in% temp_total_metabolites) %>%
dplyr::mutate(node_class = ifelse(name %in% temp_detected_metabolites, "Detected", "Hidden")) %>%
tidygraph::mutate(degree2 = tidygraph::centrality_degree())
# Identify metabolic modules in the subnetwork
temp_metabolic_modules <- identify_metabolic_modules(
sub_metabolic_network = temp_sub_metabolic_network,
detected_metabolites = temp_detected_metabolites,
hidden_metabolites = temp_hidden_metabolites
)
# Calculate activity scores for the identified metabolic modules
temp_conductance <-
temp_metabolic_modules %>%
purrr::map(function(x) {
calculate_module_quality(graph = temp_sub_metabolic_network, nodes = x)$conductance
}) %>%
unlist()
# Store the computed activity score in the list
null_conductance[[i]] <- temp_conductance
}
return(null_conductance)
}
check_feature_table_all <-
function(feature_table_all) {
####feature_table_all
# 1. Check if it's a data frame
if (!is.data.frame(feature_table_all)) {
stop("Error: feature_table_all must be a data frame.")
}
# 2. Check column names
expected_colnames <- c("variable_id", "mz", "rt", "polarity")
if (!identical(colnames(feature_table_all), expected_colnames)) {
stop("Error: Column names must be exactly: 'variable_id', 'mz', 'rt', 'polarity'.")
}
# 3. Check if variable_id is unique
if (anyDuplicated(feature_table_all$variable_id) > 0) {
stop("Error: 'variable_id' column must have unique values.")
}
# 4. Check if mz is numeric and has no NA
if (!is.numeric(feature_table_all$mz) ||
any(is.na(feature_table_all$mz))) {
stop("Error: 'mz' column must be numeric and contain no missing values.")
}
# 5. Check if rt is numeric and has no NA
if (!is.numeric(feature_table_all$rt) ||
any(is.na(feature_table_all$rt))) {
stop("Error: 'rt' column must be numeric and contain no missing values.")
}
# 6. Check if rt is in seconds (not in minutes)
max_rt <- max(feature_table_all$rt, na.rm = TRUE) # Get the max retention time
if (max_rt < 100) {
# If max RT is too small, it might be in minutes
stop("Error: 'rt' values seem to be in minutes, but they should be in seconds.")
}
# 7. Check polarity column
valid_polarity <- c("positive", "negative")
if (any(is.na(feature_table_all$polarity))) {
stop("Error: 'polarity' column must not contain NA values.")
}
if (any(grepl("\\s", feature_table_all$polarity))) {
stop("Error: 'polarity' column must not contain spaces.")
}
if (!all(feature_table_all$polarity %in% valid_polarity)) {
stop("Error: 'polarity' must only contain 'positive' or 'negative'.")
}
# If all checks pass
message("feature_table_all is correct.\n")
}
check_feature_table_marker <-
function(feature_table_marker) {
# 1. Check if it's a data frame
if (!is.data.frame(feature_table_marker)) {
stop("Error: feature_table_marker must be a data frame.")
}
# 2. Check column names
expected_colnames <- c("variable_id", "mz", "rt", "degree", "p_value", "polarity")
if (!identical(colnames(feature_table_marker), expected_colnames)) {
stop(
"Error: Column names must be exactly: 'variable_id', 'mz', 'rt', 'degree', 'p_value', 'polarity'."
)
}
# 3. Check if variable_id is unique
if (anyDuplicated(feature_table_marker$variable_id) > 0) {
stop("Error: 'variable_id' column must have unique values.")
}
# 4. Check if mz is numeric and has no NA
if (!is.numeric(feature_table_marker$mz) ||
any(is.na(feature_table_marker$mz))) {
stop("Error: 'mz' column must be numeric and contain no missing values.")
}
# 5. Check if rt is numeric, has no NA, and represents time in seconds
if (!is.numeric(feature_table_marker$rt) ||
any(is.na(feature_table_marker$rt))) {
stop("Error: 'rt' column must be numeric and contain no missing values.")
}
# 6. Ensure rt is in seconds (not minutes)
max_rt <- max(feature_table_marker$rt, na.rm = TRUE) # Get the max retention time
if (max_rt < 100) {
# If max RT is too small, it might be in minutes
stop("Error: 'rt' values seem to be in minutes, but they should be in seconds.")
}
# 7. Check if degree is numeric and has no NA
if (!is.numeric(feature_table_marker$degree) ||
any(is.na(feature_table_marker$degree))) {
stop("Error: 'degree' column must be numeric and contain no missing values.")
}
# 8. Check if p_value is numeric and has no NA
if (!is.numeric(feature_table_marker$p_value) ||
any(is.na(feature_table_marker$p_value))) {
stop("Error: 'p_value' column must be numeric and contain no missing values.")
}
# 9. Check polarity column
valid_polarity <- c("positive", "negative")
if (any(is.na(feature_table_marker$polarity))) {
stop("Error: 'polarity' column must not contain NA values.")
}
if (any(grepl("\\s", feature_table_marker$polarity))) {
stop("Error: 'polarity' column must not contain spaces.")
}
if (!all(feature_table_marker$polarity %in% valid_polarity)) {
stop("Error: 'polarity' must only contain 'positive' or 'negative'.")
}
# If all checks pass
message("feature_table_marker is correct.")
}
# Example usage:
# check_feature_table_marker(feature_table_marker)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.