knitr::opts_chunk$set(echo = TRUE)
library(BiocSet)
library(tidyverse)
library(here)
library(stringdist)
library(phyloseq)
library(ggcorrplot)
source(here("R", "concordance_functions.R"))
set.seed(160497)

First, load some sets

sets <- list(
    readRDS(here("output", "sets", "set_pathways_species.rds")), 
    readRDS(here("output", "sets", "set_substrate_species.rds"))
)

metadata <- readRDS(file = "../metadata/metacyc_parse.rds")
metadata[1:5]

Pre-process names

names(metadata) <- map(names(metadata), ~str_remove_all(.x, pattern = "\\|") %>% 
                           str_replace_all("-", "_") %>% tolower())
metadata <- map(metadata, ~str_remove_all(.x, pattern = "\\|"))
metadata[1:5]
substrate_sets <- sets[[2]] %>% es_set() %>% pull(set) 
pathway_sets <- sets[[1]] %>% es_set() %>% pull(set)

query_names <- names(metadata) %>% 
    str_remove_all(pattern = "_(syn|metabolism|biosynthesis|degredation|deg)")
compare_mat <- stringdistmatrix(a = substrate_sets, b = query_names, method = "lv")

match_substrates <- which(compare_mat == 0, arr.ind = TRUE) %>% as.data.frame() %>% 
    mutate(set = substrate_sets[row], db = names(metadata)[col])

compare_mat <- stringdistmatrix(a = pathway_sets, b = query_names, method = "lv")

match_pathways <- which(compare_mat == 0, arr.ind = TRUE) %>% as.data.frame() %>% 
    mutate(set = pathway_sets[row], db = names(metadata)[col])

p_ids <- match_pathways %>% mutate(set = paste(set, "pathways", sep = "_")) 
s_ids <- match_substrates %>% mutate(set = paste(set, "substrate", sep = "_")) 

Let's load the crc data set

trait_data <- read.csv(file = here("data", "trait_crc_wgs_feat.csv"))
path_data <- read.csv(file = here("data", "pred_pathway_crc_feat.csv"), row.names = 1)
# <- path_data %>% select(-which(colSums(path_data) <= 0.001))
met <- read.csv(file = here("data", "pred_pathway_crc_metadata.csv"), row.names = 1)
colnames(trait_data) <- colnames(trait_data) %>% str_replace("X", "") %>% str_replace("\\.", "_")
trait_data <- trait_data %>% dplyr::select(sample_ids, ends_with(c("pathways", "substrate"))) %>% 
    column_to_rownames("sample_ids")

# making sure that rownames match
trait_data <- trait_data[rownames(met),]
path_data <- path_data[rownames(met),]

names_list <- colnames(path_data) %>% str_split("\\.\\.")
path_names <- map_chr(names_list, ~.x[1] %>% str_replace_all("\\.", "-"))
annotations <- map_chr(names_list, ~.x[-1] %>% paste(collapse = "_") %>% 
                           str_replace_all("\\.", "_"))
colnames(path_data) <- path_names

Let's perform proportion assessment for CRC data

pathway_assess_crc <- assess_traits(trait_data, path_data, met, 
                                    ref_df = p_ids, o_var = "study_condition", 
                                    p_class = "CRC", annotation = "pathway", metadata = metadata)

substrate_assess_crc <- assess_traits(trait_data, path_data, met, 
                                      ref_df = s_ids, o_var = "study_condition", 
                                      p_class = "CRC", annotation = "substrate", metadata = metadata)

Let's perform spearman correlation assessment for CRC data

annotation_df <- read_delim(file = "https://github.com/picrust/picrust2/raw/master/picrust2/default_files/description_mapfiles/metacyc_pathways_info.txt.gz", delim = "\t", col_names = FALSE)

colnames(annotation_df) <- c("pathway", "annotation")

pathway_cor_crc <- assess_correlation(trait_data, path_data, met, ref_df = p_ids, 
                                      annotation = "pathway", metadata = metadata)

substrate_cor_crc <- assess_correlation(trait_data, path_data, met, ref_df = s_ids, 
                                        metadata = metadata, annotation = "substrate")


ggcorrplot(corr_mat, lab = TRUE, method = "circle", show.legend = FALSE, 
           outline.color = "white", tl.cex = 5)

Let's do the same for IBD data

trait_data <- read.csv(file = here("data", "trait_ibd_wgs_feat.csv"))
path_data <- read.csv(file = here("data", "pred_pathway_ibd_feat.csv"), row.names = 1)
# <- path_data %>% select(-which(colSums(path_data) <= 0.001))
met <- read.csv(file = here("data", "pred_pathway_ibd_metadata.csv"), row.names = 1)
colnames(trait_data) <- colnames(trait_data) %>% str_replace("X", "") %>% str_replace("\\.", "_")
trait_data <- trait_data %>% dplyr::select(sample_ids, ends_with(c("pathways", "substrate"))) %>% 
    column_to_rownames("sample_ids")

# making sure that rownames match
trait_data <- trait_data[rownames(met),]
path_data <- path_data[rownames(met),]

names_list <- colnames(path_data) %>% str_split("\\.\\.")
path_names <- map_chr(names_list, ~.x[1] %>% str_replace_all("\\.", "-"))
annotations <- map_chr(names_list, ~.x[-1] %>% paste(collapse = "_") %>% 
                           str_replace_all("\\.", "_"))
colnames(path_data) <- path_names
pathway_assess_ibd <- assess_traits(trait_data, path_data, met, 
                                    ref_df = p_ids, o_var = "study_condition", 
                                    p_class = "IBD", annotation = "pathway", metadata = metadata)

substrate_assess_ibd <- assess_traits(trait_data, path_data, met, 
                                      ref_df = s_ids, o_var = "study_condition", 
                                      p_class = "IBD", annotation = "substrate", metadata = metadata)
proportion_assessment <- bind_rows(
    bind_rows(pathway_assess_crc, substrate_assess_crc) %>% mutate(condition = "CRC"),
    bind_rows(pathway_assess_ibd, substrate_assess_ibd) %>% mutate(condition = "IBD")
)
write.csv(proportion_assessment, file = here("output", "concordance", "proportion_assessment.csv"))


qpmnguyen/microbe_set_trait documentation built on April 11, 2024, 6:07 p.m.