knitr::opts_chunk$set(echo = TRUE)
library(BiocSet) library(tidyverse) library(here) library(stringdist) library(phyloseq) library(ggcorrplot) source(here("R", "concordance_functions.R")) set.seed(160497)
First, load some sets
sets <- list( readRDS(here("output", "sets", "set_pathways_species.rds")), readRDS(here("output", "sets", "set_substrate_species.rds")) ) metadata <- readRDS(file = "../metadata/metacyc_parse.rds") metadata[1:5]
Pre-process names
names(metadata) <- map(names(metadata), ~str_remove_all(.x, pattern = "\\|") %>% str_replace_all("-", "_") %>% tolower()) metadata <- map(metadata, ~str_remove_all(.x, pattern = "\\|")) metadata[1:5]
substrate_sets <- sets[[2]] %>% es_set() %>% pull(set) pathway_sets <- sets[[1]] %>% es_set() %>% pull(set) query_names <- names(metadata) %>% str_remove_all(pattern = "_(syn|metabolism|biosynthesis|degredation|deg)") compare_mat <- stringdistmatrix(a = substrate_sets, b = query_names, method = "lv") match_substrates <- which(compare_mat == 0, arr.ind = TRUE) %>% as.data.frame() %>% mutate(set = substrate_sets[row], db = names(metadata)[col]) compare_mat <- stringdistmatrix(a = pathway_sets, b = query_names, method = "lv") match_pathways <- which(compare_mat == 0, arr.ind = TRUE) %>% as.data.frame() %>% mutate(set = pathway_sets[row], db = names(metadata)[col]) p_ids <- match_pathways %>% mutate(set = paste(set, "pathways", sep = "_")) s_ids <- match_substrates %>% mutate(set = paste(set, "substrate", sep = "_"))
Let's load the crc data set
trait_data <- read.csv(file = here("data", "trait_crc_wgs_feat.csv")) path_data <- read.csv(file = here("data", "pred_pathway_crc_feat.csv"), row.names = 1) # <- path_data %>% select(-which(colSums(path_data) <= 0.001)) met <- read.csv(file = here("data", "pred_pathway_crc_metadata.csv"), row.names = 1) colnames(trait_data) <- colnames(trait_data) %>% str_replace("X", "") %>% str_replace("\\.", "_") trait_data <- trait_data %>% dplyr::select(sample_ids, ends_with(c("pathways", "substrate"))) %>% column_to_rownames("sample_ids") # making sure that rownames match trait_data <- trait_data[rownames(met),] path_data <- path_data[rownames(met),] names_list <- colnames(path_data) %>% str_split("\\.\\.") path_names <- map_chr(names_list, ~.x[1] %>% str_replace_all("\\.", "-")) annotations <- map_chr(names_list, ~.x[-1] %>% paste(collapse = "_") %>% str_replace_all("\\.", "_")) colnames(path_data) <- path_names
Let's perform proportion assessment for CRC data
pathway_assess_crc <- assess_traits(trait_data, path_data, met, ref_df = p_ids, o_var = "study_condition", p_class = "CRC", annotation = "pathway", metadata = metadata) substrate_assess_crc <- assess_traits(trait_data, path_data, met, ref_df = s_ids, o_var = "study_condition", p_class = "CRC", annotation = "substrate", metadata = metadata)
Let's perform spearman correlation assessment for CRC data
annotation_df <- read_delim(file = "https://github.com/picrust/picrust2/raw/master/picrust2/default_files/description_mapfiles/metacyc_pathways_info.txt.gz", delim = "\t", col_names = FALSE) colnames(annotation_df) <- c("pathway", "annotation") pathway_cor_crc <- assess_correlation(trait_data, path_data, met, ref_df = p_ids, annotation = "pathway", metadata = metadata) substrate_cor_crc <- assess_correlation(trait_data, path_data, met, ref_df = s_ids, metadata = metadata, annotation = "substrate") ggcorrplot(corr_mat, lab = TRUE, method = "circle", show.legend = FALSE, outline.color = "white", tl.cex = 5)
Let's do the same for IBD data
trait_data <- read.csv(file = here("data", "trait_ibd_wgs_feat.csv")) path_data <- read.csv(file = here("data", "pred_pathway_ibd_feat.csv"), row.names = 1) # <- path_data %>% select(-which(colSums(path_data) <= 0.001)) met <- read.csv(file = here("data", "pred_pathway_ibd_metadata.csv"), row.names = 1) colnames(trait_data) <- colnames(trait_data) %>% str_replace("X", "") %>% str_replace("\\.", "_") trait_data <- trait_data %>% dplyr::select(sample_ids, ends_with(c("pathways", "substrate"))) %>% column_to_rownames("sample_ids") # making sure that rownames match trait_data <- trait_data[rownames(met),] path_data <- path_data[rownames(met),] names_list <- colnames(path_data) %>% str_split("\\.\\.") path_names <- map_chr(names_list, ~.x[1] %>% str_replace_all("\\.", "-")) annotations <- map_chr(names_list, ~.x[-1] %>% paste(collapse = "_") %>% str_replace_all("\\.", "_")) colnames(path_data) <- path_names
pathway_assess_ibd <- assess_traits(trait_data, path_data, met, ref_df = p_ids, o_var = "study_condition", p_class = "IBD", annotation = "pathway", metadata = metadata) substrate_assess_ibd <- assess_traits(trait_data, path_data, met, ref_df = s_ids, o_var = "study_condition", p_class = "IBD", annotation = "substrate", metadata = metadata)
proportion_assessment <- bind_rows( bind_rows(pathway_assess_crc, substrate_assess_crc) %>% mutate(condition = "CRC"), bind_rows(pathway_assess_ibd, substrate_assess_ibd) %>% mutate(condition = "IBD") ) write.csv(proportion_assessment, file = here("output", "concordance", "proportion_assessment.csv"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.