library(DescTools)
hierarchichal clustering in: (x,y) coordinates out: clusters column added to initial df plots x,y coordinates and clusters
plot_cluster=function(data, var_cluster, point_size) { ggplot(data, aes_string(x="V1", y="V2", color=var_cluster)) + geom_point(size=point_size) + guides(colour=guide_legend(override.aes=list(size=6))) + xlab("") + ylab("") + ggtitle("") + theme_light(base_size=20) + theme(axis.text.x=element_blank(), axis.text.y=element_blank(), legend.direction = "horizontal", legend.position = "bottom", legend.box = "horizontal") } cluster_data <- function(coords, point_size = 10, h) { names(coords) = c("V1", "V2") fit_cluster_hierarchical=hclust(dist(scale(coords))) plot(fit_cluster_hierarchical) coords$clusters = factor(cutree(fit_cluster_hierarchical, h=h)) plot(plot_cluster(coords, coords$clusters, point_size)) return(coords) }
enrichment score in: x array (unordered) in or out matrix (ions_inpath) pathway of interest plot? out: res res$x_array = ion values (ordered) res$e_score = e score at that x value res$pmin = p min (p value comparing minimum to jumbled distribution) >> currently TRUE if value above 95% interval res$pmax = p max (p value comparing maximum to jumbled distribution) >> or belot 5% interval
escore <- function(data) { sum_in <- sum((data[data$in_path == 1,]$x_array)^2) sum_out <- sum((data[data$in_path == 0,]$x_array)^2) w <- sum_out / sum_in data$e_score = 0 amp <- 0 for (ion in 1:nrow(data)) { if (data$in_path[ion] == 1) { amp <- amp + (w * (data$x_array[ion])^2) } else if (data$in_path[ion] == 0) { amp <- amp - (data$x_array[ion])^2 } data$e_score[ion] <- amp } if (abs(min(data$e_score)) > max(data$e_score)) {es <- min(data$e_score)} else {es <- max(data$e_score)} res <- list("data" = data, "es" = es) return(res) } calc_ES <- function(x_array, pathway, ions_inpath, plot, cluster_description) { in_path <- ions_inpath[,grep(paste0(pathway, "$"), names(ions_inpath))] if (!is.integer(in_path)) { print("pathway not found") } else { data <- data.frame(x_array, in_path) data <- data[order(data$x_array),] copy <- data escore_data <- escore(data) data <- escore_data$data es <- escore_data$es data$x_step <- seq(2:(nrow(data)+1)) if (plot == TRUE) { par(mar = c(5,5,2,5)) plot(c(1,data$x_step), c(0,data$e_score), type = "l", ylab = "enrichment score", xlab = "index") abline(h = 0) par(new = T) plot(data$x_step, (data$x_array^2), xlab = NA, ylab = NA, axes = F, type = "l", col = "red") axis(side = 4) mtext(side = 4, line = 3, "squared ion means") title(paste0("pathway: ", pathway, " | cluster: ", cluster_description)) } # jumble weights 1000 times and store min and max values j_es <- c() for (i in 1:1000) { jumbled <- copy jumbled$in_path <- sample(copy$in_path, size = length(copy$in_path), replace = FALSE) escore_jumbled<- escore(jumbled) jumbled <- escore_jumbled$data j_es <- c(j_es, escore_jumbled$es) } extreme <- copy extreme$in_path <- c(rep(1, sum(extreme$in_path)), rep(0, length(extreme$in_path) - sum(extreme$in_path))) extreme_val <- escore(extreme)$es if (es < quantile(j_es, .05)) {pmin = TRUE} else {pmin = FALSE} if (es > quantile(j_es, .95)) {pmax = TRUE} else {pmax = FALSE} p <- sum(abs(j_es) < es)/length(j_es) ret <- list(x_array = data$x_array, e_score = data$e_score, Q = es, Q_extreme = extreme_val, probability = p ) return(ret) } }
Before calculating enrichment score: make dataframe of ions and which pathways they are in in: annotated ions out: writes dataframe of ions and which pathways they are in as pathway_df.csv
#library(BiocManager) #BiocManager::install("hmdbQuery") find_ions_in_pathways_nopunct <- function(ann) { library(hmdbQuery) # load annotated ions ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv") pathway_df <- data.frame("init" = c(1:(nrow(ann)*100)), "metabolite" = 0, "ion" = 0, "HMDBID" = 0, "pathway" = 0) pathway_df <- pathway_df[-1] line <- 1 metabolites <- c() # for every metabolite in ann (i) for (i in 1:nrow(ann)) { met_IDs <- unlist(strsplit(as.character(ann$'ï..id'[i]), '; ')) met_IDs <- met_IDs[grepl("HMDB", met_IDs)] met_IDs <- sub("HMDB", "HMDB00", met_IDs) metabolites <- c(metabolites, gsub("[[:punct:]]", "", ann$name[i])) for (id in met_IDs) { entry <- tryCatch(HmdbEntry(prefix = "http://www.hmdb.ca/metabolites/", id = id), error = function(e) entry <- NA) if (!is.na(entry)) { pathways <- store(entry)$biological_properties$pathways if (pathways != "\n ") { prev_paths <- c() for(path in 1:length(pathways)){ if (!gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1]) %in% prev_paths){ pathway_df$metabolite[line] <- gsub("[[:punct:]]", "", ann$name[i]) pathway_df$ion[line] <- ann$ion[i] pathway_df$HMDBID[line] <- id pathway_df$pathway[line] <- gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1]) line <- line + 1 prev_paths <- c(prev_paths, gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1])) } } } } } print(paste0("met ", i, ": ", as.character(ann$name[i]))) } pathway_df <- pathway_df[pathway_df$metabolite != 0,] write.csv(pathway_df, "../data/pathway_df_nopunct.csv", row.names = FALSE) }
#library(BiocManager) #BiocManager::install("hmdbQuery") find_ions_in_pathways <- function(ann) { library(hmdbQuery) # load annotated ions ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv") pathway_df <- data.frame("init" = c(1:(nrow(ann)*100)), "metabolite" = 0, "ion" = 0, "HMDBID" = 0, "pathway" = 0) pathway_df <- pathway_df[-1] line <- 1 metabolites <- c() # for every metabolite in ann (i) for (i in 1:nrow(ann)) { met_IDs <- unlist(strsplit(as.character(ann$'ï..id'[i]), '; ')) met_IDs <- met_IDs[grepl("HMDB", met_IDs)] met_IDs <- sub("HMDB", "HMDB00", met_IDs) metabolites <- c(metabolites, gsub("[[:punct:]]", "", ann$name[i])) for (id in met_IDs) { entry <- tryCatch(HmdbEntry(prefix = "http://www.hmdb.ca/metabolites/", id = id), error = function(e) entry <- NA) if (!is.na(entry)) { pathways <- store(entry)$biological_properties$pathways if (pathways != "\n ") { prev_paths <- c() for(path in 1:length(pathways)){ if (!gsub(" ", "", unlist(as.character(pathways[path]$pathway$name))) %in% prev_paths){ pathway_df$metabolite[line] <- gsub("[[:punct:]]", "", ann$name[i]) pathway_df$ion[line] <- ann$ion[i] pathway_df$HMDBID[line] <- id pathway_df$pathway[line] <- gsub(" ", "", unlist(as.character(pathways[path]$pathway$name) )) line <- line + 1 prev_paths <- c(prev_paths, gsub(" ", "", unlist(as.character(pathways[path]$pathway$name)))) } } } } } print(paste0("met ", i, ": ", as.character(ann$name[i]))) } pathway_df <- pathway_df[pathway_df$metabolite != 0,] return(pathway_df) write.csv(pathway_df, "../data/pathway_df_punct.csv", row.names = FALSE) }
Re format pathway_df as binary matrix in: dataframe of ions and which pathways they are in (from prev) out: writes binary matrix of ions by pathway as ion_inpath.csv
binary_pathways <- function(pathway_df, name) { ion_inpath <- as.data.frame(matrix(nrow = length(unique(ann$ion)), ncol = length(unique(pathway_df$pathway)), 0)) rownames(ion_inpath) <- unique(ann$ion) colnames(ion_inpath) <- unique(pathway_df$pathway) for (path in 1:ncol(ion_inpath)){ ion_inpath[[path]] <- as.numeric(c(rownames(ion_inpath)) %in% unique(pathway_df[pathway_df$pathway == names(ion_inpath)[path],]$ion)) } write.csv(ion_inpath, paste0("../data/ion_inpath_", name, ".csv"), row.names = FALSE) }
Find "important" pathways; those with the most ions in them in: binary matrix of ions by pathway (from prev)
find_important_pathways <- function(ion_inpath, n = 5){ pathway_summary <- data.frame("pathway" = names(ion_inpath), "total_ions" = colSums(ion_inpath)) pathway_summary <- pathway_summary[order(pathway_summary$total_ions, decreasing = TRUE),] return(as.character(pathway_summary$pathway[1:n])) }
test several options and choose optimal perplexity in: data, perplexities out: plots
library(Rtsne) library(ggplot2) find_optimal_p_tSNE <- function(perplexities, data) { for (p in perplexities) { tsne_out <- Rtsne(as.matrix(train), perplexity = p) plot(tsne_out$Y) } }
in: ion data optimal perplexity out: (x,y) coordinates
run_tSNE <- function(data, optimal_p) { all_tsne_out <- Rtsne(as.matrix(data), perplexity = optimal_p) tsne_coords <- as.data.frame(all_tsne_out$Y) return(tsne_coords) }
in: ion data out: (x,y) coordinates
run_PCA <- function(data) { res <- prcomp(data)$x return(res[,c(1,2)]) }
in: list of pathways (names) of interest coordinates and clusters (results of cluster_data) metabolite data ions_inpath out: summary of pathway, cluster, Q, Q_extreme, # sd away from mean
es_summary <- function(pathways, clusters, inpath_matrix) { summary <- data.frame(matrix(ncol = 5, nrow = length(pathways)*length(unique(clusters$clusters)))) names(summary) <- c("pathway", "cluster", "Q", "Q_extreme", "probability") i <- 1 for (p in pathways) { for (c in unique(clusters$clusters)){ df <- train[which(clusters$clusters == c),] res <- calc_ES(x_array = colMeans(df), pathway = p, ions_inpath = inpath_matrix, plot = TRUE, cluster_description = c) summary$pathway[i] <- p summary$cluster[i] <- c summary$Q[i] <- res$Q summary$Q_extreme[i] <- res$Q_extreme summary$probability[i] <- res$probability i <- i + 1 } } return(summary) }
# use untargeted, transformed data data <- read.csv("../data/zscore_untargeted_annotatedions.csv") train <- data[data$group <=2,] # remove metadata pats <- train$patientid train <- train[c(grep("V", names(train)))] ions_inpath <- read.csv("../data/ion_inpath.csv") ions_inpath_punct <- read.csv("../data/ion_inpath_punct.csv")
# only do these once #ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv") #pathway_df <- find_ions_in_pathways(ann) #binary_pathways(pathway_df = pathway_df, name = "punct")
Find example pathways for enrichment analysis
top_pathways <- find_important_pathways(ions_inpath_punct, n = 20)
# reduce dimensions to (x,y) coordinates pca_coords <- run_PCA(train) write.csv(data.frame("patients" = pats, pca_coords), "../data/pca_coords.csv", row.names = FALSE) # cluster patients pca_clusters <- cluster_data(as.data.frame(pca_coords), point_size = 5, h = 2.5) write.csv(data.frame("patients" = pats, pca_clusters), "../data/pca_clusters.csv", row.names = FALSE)
pca_es_summary <- es_summary(pathways = top_pathways, clusters = pca_clusters, inpath_matrix = ions_inpath_punct) write.csv(pca_es_summary, "../data/pca_es_summary.csv", row.names = FALSE)
# reduce dimension to (x,y) coordinates #find_optimal_p_tSNE(c(5,10,20,30,40,50), train) tsne_coords <- run_tSNE(train, 50) write.csv(data.frame("patients" = pats, tsne_coords), "../data/tsne_coords.csv", row.names = FALSE) tsne_clusters <- cluster_data(tsne_coords, point_size = 5, h = 2.5) write.csv(data.frame("patients" = pats, tsne_clusters), "../data/tsne_clusters.csv", row.names = FALSE)
tsne_es_summary <- es_summary(pathways = top_pathways, clusters = tsne_clusters, inpath_matrix = ions_inpath_punct) write.csv(tsne_es_summary, "../data/tsne_es_summary", row.names = FALSE)
som_coords <- read.csv("../data/som_trainingset.csv", header = FALSE) write.csv(data.frame("patients" = pats, som_coords), "../data/som_coords.csv", row.names = FALSE) som_clusters <- cluster_data(som_coords, point_size = 5, h = 2.5) write.csv(data.frame("patients" = pats, som_clusters), "../data/som_clusters.csv", row.names = FALSE)
som_es_summary <- es_summary(pathways = top_pathways, clusters = som_clusters, inpath_matrix = ions_inpath_punct) write.csv(som_es_summary, "../data/som_es_summary.csv", row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.