packages

library(DescTools)

hierarchichal clustering in: (x,y) coordinates out: clusters column added to initial df plots x,y coordinates and clusters

plot_cluster=function(data, var_cluster, point_size) {
  ggplot(data, aes_string(x="V1", y="V2", color=var_cluster)) +
  geom_point(size=point_size) +
  guides(colour=guide_legend(override.aes=list(size=6))) +
  xlab("") + ylab("") +
  ggtitle("") +
  theme_light(base_size=20) +
  theme(axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        legend.direction = "horizontal", 
        legend.position = "bottom",
        legend.box = "horizontal")
}



cluster_data <- function(coords, point_size = 10, h) {
  names(coords) = c("V1", "V2")
  fit_cluster_hierarchical=hclust(dist(scale(coords)))
  plot(fit_cluster_hierarchical)
  coords$clusters = factor(cutree(fit_cluster_hierarchical, h=h))
  plot(plot_cluster(coords, coords$clusters, point_size))
  return(coords)
}

enrichment score in: x array (unordered) in or out matrix (ions_inpath) pathway of interest plot? out: res res$x_array = ion values (ordered) res$e_score = e score at that x value res$pmin = p min (p value comparing minimum to jumbled distribution) >> currently TRUE if value above 95% interval res$pmax = p max (p value comparing maximum to jumbled distribution) >> or belot 5% interval

escore <- function(data) {
  sum_in <- sum((data[data$in_path == 1,]$x_array)^2)
  sum_out <- sum((data[data$in_path == 0,]$x_array)^2)

  w <- sum_out / sum_in
  data$e_score = 0
  amp <- 0

  for (ion in 1:nrow(data)) {
    if (data$in_path[ion] == 1) {
      amp <- amp + (w * (data$x_array[ion])^2)
    } else if (data$in_path[ion] == 0) {
      amp <- amp - (data$x_array[ion])^2
    }
    data$e_score[ion] <- amp
  }  

  if (abs(min(data$e_score)) > max(data$e_score)) {es <- min(data$e_score)} else {es <- max(data$e_score)}
  res <- list("data" = data,
              "es" = es)
  return(res)
}



calc_ES <- function(x_array, pathway, ions_inpath, plot, cluster_description) {

  in_path <- ions_inpath[,grep(paste0(pathway, "$"), names(ions_inpath))]
  if (!is.integer(in_path)) {
    print("pathway not found")
  } else {
    data <- data.frame(x_array, in_path)
    data <- data[order(data$x_array),]

    copy <- data
    escore_data <- escore(data)
    data <- escore_data$data
    es <- escore_data$es
    data$x_step <- seq(2:(nrow(data)+1))

    if (plot == TRUE) {
      par(mar = c(5,5,2,5))
      plot(c(1,data$x_step), c(0,data$e_score), type = "l", ylab = "enrichment score", xlab = "index")
      abline(h = 0)
      par(new = T)
      plot(data$x_step, (data$x_array^2), xlab = NA, ylab = NA, axes = F, type = "l", col = "red")
      axis(side = 4)
      mtext(side = 4, line = 3, "squared ion means")
      title(paste0("pathway: ", pathway, " | cluster: ", cluster_description))
    }


    # jumble weights 1000 times and store min and max values
    j_es <- c()

    for (i in 1:1000) {
      jumbled <- copy
      jumbled$in_path <- sample(copy$in_path, size = length(copy$in_path), replace = FALSE)
      escore_jumbled<- escore(jumbled)
      jumbled <- escore_jumbled$data
      j_es <- c(j_es, escore_jumbled$es)
    }

    extreme <- copy
    extreme$in_path <- c(rep(1, sum(extreme$in_path)), rep(0, length(extreme$in_path) - sum(extreme$in_path)))
    extreme_val <- escore(extreme)$es

    if (es < quantile(j_es, .05)) {pmin = TRUE} else {pmin = FALSE}
    if (es > quantile(j_es, .95)) {pmax = TRUE} else {pmax = FALSE} 

    p <- sum(abs(j_es) < es)/length(j_es)

    ret <- list(x_array = data$x_array, 
                e_score = data$e_score, 
                Q = es,
                Q_extreme = extreme_val,
                probability = p
                )

    return(ret)
  }

}

Generate Ion-Pathway information for enrichment analysis

Before calculating enrichment score: make dataframe of ions and which pathways they are in in: annotated ions out: writes dataframe of ions and which pathways they are in as pathway_df.csv

#library(BiocManager)
#BiocManager::install("hmdbQuery")

find_ions_in_pathways_nopunct <- function(ann) {
  library(hmdbQuery)

  # load annotated ions
  ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv")

  pathway_df <- data.frame("init" = c(1:(nrow(ann)*100)),
                           "metabolite" = 0, 
                           "ion" = 0,
                           "HMDBID" = 0,
                           "pathway" = 0)
  pathway_df <- pathway_df[-1]
  line <- 1

  metabolites <- c()

  # for every metabolite in ann (i) 
  for (i in 1:nrow(ann)) { 
    met_IDs <- unlist(strsplit(as.character(ann$'ï..id'[i]), '; '))
    met_IDs <- met_IDs[grepl("HMDB", met_IDs)]
    met_IDs <- sub("HMDB", "HMDB00", met_IDs)
    metabolites <- c(metabolites, gsub("[[:punct:]]", "", ann$name[i]))

    for (id in met_IDs) {

      entry <- tryCatch(HmdbEntry(prefix = "http://www.hmdb.ca/metabolites/", id = id), error = function(e) entry <- NA)

      if (!is.na(entry)) {
        pathways <- store(entry)$biological_properties$pathways

        if (pathways != "\n    ") {
          prev_paths <- c()

          for(path in 1:length(pathways)){
            if (!gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1]) %in% prev_paths){
              pathway_df$metabolite[line] <- gsub("[[:punct:]]", "", ann$name[i])
              pathway_df$ion[line] <- ann$ion[i]
              pathway_df$HMDBID[line] <- id
              pathway_df$pathway[line] <- gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1])
              line <- line + 1
              prev_paths <- c(prev_paths, gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1]))
            }
          }
        }
      }
    }
    print(paste0("met ", i, ": ", as.character(ann$name[i])))
  }

  pathway_df <- pathway_df[pathway_df$metabolite != 0,]

  write.csv(pathway_df, "../data/pathway_df_nopunct.csv", row.names = FALSE)
}
#library(BiocManager)
#BiocManager::install("hmdbQuery")

find_ions_in_pathways <- function(ann) {
  library(hmdbQuery)

  # load annotated ions
  ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv")

  pathway_df <- data.frame("init" = c(1:(nrow(ann)*100)),
                           "metabolite" = 0, 
                           "ion" = 0,
                           "HMDBID" = 0,
                           "pathway" = 0)
  pathway_df <- pathway_df[-1]
  line <- 1

  metabolites <- c()

  # for every metabolite in ann (i) 
  for (i in 1:nrow(ann)) { 
    met_IDs <- unlist(strsplit(as.character(ann$'ï..id'[i]), '; '))
    met_IDs <- met_IDs[grepl("HMDB", met_IDs)]
    met_IDs <- sub("HMDB", "HMDB00", met_IDs)
    metabolites <- c(metabolites, gsub("[[:punct:]]", "", ann$name[i]))

    for (id in met_IDs) {

      entry <- tryCatch(HmdbEntry(prefix = "http://www.hmdb.ca/metabolites/", id = id), error = function(e) entry <- NA)

      if (!is.na(entry)) {
        pathways <- store(entry)$biological_properties$pathways

        if (pathways != "\n    ") {
          prev_paths <- c()

          for(path in 1:length(pathways)){
            if (!gsub(" ", "", unlist(as.character(pathways[path]$pathway$name))) %in% prev_paths){
              pathway_df$metabolite[line] <- gsub("[[:punct:]]", "", ann$name[i])
              pathway_df$ion[line] <- ann$ion[i]
              pathway_df$HMDBID[line] <- id
              pathway_df$pathway[line] <- gsub(" ", "", unlist(as.character(pathways[path]$pathway$name) ))
              line <- line + 1
              prev_paths <- c(prev_paths, gsub(" ", "", unlist(as.character(pathways[path]$pathway$name))))
            }
          }
        }
      }
    }
    print(paste0("met ", i, ": ", as.character(ann$name[i])))
  }

  pathway_df <- pathway_df[pathway_df$metabolite != 0,]

  return(pathway_df)

  write.csv(pathway_df, "../data/pathway_df_punct.csv", row.names = FALSE)
}

Re format pathway_df as binary matrix in: dataframe of ions and which pathways they are in (from prev) out: writes binary matrix of ions by pathway as ion_inpath.csv

binary_pathways <- function(pathway_df, name) {
  ion_inpath <- as.data.frame(matrix(nrow = length(unique(ann$ion)), ncol = length(unique(pathway_df$pathway)), 0))
  rownames(ion_inpath) <- unique(ann$ion)
  colnames(ion_inpath) <- unique(pathway_df$pathway)


  for (path in 1:ncol(ion_inpath)){
    ion_inpath[[path]] <- as.numeric(c(rownames(ion_inpath)) %in% unique(pathway_df[pathway_df$pathway == names(ion_inpath)[path],]$ion))
  }

  write.csv(ion_inpath, paste0("../data/ion_inpath_", name, ".csv"), row.names = FALSE)
}

Find "important" pathways; those with the most ions in them in: binary matrix of ions by pathway (from prev)

find_important_pathways <- function(ion_inpath, n = 5){
  pathway_summary <-  data.frame("pathway" = names(ion_inpath), "total_ions" = colSums(ion_inpath))
  pathway_summary <- pathway_summary[order(pathway_summary$total_ions, decreasing = TRUE),]
  return(as.character(pathway_summary$pathway[1:n]))
}

tSNE

test several options and choose optimal perplexity in: data, perplexities out: plots

library(Rtsne)
library(ggplot2)
find_optimal_p_tSNE <- function(perplexities, data) {
  for (p in perplexities) {
    tsne_out <- Rtsne(as.matrix(train), perplexity = p)
    plot(tsne_out$Y)
  }
}

in: ion data optimal perplexity out: (x,y) coordinates

run_tSNE <- function(data, optimal_p) {
  all_tsne_out <- Rtsne(as.matrix(data), perplexity = optimal_p)
  tsne_coords <- as.data.frame(all_tsne_out$Y)
  return(tsne_coords)
}

PCA

in: ion data out: (x,y) coordinates

run_PCA <- function(data) {
  res <- prcomp(data)$x
  return(res[,c(1,2)])
}

Create enrichment score summary for each method

in: list of pathways (names) of interest coordinates and clusters (results of cluster_data) metabolite data ions_inpath out: summary of pathway, cluster, Q, Q_extreme, # sd away from mean

es_summary <- function(pathways, clusters, inpath_matrix) {
  summary <- data.frame(matrix(ncol = 5, nrow = length(pathways)*length(unique(clusters$clusters))))
  names(summary) <- c("pathway", "cluster", "Q", "Q_extreme", "probability")
  i <- 1
  for (p in pathways) {
    for (c in unique(clusters$clusters)){
      df <- train[which(clusters$clusters == c),]
      res <- calc_ES(x_array = colMeans(df), pathway = p, ions_inpath = inpath_matrix, plot = TRUE, cluster_description = c)
      summary$pathway[i] <- p
      summary$cluster[i] <- c
      summary$Q[i] <- res$Q
      summary$Q_extreme[i] <- res$Q_extreme
      summary$probability[i] <- res$probability
      i <- i + 1
    }
  }

  return(summary)

}

load data

# use untargeted, transformed data  
data <- read.csv("../data/zscore_untargeted_annotatedions.csv")

train <- data[data$group <=2,]

# remove metadata
pats <- train$patientid
train <- train[c(grep("V", names(train)))]

ions_inpath <- read.csv("../data/ion_inpath.csv")
ions_inpath_punct <- read.csv("../data/ion_inpath_punct.csv")
# only do these once
#ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv")
#pathway_df <- find_ions_in_pathways(ann)
#binary_pathways(pathway_df = pathway_df, name = "punct")

Find example pathways for enrichment analysis

top_pathways <- find_important_pathways(ions_inpath_punct, n = 20)

PCA dimension reduction

# reduce dimensions to (x,y) coordinates
pca_coords <- run_PCA(train)
write.csv(data.frame("patients" = pats, pca_coords), "../data/pca_coords.csv", row.names = FALSE)

# cluster patients
pca_clusters <- cluster_data(as.data.frame(pca_coords), point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, pca_clusters), "../data/pca_clusters.csv", row.names = FALSE)

PCA enrichment analysis

pca_es_summary <- es_summary(pathways = top_pathways, clusters = pca_clusters, inpath_matrix = ions_inpath_punct)
write.csv(pca_es_summary, "../data/pca_es_summary.csv", row.names = FALSE)

TSNE dimension reduction

# reduce dimension to (x,y) coordinates
#find_optimal_p_tSNE(c(5,10,20,30,40,50), train)

tsne_coords <- run_tSNE(train, 50)
write.csv(data.frame("patients" = pats, tsne_coords), "../data/tsne_coords.csv", row.names = FALSE)

tsne_clusters <- cluster_data(tsne_coords, point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, tsne_clusters), "../data/tsne_clusters.csv", row.names = FALSE)

TSNE enrichment analysis

tsne_es_summary <- es_summary(pathways = top_pathways, clusters = tsne_clusters, inpath_matrix = ions_inpath_punct)
write.csv(tsne_es_summary, "../data/tsne_es_summary", row.names = FALSE)

SOM dimension reduction

som_coords <- read.csv("../data/som_trainingset.csv", header = FALSE)
write.csv(data.frame("patients" = pats, som_coords), "../data/som_coords.csv", row.names = FALSE)

som_clusters <- cluster_data(som_coords, point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, som_clusters), "../data/som_clusters.csv", row.names = FALSE)

SOM enrichment analysis

som_es_summary <- es_summary(pathways = top_pathways, clusters = som_clusters, inpath_matrix = ions_inpath_punct)
write.csv(som_es_summary, "../data/som_es_summary.csv", row.names = FALSE)


dmontemayor/Rcricvol documentation built on Sept. 9, 2021, 9:12 a.m.