In dmontemayor/Rcricvol: Data Processing Tools in R for cricvol project

packages

library(DescTools)

hierarchichal clustering in: (x,y) coordinates out: clusters column added to initial df plots x,y coordinates and clusters

plot_cluster=function(data, var_cluster, point_size) {
  ggplot(data, aes_string(x="V1", y="V2", color=var_cluster)) +
  geom_point(size=point_size) +
  guides(colour=guide_legend(override.aes=list(size=6))) +
  xlab("") + ylab("") +
  ggtitle("") +
  theme_light(base_size=20) +
  theme(axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        legend.direction = "horizontal", 
        legend.position = "bottom",
        legend.box = "horizontal")
}



cluster_data <- function(coords, point_size = 10, h) {
  names(coords) = c("V1", "V2")
  fit_cluster_hierarchical=hclust(dist(scale(coords)))
  plot(fit_cluster_hierarchical)
  coords$clusters = factor(cutree(fit_cluster_hierarchical, h=h))
  plot(plot_cluster(coords, coords$clusters, point_size))
  return(coords)
}

enrichment score in: x array (unordered) in or out matrix (ions_inpath) pathway of interest plot? out: res res$x_array = ion values (ordered) res$e_score = e score at that x value res$pmin = p min (p value comparing minimum to jumbled distribution) >> currently TRUE if value above 95% interval res$pmax = p max (p value comparing maximum to jumbled distribution) >> or belot 5% interval

escore <- function(data) {
  sum_in <- sum((data[data$in_path == 1,]$x_array)^2)
  sum_out <- sum((data[data$in_path == 0,]$x_array)^2)

  w <- sum_out / sum_in
  data$e_score = 0
  amp <- 0

  for (ion in 1:nrow(data)) {
    if (data$in_path[ion] == 1) {
      amp <- amp + (w * (data$x_array[ion])^2)
    } else if (data$in_path[ion] == 0) {
      amp <- amp - (data$x_array[ion])^2
    }
    data$e_score[ion] <- amp
  }  

  if (abs(min(data$e_score)) > max(data$e_score)) {es <- min(data$e_score)} else {es <- max(data$e_score)}
  res <- list("data" = data,
              "es" = es)
  return(res)
}



calc_ES <- function(x_array, pathway, ions_inpath, plot, cluster_description) {

  in_path <- ions_inpath[,grep(paste0(pathway, "$"), names(ions_inpath))]
  if (!is.integer(in_path)) {
    print("pathway not found")
  } else {
    data <- data.frame(x_array, in_path)
    data <- data[order(data$x_array),]

    copy <- data
    escore_data <- escore(data)
    data <- escore_data$data
    es <- escore_data$es
    data$x_step <- seq(2:(nrow(data)+1))

    if (plot == TRUE) {
      par(mar = c(5,5,2,5))
      plot(c(1,data$x_step), c(0,data$e_score), type = "l", ylab = "enrichment score", xlab = "index")
      abline(h = 0)
      par(new = T)
      plot(data$x_step, (data$x_array^2), xlab = NA, ylab = NA, axes = F, type = "l", col = "red")
      axis(side = 4)
      mtext(side = 4, line = 3, "squared ion means")
      title(paste0("pathway: ", pathway, " | cluster: ", cluster_description))
    }


    # jumble weights 1000 times and store min and max values
    j_es <- c()

    for (i in 1:1000) {
      jumbled <- copy
      jumbled$in_path <- sample(copy$in_path, size = length(copy$in_path), replace = FALSE)
      escore_jumbled<- escore(jumbled)
      jumbled <- escore_jumbled$data
      j_es <- c(j_es, escore_jumbled$es)
    }

    extreme <- copy
    extreme$in_path <- c(rep(1, sum(extreme$in_path)), rep(0, length(extreme$in_path) - sum(extreme$in_path)))
    extreme_val <- escore(extreme)$es

    if (es < quantile(j_es, .05)) {pmin = TRUE} else {pmin = FALSE}
    if (es > quantile(j_es, .95)) {pmax = TRUE} else {pmax = FALSE} 

    p <- sum(abs(j_es) < es)/length(j_es)

    ret <- list(x_array = data$x_array, 
                e_score = data$e_score, 
                Q = es,
                Q_extreme = extreme_val,
                probability = p
                )

    return(ret)
  }

}

Generate Ion-Pathway information for enrichment analysis

Before calculating enrichment score: make dataframe of ions and which pathways they are in in: annotated ions out: writes dataframe of ions and which pathways they are in as pathway_df.csv

#library(BiocManager)
#BiocManager::install("hmdbQuery")

find_ions_in_pathways_nopunct <- function(ann) {
  library(hmdbQuery)

  # load annotated ions
  ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv")

  pathway_df <- data.frame("init" = c(1:(nrow(ann)*100)),
                           "metabolite" = 0, 
                           "ion" = 0,
                           "HMDBID" = 0,
                           "pathway" = 0)
  pathway_df <- pathway_df[-1]
  line <- 1

  metabolites <- c()

  # for every metabolite in ann (i) 
  for (i in 1:nrow(ann)) { 
    met_IDs <- unlist(strsplit(as.character(ann$'ï..id'[i]), '; '))
    met_IDs <- met_IDs[grepl("HMDB", met_IDs)]
    met_IDs <- sub("HMDB", "HMDB00", met_IDs)
    metabolites <- c(metabolites, gsub("[[:punct:]]", "", ann$name[i]))

    for (id in met_IDs) {

      entry <- tryCatch(HmdbEntry(prefix = "http://www.hmdb.ca/metabolites/", id = id), error = function(e) entry <- NA)

      if (!is.na(entry)) {
        pathways <- store(entry)$biological_properties$pathways

        if (pathways != "\n    ") {
          prev_paths <- c()

          for(path in 1:length(pathways)){
            if (!gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1]) %in% prev_paths){
              pathway_df$metabolite[line] <- gsub("[[:punct:]]", "", ann$name[i])
              pathway_df$ion[line] <- ann$ion[i]
              pathway_df$HMDBID[line] <- id
              pathway_df$pathway[line] <- gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1])
              line <- line + 1
              prev_paths <- c(prev_paths, gsub(" ", "", unlist(strsplit(as.character(pathways[path]$pathway$name), "[[:punct:]]"))[1]))
            }
          }
        }
      }
    }
    print(paste0("met ", i, ": ", as.character(ann$name[i])))
  }

  pathway_df <- pathway_df[pathway_df$metabolite != 0,]

  write.csv(pathway_df, "../data/pathway_df_nopunct.csv", row.names = FALSE)
}

#library(BiocManager)
#BiocManager::install("hmdbQuery")

find_ions_in_pathways <- function(ann) {
  library(hmdbQuery)

  # load annotated ions
  ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv")

  pathway_df <- data.frame("init" = c(1:(nrow(ann)*100)),
                           "metabolite" = 0, 
                           "ion" = 0,
                           "HMDBID" = 0,
                           "pathway" = 0)
  pathway_df <- pathway_df[-1]
  line <- 1

  metabolites <- c()

  # for every metabolite in ann (i) 
  for (i in 1:nrow(ann)) { 
    met_IDs <- unlist(strsplit(as.character(ann$'ï..id'[i]), '; '))
    met_IDs <- met_IDs[grepl("HMDB", met_IDs)]
    met_IDs <- sub("HMDB", "HMDB00", met_IDs)
    metabolites <- c(metabolites, gsub("[[:punct:]]", "", ann$name[i]))

    for (id in met_IDs) {

      entry <- tryCatch(HmdbEntry(prefix = "http://www.hmdb.ca/metabolites/", id = id), error = function(e) entry <- NA)

      if (!is.na(entry)) {
        pathways <- store(entry)$biological_properties$pathways

        if (pathways != "\n    ") {
          prev_paths <- c()

          for(path in 1:length(pathways)){
            if (!gsub(" ", "", unlist(as.character(pathways[path]$pathway$name))) %in% prev_paths){
              pathway_df$metabolite[line] <- gsub("[[:punct:]]", "", ann$name[i])
              pathway_df$ion[line] <- ann$ion[i]
              pathway_df$HMDBID[line] <- id
              pathway_df$pathway[line] <- gsub(" ", "", unlist(as.character(pathways[path]$pathway$name) ))
              line <- line + 1
              prev_paths <- c(prev_paths, gsub(" ", "", unlist(as.character(pathways[path]$pathway$name))))
            }
          }
        }
      }
    }
    print(paste0("met ", i, ": ", as.character(ann$name[i])))
  }

  pathway_df <- pathway_df[pathway_df$metabolite != 0,]

  return(pathway_df)

  write.csv(pathway_df, "../data/pathway_df_punct.csv", row.names = FALSE)
}

Re format pathway_df as binary matrix in: dataframe of ions and which pathways they are in (from prev) out: writes binary matrix of ions by pathway as ion_inpath.csv

binary_pathways <- function(pathway_df, name) {
  ion_inpath <- as.data.frame(matrix(nrow = length(unique(ann$ion)), ncol = length(unique(pathway_df$pathway)), 0))
  rownames(ion_inpath) <- unique(ann$ion)
  colnames(ion_inpath) <- unique(pathway_df$pathway)


  for (path in 1:ncol(ion_inpath)){
    ion_inpath[[path]] <- as.numeric(c(rownames(ion_inpath)) %in% unique(pathway_df[pathway_df$pathway == names(ion_inpath)[path],]$ion))
  }

  write.csv(ion_inpath, paste0("../data/ion_inpath_", name, ".csv"), row.names = FALSE)
}

Find "important" pathways; those with the most ions in them in: binary matrix of ions by pathway (from prev)

find_important_pathways <- function(ion_inpath, n = 5){
  pathway_summary <-  data.frame("pathway" = names(ion_inpath), "total_ions" = colSums(ion_inpath))
  pathway_summary <- pathway_summary[order(pathway_summary$total_ions, decreasing = TRUE),]
  return(as.character(pathway_summary$pathway[1:n]))
}

tSNE

test several options and choose optimal perplexity in: data, perplexities out: plots

library(Rtsne)
library(ggplot2)
find_optimal_p_tSNE <- function(perplexities, data) {
  for (p in perplexities) {
    tsne_out <- Rtsne(as.matrix(train), perplexity = p)
    plot(tsne_out$Y)
  }
}

in: ion data optimal perplexity out: (x,y) coordinates

run_tSNE <- function(data, optimal_p) {
  all_tsne_out <- Rtsne(as.matrix(data), perplexity = optimal_p)
  tsne_coords <- as.data.frame(all_tsne_out$Y)
  return(tsne_coords)
}

PCA

in: ion data out: (x,y) coordinates

run_PCA <- function(data) {
  res <- prcomp(data)$x
  return(res[,c(1,2)])
}

Create enrichment score summary for each method

in: list of pathways (names) of interest coordinates and clusters (results of cluster_data) metabolite data ions_inpath out: summary of pathway, cluster, Q, Q_extreme, # sd away from mean

es_summary <- function(pathways, clusters, inpath_matrix) {
  summary <- data.frame(matrix(ncol = 5, nrow = length(pathways)*length(unique(clusters$clusters))))
  names(summary) <- c("pathway", "cluster", "Q", "Q_extreme", "probability")
  i <- 1
  for (p in pathways) {
    for (c in unique(clusters$clusters)){
      df <- train[which(clusters$clusters == c),]
      res <- calc_ES(x_array = colMeans(df), pathway = p, ions_inpath = inpath_matrix, plot = TRUE, cluster_description = c)
      summary$pathway[i] <- p
      summary$cluster[i] <- c
      summary$Q[i] <- res$Q
      summary$Q_extreme[i] <- res$Q_extreme
      summary$probability[i] <- res$probability
      i <- i + 1
    }
  }

  return(summary)

}

load data

# use untargeted, transformed data  
data <- read.csv("../data/zscore_untargeted_annotatedions.csv")

train <- data[data$group <=2,]

# remove metadata
pats <- train$patientid
train <- train[c(grep("V", names(train)))]

ions_inpath <- read.csv("../data/ion_inpath.csv")
ions_inpath_punct <- read.csv("../data/ion_inpath_punct.csv")

# only do these once
#ann <- read.csv("../data/CRIC_annotation_1mD_neg.csv")
#pathway_df <- find_ions_in_pathways(ann)
#binary_pathways(pathway_df = pathway_df, name = "punct")

Find example pathways for enrichment analysis

top_pathways <- find_important_pathways(ions_inpath_punct, n = 20)

PCA dimension reduction

# reduce dimensions to (x,y) coordinates
pca_coords <- run_PCA(train)
write.csv(data.frame("patients" = pats, pca_coords), "../data/pca_coords.csv", row.names = FALSE)

# cluster patients
pca_clusters <- cluster_data(as.data.frame(pca_coords), point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, pca_clusters), "../data/pca_clusters.csv", row.names = FALSE)

PCA enrichment analysis

pca_es_summary <- es_summary(pathways = top_pathways, clusters = pca_clusters, inpath_matrix = ions_inpath_punct)
write.csv(pca_es_summary, "../data/pca_es_summary.csv", row.names = FALSE)

TSNE dimension reduction

# reduce dimension to (x,y) coordinates
#find_optimal_p_tSNE(c(5,10,20,30,40,50), train)

tsne_coords <- run_tSNE(train, 50)
write.csv(data.frame("patients" = pats, tsne_coords), "../data/tsne_coords.csv", row.names = FALSE)

tsne_clusters <- cluster_data(tsne_coords, point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, tsne_clusters), "../data/tsne_clusters.csv", row.names = FALSE)

TSNE enrichment analysis

tsne_es_summary <- es_summary(pathways = top_pathways, clusters = tsne_clusters, inpath_matrix = ions_inpath_punct)
write.csv(tsne_es_summary, "../data/tsne_es_summary", row.names = FALSE)

SOM dimension reduction

som_coords <- read.csv("../data/som_trainingset.csv", header = FALSE)
write.csv(data.frame("patients" = pats, som_coords), "../data/som_coords.csv", row.names = FALSE)

som_clusters <- cluster_data(som_coords, point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, som_clusters), "../data/som_clusters.csv", row.names = FALSE)

SOM enrichment analysis

som_es_summary <- es_summary(pathways = top_pathways, clusters = som_clusters, inpath_matrix = ions_inpath_punct)
write.csv(som_es_summary, "../data/som_es_summary.csv", row.names = FALSE)

dmontemayor/Rcricvol documentation built on Sept. 9, 2021, 9:12 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dmontemayor/Rcricvol
Data Processing Tools in R for cricvol project

In dmontemayor/Rcricvol: Data Processing Tools in R for cricvol project

packages

Generate Ion-Pathway information for enrichment analysis

tSNE

PCA

Create enrichment score summary for each method

load data

PCA dimension reduction

PCA enrichment analysis

TSNE dimension reduction

TSNE enrichment analysis

SOM dimension reduction

SOM enrichment analysis

R Package Documentation

Browse R Packages

We want your feedback!

dmontemayor/Rcricvol Data Processing Tools in R for cricvol project

In dmontemayor/Rcricvol: Data Processing Tools in R for cricvol project

packages

Generate Ion-Pathway information for enrichment analysis

tSNE

PCA

Create enrichment score summary for each method

load data

PCA dimension reduction

PCA enrichment analysis

TSNE dimension reduction

TSNE enrichment analysis

SOM dimension reduction

SOM enrichment analysis

R Package Documentation

Browse R Packages

We want your feedback!

dmontemayor/Rcricvol
Data Processing Tools in R for cricvol project