In dmontemayor/Rcricvol: Data Processing Tools in R for cricvol project

packages

library(DescTools)
library(ggplot2)
library(class)
library(Rtsne)
library(ggplot2)

functions

hierarchichal clustering in: (x,y) coordinates out: clusters column added to initial df plots x,y coordinates and clusters

# data = data frame of coordinates with column names of "V1" and "V2" for x and y, respectively
# var_cluster = vector of same length as data with cluster information
# point_size = number (default = 10)

plot_cluster=function(data, var_cluster, point_size = 10) {
  var_cluster <- factor(var_cluster)
  ggplot(data, aes_string(x="V1", y="V2", color=var_cluster)) +
  geom_point(size=point_size) +
  guides(colour=guide_legend(override.aes=list(size=6))) +
  xlab("") + ylab("") +
  ggtitle("") +
  theme_light(base_size=20) +
  theme(axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        legend.direction = "horizontal", 
        legend.position = "bottom",
        legend.box = "horizontal")
}

# coords = data frame of (x,y) coordinates only
# returns coords with additioal clusters column

cluster_data <- function(coords, point_size = 10, h) {
  names(coords) = c("V1", "V2")
  fit_cluster_hierarchical=hclust(dist(scale(coords)))
  plot(fit_cluster_hierarchical)
  coords$clusters = factor(cutree(fit_cluster_hierarchical, h=h))
  plot(plot_cluster(coords, coords$clusters, point_size))
  return(coords)
}

test several options for tsne and choose optimal perplexity in: data, perplexities out: plots

find_optimal_p_tSNE <- function(perplexities, data) {
  for (p in perplexities) {
    tsne_out <- Rtsne(as.matrix(train), perplexity = p)
    plot(tsne_out$Y)
  }
}

run tsne in: ion data optimal perplexity out: (x,y) coordinates

run_tSNE <- function(data, optimal_p) {
  all_tsne_out <- Rtsne(as.matrix(data), perplexity = optimal_p)
  tsne_coords <- as.data.frame(all_tsne_out$Y)
  return(tsne_coords)
}

load data

# use untargeted, transformed data  
data <- read.csv("../data/zscore_untargeted_annotatedions.csv")

train <- data[data$group <=2,]
pats <- train$patientid
train <- train[c(grep("V", names(train)))]

val <- data[data$group == 3,]
val_pats <- val$patientid
val <- val[c(grep("V", names(val)))]

ions_inpath <- read.csv("../data/ion_inpath.csv")[-1]

PCA dimension reduction with validation set

# reduce dimensions to (x,y) coordinates (train on training, use same pca model on validation)
pca_train <- prcomp(x = train)
pca_train_coords <- as.data.frame(pca_train$x[,c(1,2)])
pca_val <- predict(pca_train, val)
pca_val_coords <- as.data.frame(pca_val[,c(1,2)])
write.csv(data.frame("patients" = pats, pca_train_coords), "../data/pca_train_coords.csv", row.names = FALSE)
write.csv(data.frame("patients" = val_pats, pca_val_coords), "../data/pca_val_coords.csv", row.names = FALSE)

# cluster training patients w hierarchical clustering
pca_train_clusters <- cluster_data(pca_train_coords, point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, pca_train_clusters), "../data/pca_train_clusters.csv", row.names = FALSE)

# use knn to cluster validation patients
p_c <- knn(train = train, test = val, cl = pca_train_clusters$clusters, k = 5)
pca_val_clusters <- data.frame("V1" = pca_val_coords$PC1, "V2" = pca_val_coords$PC2, "clusters" = p_c)
write.csv(data.frame("patients" = val_pats, pca_val_clusters), "../data/pca_val_clusters.csv", row.names = FALSE)

TSNE dimension reduction

# test several perplexity values to choose one (i chose 50)
#find_optimal_p_tSNE(c(5,10,20,30,40,50), train)

# reduce dimension to (x,y) coordinates
tsne_train_coords <- run_tSNE(train, 50)
write.csv(data.frame("patients" = pats, tsne_train_coords), "../data/tsne_train_coords.csv", row.names = FALSE)

# cluster training
tsne_train_clusters <- cluster_data(tsne_train_coords, point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, tsne_train_clusters), "../data/tsne_train_clusters.csv", row.names = FALSE)

Use KNN to predict cluster, x, and y coord of validation data

t_c <- knn(train = train, test = val, cl = tsne_train_clusters$clusters, k = 5)
t_x <- knn(train = train, test = val, cl = tsne_train_clusters$V1, k = 5)
t_y <- knn(train = train, test = val, cl = tsne_train_clusters$V2, k = 5)
tsne_val_clusters <- data.frame("patients" = val_pats, "V1" = as.numeric(t_x), "V2" = as.numeric(t_y), "clusters" = as.numeric(t_c))

write.csv(tsne_val_clusters[-4], "../data/tsne_val_coords.csv", row.names = FALSE)
write.csv(tsne_val_clusters, "../data/tsne_val_clusters.csv", row.names = FALSE)

SOM dimension reduction

som_train_coords <- read.csv("../data/som_trainingset.csv", header = FALSE)
write.csv(data.frame("patients" = pats, som_train_coords), "../data/som_train_coords.csv", row.names = FALSE)

som_train_clusters <- cluster_data(som_train_coords, point_size = 5, h = 2.5)
write.csv(data.frame("patients" = pats, som_train_clusters), "../data/som_train_clusters.csv", row.names = FALSE)