library(DescTools) library(ggplot2) library(class) library(Rtsne) library(ggplot2)
hierarchichal clustering in: (x,y) coordinates out: clusters column added to initial df plots x,y coordinates and clusters
# data = data frame of coordinates with column names of "V1" and "V2" for x and y, respectively # var_cluster = vector of same length as data with cluster information # point_size = number (default = 10) plot_cluster=function(data, var_cluster, point_size = 10) { var_cluster <- factor(var_cluster) ggplot(data, aes_string(x="V1", y="V2", color=var_cluster)) + geom_point(size=point_size) + guides(colour=guide_legend(override.aes=list(size=6))) + xlab("") + ylab("") + ggtitle("") + theme_light(base_size=20) + theme(axis.text.x=element_blank(), axis.text.y=element_blank(), legend.direction = "horizontal", legend.position = "bottom", legend.box = "horizontal") } # coords = data frame of (x,y) coordinates only # returns coords with additioal clusters column cluster_data <- function(coords, point_size = 10, h) { names(coords) = c("V1", "V2") fit_cluster_hierarchical=hclust(dist(scale(coords))) plot(fit_cluster_hierarchical) coords$clusters = factor(cutree(fit_cluster_hierarchical, h=h)) plot(plot_cluster(coords, coords$clusters, point_size)) return(coords) }
test several options for tsne and choose optimal perplexity in: data, perplexities out: plots
find_optimal_p_tSNE <- function(perplexities, data) { for (p in perplexities) { tsne_out <- Rtsne(as.matrix(train), perplexity = p) plot(tsne_out$Y) } }
run tsne in: ion data optimal perplexity out: (x,y) coordinates
run_tSNE <- function(data, optimal_p) { all_tsne_out <- Rtsne(as.matrix(data), perplexity = optimal_p) tsne_coords <- as.data.frame(all_tsne_out$Y) return(tsne_coords) }
# use untargeted, transformed data data <- read.csv("../data/zscore_untargeted_annotatedions.csv") train <- data[data$group <=2,] pats <- train$patientid train <- train[c(grep("V", names(train)))] val <- data[data$group == 3,] val_pats <- val$patientid val <- val[c(grep("V", names(val)))] ions_inpath <- read.csv("../data/ion_inpath.csv")[-1]
# reduce dimensions to (x,y) coordinates (train on training, use same pca model on validation) pca_train <- prcomp(x = train) pca_train_coords <- as.data.frame(pca_train$x[,c(1,2)]) pca_val <- predict(pca_train, val) pca_val_coords <- as.data.frame(pca_val[,c(1,2)]) write.csv(data.frame("patients" = pats, pca_train_coords), "../data/pca_train_coords.csv", row.names = FALSE) write.csv(data.frame("patients" = val_pats, pca_val_coords), "../data/pca_val_coords.csv", row.names = FALSE) # cluster training patients w hierarchical clustering pca_train_clusters <- cluster_data(pca_train_coords, point_size = 5, h = 2.5) write.csv(data.frame("patients" = pats, pca_train_clusters), "../data/pca_train_clusters.csv", row.names = FALSE) # use knn to cluster validation patients p_c <- knn(train = train, test = val, cl = pca_train_clusters$clusters, k = 5) pca_val_clusters <- data.frame("V1" = pca_val_coords$PC1, "V2" = pca_val_coords$PC2, "clusters" = p_c) write.csv(data.frame("patients" = val_pats, pca_val_clusters), "../data/pca_val_clusters.csv", row.names = FALSE)
# test several perplexity values to choose one (i chose 50) #find_optimal_p_tSNE(c(5,10,20,30,40,50), train) # reduce dimension to (x,y) coordinates tsne_train_coords <- run_tSNE(train, 50) write.csv(data.frame("patients" = pats, tsne_train_coords), "../data/tsne_train_coords.csv", row.names = FALSE) # cluster training tsne_train_clusters <- cluster_data(tsne_train_coords, point_size = 5, h = 2.5) write.csv(data.frame("patients" = pats, tsne_train_clusters), "../data/tsne_train_clusters.csv", row.names = FALSE)
t_c <- knn(train = train, test = val, cl = tsne_train_clusters$clusters, k = 5) t_x <- knn(train = train, test = val, cl = tsne_train_clusters$V1, k = 5) t_y <- knn(train = train, test = val, cl = tsne_train_clusters$V2, k = 5) tsne_val_clusters <- data.frame("patients" = val_pats, "V1" = as.numeric(t_x), "V2" = as.numeric(t_y), "clusters" = as.numeric(t_c)) write.csv(tsne_val_clusters[-4], "../data/tsne_val_coords.csv", row.names = FALSE) write.csv(tsne_val_clusters, "../data/tsne_val_clusters.csv", row.names = FALSE)
som_train_coords <- read.csv("../data/som_trainingset.csv", header = FALSE) write.csv(data.frame("patients" = pats, som_train_coords), "../data/som_train_coords.csv", row.names = FALSE) som_train_clusters <- cluster_data(som_train_coords, point_size = 5, h = 2.5) write.csv(data.frame("patients" = pats, som_train_clusters), "../data/som_train_clusters.csv", row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.