knitr::opts_chunk$set(echo = TRUE, dpi=300)
En este cuaderno se aplican distintas abordajes para reducir la dimensionalidad de los datos y se realizan clustering con los resultados.
library(tidyverse) library(Boruta) library(corrplot) library(caret)
`%!in%` = Negate(`%in%`) data <- FascinRSCA::freezing %>% distinct(fasta_seq, .keep_all = TRUE)
Comenzamos eliminando aquellas variables que no aportan información biológica relevante sobre las secuencias.
df.numeric <- data %>% select(where(is.numeric))%>% select(-c (ndom, Length, nregions, niseqs, pvalue, nincluded, flags, nreported)) df <- df.numeric %>% add_column(arch = data$arch, phylum = data$phylum, kingdom = data$kingdom, file_label = data$file_label, annotation = data$annotation)
A continuación, aplicaremos un filtro a las todas las variables y a las variables preseleccionadas en el PCA. Este filtro consistirá en liminar aquellas variables altamente correlacionadas para reducir correlación absoluta por pares a, como mucho, el 75%
En primer lugar, lo aplicamos a todas las variables, quedándonos con 26 variables:
corrplot(cor(df.numeric), method="color", is.corr=FALSE) highlyCorrelated_all <- findCorrelation(cor(df.numeric), cutoff=0.75) #colnames(df.numeric[,-c(highlyCorrelated_all)]) df.numeric %>% select(-all_of(highlyCorrelated_all))%>% cor()%>% corrplot(method="color", is.corr=FALSE)
En segundo lugar, lo aplicamos a las 10 variables que más contribuyen a explicar las 15 primeras dimensiones del PCA (es decir, aquellas que explican al menos un 1% de la varianza), quedándonos con 5 variables:
df.numeric_1per <- df.numeric %>% select(charged, acidic, DE, basic, residue_n, weigth, STYNQW, charge, nonPolar, polar) corrplot(cor(df.numeric_1per), method="color", is.corr=FALSE) highlyCorrelated_1per <- findCorrelation(cor(df.numeric_1per), cutoff=0.75) #colnames(df.numeric_1per[,-c(highlyCorrelated_1per)]) df.numeric_1per %>% select(-highlyCorrelated_1per)%>% cor()%>% corrplot(method="color", is.corr=FALSE) ggsave(filename = "figures/clustering/corrplot_variables.pdf", dpi = 300)
Esta información, aunque útil, presenta el problema de que puede ser algo arbitrario establecer el umbral de "importancia" para discriminar:
library(FSelector) Fsel_info<-information.gain(annotation~.,df) Fsel_info %>% arrange(-attr_importance)
A continuación, aplicamos dos algoritmos de selección, no obteniendo una reducción significativa:
set.seed(7) library(mlbench) library(caret) control <- rfeControl(functions = rfFuncs, # random forest method = "repeatedcv", # repeated cv repeats = 5, # number of repeats number = 10) # number of folds y <- data$annotation %>% as.factor() x <- df.numeric inTrain <- createDataPartition(y, p = .80, list = FALSE)[,1] x_train <- x[ inTrain, ] x_test <- x[-inTrain, ] y_train <- y[ inTrain] y_test <- y[-inTrain] result_rfe1 <- rfe(x = x_train, y = y_train, sizes = c(1:15), rfeControl = control) # Print the results summary(result_rfe1) # Print the selected features predictors(result_rfe1) # Print the results visually ggplot(data = result_rfe1, metric = "Accuracy") + theme_bw() ggplot(data = result_rfe1, metric = "Kappa") + theme_bw()
set.seed(800) data_boruta <- df.numeric %>% mutate(annotation = as.factor(data$annotation)) Boruta(annotation ~ ., data=data_boruta, doTrace=0,)->Boruta.x #Results summary print(Boruta.x) #Attribute statistics attStats(Boruta.x)
Hemos realizado distintas pruebas, obteniendo buenos resultados en todos los casos. Nos hemos decantado por utilizar las 6 variables resultantes de eliminar las altamente correlacionadas, de las 10 preseleccionadas del PCA. Estas son: "basic" "residue_n" "STYNQW" "charge" y "nonPolar"
set.seed(1) #cluster_x <- df.numeric %>% select(highlyCorrelated_all) cluster_x <- df.numeric_1per %>% select(-highlyCorrelated_1per) #cluster_x <-df[,rownames(Fsel_info)[1:20]] #cluster_x <- df[,predictors(result_rfe1)] #cluster_x <- df[,rownames(attStats(Boruta.x))] colnames(cluster_x)
Escalamos los datos y computamos clValid:
library(clValid) df_cluster <- cluster_x %>% scale() # Compute clValid clmethods <- c("hierarchical","kmeans","pam") intern <- clValid(df_cluster, nClust = 2:6, clMethods = clmethods, validation = "internal") # Summary summary(intern) stab <- clValid(df_cluster, nClust = 2:6, clMethods = clmethods, validation = "stability") # Display only optimal Scores knitr::kable(optimalScores(stab), caption = "Optimal scores") #The values of APN, ADM and FOM ranges from 0 to 1, with smaller value corresponding with highly consistent clustering results. AD has a value between 0 and infinity, and smaller values are also preferred.
A continuación, realizamos clustering jerárquico y kmeans jerárquico.
Calculamos las distancias basándonos en spearman. Visualizamos dissimilarity matrix.
library(factoextra) # Correlation-based distance method res.dist <- get_dist(df_cluster, method = "pearson") # Visualize the fviz_dist(res.dist, lab_size = 2) ggsave(filename = "figures/clustering/dissimilarity_matrix_ward.D2_spearman.pdf", dpi = 400)
Realizamos clustering jerárquico usando ward.D2 y k=2, observando que se separan fascina y fascina 1 de fascina 2.
# Compute hierarchical clustering res.hc <- hclust(res.dist, method = "ward.D2") # Visualize plot(res.hc, cex = 0.5) fviz_dend(res.hc, cex = 0.5) head(round(as.matrix(res.dist), 2))[, 1:6] grp <- cutree(res.hc, k = 2) # Compute cophentic distance res.coph <- cophenetic(res.hc) # Correlation between cophenetic distance and # the original distance cor(res.dist, res.coph) knitr::kable(table(grp, data$annotation), caption = "Clusters results hierarchical ward.D2 clustering k=2")
knitr::kable(data %>% select(annotation, reannotated, class,order, species, extlink) %>% mutate(cluster=grp)%>% filter((cluster==1 & annotation == "fascin2")|(cluster==1 & annotation == "fascin2a")), caption="Fascinas 2 en cluster 1") knitr::kable(data %>% select(annotation,reannotated, Status,phylum, class, order, species, extlink) %>% mutate(cluster=grp)%>% filter(cluster==2 & annotation %!in% c("fascin2", "fascin2a")), caption="No fascinas2 en cluster 2")
Realizamos clustering jerárquico usando ward.D2 y k=3.
grp <- cutree(res.hc, k = 3) knitr::kable(table(grp, data$annotation), caption = "Clusters results hierarchical ward.D2 clustering k=3")
Realizamos hierarchical k-means clustering usando k=2, observando que se separan fascina y fascina 1 de fascina 2.
res.hk <-hkmeans(df_cluster, 2) fviz_dend(res.hk, cex = 0.6, palette = "jco")# Visualize the hkmeans final clusters fviz_cluster(res.hk, palette = "jco", repel = TRUE, ggtheme = theme_classic()) ggsave(filename = "figures/clustering/hierarchical_kmeans_clustering.pdf", dpi = 400) knitr::kable(table(res.hk$cluster, data$annotation), caption = "Clusters results hierarchical k-means clustering")
Realizamos k-medoids usando k=6 y euclidean distances. Compararemos con la anotación incluido fscn2a y fscn2b
pam.res <- pam(df_cluster, 6, metric = "euclidean") fviz_cluster(object = pam.res, data = df_cluster, ellipse.type = "t", repel = TRUE) + theme_bw() + labs(title = "Resultados clustering PAM") knitr::kable(table(pam.res$cluster, data$annotation), caption = "Clusters results PAM")
Vamos a guardar los datos de los distintos clusters, aunque nos decantaremos por el kmeans jerárquico.
freezing_cluster <- data %>% mutate(cluster.hier=grp, cluster.kmeans = res.hk$cluster, cluster.pam = pam.res$cluster) write.csv(freezing_cluster, paste(params$output_dir, 'freezing_cluster.csv', sep="/"), row.names = TRUE) usethis::use_data(freezing_cluster, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.