In currocam/FascinRSCA: R and Python code for SCA with fascin protein

knitr::opts_chunk$set(echo = TRUE,
  dpi=300)

En este cuaderno se aplican distintas abordajes para reducir la dimensionalidad de los datos y se realizan clustering con los resultados.

library(tidyverse)
library(Boruta)
library(corrplot)
library(caret)

`%!in%` = Negate(`%in%`)
data <- FascinRSCA::freezing %>% distinct(fasta_seq, .keep_all = TRUE)

Comenzamos eliminando aquellas variables que no aportan información biológica relevante sobre las secuencias.

df.numeric <- data %>% select(where(is.numeric))%>% select(-c (ndom, Length,
  nregions, niseqs, pvalue, nincluded, flags, nreported))
df <- df.numeric %>% add_column(arch = data$arch,
                                 phylum = data$phylum,
                                 kingdom = data$kingdom,
                                 file_label = data$file_label,
                                annotation = data$annotation)

Filtrando por correlación

A continuación, aplicaremos un filtro a las todas las variables y a las variables preseleccionadas en el PCA. Este filtro consistirá en liminar aquellas variables altamente correlacionadas para reducir correlación absoluta por pares a, como mucho, el 75%

En primer lugar, lo aplicamos a todas las variables, quedándonos con 26 variables:

corrplot(cor(df.numeric), method="color", is.corr=FALSE)
highlyCorrelated_all <- findCorrelation(cor(df.numeric), cutoff=0.75)
#colnames(df.numeric[,-c(highlyCorrelated_all)])
df.numeric %>% select(-all_of(highlyCorrelated_all))%>% cor()%>% corrplot(method="color", is.corr=FALSE)

En segundo lugar, lo aplicamos a las 10 variables que más contribuyen a explicar las 15 primeras dimensiones del PCA (es decir, aquellas que explican al menos un 1% de la varianza), quedándonos con 5 variables:

df.numeric_1per <-  df.numeric %>% select(charged, acidic, DE, basic, residue_n, weigth,
  STYNQW, charge, nonPolar, polar)
corrplot(cor(df.numeric_1per), method="color", is.corr=FALSE)
highlyCorrelated_1per <- findCorrelation(cor(df.numeric_1per), cutoff=0.75)
#colnames(df.numeric_1per[,-c(highlyCorrelated_1per)])
df.numeric_1per %>% select(-highlyCorrelated_1per)%>% cor()%>% corrplot(method="color", is.corr=FALSE)
ggsave(filename = "figures/clustering/corrplot_variables.pdf", dpi = 300)

Using information gain for variable selection

Esta información, aunque útil, presenta el problema de que puede ser algo arbitrario establecer el umbral de "importancia" para discriminar:

library(FSelector)
Fsel_info<-information.gain(annotation~.,df)
Fsel_info  %>% arrange(-attr_importance)

A continuación, aplicamos dos algoritmos de selección, no obteniendo una reducción significativa:

Recursive Feature Elimination Method

set.seed(7)
library(mlbench)
library(caret)

control <- rfeControl(functions = rfFuncs, # random forest
                      method = "repeatedcv", # repeated cv
                      repeats = 5, # number of repeats
                      number = 10) # number of folds
y <- data$annotation %>% as.factor()
x <- df.numeric 
inTrain <- createDataPartition(y, p = .80, list = FALSE)[,1]
x_train <- x[ inTrain, ]
x_test  <- x[-inTrain, ]
y_train <- y[ inTrain]
y_test  <- y[-inTrain]

result_rfe1 <- rfe(x = x_train, 
                   y = y_train, 
                   sizes = c(1:15),
                   rfeControl = control) 
# Print the results
summary(result_rfe1)

# Print the selected features
predictors(result_rfe1)

# Print the results visually
ggplot(data = result_rfe1, metric = "Accuracy") + theme_bw()
ggplot(data = result_rfe1, metric = "Kappa") + theme_bw()

Boruta

set.seed(800)
data_boruta <- df.numeric %>% mutate(annotation = as.factor(data$annotation))
Boruta(annotation ~ ., data=data_boruta, doTrace=0,)->Boruta.x
#Results summary
print(Boruta.x)
#Attribute statistics
attStats(Boruta.x)

Clustering

Hemos realizado distintas pruebas, obteniendo buenos resultados en todos los casos. Nos hemos decantado por utilizar las 6 variables resultantes de eliminar las altamente correlacionadas, de las 10 preseleccionadas del PCA. Estas son: "basic" "residue_n" "STYNQW" "charge" y "nonPolar"

set.seed(1)
#cluster_x <- df.numeric %>% select(highlyCorrelated_all)
cluster_x <- df.numeric_1per %>% select(-highlyCorrelated_1per)
#cluster_x <-df[,rownames(Fsel_info)[1:20]]
#cluster_x <- df[,predictors(result_rfe1)]
#cluster_x <- df[,rownames(attStats(Boruta.x))]
colnames(cluster_x)

Escalamos los datos y computamos clValid:

library(clValid)
df_cluster <- cluster_x %>%
  scale() 
# Compute clValid
clmethods <- c("hierarchical","kmeans","pam")
intern <- clValid(df_cluster, nClust = 2:6, 
              clMethods = clmethods, validation = "internal")
# Summary
summary(intern)
stab <- clValid(df_cluster, nClust = 2:6, 
              clMethods = clmethods, validation = "stability")
# Display only optimal Scores
knitr::kable(optimalScores(stab), caption = "Optimal scores")
#The values of APN, ADM and FOM ranges from 0 to 1, with smaller value corresponding with highly consistent clustering results. AD has a value between 0 and infinity, and smaller values are also preferred.

A continuación, realizamos clustering jerárquico y kmeans jerárquico.

Clustering ward.D2 spearman

Calculamos las distancias basándonos en spearman. Visualizamos dissimilarity matrix.

library(factoextra)
# Correlation-based distance method
res.dist <- get_dist(df_cluster, method = "pearson")
# Visualize the 
fviz_dist(res.dist, lab_size = 2)
ggsave(filename = "figures/clustering/dissimilarity_matrix_ward.D2_spearman.pdf", dpi = 400)

Realizamos clustering jerárquico usando ward.D2 y k=2, observando que se separan fascina y fascina 1 de fascina 2.

# Compute hierarchical clustering
res.hc <- hclust(res.dist, method = "ward.D2")

# Visualize
plot(res.hc, cex = 0.5)

fviz_dend(res.hc, cex = 0.5)
head(round(as.matrix(res.dist), 2))[, 1:6]

grp <- cutree(res.hc, k = 2)

# Compute cophentic distance
res.coph <- cophenetic(res.hc)

# Correlation between cophenetic distance and
# the original distance
cor(res.dist, res.coph)

knitr::kable(table(grp, data$annotation),
 caption = "Clusters results hierarchical ward.D2 clustering k=2")

Si observamos las secuencias que que se salen de lo esperado. Destacamos que hay 5 secuencias correspondientes a Passeriformes, los cuales habiamos seguido el patron al revés, anotadas como fascina 2 en el cluster 1. Sin embargo, no hay fascinas 1 de Passeriformes en el cluster 2. ¿Puede ser que sean otra cosa?
En Xenopus laevis había muchas duplicaciones.
En Australian saltwater crocodile, hay dos secuencias anotadas como fascina 1 y fascina 2 con una diferencia de dos aminoácidos en longitud y sin cambio en carga. Las dos entran en el cluster de fascina 2. En el caso de Argentine black and white tegu, Tuatara,Chilean tinamou, Mainland tiger snake y Wallaby realmente ambas si están anotadas como están, pero fue necesario reanotar automáticamente.

knitr::kable(data %>% select(annotation, reannotated, class,order, species, extlink) %>% 
  mutate(cluster=grp)%>%
  filter((cluster==1 & annotation == "fascin2")|(cluster==1 & annotation == "fascin2a")),
  caption="Fascinas 2 en cluster 1")
knitr::kable(data %>% select(annotation,reannotated, Status,phylum, class, order, species, extlink) %>% 
  mutate(cluster=grp)%>%
  filter(cluster==2 & annotation %!in% c("fascin2", "fascin2a")), caption="No fascinas2 en cluster 2")

Realizamos clustering jerárquico usando ward.D2 y k=3.

grp <- cutree(res.hc, k = 3)

knitr::kable(table(grp, data$annotation),
 caption = "Clusters results hierarchical ward.D2 clustering k=3")

hierarchical k-means clustering

Realizamos hierarchical k-means clustering usando k=2, observando que se separan fascina y fascina 1 de fascina 2.

res.hk <-hkmeans(df_cluster, 2)

fviz_dend(res.hk, cex = 0.6, palette = "jco")# Visualize the hkmeans final clusters
fviz_cluster(res.hk, palette = "jco", repel = TRUE,
             ggtheme = theme_classic())
ggsave(filename = "figures/clustering/hierarchical_kmeans_clustering.pdf", dpi = 400)

knitr::kable(table(res.hk$cluster, data$annotation),
 caption = "Clusters results hierarchical k-means clustering")

Realizamos k-medoids usando k=6 y euclidean distances. Compararemos con la anotación incluido fscn2a y fscn2b

pam.res <- pam(df_cluster, 6, metric = "euclidean")

fviz_cluster(object = pam.res, data = df_cluster, ellipse.type = "t",
             repel = TRUE) +
  theme_bw() +
  labs(title = "Resultados clustering PAM") 
knitr::kable(table(pam.res$cluster, data$annotation),
 caption = "Clusters results PAM")

Vamos a guardar los datos de los distintos clusters, aunque nos decantaremos por el kmeans jerárquico.

freezing_cluster <- data %>%
  mutate(cluster.hier=grp,
         cluster.kmeans = res.hk$cluster,
         cluster.pam = pam.res$cluster)
write.csv(freezing_cluster, paste(params$output_dir, 'freezing_cluster.csv', sep="/"), row.names = TRUE) 
usethis::use_data(freezing_cluster, overwrite = TRUE)