In Monoxido45/PhyloHclust: What the Package Does (One Line, Title Case)

Importing needed packages

library(ape)
library(dendextend)
library(cluster)
library(tibble)
library(magrittr)
library(dplyr)
library(phytools)
library(mltools)
library(data.table)
library(factoextra)
source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/convert_to_parenthesis.R")
source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/cv_score.R")
library(tictoc)
library(mvMORPH)
library(dplyr)
library(tidyr)
library(RColorBrewer)
library(MASS)
library(kableExtra)

Importing USArrests:

data("USArrests")
arr.data = USArrests
types = sapply(arr.data, class)
types
bool = (types == "numeric" | types == "integer")
bool
arr.data[, bool]

First testing hiearchical clustering:

hc.obj = hclust(dist(scale(arr.data)), method = "ward.D2")
tree = convert_to_phylo(hc.obj)
plot(tree)

\ After this, testing each one:

tic("Running time for L_score")
score1 = L_score(tree, arr.data)
score1
toc()

tic("Running time for L_score_2")
score2 = L_score_2(tree, arr.data)
score2
toc()

tic("Running time for L_score_3")
score3 = L_score_3(tree, arr.data)
score3
toc()

tic("Running time for L_score_4")
score4 = L_score_4(tree, arr.data)
score4
toc()

types = sapply(arr.data, class)
row_computing2(types, tree, arr.data, tol = 1e-20, 1)

Now we can test for several types of clustering using score 1, 2 and 3:

d = dist(scale(arr.data))
tree.list = list()
hc.list = list(ward.hc = hclust(d, method = "ward.D"),
ward.d2.hc = hclust(d, method = "ward.D2"),
single.hc = hclust(d, method = "single"),
complete.hc = hclust(d, method = "complete"),
ave.hc = hclust(d, method = "average"),
med.hc = hclust(d, method = "median"),
centroid.hc = hclust(d, method = "centroid"),
mcquitty.hc = hclust(d, method = "mcquitty"),
agnes.hc = agnes(scale(arr.data)),
diana.hc = diana(scale(arr.data)))
for (i in (1:length(hc.list))){
  tree.list[[i]] =  convert_to_phylo(hc.list[[i]])
}
names(tree.list) = names(hc.list)

nscores = 3
score.type = factor(c(rep(1, length(tree.list)), rep(2, length(tree.list)), 
                 rep(3, length(tree.list))))
measurements = numeric(length(tree.list)*nscores)
cl.names = rep(names(hc.list), nscores)
tree.rep = rep(tree.list, nscores)
tic("Running time of scoring function for the 10 clustering methdos")
for(j in (1:(nscores*length(tree.list)))){
    if (score.type[j] == 1){
      measurements[j] = L_score(tree.rep[[j]], arr.data)
    }else{
      if (score.type[j] == 2){
      measurements[j] = L_score_2(tree.rep[[j]], arr.data)
      }else{
      measurements[j] = L_score_3(tree.rep[[j]], arr.data)
      }
    }
}
toc()
data.scores =  data.frame("measurements" = measurements,
                          "score_type" = score.type,
                          "cluster" = as.factor(cl.names))
head(data.scores)

\ Now graphing the above data frame:

nb.cols = 10
mycolors = colorRampPalette(brewer.pal(10, "Paired"))(nb.cols)
data.scores %>%
  ggplot( aes(x = score_type, y  = measurements, group = cluster)) +
  geom_line(aes(color = cluster)) +
  geom_point(aes(color = cluster)) +
   scale_colour_manual(values = mycolors) +
  theme_bw()+
  scale_y_continuous(breaks = round(seq(min(data.scores$measurements), 
                                        max(data.scores$measurements), 
                                        by = 3),1)) +
  labs(title = "Comparisson of score type with clustering method",
       x = "Score Type",
       y = "Measurements",
       colour = "Clustering methods") +
  ggsave(filename = "comps_USArrests.png",
         path = "C:/Users/lucru/Estatística_UFSCar/datasets",
                    width = 18.75, height = 11.5, units = "cm")

\ Comparing just the score type 1 and 3

mutated_data.frame = data.scores %>%
  filter(score_type != 2)
mutated_data.frame %>%
  ggplot( aes(x = score_type, y  = measurements, group = cluster)) +
  geom_line(aes(color = cluster)) +
  geom_point(aes(color = cluster)) +
   scale_colour_manual(values = mycolors) +
  theme_bw()+
  labs(title = "Comparisson of score type with clustering method",
       x = "Score Type",
       y = "Measurements",
       colour = "Clustering methods") +
  ggsave(filename = "comps_1_3_USArrests.png",
         path = "C:/Users/lucru/Estatística_UFSCar/datasets",
                    width = 18.75, height = 11.5, units = "cm")

\ Simulating multivariate normal

set.seed(6)
p = 5
# defining mu and sigma
mu_0 = c(2, 4, 6, 1, 5)
n = 5  
A = matrix(runif(n^2,1, 2)*(1.25)-1, ncol=n) 
S = t(A) %*% A
S

set.seed(99)
n =  400
cont.data = as.data.frame(mvrnorm(n, mu =  mu_0, Sigma = S))
colnames(cont.data) = c("X1", "X2", "X3", "X4", "X5")
head(cont.data)

Assessing correlation:

cor(cont.data)

First testing just to Hclust

hc.obj = hclust(dist(scale(cont.data)), method = "ward.D2")
tree = convert_to_phylo(hc.obj)

tic("Running time for L_score")
score1 = L_score(tree, cont.data)
score1
toc()

tic("Running time for L_score_3")
score3 = L_score_3(tree, cont.data)
score3
toc()


tic("Running time for L_score_2")
score2 = L_score_2(tree, cont.data)
score2
toc()

d = dist(scale(cont.data))
tree.list = list()
hc.list = list(ward.hc = hclust(d, method = "ward.D"),
ward.d2.hc = hclust(d, method = "ward.D2"),
single.hc = hclust(d, method = "single"),
complete.hc = hclust(d, method = "complete"),
ave.hc = hclust(d, method = "average"),
med.hc = hclust(d, method = "median"),
centroid.hc = hclust(d, method = "centroid"),
mcquitty.hc = hclust(d, method = "mcquitty"),
agnes.hc = agnes(scale(cont.data)),
diana.hc = diana(scale(cont.data)))
for (i in (1:length(hc.list))){
  tree.list[[i]] =  convert_to_phylo(hc.list[[i]])
}
names(tree.list) = names(hc.list)

nscores = 2
score.type = factor(c(rep(1, length(tree.list)), 
                 rep(3, length(tree.list))))
measurements = numeric(length(tree.list)*nscores)
cl.names = rep(names(hc.list), nscores)
tree.rep = rep(tree.list, nscores)
tic("Running time of scoring function for the 10 clustering methdos")
for(j in (1:(nscores*length(tree.list)))){
    if (score.type[j] == 1){
      measurements[j] = L_score(tree.rep[[j]], cont.data)
    }else{
      measurements[j] = L_score_3(tree.rep[[j]], cont.data)
    }
}
toc()
data.scores =  data.frame("measurements" = measurements,
                          "score_type" = score.type,
                          "cluster" = as.factor(cl.names))
head(data.scores)

Graph of the above data:

data.scores %>%
  ggplot( aes(x = score_type, y  = measurements, group = cluster)) +
  geom_line(aes(color = cluster)) +
  geom_point(aes(color = cluster)) +
   scale_colour_manual(values = mycolors) +
  theme_bw()+
  labs(title = "Comparisson of score type with clustering method",
       x = "Score Type",
       y = "Measurements",
       colour = "Clustering methods") +
  ggsave(filename = "normcomps.png",
         path = "C:/Users/lucru/Estatística_UFSCar/datasets",
                    width = 18.75, height = 11.5, units = "cm")

\ Generating dependent multivariate normal

set.seed(120)
n = 60
p = 2

Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.6, 0.4))
mu_0 = c(2, 4)
mu_1 = c(4, 6)
S_0 = cbind(c(4, 1), c(1, 2))
S_1 = cbind(c(10, 5), c(5, 6))


X = matrix(nrow = n, ncol = p)
X[Y == 0, ] = mvrnorm(sum(Y == 0), mu_0, S_0)
X[Y == 1, ] = mvrnorm(sum(Y == 1), mu_1, S_1)

sim.data = as.data.frame(cbind(X , Y))
sim.data$Y = as.factor(Y)
colnames(sim.data) = c("X1", "X2", "Y")
row.names(sim.data) = c(1:nrow(sim.data))
head(sim.data)

hc.obj = hclust(dist(scale(sim.data[, c(1,2)])), method = "ward.D2")
tree = convert_to_phylo(hc.obj)
plot(tree)

set.seed(99)
pi = matrix(nrow = n, ncol = 2)
tic("Time of several simmap computations")
for(i in (1:n)){
  teste = sim.data$Y
  teste[i] = NA
  onehot = onehotencoder(teste)
  row.names(onehot) = row.names(sim.data)
  fit_discrete = make.simmap(as.multiPhylo(tree), onehot, model = "ER", message = FALSE)
  results = describe.simmap(fit_discrete, plot = F)
  which_row = which(rownames(results$tips) == i)
  pi[i, ] = results$tips[which_row, ]
}
toc()
head(pi)
pred = ifelse(pi[, 2] == 1, 1, 0)
pred

L_score(tree, sim.data)

tipo = sapply(sim.data, class)
row_computing(tipo, tree, sim.data, tol = 1e-20, seed = 45, col = 3)

Checking hits:

hits = sum(pred == sim.data$Y)
hits

Picking results for differents topologies in USArrests:

d = dist(scale(arr.data))
tree.list = list()
hc.list = list(ward.hc = hclust(d, method = "ward.D"),
ward.d2.hc = hclust(d, method = "ward.D2"),
single.hc = hclust(d, method = "single"),
complete.hc = hclust(d, method = "complete"),
ave.hc = hclust(d, method = "average"),
med.hc = hclust(d, method = "median"),
centroid.hc = hclust(d, method = "centroid"),
mcquitty.hc = hclust(d, method = "mcquitty"),
agnes.hc = agnes(scale(arr.data)),
diana.hc = diana(scale(arr.data)))
tree.list = lapply(hc.list, FUN = convert_to_phylo)

tic("Running time for topologie")
results = sapply(tree.list, FUN = L_score, original_data = arr.data)
round(results, 4)

tic("Running time of cross_validation for USArrests")
cross_val = L_cross_val(arr.data)
cross_val
toc()

cross_data = as.data.frame(t(cross_val))
rownames(cross_data) = c("Ward HC", "Ward D2 HC", "Single HC", "Complete HC", "Average HC",
                         "Median HC", "Centroid HC", "Mcquitty HC", "Agnes", "Diana")
marginal_data = cross_data %>%
  mutate(Sum = rowSums(cross_data))
sums = as.data.frame(marginal_data$Sum)
colnames(sums) = "Sum"

round(sums, 4) %>%
  kbl() %>%
kable_classic(full_width = F, html_font = "TimesNewRoman") %>%
  kable_styling(position = "center")

round(cross_data,4) %>%
  kbl() %>%
kable_classic(full_width = F, html_font = "TimesNewRoman") %>%
  kable_styling(position = "center")

Generating interesting graphs for the presentation

d = dist(scale(arr.data))
single.hc = hclust(d, method = "single")
fviz_dend(single.hc, title = "Dendrogram of USArrests for single HC")+
  fviz_dend()

single.tree = convert_to_phylo(single.hc)
x = setNames(arr.data[,1],gsub(" ", "", rownames(arr.data)))
reordered_x = x[single.tree$tip.label]

obj = contMap(single.tree, reordered_x, plot=FALSE)
obj = setMap(obj,invert=TRUE)
plot(obj,fsize=c(0.7,1),outline=FALSE,lwd=c(3,7),leg.txt="Murder")

\ And for Assault

x = setNames(arr.data[,2],gsub(" ", "", rownames(arr.data)))
reordered_x = x[single.tree$tip.label]

obj = contMap(single.tree, reordered_x, plot=FALSE)
obj = setMap(obj,invert=TRUE)
plot(obj,fsize=c(0.7,1),outline=FALSE,lwd=c(3,7),leg.txt="Assault")

Making simulation for IRIS dataset

flowers = iris
d = daisy(flowers, metric = "gower")
flowers.hc = hclust(d, method = "ward.D2")
fviz_dend(flowers.hc)

flowers.tree = convert_to_phylo(flowers.hc)
tic("Computation time  for iris dataset")
hc_score = L_score(flowers.tree,  flowers)
toc()

tic()
cross_val.flowers = L_cross_val(flowers)
cross_val.flowers
toc()

Using multivariate normal dataset:

set.seed(120)
n = 70
p = 2

Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5))
mu_0 = c(2, 4)
mu_1 = c(4, 6)
S_0 = cbind(c(4, 1), c(1, 2))
S_1 = cbind(c(10, 5), c(5, 6))


X = matrix(nrow = n, ncol = p)
X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S_0), 4)
X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S_1), 4)

sim.data = as.data.frame(cbind(X , Y))
sim.data$Y = as.factor(Y)
colnames(sim.data) = c("X1", "X2", "Y")
row.names(sim.data) = c(1:nrow(sim.data))
head(sim.data, 4)

tic("Running time of cross_validation for multivariate normal")
cross_val = L_cross_val(sim.data, cl.method = "factors")
cross_val
toc()

cross_data = as.data.frame(t(cross_val))
rownames(cross_data) = c("Ward HC", "Ward D2 HC", "Single HC", "Complete HC", "Average HC", "Mcquitty HC", "Agnes", "Diana")
marginal_data = cross_data %>%
  mutate(Sum = rowSums(cross_data))
sums = as.data.frame(marginal_data$Sum)
colnames(sums) = "Sum"

round(sums, 4) %>%
  kbl() %>%
kable_classic(full_width = F, html_font = "TimesNewRoman") %>%
  kable_styling(position = "center")

round(cross_data,4) %>%
  kbl() %>%
kable_classic(full_width = F, html_font = "TimesNewRoman") %>%
  kable_styling(position = "center")

Making graph for presentation

d = daisy(sim.data, metric = "gower")
single.hc = hclust(d, method = "single")
single.tree = convert_to_phylo(single.hc)
x = setNames(sim.data$Y, row.names(sim.data))
fitER<-ace(x, single.tree, model="ER", type="discrete")
cols<-setNames(c("red","blue"),levels(x))

plotTree(single.tree,type="fan",fsize=0.7,ftype="i",lwd=1)
nodelabels(node=1:single.tree$Nnode+Ntip(single.tree),
    pie=fitER$lik.anc,piecol=cols,cex=0.4)
tiplabels(pie=to.matrix(x[single.tree$tip.label],
    levels(x)),piecol=cols,cex=0.3)
add.simmap.legend(colors=cols,prompt=FALSE,x=0.9*par()$usr[1],
    y=0.8*par()$usr[3],fsize=0.8)