Importing needed packages

library(ape)
library(dendextend)
library(cluster)
library(tibble)
library(magrittr)
library(dplyr)
library(phytools)
library(limSolve)
library(mltools)
library(data.table)
library(factoextra)
source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/convert_to_parenthesis.R")
library(tictoc)
library(Rphylopars)
library(mvMORPH)

Generating some data and testing the continuous score:

set.seed(99)
data.sim =  data.frame("x1" = rnorm(20, 2, 1),
                       "x2" = runif(20, 3, 6),
                       "x3" = rexp(20))
# using hierarchical clustering
sim.agnes = agnes(scale(data.sim))
dend.agnes = to.dend(sim.agnes)
test = convert_to_par(dend.agnes)
sim.test= read.tree(text = test)

# comparing fviz_dend
fviz_dend(dend.agnes)

\ With normal plot:

plot(sim.test)

\ No difference. So, we use the above dendrogram and each $X_i$ to make the predictions.Initially we have for $X_1$

var_1 = data.sim$x1
var_1[3] = NA
species = row.names(data.sim)
data1 = data.frame("species" = species)
mat = matrix(rep(data.sim$x1, nrow(data.sim)), nrow = nrow(data.sim), ncol = nrow(data.sim))
diag(mat) = rep(NA, nrow(data.sim))
data2 =  as.data.frame(mat)
new_data = cbind(data1, data2)
row.names(data.var) = row.names(data.sim)
names(var_1) = row.names(data.sim)
var_1
new_data

Removing NA index

del.ind = match(NA, var_1)
new.var_1 = var_1[-del.ind]
new.var_1

And now, testing in anc.ML:

fit = anc.ML(sim.test, new.var_1, model = "BM")
fit$missing.x
fit2 = phylopars(new_data, sim.test, phylo_correlated = FALSE, pheno_correlated = FALSE,
                 REML = FALSE)
estim = fit2$anc_recon
estim = estim[as.character(sort(as.numeric(row.names(estim)))),]
missing = diag(estim[1:nrow(data.sim), 1:nrow(data.sim)])
missing

Designing test function:

test_cont = function(data, column, pos, dend){
  if(is.factor(data[, column]) == FALSE){
    var = scale(data[, column])
    var[pos] = NA
    names(var) = row.names(data)
    del.ind = match(NA, var)
    new.var = var[-del.ind]
    fit = anc.ML(dend, new.var, model = "BM")
    return(fit$missing.x)
  }else{
    return(NULL)
  }
}

Testing for $X_2$:

test_cont(data.sim, 2, 3, sim.test)

and for $X_3$:

test_cont(data.sim, 3, 3, sim.test)

Now simulating with a bigger $n$ and testing other functions:

set.seed(99)
n = 350
new_data.sim =  data.frame("x1" = rnorm(n, 2, 1),
                       "x2" = runif(n, 3, 6),
                       "x3" = rexp(n))

sim.agnes = agnes(scale(new_data.sim))
dend.agnes = to.dend(sim.agnes)
test = convert_to_par(dend.agnes)
sim.test= read.tree(text = test)

\ With normal plot:

plot(sim.test)

\ Testing Phylopars function:

species = row.names(new_data.sim)
data1 = data.frame("species" = species)
mat = matrix(rep(data.sim$x1, nrow(new_data.sim)), nrow = nrow(new_data.sim), 
             ncol = nrow(new_data.sim))
diag(mat) = rep(NA, nrow(new_data.sim))
data2 =  as.data.frame(mat)
new_data = cbind(data1, data2)
new_data[, c(1,2)]
tic("Phylopars time:")
fit2 = phylopars(new_data, sim.test, phylo_correlated = FALSE, pheno_correlated = FALSE,
                 REML = FALSE)
estim = fit2$anc_recon
estim = estim[as.character(sort(as.numeric(row.names(estim)))),]
estim
toc()

Testing mvMORPH fitting only one column:

# fitting a phylogenetic model
tic("Running time of fitting")
temp = new_data.sim[, 1]
temp = as.data.frame(temp)
temp[1, ] = NA
colnames(temp) = colnames(new_data.sim)[1]
rownames(temp) = rownames(new_data.sim)
fit = mvBM(sim.test, temp, model = "BM1", method = "inverse")
toc()

Then estimating some missing states:

mat = as.matrix(new_data.sim[, 1])
mat[1,1] = rep(NA, 1)
colnames(mat) = colnames(new_data.sim)[1]
head(mat)
tic("Running time of a single imputation")
imp = estim(sim.test, mat, fit)
imp$estimates[imp$NA_index, 1]
imp$var[]

Making vector of estimatives and variances

estims = numeric(nrow(new_data.sim))
vars = numeric(nrow(new_data.sim))
# assessing the running time of a loop using the above function
tic("Time of a loop in function")
for(i in (1:nrow(new_data.sim))){
  mat = as.matrix(new_data.sim[, 1])
  colnames(mat) = colnames(new_data.sim)[1]
  mat[i, 1] = NA
  imp = estim(sim.test, mat, fit)
  estims[i] = imp$estimates[imp$NA_index, 1]
  vars[i] = imp$var[imp$NA_index]
}
toc()
head(estims)
head(vars)

Now, testing the same but using more data:

tic("Running time of fitting")
fit = mvBM(sim.test, new_data.sim, model = "BMM")
toc()

Estimating missing state for column one:

estims = numeric(nrow(new_data.sim))
vars = numeric(nrow(new_data.sim))
# assessing the running time of a loop using the above function
tic("Time of a loop in function")
for(i in (1:nrow(new_data.sim))){
  mat = as.matrix(new_data.sim)
  colnames(mat) = colnames(new_data.sim)
  mat[i, 1] = NA
  imp = estim(sim.test, mat, fit)
  estims[i] = imp$estimates[imp$NA_index, 1]
  vars[i] = imp$var[imp$NA_index]
}
toc()
head(estims)
head(vars)

It seems that doing just by one state is better than multivariate in matters of time. Now

Assessing number of cores:

cores = detectCores()
cores

For $X_1$:

test_cont(new_data.sim, 1, 3, sim.test)

For $X_2$:

test_cont(new_data.sim, 2, 3, sim.test)

For $X_3$:

test_cont(new_data.sim, 3, 3, sim.test)

Simulating mixed data:

n = 250
set.seed(125)
data.sim = data.frame("x1" = rnorm(n, 2, 1),
                      "x2" = runif(n , 3, 6),
                      "x3" = factor(sample(c(1,2,3), n, replace = TRUE)))
head(data.sim)

Fitting to phylogenetic model

d =  daisy(data.sim, metric = "gower")

sim.hc = hclust(d)
dend.hc = to.dend(sim.hc)
test = convert_to_par(dend.hc)
sim.tree = read.tree(text = test)


tic("Running time of fitting")
fit = mvBM(sim.tree, data.sim, model = "BMM")
toc()
mat = data.sim
mat[1,3] = NA
head(mat)
tic("Running time of a single imputation")
imp = estim(sim.tree, mat, fit)
imp$estimates[1, 3]

It seems to work, lets test our scoring function parallelized with anc.ML, serialized and parallelized with mvMORPH:

set.seed(99)
n = 100
data.sim =  data.frame("x1" = rnorm(n, 2, 1),
                       "x2" = runif(n, 3, 6),
                       "x3" = rexp(n))

# using agnes clustering
sim.agnes = agnes(scale(data.sim))
dend.agnes = to.dend(sim.agnes)
test = convert_to_par(dend.agnes)
sim.test= read.tree(text = test)

tic("Scoring time for simulated data 3")
final_score3 = L_score_3(sim.test, data.sim)
final_score3
toc()


tic("Scoring time for simulated data 1")
final_score = L_score(sim.test, data.sim)
final_score
toc()


tic("Scoring time for simulated data 2")
final_score2 = L_score_2(sim.test,data.sim)
final_score2
toc()

So, for agnes dendrogram we have a score of $3.33658$. Trying other hierarchical methods gives:

sim.diana = diana(scale(new_data.sim))
sim.hc = hclust(dist(scale(new_data.sim)), method = "ward.D2")

dend.diana = to.dend(sim.diana)
test.diana = convert_to_par(dend.diana)
tree.diana = read.tree(text = test.diana)

dend.hc = to.dend(sim.hc)
test.hc = convert_to_par(dend.hc)
tree.hc = read.tree(text = test.hc)

# testing both
tic("Scoring time for diana clustering in simulated data")
final_score.diana = L_score(tree.diana, new_data.sim)
toc()
tic("Scoring time for hierarchical clustering in simulated data")
final_score.hc = L_score(tree.hc, new_data.sim)
toc()
final_score
final_score.hc
final_score.diana

Now, returning to our original data, lets explore more of USArrests:

arr = USArrests
head(arr)

Testing hierarchical clustering with ward distance

row.names(arr) = c(1:nrow(arr))
arr.hc = hclust(dist(scale(arr)))
dend.hc = to.dend(arr.hc)
test = convert_to_par(dend.hc)
arr.tree.hc = read.tree(text = test)

# comparing fviz_dend
fviz_dend(arr.hc)

\ Ploting the tree by default function to check the transformation:

plot(arr.tree.hc)

\ It seems to be right, lets give the anc.ML a try:

test_cont(arr, 1, 3, arr.tree.hc)

Testing the score function now:

tic("Scoring time for diana clustering in USArrests")
L_score(arr.tree.hc, arr)
toc()

Simulating large categorical dataset and then making a distance matrix with gower distance

set.seed(7*14)
n = 85
sim.data = data.frame("X1" = factor(sample(c(1,2,3), n, replace = TRUE)),
                      "X2" = factor(sample(c(1,2,3,4), n, replace = TRUE)),
                      "X3" = factor(sample(c(0, 1), n, replace =  TRUE, prob = c(0.7, 0.3))),
                      "X4" = factor(sample(c(0, 1, 2), n, replace = TRUE, prob = c(0.65, 0.05, 0.3)))
)
head(sim.data)

Making gowe matrix and fitting

d =  daisy(sim.data, metric = "gower")
sim.hc = hclust(d)
dend.hc = to.dend(sim.hc)
test = convert_to_par(dend.hc)
sim.tree = read.tree(text = test)
fviz_dend(sim.hc)
plot(sim.tree)

\ Checking effects of chunk separation and different NA positions for pi:

test = sim.data$X1
test2 = sim.data$X1
names(test) = row.names(sim.data)
names(test2) = row.names(sim.data)
test[4] = NA
test2[3] = NA
onehot1 = onehotencoder(test)
onehot2 = onehotencoder(test2)


fit_discrete = make.simmap(as.multiPhylo(sim.tree), onehot1, model = "ER",
                             message = FALSE, pi = "estimated")

fit_discrete2 = make.simmap(as.multiPhylo(sim.tree), onehot2, model = "ER",
                             message = FALSE, pi = "estimated")

results = describe.simmap(fit_discrete, plot = F)
results2  = describe.simmap(fit_discrete2, plot = F)

results$tips[row.names(results$tips) ==  "4"]
results2$tips[row.names(results$tips) ==  "3"]
Q = as.matrix(fit_discrete$Q)
dim = nrow(Q)
pi = xranges(E = matrix(1, nrow = dim, ncol = dim), F = rep(1, dim),
          G = t(Q), H = rep(0, dim))[, 1]


Q2 = as.matrix(fit_discrete2$Q)
  dim = nrow(Q2)
pi2 = xranges(E = matrix(1, nrow = dim, ncol = dim), F = rep(1, dim),
          G = t(Q2), H = rep(0, dim))[, 1]

pi
pi2
set.seed(190)
test = sim.data$X4
test2 = sim.data$X4
names(test) = row.names(sim.data)
names(test2) = row.names(sim.data)
pos = sample(1:nrow(sim.data), 2)
test[pos[1]] = rep(NA, 1)
test2[pos[2]] = rep(NA, 1)
onehot1 = onehotencoder(test)
onehot2 = onehotencoder(test2)
fit_discrete = make.simmap(as.multiPhylo(sim.tree), onehot1, model = "ER", message = FALSE)

fit_discrete2 = make.simmap(as.multiPhylo(sim.tree), onehot2, model = "ER",
                             message = FALSE)

results = describe.simmap(fit_discrete, plot = F)
results2  = describe.simmap(fit_discrete2, plot = F)

results$tips[row.names(results$tips) ==  pos[1]]
results2$tips[row.names(results$tips) ==  pos[2]]

Doing it with mixed data:

set.seed(99)
n = 85
sim.data = data.frame(
  "X1" = rnorm(n, 2, 1),
  "X2" = runif(n , 3, 6),
  "X3" = factor(sample(c(0, 1), n, replace =  TRUE, prob = c(0.7, 0.3))),
  "X4" = factor(sample(c(0, 1, 2), n, replace = TRUE, prob = c(0.65, 0.05, 0.3))))
head(sim.data)
d =  daisy(sim.data, metric = "gower")
sim.hc = hclust(d)
dend.hc = to.dend(sim.hc)
test = convert_to_par(dend.hc)
sim.tree = read.tree(text = test)
fviz_dend(sim.hc)
set.seed(190)
test = sim.data$X4
test2 = sim.data$X4
names(test) = row.names(sim.data)
names(test2) = row.names(sim.data)
test[sample(1:nrow(sim.data),10)] = rep(NA, 10)
test2[sample(1:nrow(sim.data), 10)] = rep(NA, 10)
priori = as.data.frame(table(test))[, 2]
priori_2 = as.data.frame(table(test2))[, 2]
priori = priori/(sum(priori))
priori_2 = priori_2/(sum(priori_2))

onehot1 = onehotencoder(test)
onehot2 = onehotencoder(test2)
fit_discrete = make.simmap(sim.tree, onehot1, model = "ER", pi  = priori,
                           message = F)


fit_discrete2 = make.simmap(sim.tree, onehot2, model = "ER", pi = priori_2,
                            message = F)


Q = as.matrix(fit_discrete$Q)
dim = nrow(Q)
pi = xranges(E = matrix(1, nrow = dim, ncol = dim), F = rep(1, dim),
          G = t(Q), H = rep(0, dim))[, 1]


Q2 = as.matrix(fit_discrete2$Q)
  dim = nrow(Q2)
pi_2 = xranges(E = matrix(1, nrow = dim, ncol = dim), F = rep(1, dim),
          G = t(Q2), H = rep(0, dim))[, 1]

pi
pi_2

Testing score function now

tic("Scoring time for hierarchical clustering with gower distance")
L_score_2(sim.tree, sim.data)
toc()

Testing Iris dataset:

flowers = iris
head(flowers)
row.names(flowers) = c(1:nrow(flowers))

flowers.onehot = one_hot(as.data.table(flowers))
head(flowers.onehot)
flowers.onehot = flowers.onehot %>%
  dplyr::select(-c("Species_virginica")) %>%
  scale()
flowers.hc= dist(flowers.onehot) %>%
  hclust()
dend.hc = to.dend(flowers.hc)
test = convert_to_par(dend.hc)
flowers.tree.hc = read.tree(text = test)
flowers.tree.hc$edge.length[which(flowers.tree.hc$edge.length %in% c(0))] = 10^(-8)

# comparing fviz_dend
fviz_dend(flowers.hc)

\ Ploting the tree by default function to check the transformation:

plot(flowers.tree.hc)

\ Testing continuous function:

test_cont(flowers, 1, 3, flowers.tree.hc)

Testing the score function now:

tic("Scoring time for hiearchical clustering in iris dataset")
score = L_score(flowers.tree.hc, flowers)
toc()


Monoxido45/PhyloHclust documentation built on Sept. 25, 2024, 3:17 a.m.