################################################################################
## Difficulties ################################################################
################################################################################
rm(list=ls())
WD_mac <- "/Users/paulb/Dropbox/These/Code" # Dossier de travail (Mac)
WD_unb <- "/home/bastide/Dropbox/These/Code" # Dossier de travail (Ubuntu)
WD <- ifelse(file.exists(WD_mac), WD_mac, WD_unb)
setwd(WD)
library(ape)
# library(glmnet) # For Lasso initialization
# library(robustbase) # For robust fitting of alpha
library(ggplot2) # Plot
library(scales) # plot
library(reshape2) # Plot
library(grid) # Plot
treeType <- "simulated_tree"
date <- "2014-11-29_11-01-19"
load(paste0("Results/Simulation_Estimation_Bayou_Design/simulation_ou_on_tree_bayou_design_", treeType, "-", date, ".RData"))
## Tree Structure
ntaxa <- length(tree$tip.label)
times_shared <- compute_times_ca(tree)
distances_phylo <- compute_dist_phy(tree)
## How difficult a problem is ?
compute_difficulty <- function(simest){
## Parameters of the simulation
root.state <- list(random = TRUE,
stationary.root = TRUE,
value.root = NA,
exp.root = beta_0,
var.root = simest$gamma)
params_simu <- list(variance = simest$gamma * 2 * simest$alpha,
root.state = root.state,
shifts = simest$shifts,
selection.strength = simest$alpha,
optimal.value = beta_0)
## Variance - Covariance of the tips
Sigma <- compute_variance_covariance.OU(times_shared = times_shared,
distances_phylo = distances_phylo,
params_old = params_simu)
Sigma_YY <- extract_variance_covariance(Sigma, what="YY")
Sigma_YY_inv <- solve(Sigma_YY)
## Mean at the tips
XX <- simulate_internal(phylo = tree,
process = process,
root.state = root.state,
variance = 2 * simest$alpha * simest$gamma,
shifts = simest$shifts,
selection.strength = simest$alpha,
optimal.value = beta_0)
MU <- extract_simulate_internal(XX, what="expectations", where="tips")
## Weighted mean
mu_0 <- (sum(Sigma_YY_inv))^(-1) * sum(Sigma_YY_inv%*%MU)
## Score
dif <- t(MU - mu_0) %*% Sigma_YY_inv %*% (MU - mu_0)
return(dif)
}
difficulties <- sapply(simestimations, compute_difficulty)
difficulties_alpha_known <- sapply(simestimations_alpha_known, compute_difficulty)
save.image(paste0("Results/Simulation_Estimation_Bayou_Design/simulation_ou_on_tree_bayou_design_simulated_tree-", datestamp, "-with_difficulties", ".RData"))
################################################################################
## Arrange the Data ############################################################
################################################################################
rm(list=ls())
WD_mac <- "/Users/paulb/Dropbox/These/Code" # Dossier de travail (Mac)
WD_unb <- "/home/bastide/Dropbox/These/Code" # Dossier de travail (Ubuntu)
WD <- ifelse(file.exists(WD_mac), WD_mac, WD_unb)
setwd(WD)
library(ape)
#library(glmnet) # For Lasso initialization
#library(robustbase) # For robust fitting of alpha
library(ggplot2) # Plot
library(reshape2) # Plot
library(scales)
library(grid) # Plot
library(plyr)
source("Phylogenetic-EM/R/partitionsNumber.R")
library(mclust)
library(parallel)
treeType <- "simulated_tree"
date <- "2014-11-29_11-01-19"
load(paste0("Results/Simulation_Estimation_Bayou_Design/simulation_ou_on_tree_bayou_design_", treeType, "-", date, "-with_difficulties.RData"))
## Transform list of data into data frame and add difficulties
ddd <- list(alpha_kn = NULL, alpha_un = NULL)
ddd[["alpha_un"]] <- do.call(rbind, simestimations)
ddd[["alpha_un"]] <- cbind(ddd[["alpha_un"]], difficulty = difficulties)
ddd[["alpha_kn"]] <- do.call(rbind, simestimations_alpha_known)
ddd[["alpha_kn"]] <- cbind(ddd[["alpha_kn"]], difficulty = difficulties_alpha_known)
## Transform dd to data frame
df <- list(alpha_kn = NULL, alpha_un = NULL)
for (name in c("alpha_kn", "alpha_un")){
dd <- ddd[[name]]
## Extract data frame like portion
df[[name]] <- apply(dd[ , colnames(dd) %in% c("alpha", "gamma", "K", "n",
"grp", "alpha_estim",
"gamma_estim", "EM_steps",
"DV_estim", "CV_estim",
"difficulty", "beta_0_estim",
"log_likelihood",
"mean_number_new_shifts"
,"number_equivalent_solutions"
)],
2, unlist)
df[[name]] <- as.data.frame(df[[name]])
## Inject list portion
for (column in c("shifts", "start", "shifts_estim", "edge.quality")) {
df[[name]][[column]] <- dd[ , column]
}
}
## Clusterings
subtree.list <- enumerate_tips_under_edges(tree)
tips_clustering <- function(shifts){
return(clusters_from_shifts(tree, shifts$edges, subtree.list))
}
nodes_clustering <- function(clusters){
return(extract.enumerate_parsimony(enumerate_parsimony(tree, clusters)))
}
nodes_clustering_from_shifts <- function(shifts){
return(allocate_regimes_from_shifts(tree, shifts$edges))
}
ARI_nodes <- function(tr_clus, est_clus){
fun <- function(est){
return(adjustedRandIndex(tr_clus, est))
}
apply(est_clus, 1, fun)
}
# Parallel computation of ARIs
Ncores <- 3
for (name in c("alpha_kn", "alpha_un")){
# True
df[[name]][["tips_clusters"]] <- lapply(df[[name]][["shifts"]], tips_clustering)
df[[name]][["nodes_clusters"]] <- mclapply(df[[name]][["shifts"]], nodes_clustering_from_shifts, mc.cores = Ncores)
# Estim
df[[name]][["tips_clusters_estim"]] <- lapply(df[[name]][["shifts_estim"]], tips_clustering)
df[[name]][["nodes_clusters_estim_default"]] <- mclapply(df[[name]][["shifts_estim"]], nodes_clustering_from_shifts, mc.cores = Ncores)
df[[name]][["nodes_clusters_estim"]] <- mclapply(df[[name]][["tips_clusters_estim"]], nodes_clustering, mc.cores = Ncores)
# ARI
df[[name]][["ARI_tips"]] <- mcmapply(adjustedRandIndex, df[[name]][["tips_clusters"]], df[[name]][["tips_clusters_estim"]], mc.cores = Ncores)
df[[name]][["ARI_nodes_default"]] <- mcmapply(adjustedRandIndex, df[[name]][["nodes_clusters"]], df[[name]][["nodes_clusters_estim_default"]], mc.cores = Ncores)
df[[name]][["ARI_nodes_all"]] <- mcmapply(ARI_nodes, df[[name]][["nodes_clusters"]], df[[name]][["nodes_clusters_estim"]], mc.cores = 3)
df[[name]][["ARI_nodes_max"]] <- sapply(df[[name]][["ARI_nodes_all"]], max)
df[[name]][["ARI_nodes_min"]] <- sapply(df[[name]][["ARI_nodes_all"]], min)
}
## Scores Functions
RMSE <- function(y, true.value) {
return(sqrt(mean((y - true.value)^2)))
}
RMSE_initial<- function(df, parameter, true.value) {
## df is a data.frame
y <- lapply(df, function(z) z[[1]][parameter])
## browser()
y <- unlist(y)
return(sqrt(mean((y - true.value)^2, na.rm = TRUE)))
}
RMSE_initial_bis<- function(df, parameter1, parameter2, true.value) {
## df is a data.frame
y <- lapply(df, function(z) z[[1]][[parameter1]][parameter2])
## browser()
y <- unlist(y)
return(sqrt(mean((y - true.value)^2, na.rm = TRUE)))
}
match.edges <- function(tr.edges, est.edges) {
loss.fun <- function(tr, est) {
mean(est$edges %in% tr$edges)
}
z <- mapply(loss.fun, tr.edges, est.edges)
return(mean(z))
}
match.edges.init <- function(tr.edges, start) {
est.edges <- lapply(start, function(z) z[[1]][["shifts"]])
## browser()
return(match.edges(tr.edges, est.edges))
}
match.partitions <- function(tr.edges, est.edges, part.list = subtree.list, n = ntaxa) {
loss.fun <- function(tr, est) {
## computes ARI between partitions induced by edges in tr$edges and est$edges
if (is.null(est$edges)){
return(-1) # If lasso initialization failed
} else {
require(mclust)
part1 <- clusters_from_shifts(tree, tr$edges, part.list)
part2 <- clusters_from_shifts(tree, est$edges, part.list)
return(adjustedRandIndex(part1, part2))
}
}
## browser()
z <- mapply(loss.fun, tr.edges, est.edges)
return(mean(z))
}
match.partitions.init <- function(tr.edges, start) {
est.edges <- lapply(start, function(z) z[[1]][["shifts"]])
return(match.partitions(tr.edges, est.edges))
}
mean.loglik.init <- function(start){
log_liks <- lapply(start, function(z) attr(z[[1]], "log_likelihood")[1])
return(mean(unlist(log_liks)))
}
data <- list(alpha_kn = NULL, alpha_un = NULL)
data2 <- list(alpha_kn = NULL, alpha_un = NULL)
for (name in c("alpha_kn", "alpha_un")){
dd <- ddd[[name]]
## Combinations of "true" parameters (no n)
system.time(
## Aggregation of parameter estimates conditonal upon convergence of EM procedure
data[[name]] <- ddply(subset(df[[name]], CV_estim == 1),
.(alpha, gamma, K, grp),
summarize,
alpha.mean = mean(alpha_estim),
alpha.sd = sd(alpha_estim),
alpha.RMSE = RMSE(alpha_estim, unique(alpha)),
alpha.init.RMSE = RMSE_initial(start,
"selection.strength",
unique(alpha)),
gamma.mean = mean(gamma_estim),
gamma.sd = sd(gamma_estim),
gamma.RMSE = RMSE(gamma_estim, unique(gamma)),
gamma.init.RMSE = RMSE_initial_bis(start,
"root.state",
"exp.root",
unique(gamma)),
beta_0.mean = mean(beta_0_estim),
beta_0.sd = sd(beta_0_estim),
beta_0.RMSE = RMSE(beta_0_estim, beta_0),
beta_0.init.RMSE = RMSE_initial(start,
"optimal.value",
beta_0),
## edge quality is a list of variable size vectors
mean.edge.quality = mean(unlist(edge.quality)),
mean.difficulty = mean(difficulty),
edge.recovery = match.edges(shifts, shifts_estim),
edge.init.recovery = match.edges.init(shifts, start),
adjrandind = match.partitions(shifts, shifts_estim),
adjrandind.init = match.partitions.init(shifts, start),
ARI_tips = mean(ARI_tips),
ARI_nodes_default = mean(ARI_nodes_default),
ARI_nodes_max <- mean(ARI_nodes_max),
ARI_nodes_min <- mean(ARI_nodes_min),
log_likelihood.mean = mean(log_likelihood),
log_likelihood.mean.init = mean.loglik.init(start),
mean.mean_number_new_shifts = mean(mean_number_new_shifts),
mean.number_equivalent_solutions = mean(number_equivalent_solutions),
model_complexity = extract.partitionsNumber(partitionsNumber(tree, unique(K) + 1))
)
)
## Convergence monitoring quantities
data2[[name]] <- ddply(df[[name]],
.(alpha, gamma, K, grp),
summarize,
mean.steps = mean(EM_steps, na.rm = TRUE),
DV.rate = mean(DV_estim),
CV.rate = mean(CV_estim))
data[[name]] <- merge(data[[name]], data2[[name]])
#data[[name]]$K <- factor(data[[name]]$K)
}
## Join known and unknown data together
data[["alpha_kn"]]$alpha_known <- TRUE
data[["alpha_un"]]$alpha_known <- FALSE
data <- do.call(rbind, data)
df[["alpha_kn"]]$alpha_known <- TRUE
df[["alpha_un"]]$alpha_known <- FALSE
df <- do.call(rbind, df)
data_n <- df[, c("alpha", "gamma", "K", "n",
"grp", "alpha_estim",
"gamma_estim", "EM_steps",
"DV_estim", "CV_estim",
"difficulty", "beta_0_estim",
"log_likelihood",
"mean_number_new_shifts",
"number_equivalent_solutions", "alpha_known")]
data_n$model_complexity <- data$model_complexity[match(data_n$K, subset(data, alpha_known)$K)]
rm(ddd, df, difficulties, difficulties_alpha_known, data2, simestimations, simestimations_alpha_known)
save.image(paste0("Results/Simulation_Estimation_Bayou_Design/simulation_ou_on_tree_bayou_design_simulated_tree-", datestamp, "-aranged_bis", ".RData"))
###############################################################################
### Plots #####################################################################
###############################################################################
rm(list=ls())
WD <- "/Users/paulb/Dropbox/These/Code" # Dossier de travail (Mac)
#WD <- "/home/bastide/Dropbox/These/Code" # Dossier de travail (Ubuntu)
setwd(WD)
library(ape)
#library(glmnet) # For Lasso initialization
#library(robustbase) # For robust fitting of alpha
library(ggplot2) # Plot
library(reshape2) # Plot
library(scales)
library(grid) # Plot
library(plyr)
treeType <- "simulated_tree"
datestamp <- "2014-11-28_15-48-04"
load(paste0("Results/Simulation_Estimation_Bayou_Design_new_seg/simulation_ou_on_tree_bayou_design_", treeType, "-", datestamp, "-aranged.RData"))
## Choose known/unknown
PATH <- paste0("Results/Simulation_Estimation_Bayou_Design_new_seg/simulated_tree_", datestamp,"_")
## Other plots
my.labeller <- function(variable, value) {
value <- paste(variable, "==", as.character(value))
value <- lapply(value, function(x) parse(text = x))
return(value)
}
## Alpha RMSE
data_alpha <- subset(data, grp %in% c(2))
p <- ggplot(data_alpha, aes(x = alpha, y = alpha.RMSE / alpha))
p <- p + geom_line(aes(linetype = alpha_known))
p <- p + scale_linetype(name = "Alpha",
breaks = c("FALSE", "TRUE"),
labels = c("Estimated", "Known"))
p <- p + labs(x = expression(paste("True ", alpha)),
y = expression(paste("RMSE(", alpha, ") / ", alpha)))
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12))
# legend.position = c(0, 1),
# legend.justification = c(0, 1))
p
ggsave(p, file = paste0(PATH, "alpha_rmse.pdf")
, width = 210, height = 148, units="mm"
)
## Gamma RMSE
data_gamma <- subset(data, grp %in% c(1, 3))
p <- ggplot(data_gamma, aes(x = gamma, y = gamma.RMSE / gamma))
p <- p + geom_line(aes(linetype = alpha_known))
p <- p + scale_linetype(name = "Alpha",
breaks = c("FALSE", "TRUE"),
labels = c("Estimated", "Known"))
p <- p + labs(x = expression(paste("True ", gamma)),
y = expression(paste("RMSE(", gamma, ") / ", gamma)))
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12)
##legend.position = c(0, 1),
##legend.justification = c(0, 1)
)
p
ggsave(p, file = paste0(PATH, "gamma_rmse.pdf")
, width = 210, height = 148, units="mm"
)
## CV and DV rates
# data2$grp <- c(rep(1,6), rep(3,3), rep(2,10), rep(3,4), rep(1,4))
# p <- ggplot(data2, aes(x = alpha, y = CV.rate))
# ## p <- p + geom_bar(aes(y = 1 - DV.rate), position = "dodge", stat = "identity", alpha = 0.8) + geom_bar(stat = "identity", position = "dodge")
# p <- p + geom_line()
# p <- p + facet_grid(facets = .~grp, labeller = my.labeller, scales = "fixed")
# p <- p + labs(x = expression(alpha),
# y = "Convergence rate")
# p <- p + scale_linetype(name = "Conv. type")
# p <- p + scale_x_continuous(trans = log_trans(10))
# p <- p + theme_bw() + coord_cartesian(ylim = c(0, 1.1))
# p <- p + theme(axis.text = element_text(size = 12),
# strip.text = element_text(size = 12)
# ##legend.position = c(0, 1),
# ##legend.justification = c(0, 1)
# )
# p
# ggsave(p, file = paste0(PATH, "convergence_rate.pdf")
# , width = 210, height = 148, units="mm"
# )
## CV and DV using grid
grp_name <- c("Alpha Varying", "Gamma Varying", "K Varying")
grp_var <- c(expression(alpha), expression(gamma^2), expression(K))
pdf(paste(PATH, "CV_rate", ".pdf", sep=""),
width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
data.grp <- subset(data, grp %in% c(1, group + 1))
p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "CV.rate"))
p <- p + geom_line(aes(linetype = alpha_known))
p <- p + scale_linetype(name = "Alpha",
breaks = c("FALSE", "TRUE"),
labels = c("Estimated", "Known"))
p <- p + labs(x = grp_var[group],
y = "Convergence rate")
#p <- p + scale_linetype(name = "Conv. type")
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw() + coord_cartesian(ylim = c(0, 1.1))
p <- p + theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12)
##legend.position = c(0, 1),
##legend.justification = c(0, 1)
)
print(p, vp=vplayout(1, group))
}
dev.off()
## Edge recovery
pdf(paste(PATH, "Edge_Recovery", ".pdf", sep=""),
width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
data.grp <- subset(data, grp %in% c(1, group + 1))
p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "edge.recovery"))
p <- p + geom_line(aes(linetype = alpha_known))
p <- p + scale_linetype(name = "Alpha",
breaks = c("FALSE", "TRUE"),
labels = c("Estimated", "Known"))
p <- p + labs(x = grp_var[group],
y = "Edge Recovery")
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw() + coord_cartesian(ylim = c(0, 1))
p <- p + theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12)
##legend.position = c(0, 1),
##legend.justification = c(0, 1)
)
print(p, vp=vplayout(1, group))
}
dev.off()
## partition quality
pdf(paste(PATH, "ARI", ".pdf", sep=""),
width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
data.grp <- subset(data, grp %in% c(1, group + 1))
p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "adjrandind"))
p <- p + geom_hline(yintercept = 0)
p <- p + geom_line(aes(linetype = "Estimated")) + geom_line(aes(y = adjrandind.init, linetype = "Initial"))
p <- p + labs(x = grp_var[group],
y = "Adjusted Rand Index")
p <- p + scale_linetype(name = "Shift Allocation")
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw() + coord_cartesian(ylim = c(-0.1, 1))
p <- p + theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12)
##legend.position = c(0, 1),
##legend.justification = c(0, 1)
)
print(p, vp=vplayout(1, group))
}
dev.off()
## iteration time
pdf(paste(PATH, "Iteration_Steps", ".pdf", sep=""),
width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
data.grp <- subset(data, grp %in% c(1, group + 1))
p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "mean.steps"))
p <- p + geom_hline(yintercept = 1000)
p <- p + geom_line(aes(linetype = alpha_known))
p <- p + scale_linetype(name = "Alpha",
breaks = c("FALSE", "TRUE"),
labels = c("Estimated", "Known"))
p <- p + labs(x = grp_var[group],
y = "Iteration steps")
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12)
##legend.position = c(0, 1),
##legend.justification = c(0, 1)
)
print(p, vp=vplayout(1, group))
}
dev.off()
## mean log likelihood
pdf(paste(PATH, "mean_log_likelihood", ".pdf", sep=""),
width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
data.grp <- subset(data, grp %in% c(1, group + 1))
p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "log_likelihood.mean", linetype = "alpha_known"))
p <- p + geom_hline(yintercept = 0)
p <- p + scale_linetype(name = "Alpha",
breaks = c("FALSE", "TRUE"),
labels = c("Estimated", "Known"))
p <- p + geom_line(aes(colour = "Final")) + geom_line(aes(y = log_likelihood.mean.init, colour = "Initial"))
p <- p + labs(x = grp_var[group],
y = "Mean Log Likelihood")
p <- p + scale_colour_discrete(name = "Estimation")
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12)
##legend.position = c(0, 1),
##legend.justification = c(0, 1)
)
print(p, vp=vplayout(1, group))
}
dev.off()
## beta_0 RMSE
pdf(paste(PATH, "beta_0_RMSE", ".pdf", sep=""),
width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
data.grp <- subset(data, grp %in% c(1, group + 1))
p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "beta_0.RMSE"))
p <- p + geom_hline(yintercept = 0)
p <- p + geom_line(aes(linetype = alpha_known))
p <- p + scale_linetype(name = "Alpha",
breaks = c("FALSE", "TRUE"),
labels = c("Estimated", "Known"))
p <- p + labs(x = grp_var[group],
y = expression(paste("RMSE(", beta[0], ")")))
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12)
##legend.position = c(0, 1),
##legend.justification = c(0, 1)
)
print(p, vp=vplayout(1, group))
}
dev.off()
## Difficulty vs convergence rate
p <- ggplot(subset(df, K == 1), aes(x = difficulty, y = CV_estim,
color = factor(snr), group = factor(snr)))
p <- p + facet_grid(facets = gamma~alpha, labeller = my.labeller, scales = "fixed")
p <- p + geom_point() + scale_x_log10()
p <- p + geom_smooth(method="glm", family="binomial", se=F)
p <- p + ggtitle("1 shift")
p
p <- ggplot(subset(df, K == 3), aes(x = difficulty, y = CV_estim,
color = factor(snr), group = factor(snr)))
p <- p + facet_grid(facets = gamma~alpha, labeller = my.labeller, scales = "fixed")
p <- p + geom_point() + scale_x_log10()
p <- p + geom_smooth(method="glm", family="binomial", se=F)
p <- p + ggtitle("3 shifts")
p
p <- ggplot(subset(df, K == 5), aes(x = difficulty, y = CV_estim,
color = factor(snr), group = factor(snr)))
p <- p + facet_grid(facets = gamma~alpha, labeller = my.labeller, scales = "fixed")
p <- p + geom_point() + scale_x_log10()
p <- p + geom_smooth(method="glm", family="binomial", se=F)
p <- p + ggtitle("5 shifts")
plot(p)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.