simulations_study/simulation_exploitation_bayou_design_alpha_known.R

################################################################################
## Difficulties ################################################################
################################################################################

rm(list=ls())
WD <- "/Users/paulb/Dropbox/These/Code" # Dossier de travail (Mac)
#WD <- "/home/bastide/Dropbox/These/Code" # Dossier de travail (Ubuntu)
setwd(WD)
library(ape)
# library(glmnet) # For Lasso initialization
# library(robustbase) # For robust fitting of alpha
library(ggplot2) # Plot
library(scales) # plot
library(reshape2) # Plot
library(grid) # Plot

# load("Results/Simulation_Estimation_Bayou_Design_new_seg/simulation_ou_on_tree_bayou_design_alpha_known_simulated_tree-2014-10-27_10-14-24.RData")
load("Results/Simulation_Estimation_Bayou_Design_new_seg/simulation_ou_on_tree_bayou_design_simulated_tree_alpha_known-2014-11-28_14-58-37.RData")

## Tree Structure
ntaxa <- length(tree$tip.label)
times_shared <- compute_times_ca(tree)
distances_phylo <- compute_dist_phy(tree)

## How difficult a problem is ?
compute_difficulty <- function(simest){
  ## Parameters of the simulation
  root.state <- list(random = TRUE,
                     stationary.root = TRUE,
                     value.root = NA,
                     exp.root = beta_0,
                     var.root  = simest$gamma)
  params_simu <- list(variance = simest$gamma * 2 * simest$alpha,
                      root.state = root.state,
                      shifts = simest$shifts,
                      selection.strength = simest$alpha,
                      optimal.value = beta_0)
  ## Variance - Covariance of the tips
  Sigma <- compute_variance_covariance.OU(times_shared = times_shared, 
                                          distances_phylo = distances_phylo,
                                          params_old = params_simu)
  Sigma_YY <- extract_variance_covariance(Sigma, what="YY")
  Sigma_YY_inv <- solve(Sigma_YY)
  ## Mean at the tips
  XX <- simulate_internal(phylo = tree,
                 process = process,
                 root.state = root.state, 
                 variance = 2 * simest$alpha * simest$gamma,
                 shifts = simest$shifts, 
                 selection.strength = simest$alpha, 
                 optimal.value = beta_0)
  MU <- extract_simulate_internal(XX, what="expectations", where="tips")
  ## Weighted mean
  mu_0 <- (sum(Sigma_YY_inv))^(-1) * sum(Sigma_YY_inv%*%MU)
  ## Score
  dif <- t(MU - mu_0) %*% Sigma_YY_inv %*% (MU - mu_0)
  return(dif)
}

difficulties <- sapply(simestimations_alpha_known, compute_difficulty)

save.image(paste0("Results/Simulation_Estimation_Bayou_Design_new_seg/simulation_ou_on_tree_bayou_design_simulated_tree_alpha_known-", datestamp, "-with_difficulties", ".RData"))

################################################################################
## Arrange the Data ############################################################
################################################################################

rm(list=ls())
WD <- "/Users/paulb/Dropbox/These/Code" # Dossier de travail (Mac)
# WD <- "/home/bastide/Dropbox/These/Code" # Dossier de travail (Ubuntu)
setwd(WD)
library(ape)
#library(glmnet) # For Lasso initialization
#library(robustbase) # For robust fitting of alpha
library(ggplot2) # Plot
library(reshape2) # Plot
library(scales)
library(grid) # Plot
library(plyr)

load("Results/Simulation_Estimation_Bayou_Design_new_seg/simulation_ou_on_tree_bayou_design_simulated_tree_alpha_known-2014-11-03_16-54-47-with_difficulties.RData")

## Transform list of data into data frame and add difficulties
dd <- do.call(rbind, simestimations_alpha_known)
dd <- cbind(dd, difficulty = difficulties)

## Transform dd to data frame
## Extract data frame like portion
df <- apply(dd[ , colnames(dd) %in% c("alpha", "gamma", "K", "n",
                                      "grp", "alpha_estim",
                                      "gamma_estim", "EM_steps", 
                                      "DV_estim", "CV_estim",
                                      "difficulty", "beta_0_estim",
                                      "log_likelihood",
                                      "mean_number_new_shifts")],
            2, unlist)
df <- as.data.frame(df)
## Inject list portion
for (column in c("shifts", "start", "shifts_estim", "edge.quality")) {
  df[[column]] <- dd[ , column]
}

## Scores Functions
temp <- prop.part(tree)
subtree.list <- vector("list", nrow(tree$edge))
for (i in 1:nrow(tree$edge)) {
  node <- tree$edge[i, 2]
  ntaxa <- length(tree$tip.label)
  if (node > ntaxa) {
    subtree.list[[i]] <- temp[[node -ntaxa]]
  } else {
    subtree.list[[i]] <- node
  }
}

RMSE <- function(y, true.value) {
  return(sqrt(mean((y - true.value)^2)))
}

RMSE_initial<- function(df, parameter, true.value) {
  ## df is a data.frame
  y <- lapply(df, function(z) z[[1]][parameter])
  ## browser()
  y <- unlist(y)
  return(sqrt(mean((y - true.value)^2, na.rm = TRUE)))
}

match.edges <- function(tr.edges, est.edges) {
  loss.fun <- function(tr, est) {
    mean(est$edges %in% tr$edges)
  }
  z <- mapply(loss.fun, tr.edges, est.edges)
  return(mean(z))
}

match.edges.init <- function(tr.edges, start) {
  est.edges <- lapply(start, function(z) z[[1]][["shifts"]])
  ## browser()
  return(match.edges(tr.edges, est.edges))
}

match.partitions <- function(tr.edges, est.edges, part.list = subtree.list, n = ntaxa) {
  loss.fun <- function(tr, est) {
    ## computes ARI between partitions induced by edges in tr$edges and est$edges
    if (is.null(est$edges)){
      return(-1) # If lasso initialization failed
    } else {
      require(mclust)
      part1 <- rep(0, n)
      edges <- sort(tr$edges)
      for (i in 1:length(edges)) {
        part1[part.list[[edges[i]]]] <- i
      }
      part2 <- rep(0, n)
      edges <- sort(est$edges)
      for (i in 1:length(edges)) {
        part2[part.list[[edges[i]]]] <- i
      }
      return(adjustedRandIndex(part1, part2))        
    }
  }
  ## browser()
  z <- mapply(loss.fun, tr.edges, est.edges)
  return(mean(z))
}

match.partitions.init <- function(tr.edges, start) {
  est.edges <- lapply(start, function(z) z[[1]][["shifts"]])
  return(match.partitions(tr.edges, est.edges))
}

mean.loglik.init <- function(start){
  log_liks <- lapply(start, function(z) attr(z[[1]], "log_likelihood")[1])
  return(mean(unlist(log_liks)))
}

## Combinations of "true" parameters (no n)
system.time(
  ## Aggregation of parameter estimates conditonal upon convergence of EM procedure
  data <- ddply(subset(df, CV_estim == 1),
                .(alpha, gamma, K, grp),
                summarize,
                alpha.mean = mean(alpha_estim),
                alpha.sd = sd(alpha_estim),
                alpha.RMSE = RMSE(alpha_estim, unique(alpha)),
                alpha.init.RMSE = RMSE_initial(start,
                                               "selection.strength",
                                               unique(alpha)),
                gamma.mean = mean(gamma_estim),
                gamma.sd = sd(gamma_estim),
                gamma.RMSE = RMSE(gamma_estim, unique(gamma)),
                beta_0.mean = mean(beta_0_estim),
                beta_0.sd = sd(beta_0_estim),
                beta_0.RMSE = RMSE(beta_0_estim, beta_0),
                ## edge quality is a list of variable size vectors
                mean.edge.quality = mean(unlist(edge.quality)),
                mean.difficulty = mean(difficulty),
                edge.recovery = match.edges(shifts, shifts_estim),
                edge.init.recovery = match.edges.init(shifts, start),
                adjrandind = match.partitions(shifts, shifts_estim),
                adjrandind.init = match.partitions.init(shifts, start),
                log_likelihood.mean = mean(log_likelihood),
                log_likelihood.mean.init = mean.loglik.init(start)
  )
)

## Convergence monitoring quantities
data2 <- ddply(df,
               .(alpha, gamma, K, grp),
               summarize,
               mean.steps = mean(EM_steps, na.rm = TRUE),
               DV.rate = mean(DV_estim),
               CV.rate = mean(CV_estim))

data <- merge(data, data2)
#data[[paste0(dd)]]$K <- factor(data[[paste0(dd)]]$K)

rm(dd, df, difficulties)

save.image(paste0("Results/Simulation_Estimation_Bayou_Design_new_seg/simulation_ou_on_tree_bayou_design_simulated_tree_alpha_known-", datestamp, "-aranged", ".RData"))

###############################################################################
### Plots #####################################################################
###############################################################################
rm(list=ls())
#WD <- "/Users/paulb/Dropbox/These/Code" # Dossier de travail (Mac)
WD <- "/home/bastide/Dropbox/These/Code" # Dossier de travail (Ubuntu)
setwd(WD)
library(ape)
#library(glmnet) # For Lasso initialization
#library(robustbase) # For robust fitting of alpha
library(ggplot2) # Plot
library(reshape2) # Plot
library(scales)
library(grid) # Plot
library(plyr)

load("Results/Simulation_Estimation_Bayou_Design_new_seg/simulation_ou_on_tree_bayou_design_simulated_tree_alpha_known-2014-11-03_16-54-47-aranged.RData")


## Plot using grid
PATH <- paste0("Results/Simulation_Estimation_Bayou_Design_new_seg/simulated_tree_alpha_known_", datestamp,"_")


## Other plots
my.labeller <- function(variable, value) {
  value <- paste(variable, "==", as.character(value))
  value <- lapply(value, function(x) parse(text = x))
  return(value)
}


## Alpha RMSE
data_alpha <- subset(data, grp %in% c(2))
p <- ggplot(data_alpha, aes(x = alpha, y = alpha.RMSE / alpha))
p <- p + geom_line() ## + geom_line(aes(y = alpha.init.RMSE), linetype = 2)
p <- p + labs(x = expression(paste("True ", alpha)),
              y = expression(paste("RMSE(", alpha, ") / ", alpha)))
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
               strip.text = element_text(size = 12))
# legend.position = c(0, 1),
# legend.justification = c(0, 1))
p
ggsave(p, file = paste0(PATH, "alpha_rmse.pdf")
       , width = 210, height = 148, units="mm"
)

## Gamma RMSE
data_gamma <- subset(data, grp %in% c(1, 3))
p <- ggplot(data_gamma, aes(x = gamma, y = gamma.RMSE / gamma))
p <- p + geom_line() ## + geom_line(aes(y = gamma.init.RMSE), linetype = 2)
p <- p + labs(x = expression(paste("True ", gamma)),
              y = expression(paste("RMSE(", gamma, ") / ", gamma)))
p <- p + scale_x_continuous(trans = log_trans(10))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
               strip.text = element_text(size = 12)
               ##legend.position = c(0, 1),
               ##legend.justification = c(0, 1)
)
p
ggsave(p, file = paste0(PATH, "gamma_rmse.pdf")
       , width = 210, height = 148, units="mm"
)


## CV and DV rates
# data2$grp <- c(rep(1,6), rep(3,3), rep(2,10), rep(3,4), rep(1,4))
# p <- ggplot(data2, aes(x = alpha, y = CV.rate))
# ## p <- p + geom_bar(aes(y = 1 - DV.rate), position = "dodge", stat = "identity", alpha = 0.8) + geom_bar(stat = "identity", position = "dodge")
# p <- p + geom_line()
# p <- p + facet_grid(facets = .~grp, labeller = my.labeller, scales = "fixed")
# p <- p + labs(x = expression(alpha),
#               y = "Convergence rate")
# p <- p + scale_linetype(name = "Conv. type")
# p <- p + scale_x_continuous(trans = log_trans(10))
# p <- p + theme_bw() + coord_cartesian(ylim = c(0, 1.1))
# p <- p + theme(axis.text = element_text(size = 12),
#                strip.text = element_text(size = 12)
#                ##legend.position = c(0, 1),
#                ##legend.justification = c(0, 1)
# )
# p
# ggsave(p, file = paste0(PATH, "convergence_rate.pdf")
#        , width = 210, height = 148, units="mm"
# )

## CV and DV using grid
grp_name <- c("Alpha Varying", "Gamma Varying", "K Varying")
grp_var <- c(expression(alpha), expression(gamma^2), expression(K))

pdf(paste(PATH, "CV_rate", ".pdf", sep=""),
    width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
  data.grp <- subset(data2, grp %in% c(1, group + 1))
  p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "CV.rate"))
  p <- p + geom_line()
  p <- p + labs(x = grp_var[group],
                y = "Convergence rate")
  p <- p + scale_linetype(name = "Conv. type")
  p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw() + coord_cartesian(ylim = c(0, 1.1))
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12)
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  print(p, vp=vplayout(1, group))
}
dev.off()


## Edge recovery
pdf(paste(PATH, "Edge_Recovery", ".pdf", sep=""),
    width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
  data.grp <- subset(data, grp %in% c(1, group + 1))
  p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "edge.recovery"))
  p <- p + geom_line()
  p <- p + labs(x = grp_var[group],
                y = "Edge Recovery")
  p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw() + coord_cartesian(ylim = c(0, 1))
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12)
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  print(p, vp=vplayout(1, group))
}
dev.off()

## partition quality
pdf(paste(PATH, "ARI", ".pdf", sep=""),
    width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
  data.grp <- subset(data, grp %in% c(1, group + 1))
  p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "adjrandind"))
  p <- p + geom_hline(yintercept = 0)
  p <- p + geom_line(aes(linetype = "Estimated")) + geom_line(aes(y = adjrandind.init, linetype = "Initial"))
  p <- p + labs(x = grp_var[group],
                y = "Adjusted Rand Index")
  p <- p + scale_linetype(name = "Shift Allocation")
  p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw() + coord_cartesian(ylim = c(-0.1, 1))
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12)
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  print(p, vp=vplayout(1, group))
}
dev.off()

## iteration time
pdf(paste(PATH, "Iteration_Steps", ".pdf", sep=""),
    width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
  data.grp <- subset(data, grp %in% c(1, group + 1))
  p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "mean.steps"))
  p <- p + geom_hline(yintercept = 1000)
  p <- p + geom_line() 
  p <- p + labs(x = grp_var[group],
                y = "Iteration steps")
  p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw()
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12)
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  print(p, vp=vplayout(1, group))
}
dev.off()

## mean log likelihood
pdf(paste(PATH, "mean_log_likelihood", ".pdf", sep=""),
    width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
  data.grp <- subset(data, grp %in% c(1, group + 1))
  p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "log_likelihood.mean"))
  p <- p + geom_hline(yintercept = 0)
  p <- p + geom_line(aes(linetype = "Final")) + geom_line(aes(y = log_likelihood.mean.init, linetype = "Initial"))
  p <- p + labs(x = grp_var[group],
                y = "Mean Log Likelihood")
  p <- p + scale_linetype(name = "Estimation")
  p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw()
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12)
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  print(p, vp=vplayout(1, group))
}
dev.off()

## beta_0 RMSE
pdf(paste(PATH, "beta_0_RMSE", ".pdf", sep=""),
    width = 12, height = 8)
pushViewport(viewport(layout = grid.layout(1, 3, heights = unit(c(5), "null"))))
for (group in 1:3) {
  data.grp <- subset(data, grp %in% c(1, group + 1))
  p <- ggplot(data.grp , aes_string(x = grp_var[group], y = "beta_0.RMSE"))
  p <- p + geom_hline(yintercept = 0)
  p <- p + geom_line()
  p <- p + labs(x = grp_var[group],
                y = expression(paste("RMSE(", beta[0], ")")))
  p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw()
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12)
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  print(p, vp=vplayout(1, group))
}
dev.off()

## Difficulty vs convergence rate
p <- ggplot(subset(df, K == 1), aes(x = difficulty, y = CV_estim,
                                    color = factor(snr), group = factor(snr)))
p <- p + facet_grid(facets = gamma~alpha, labeller = my.labeller, scales = "fixed")
p <- p + geom_point() + scale_x_log10()
p <- p + geom_smooth(method="glm", family="binomial", se=F)
p <- p + ggtitle("1 shift")
p

p <- ggplot(subset(df, K == 3), aes(x = difficulty, y = CV_estim,
                                    color = factor(snr), group = factor(snr)))
p <- p + facet_grid(facets = gamma~alpha, labeller = my.labeller, scales = "fixed")
p <- p + geom_point() + scale_x_log10()
p <- p + geom_smooth(method="glm", family="binomial", se=F)
p <- p + ggtitle("3 shifts")
p


p <- ggplot(subset(df, K == 5), aes(x = difficulty, y = CV_estim,
                                    color = factor(snr), group = factor(snr)))
p <- p + facet_grid(facets = gamma~alpha, labeller = my.labeller, scales = "fixed")
p <- p + geom_point() + scale_x_log10()
p <- p + geom_smooth(method="glm", family="binomial", se=F)
p <- p + ggtitle("5 shifts")
plot(p)
pbastide/PhylogeneticEM documentation built on Feb. 12, 2024, 1:27 a.m.