PhylogeneticEM: Automatic Shift Detection using a Phylogenetic EM

#############################################
## Exploitation several K
#############################################

##############
## Parameters and functions
##############

require(doParallel)
require(foreach)
require(ape)
#require(glmnet) # For Lasso initialization
require(quadrupen) # For Lasso initialization
require(robustbase) # For robust fitting of alpha
library(TreeSim) # For simulation of the tree
require(ggplot2) # Plot
library(scales) # plot
library(reshape2) # Plot
library(grid) # Plot
library(plyr)

## Alpha known - simulation model
# saveresultfile = "../Results/Simulations_Several_K/several_K_estimations"
# datestamp_day <- "2015-03-17"
# ak <- "_alpha_known"
# student <- ""
# load(paste0(saveresultfile, ak, "-", datestamp_day, "_all.RData"))

## Alpha unknown - model simulation
# saveresultfile = "../Results/Simulations_Several_K/several_K_estimations"
# datestamp_day <- "2015-03-24"
# ak <- ""
# student <- ""
# load(paste0(saveresultfile, ak, "-", datestamp_day, "_all.RData"))

# ## Alpha unknown 
# saveresultfile = "../Results/Simulations_Student/student_estimations"
# datestamp_day <- "2015-09-09"
# ak <- ""
# student <- "_df5"
# favorable <- "favorables"
# load(paste0(saveresultfile, favorable, student, ak, "-", datestamp_day, "_all.RData"))

saveresultfile = "../Results/Simulations_Several_K/several_K_estimations"
datestamp_day <- "2015-03-17" # 17 24

ak <- "_alpha_known" # ""

load(paste0(saveresultfile, ak, "-", datestamp_day, "_all.RData"))

## Source functions
source("R/simulate.R")
source("R/estimateEM.R")
source("R/init_EM.R")
source("R/E_step.R")
source("R/M_step.R")
source("R/shutoff.R")
source("R/generic_functions.R")
source("R/shifts_manipulations.R")
source("R/plot_functions.R")
source("R/parsimonyNumber.R")
source("R/partitionsNumber.R")
source("R/model_selection.R")

# saveresultfile = "../Results/Simulations_Several_K/several_K"
# datestamp_day <- "2015-03-31_0"
# 
# ak <- "_alpha_0" # "_alpha_known"
# 
# load(paste0(saveresultfile, ak, "-", datestamp_day, ".RData"))

#########################################################
## Time computation
#########################################################
add_total_time <- function(simest){
  simest$results_summary$total_time <- sum(simest$results_summary$time)
  return(simest)
}

simests <- as.name(paste0("simestimations", favorable, student, ak))

assign(paste(simests), lapply(eval(simests), add_total_time))

#########################################################
## Model Selection
#########################################################

## Compute penalty and criteria for each set of parameters
penalty <- function(K_try, complexity, ntaxa){
  return(1/2 * penalty_BaraudGiraudHuet_likelihood(K_try, 
                                                   complexity, 
                                                   ntaxa, 
                                                   C = 1.1))
} 

criteria <- function(ll, pen){
  return(-ll + pen)
}

selected_K <- function(crit_ll, K_try){
  K_try[which.min(crit_ll)]
}

add_crit_and_pen <- function(z){
  z$pen_ll <- penalty(z$K_try,
                      z$complexity,
                      unique(z$ntaxa))
  z$crit_ll <- criteria(z$log_likelihood,
                        z$pen_ll)
  z$K_select <- z$K_try[which.min(z$crit_ll)]
  return(z)
}

## Find K_select and add it to the list of results
add_K_select_to_list <- function(simest){
  simest$results_summary <- ddply(simest$results_summary,
                                  .(alpha, gamma, K, n, ntaxa, grp),
                                  add_crit_and_pen)
  simest$K_select <- unique(simest$results_summary$K_select)
  return(simest)
}

assign(paste(simests), lapply(eval(simests), add_K_select_to_list))

#########################################################
## Model Selection Birge Massart Capushe
#########################################################
library(capushe)
## Compute penalty and criteria for each set of parameters
pen_shape_BM1 <- function(K_try, complexity){
  return(penalty_BirgeMassart_shape1(K_try,
                                     p = 1,
                                     complexity,
                                     B = 0.1))
} 

pen_shape_BM2 <- function(K_try, complexity){
  return(penalty_BirgeMassart_shape2(K_try,
                                     p = 1,
                                     complexity,
                                     C = 2.5))
} 

model_selection_capushe <- function(K_try, complexity, log_likelihood, pen_shape){
  ## If not enaugh data (ntaxa = 64), return NAs
  if (length(K_try) < 11){
    return(list(pen_DDSE = NA,
                crit_DDSE = NA,
                pen_Djump = NA,
                crit_Djump = NA))
  }
  ## Format data for capushe
  data_capushe <- data.frame(names = K_try, 
                             pen_shape = pen_shape,
                             complexity = complexity,
                             contrast = -log_likelihood)
  ## Capushe
  cap_res <- capushe(data_capushe)
  ## Assign results
  # DDSE
  pen_DDSE <- 2 * cap_res@DDSE@interval$interval["max"] * pen_shape
  crit_DDSE <- data_capushe$contrast + pen_DDSE
  # Djump
  pen_Djump <- cap_res@Djump@ModelHat$Kopt * pen_shape
  crit_Djump <- data_capushe$contrast + pen_Djump
  return(list(pen_DDSE = pen_DDSE,
              crit_DDSE = crit_DDSE,
              pen_Djump = pen_Djump,
              crit_Djump = crit_Djump))
}

selected_K <- function(crit_ll, K_try){
  if (anyNA(crit_ll)) return(NA)
  return(K_try[which.min(crit_ll)])
}

add_crit_and_pen_BirgeMassart1 <- function(z){
  pen_shape <- pen_shape_BM1(z$K_try, z$complexity)
  tmp <- model_selection_capushe(z$K_try, z$complexity, z$log_likelihood,
                                 pen_shape)
  z$pen_BM1_DDSE <- tmp$pen_DDSE
  z$crit_BM1_DDSE <- tmp$crit_DDSE
  z$pen_BM1_Djump <- tmp$pen_Djump
  z$crit_BM1_Djump <- tmp$crit_Djump
  z$K_select_BM1_DDSE <- selected_K(z$crit_BM1_DDSE, z$K_try)
  z$K_select_BM1_Djump <- selected_K(z$crit_BM1_Djump, z$K_try)
  return(z)
}

add_crit_and_pen_BirgeMassart2 <- function(z){
  pen_shape <- pen_shape_BM2(z$K_try, z$complexity)
  tmp <- model_selection_capushe(z$K_try, z$complexity, z$log_likelihood,
                                 pen_shape)
  z$pen_BM2_DDSE <- tmp$pen_DDSE
  z$crit_BM2_DDSE <- tmp$crit_DDSE
  z$pen_BM2_Djump <- tmp$pen_Djump
  z$crit_BM2_Djump <- tmp$crit_Djump
  z$K_select_BM2_DDSE <- selected_K(z$crit_BM2_DDSE, z$K_try)
  z$K_select_BM2_Djump <- selected_K(z$crit_BM2_Djump, z$K_try)
  return(z)
}

## Find K_select and add it to the list of results
add_K_select_to_list_BirgeMassart <- function(simest){
  simest$results_summary <- ddply(simest$results_summary,
                                  .(alpha, gamma, K, n, ntaxa, grp),
                                  add_crit_and_pen_BirgeMassart1)
  simest$results_summary <- ddply(simest$results_summary,
                                  .(alpha, gamma, K, n, ntaxa, grp),
                                  add_crit_and_pen_BirgeMassart2)
  simest$K_select_BM1_DDSE <- unique(simest$results_summary$K_select_BM1_DDSE)
  simest$K_select_BM1_Djump <- unique(simest$results_summary$K_select_BM1_Djump)
  simest$K_select_BM2_DDSE <- unique(simest$results_summary$K_select_BM2_DDSE)
  simest$K_select_BM2_Djump <- unique(simest$results_summary$K_select_BM2_Djump)
  return(simest)
}

assign(paste(simests), lapply(eval(simests), add_K_select_to_list_BirgeMassart))

######################
## Extract inferences with K_true or K_select
######################

## Extract the results where K_try = K_e (K_true or something else)
extract_result <- function(simest, Ke){
  K_try <- 0:(length(simest$params_estim) - 1)
  if (Ke >= length(simest$params_estim)) return(NULL)
  fun <- function(l){
    if (Ke %in% names(l)) return(l[[paste(Ke)]])
    return(l)
  }
  res <- lapply(simest, fun)
  res$results_summary  <-  subset(simest$results_summary, K_try == Ke)
  return(res)
}

## Extract K_true
extract_result_K_true <- function(simest){
  K_true <- simest$K
  return(extract_result(simest, K_true))
}

simests_K_true <-  as.name(paste0("simestimations", favorable, student, ak, "_K_true"))
assign(paste(simests_K_true), lapply(eval(simests), extract_result_K_true))

## Extract K_select
extract_result_K_select <- function(simest){
  K_select <- simest$K_select
  return(extract_result(simest, K_select))
}

simests_K_select <-  as.name(paste0("simestimations", favorable, student, ak, "_K_select"))
assign(paste(simests_K_select), lapply(eval(simests), extract_result_K_select))
       
# #########################################################
# ## Computation of means at tips
# #########################################################
# compute_mean_estim <- function(simest){
#   if (is.null(simest)){
#     return(NULL)
#   } else {
#   XX <- simulate_internal(phylo = trees[[paste(simest$ntaxa)]],
#                  process = "OU",
#                  root.state = simest$params_estim$root.state, 
#                  variance = simest$params_estim$variance,
#                  shifts = simest$params_estim$shifts, 
#                  selection.strength = simest$params_estim$selection.strength, 
#                  optimal.value = simest$params_estim$optimal.value)
#   m_Y_estim <- extract_simulate_internal(XX, where="tips", what="expectations")
#   simest$m_Y_estim <- m_Y_estim
#   return(simest)
#   }
# }
# 
# simestimations_alpha_known_K_true <- lapply(simestimations_alpha_known_K_true,
#                                             compute_mean_estim)
# simestimations_alpha_known_K_select<-lapply(simestimations_alpha_known_K_select,
#                                               compute_mean_estim)

###########################################
## Scores involving lists
## (Compute complex score and add them to result summary for each element of the list)
###########################################
sensitivity <- function(tr.edges, est.edges) { # Sensibilité
  if (is.null(tr.edges) ) return(NA) # If no true shift at all, any shift found is misplaced
  if (is.null(est.edges)) return(0) # If no estimated shift, TP = 0
  return(sum(est.edges %in% tr.edges)/length(tr.edges))
}

specificity <- function(tr.edges, est.edges, nedges) { # spécificité
  edges <- 1:nedges
  if (is.null(est.edges)) est.edges <- nedges + 1
  if (is.null(tr.edges)) tr.edges <- nedges + 1
  return(sum(edges[-est.edges] %in% edges[-tr.edges])/length(edges[-tr.edges]))
}

precision <- function(tr.edges, est.edges) { # Precision
  if (is.null(tr.edges) ) return(NA) # If no true shift at all, any shift found is misplaced
  if (is.null(est.edges)) return(0) # If no estimated shift, TP = 0
  return(sum(est.edges %in% tr.edges)/length(est.edges))
}

match.partitions <- function(tr, est, nta) {
  tree <- trees[[paste0(nta)]]
  part.list <- subtree.list[[paste0(nta)]]
  ## computes ARI between partitions induced by edges in tr$edges and est$edges
  #   if (is.null(est$edges)){
  #     return(-1) # If lasso initialization failed
  #   } else {
  require(mclust)
  part1 <- clusters_from_shifts(tree, tr$edges, part.list)
  part2 <- clusters_from_shifts(tree, est$edges, part.list)
  return(adjustedRandIndex(part1, part2))        
  #   }
}

euclidian_distance <- function(m1, m2){
  return(sum((m2 - m1)^2))
}

compute_list_scores_element <- function(simest){
  if (is.null(simest)){ # || !simest$results_summary$CV_estim){
    #     return(data.frame(mean.edge.quality = NA, edge.recovery = NA,
    #                       edge.init.recovery = NA, adjrandind = NA, adjrandind.init = NA))
    return(NULL)
  } else {
    df <- data.frame(
      mean.edge.quality = mean(simest$edge.quality),
      sensitivity = sensitivity(simest$shifts$edges,
                                simest$params_estim$shifts$edges),
      sensitivity.init = sensitivity(simest$shifts$edges,
                                     simest$params_init_estim$shifts$edges),
      specificity = specificity(simest$shifts$edges,
                                simest$params_estim$shifts$edges,
                                length(simest$Z_data) + length(simest$Y_data) - 1),
      specificity.init = specificity(simest$shifts$edges,
                                     simest$params_init_estim$shifts$edges,
                                     length(simest$Z_data) + length(simest$Y_data) - 1),
      precision = precision(simest$shifts$edges,
                            simest$params_estim$shifts$edges),
      precision.init = precision(simest$shifts$edges,
                                 simest$params_init_estim$shifts$edges),
      adjrandind = match.partitions(simest$shifts, 
                                    simest$params_estim$shifts,
                                    simest$ntaxa),
      adjrandind.init = match.partitions(simest$shifts, 
                                         simest$params_init_estim$shifts,
                                         simest$ntaxa),
      distance_means = euclidian_distance(simest$m_Y_data,
                                          simest$m_Y_estim)
    )
    simest$results_summary <- cbind(simest$results_summary, df)
    return(simest)
  }
}


assign(paste(simests_K_true), 
       lapply(eval(simests_K_true), compute_list_scores_element))
assign(paste(simests_K_select), 
       lapply(eval(simests_K_select), compute_list_scores_element))

######################
## Format the data
######################

extract_data_frame <- function(simests){
  dd <- do.call(rbind, simests)
  results_summary <- as.data.frame(do.call(rbind, dd[,"results_summary"]))
  return(results_summary)
}

res_sum <-  as.name(paste0("results_summary", favorable, student, ak))
res_sum_K_true <-  as.name(paste0("results_summary", favorable, student, ak, "_K_true"))
res_sum_K_select <-  as.name(paste0("results_summary", favorable, student, ak, "_K_select"))
res_sum_K_select_true <-  as.name(paste0("results_summary", favorable, student, ak, "_K_select_true"))

# assign(paste(res_sum), extract_data_frame(eval(simests)))
assign(paste(res_sum_K_true), extract_data_frame(eval(simests_K_true)))
assign(paste(res_sum_K_select), extract_data_frame(eval(simests_K_select)))

assign(paste0(res_sum_K_true),
       transform(eval(res_sum_K_true), K_type = "K_true"))
assign(paste0(res_sum_K_select),
       transform(eval(res_sum_K_select), K_type = "K_select"))
assign(paste(res_sum_K_select_true), rbind(eval(res_sum_K_true),
                                           eval(res_sum_K_select)))

comp_time <-  as.name(paste0("computation_time", favorable, student, ak))
assign(paste(comp_time), 
       sum(eval(res_sum_K_true)$total_time))

m_time <-  as.name(paste0("mean_time", favorable, student, ak))
assign(paste(m_time), 
       mean(eval(res_sum_K_true)$total_time))

######################################################################
## Sumarizing Function
######################################################################
## Simple Scores
RMSE <- function(y, true.value) {
  return(sqrt(mean((y - true.value)^2)))
}

compute_scores <- function(res_sum){
  # Combinations of "true" parameters (no n)
  # Aggregation of parameter estimates conditonal upon convergence of EM procedure
  return(ddply(subset(res_sum, CV_estim == 1),
               .(alpha, gamma, K, ntaxa, grp),
               summarize,
               ## Simple Scores
               alpha.mean = mean(alpha_estim),
               alpha.sd = sd(alpha_estim),
               alpha.RMSE = RMSE(alpha_estim, unique(alpha)),
               alpha.init.RMSE = RMSE(alpha_estim_init, unique(alpha)),
               gamma.mean = mean(gamma_estim),
               gamma.sd = sd(gamma_estim),
               gamma.RMSE = RMSE(gamma_estim, unique(gamma)),
               gamma.init.RMSE = RMSE(gamma_estim_init, unique(gamma)),
               beta_0.mean = mean(beta_0_estim),
               beta_0.sd = sd(beta_0_estim),
               beta_0.RMSE = RMSE(beta_0_estim, beta_0),
               beta_0.init.RMSE = RMSE(beta_0_estim_init, beta_0),
               log_likelihood.mean = mean(log_likelihood),
               log_likelihood.mean.init = mean(log_likelihood_init),
               mean_number_new_shifts.mean = mean(mean_number_new_shifts),
               number_equivalent_solutions.mean = mean(number_equivalent_solutions),
               difficulty.mean = mean(difficulty),
               EM_steps.mean = mean(EM_steps, na.rm = TRUE),
               K.match = mean(K == K_try),
               K.match.leq = mean(K >= K_try),
               ## List Scores
               mean.edge.quality.mean = mean(mean.edge.quality),
               sensitivity.mean = mean(sensitivity),
               sensitivity.init.mean = mean(sensitivity.init),
               specificity.mean = mean(specificity),
               specificity.init.mean = mean(specificity.init),
               precision.mean = mean(precision),
               precision.init.mean = mean(precision.init),
               mean.adjrandind = mean(adjrandind),
               mean.adjrandind.init = mean(adjrandind.init),
               mean.distance_means = mean(distance_means),
               total_time.mean = mean(total_time)
  ))
}

sum_scores_K_true <-  as.name(paste0("summary_scores", favorable, student, ak, "_K_true"))
sum_scores_K_select <-  as.name(paste0("summary_scores", favorable, student, ak, "_K_select"))
sum_scores_K_select_true <-  as.name(paste0("summary_scores", favorable, student, ak, "_K_select_true"))

assign(paste(sum_scores_K_true), compute_scores(eval(res_sum_K_true)))
assign(paste(sum_scores_K_select), compute_scores(eval(res_sum_K_select)))

assign(paste0(sum_scores_K_true),
       transform(eval(sum_scores_K_true), K_type = "K_true"))
assign(paste0(sum_scores_K_select),
       transform(eval(sum_scores_K_select), K_type = "K_select"))
assign(paste(sum_scores_K_select_true), 
       rbind(eval(sum_scores_K_true),
             eval(sum_scores_K_select)))
rm(list = c(paste(sum_scores_K_true), paste(sum_scores_K_select)))

save.image(paste0(saveresultfile, "_aranged", favorable, student, ak, "-", datestamp_day, "_heavy.RData"))

rm(list = c("simparams_alpha", "simparams_base",
            "simparams_gamma", "simparams_K",
            "comp_time", "m_time",
            paste(simests), "simests",
            paste(simests_K_true), "simests_K_true",
            paste(simests_K_select), "simests_K_select",
            paste(res_sum_K_true), "res_sum_K_true",
            paste(res_sum_K_select), "res_sum_K_select"))

save.image(paste0(saveresultfile, "_aranged", favorable, student, ak, "-", datestamp_day, ".RData"))

##################################################################
## Merge alpha known and unknown
##################################################################
rm(list = ls())

## Alpha unknown - Model simulation
saveresultfile = "../Results/Simulations_Several_K/several_K_estimations"
datestamp_day <- "2015-03-24"
ak <- ""
load(paste0(saveresultfile, "_aranged", ak, "-", datestamp_day, ".RData"))
results_summary_K_select_true$alpha_known <- FALSE
results_summary_K_select_true$student <- Inf
summary_scores_K_select_true$alpha_known <- FALSE
summary_scores_K_select_true$student <- Inf

## Alpha known - Model simulation
saveresultfile = "../Results/Simulations_Several_K/several_K_estimations"
datestamp_day <- "2015-03-17"
ak <- "_alpha_known"
load(paste0(saveresultfile, "_aranged", ak, "-", datestamp_day, ".RData"))
results_summary_alpha_known_K_select_true$alpha_known <- TRUE
results_summary_alpha_known_K_select_true$student <- Inf
summary_scores_alpha_known_K_select_true$alpha_known <- TRUE
summary_scores_alpha_known_K_select_true$student <- Inf

## Alpha unknown - Student simulation
saveresultfile = "../Results/Simulations_Student/student_estimations"
datestamp_day <- "2015-09-09"
ak <- ""
favorable <- "favorables"
student <- "_df5"
load(paste0(saveresultfile, "_aranged", favorable, student, ak, "-", datestamp_day, ".RData"))
results_summaryfavorables_df5_K_select_true$alpha_known <- FALSE
results_summaryfavorables_df5_K_select_true$student <- 5
summary_scoresfavorables_df5_K_select_true$alpha_known <- FALSE
summary_scoresfavorables_df5_K_select_true$student <- 5

names_int <- intersect(names(results_summaryfavorables_df5_K_select_true),
                       c(names(results_summary_K_select_true),
                       names(results_summary_alpha_known_K_select_true)))

results_summary_K_select_true <- rbind(results_summary_K_select_true[, names_int],
                                       results_summary_alpha_known_K_select_true[, names_int],
                                       results_summaryfavorables_df5_K_select_true[, names_int])
rm(results_summary_alpha_known_K_select_true, results_summaryfavorables_df5_K_select_true)

names_int <- intersect(names(summary_scoresfavorables_df5_K_select_true),
                       c(names(summary_scores_K_select_true),
                         names(summary_scores_alpha_known_K_select_true)))

summary_scores_K_select_true <- rbind(summary_scores_K_select_true[, names_int],
                                      summary_scores_alpha_known_K_select_true[, names_int],
                                      summary_scoresfavorables_df5_K_select_true[, names_int])
rm(summary_scores_alpha_known_K_select_true, summary_scoresfavorables_df5_K_select_true)

datestamp_day <- format(Sys.time(), "%Y-%m-%d")
save.image(paste0(saveresultfile, "_aranged_both_alpha", "-", datestamp_day, ".RData"))

######################################################################
## Equivalent Solutions
######################################################################
saveresultfile = "../Results/Simulations_Several_K/several_K_estimations"
datestamp_day <- "2015-03-24" # 17
ak <- "" # "_alpha_known"

load(paste0(saveresultfile, "_aranged", ak, "-", datestamp_day, "_heavy.RData"))

indexes <- which((results_summary_K_select$number_equivalent_solutions > 1))
length(indexes)

simestimations_K_select_eqs <- simestimations_K_select[indexes]
results_summary_K_select_eqs <- results_summary_K_select[indexes,]

save(simestimations_K_select_eqs, results_summary_K_select_eqs, file = paste0(saveresultfile, "_equivalent_solutions", ak, "-", datestamp_day, ".RData"))

#################################################################
## Plots
#################################################################
load(paste0(saveresultfile, "_aranged", "_alpha_known-", datestamp_day, "_", inference.index, "_heavy.RData"))

## Process
conf <- 17
pars <- simestimations_alpha_known_K_select[[conf]]

## Clustering of tips and edges
plot_data.process.actual(Y.state = pars$m_Y_estim,
                         phylo = trees[[paste(pars$ntaxa)]], 
                         params = list(shifts = pars$params_estim$shifts, 
                                       optimal.value = pars$params_estim$optimal.value),
                         adj = 2,
                         automatic_colors = TRUE)
pars <- simlist[[conf]]
plot_data.process.actual(Y.state = pars$m_Y_data,
                         phylo = trees[[paste(pars$ntaxa)]], 
                         params = list(shifts = pars$shifts, 
                                       optimal.value = beta_0),
                         adj = 2,
                         automatic_colors = TRUE)


## Utility functions
gtable_filter <- function (x, pattern, fixed = FALSE, trim = TRUE, complementary = FALSE) 
{
  matches <- grepl(pattern, x$layout$name, fixed = fixed)
  if (complementary) matches <- !matches
  x$layout <- x$layout[matches, , drop = FALSE]
  x$grobs <- x$grobs[matches]
  if (trim) 
    x <- gtable_trim(x)
  x
}

g_table_replace_xlab_strips <- function(g){
  xlab <- gtable_filter(g, "xlab", trim = FALSE, complementary = FALSE)
  g <- gtable_filter(g, "xlab", trim = FALSE, complementary = TRUE)
  matches <- grepl("strip-top", g$layout$name)
  g$layout[matches, "t"] <- xlab$layout[1,"t"]
  g$layout[matches, "b"] <- xlab$layout[1,"b"]
  return(g)
}

g_table_put_right_strips <- function(g, grid){
  # Find x label and delete it
  xlab <- gtable_filter(g, "xlab", trim = FALSE, complementary = FALSE)
  g <- gtable_filter(g, "xlab", trim = FALSE, complementary = TRUE)
  # delete unused strips
  g <- gtable_filter(g, "strip_t-[4-9]", trim = FALSE, complementary = TRUE)
  # deplace used strips
  matches <- grepl("strip_t", g$layout$name)
  g$layout[matches, "t"] <- xlab$layout[1,"t"]
  g$layout[matches, "b"] <- xlab$layout[1,"b"]
  # rename used strips
  #   fun <- function(strip){
  #     index <- grep("strip.text.x.text", names(strip$children))
  #     text <- strip$children[[index]]$label
  #     strip$children[[index]]$label <- sub(".*, ", "", text)
  #     strip$children[[index]]$label <- lapply(strip$children[[index]]$label,
  #                                             function(x) parse(text = x))
  #     return(strip)
  #   }
  #   g$grobs[matches] <- lapply(g$grobs[matches], fun)
  #  g$grob[matches] <- lapply(g$grob[matches], fun)
  return(g)
}

facet_wrap_labeller <- function(g, labels = NULL) {
  gg <- g$grobs      
  strips <- grep("strip_t", names(gg))
  
  for(ii in seq_along(labels))  {
    modgrob <- getGrob(gg[[strips[ii]]], "strip.text", 
                       grep=TRUE, global=TRUE)
    gg[[strips[ii]]]$children[[modgrob$name]] <- editGrob(modgrob,label=labels[ii])
  }
  g$grobs <- gg
  return(g)
}

my.labeller <- function(variable, value) {
  value <- paste(variable, "==", as.character(value))
  value <- lapply(value, function(x) parse(text = x))
  return(value)
}

## Format data for plots
format_plot <- function(df){
  # Save true values
  df$alpha.true <- df$alpha
  df$gamma.true <- df$gamma
  df$K.true <- df$K
  
  # Transformations and names
  df[["ln(2)/alpha"]] <- log(2)/df$alpha
  df[["sigma^2 == 2*alpha*gamma^2"]] <- 2*df$alpha*df$gamma
  
  # Melt
  df_plot <- melt(df, 
                  measure.vars = c("ln(2)/alpha", "sigma^2 == 2*alpha*gamma^2", "K"),
                  value.name = "parameter.value")
  
  # Supress lines where group != variable
  grp_var <- c(alpha_var = "ln(2)/alpha", 
               gamma_var = "sigma^2 == 2*alpha*gamma^2", 
               K_var = "K")
  fun <- function(z){
    if (z["grp"] == "base") return(TRUE)
    return(unname(grp_var[z["grp"]] == z["variable"]))
  }
  masque <- apply(df_plot, 1, fun)
  df_plot <- df_plot[masque,]
  df_plot$ntaxa <- as.factor(df_plot$ntaxa)
  df_plot$parameter.value <- round(df_plot$parameter.value, 2)
  df_plot$parameter.value <- as.factor(df_plot$parameter.value)
  return(df_plot)
}

summary_scores_K_select_true_plot <- format_plot(summary_scores_K_select_true)
results_summary_K_select_true_plot <- format_plot(results_summary_K_select_true)

## Plot functions
plot_results_score <- function(score, score_name){
  p <- ggplot(summary_scores_K_select_true_plot,
              aes_string(x = "parameter.value",
                         y = score,
                         linetype = "K_type",
                         color = "alpha_known"))
  p <- p + aes(group = interaction(alpha_known, K_type))
  p <- p + facet_wrap(ntaxa ~ variable, scales = "free")
  p <- p + scale_linetype(name = "K",
                          labels = c("Known", "Estimated"))
  p <- p + geom_line()
  p <- p + scale_colour_discrete(name = expression(alpha),
                                 breaks = c(TRUE, FALSE),
                                 labels = c("Known", "Estimated"))
  p <- p + labs(x = "",
                y = score_name)
  #p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw()
  p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1))
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12),
                 strip.background = element_blank()
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  g <- ggplotGrob(p)
  g <- g_table_put_right_strips(g)
  labels <- unname(sapply(levels(summary_scores_K_select_true_plot$variable),
                          function(z) parse(text = z)))
  g <- facet_wrap_labeller(g, labels = labels)
  grid.newpage()
  grid.draw(g)
}

plot_results_score_smooth <- function(score, score_name, plot_points = TRUE){
  replace_name <- function(i){
    i <- as.integer(as.character(i))
    i <- formatC(i, width = 3, format = "d", flag = "0")
    i <- paste0("ntaxa == ", i)
    return(i)
  }
  data <- results_summary_K_select_true_plot
  data$ntaxa <- sapply(data$ntaxa, replace_name)
  data$ntaxa <- as.factor(data$ntaxa)
  p <- ggplot(data,
              aes_string(x = "parameter.value",
                         y = score,
                         color = "K_type"),
              shrink = TRUE)
  #  p <- p + aes(group = interaction(ntaxa, K_type))
  p <- p + facet_grid(ntaxa ~ variable,
                      labeller = label_parsed,
                      scales = "free",
                      shrink = TRUE)
  if (plot_points){
    p <- p + geom_point(alpha = 0.2,
                        position = position_jitter(width = 0.4, height = 0))
  } else {
    sts <- quantile(data[[score]], probs = c(0.05, 0.95))  # Compute lower and upper whisker limits   
    p = p + coord_cartesian(ylim = c(sts[1], sts[2]))
  }
  p <- p + geom_boxplot(outlier.size = 0, alpha = 0.5)
  p <- p + geom_smooth(aes(group = K_type), method = "loess", se = FALSE, size = 1.2)
  p <- p + scale_colour_discrete(name = "Estimation")
  p <- p + labs(x = "",
                y = score_name)
  #p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw()
  p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1))
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12),
                 strip.background = element_blank()
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  g <- ggplotGrob(p)
  g <- g_table_replace_xlab_strips(g)
  grid.newpage()
  grid.draw(g)
}


plot_results_K_select_true <- function(name){
  replace_name <- function(i){
    i <- as.integer(as.character(i))
    i <- formatC(i, width = 3, format = "d", flag = "0")
    i <- paste0("ntaxa == ", i)
    return(i)
  }
  data <- results_summary_K_select_true_plot
  data$ntaxa <- sapply(data$ntaxa, replace_name)
  data$ntaxa <- as.factor(data$ntaxa)
  data$parameter.value <- data$parameter.value
    p <- ggplot(data,
                aes_string(x = "parameter.value",
                           y = "K_select",
                           color = "alpha_known"))
  #  p <- p + aes(group = interaction(ntaxa, K_type))
  p <- p + facet_wrap(ntaxa ~ variable,
                      #labeller = label_parsed,
                      scales = "free",
                      shrink = TRUE)
  #   p <- p + geom_point(alpha = 0.2,
  #                       position = position_jitter(width = 0.4, height = 0))
  p <- p + geom_boxplot(outlier.size = 0.5, alpha = 0.5)
  p <- p + geom_point(aes_string(x = "parameter.value", y = "K.true",
                                 group = interaction("ntaxa", "variable")),
                      colour = "black", size = 2)
  p <- p + scale_colour_discrete(name = expression(alpha),
                                 breaks = c(TRUE, FALSE),
                                 labels = c("Known", "Estimated"))
  p <- p + labs(x = "",
                y = name)
  #p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw()
  p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1))
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12),
                 strip.background = element_blank()
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  g <- ggplotGrob(p)
  g <- g_table_put_right_strips(g)
  labels <- unname(sapply(levels(results_summary_K_select_true_plot$variable),
                          function(z) parse(text = z)))
  g <- facet_wrap_labeller(g, labels = labels)
  grid.newpage()
  grid.draw(g)
}

plot_results_estim_vs_true <- function(estim, true, name, sub, val){
  replace_name <- function(i){
    i <- as.integer(as.character(i))
    i <- formatC(i, width = 3, format = "d", flag = "0")
    i <- paste0("ntaxa == ", i)
    return(i)
  }
  data <- subset(results_summary_K_select_true_plot,
                 results_summary_K_select_true_plot[, sub] == val)
  data$ntaxa <- sapply(data$ntaxa, replace_name)
  data$ntaxa <- as.factor(data$ntaxa)
  data$parameter.value <- data$parameter.value
  if (sub == "K_type"){
    p <- ggplot(data,
                aes_string(x = "parameter.value",
                           y = estim,
                           color = "alpha_known"))
    p <- p + scale_colour_discrete(name = expression(alpha),
                                   breaks = c(TRUE, FALSE),
                                   labels = c("Known", "Estimated"))
  } else {
    p <- ggplot(data,
                aes_string(x = "parameter.value",
                           y = estim,
                           color = "K_type"))
    p <- p + scale_colour_discrete(name = "K",
                                   labels = c("Known", "Estimated"))
  }
  #  p <- p + aes(group = interaction(ntaxa, K_type))
  p <- p + facet_wrap(ntaxa ~ variable,
                      #labeller = label_parsed,
                      scales = "free",
                      shrink = TRUE)
  #   p <- p + geom_point(alpha = 0.2,
  #                       position = position_jitter(width = 0.4, height = 0))
  p <- p + geom_boxplot(outlier.size = 0.5, alpha = 0.5)
  if (estim == "log_likelihood"){
    #       p <- p + geom_boxplot(aes_string(x = "parameter.value", y = true),
    #                      colour = "red", outlier.size = 0.5, alpha = 0.1)
    p <- p + stat_summary(fun.y = median, geom = "point",
                          aes_string(x = "parameter.value", y = true,
                                     group = interaction("ntaxa", "variable")),
                          colour = "black", size = 2)
  } else {
    p <- p + geom_point(aes_string(x = "parameter.value", y = true,
                                   group = interaction("ntaxa", "variable")),
                        colour = "black", size = 2)
  }
  p <- p + labs(x = "",
                y = name)
  #p <- p + scale_x_continuous(trans = log_trans(10))
  p <- p + theme_bw()
  p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1))
  p <- p + theme(axis.text = element_text(size = 12),
                 strip.text = element_text(size = 12),
                 strip.background = element_blank()
                 ##legend.position = c(0, 1),
                 ##legend.justification = c(0, 1)
  )
  g <- ggplotGrob(p)
  g <- g_table_put_right_strips(g)
  labels <- unname(sapply(levels(results_summary_K_select_true_plot$variable),
                          function(z) parse(text = z)))
  g <- facet_wrap_labeller(g, labels = labels)
  grid.newpage()
  grid.draw(g)
}

## Gamma RMSE
plot_results_score("gamma.RMSE/gamma.true", 
                   expression(paste("RMSE(", gamma^2, ")/", gamma^2)))
plot_results_score_smooth("gamma_estim", 
                          expression(paste("Estimated ", gamma^2)))

plot_results_estim_vs_true("gamma_estim", "gamma.true",
                           expression(gamma^2),
                           "alpha_known", TRUE)

plot_results_estim_vs_true("gamma_estim", "gamma.true",
                           expression(gamma^2),
                           "alpha_known", FALSE)

plot_results_score_alpha("gamma.RMSE/gamma.true", 
                   expression(paste("RMSE(", gamma^2, ")/", gamma^2)))

## beta_0 RMSE
plot_results_score("beta_0.RMSE", 
                   expression(paste("RMSE(", beta[0], ")")))
plot_results_score_smooth("beta_0_estim", 
                          expression(paste("Estimated ", beta[0])))

plot_results_score_alpha("beta_0.RMSE", 
                   expression(paste("RMSE(", beta[0], ")")))

## alpha RMSE
plot_results_score("alpha.RMSE/alpha.true", 
                   expression(paste("RMSE(", alpha, ")/", alpha)))

## log_likelihood
plot_results_score("log_likelihood.mean", "Log Likelihood")
plot_results_score_smooth("log_likelihood", "Log Likelihood")

## ARI
plot_results_score("mean.adjrandind", "ARI")
plot_results_score_smooth("adjrandind", "ARI")

## Euclidian distances
plot_results_score("mean.distance_means", "Euclidian distance")
plot_results_score_smooth("distance_means", "Euclidian distance")#, plot_points = FALSE)

## K match
plot_results_score("K.match", "Percent Exact Match")

## K_s
plot_results_K_select_true("K select")

## Sensibilité, spécificité, précision
plot_results_score_smooth("sensitivity", "Sensitivity")
plot_results_score_smooth("1 - specificity", "False Positive Rate")
plot_results_score_smooth("1 - precision", "False Discovery Rate")

## K_true vs K_select
nbr_inferences_K <- function(df){
  len <- unique(floor(sqrt(nta)))
  res <- vector(length = len + 1)
  names(res) <- 0:len
  for (K_e in 0:len){
    res[K_e + 1] <- sum(df$K_select == K_e) / length(df$K_select)
  }
  names(res) <- 0:len
  return(res)
}

incidences_K_select <- ddply(results_summary,
                             .(K, ntaxa),
                             nbr_inferences_K)

incidences_K_select <- melt(incidences_K_select,
                            id.vars = c("K", "ntaxa"), 
                            variable.name = "K_select",
                            value.name = "frequence_observed")

incidences_K_select$K_select <- as.numeric(levels(incidences_K_select$K_select))[incidences_K_select$K_select]

incidences_K_select <- merge(results_summary, incidences_K_select)

p <- ggplot(incidences_K_select, aes(x = K, y = K_select, size = frequence_observed))
p <- p + facet_grid(. ~ ntaxa, labeller = my.labeller)
p <- p + geom_abline(intercept = 0, slope=1)
p <- p + geom_hline(aes(yintercept = floor(sqrt(ntaxa))))
p <- p + geom_smooth(method = "lm", se = FALSE, show_guide=FALSE)
p <- p + geom_point()
p <- p + scale_size(name = "Frequence Estimated")
#                           breaks = c("FALSE", "TRUE"),
#                           labels = c("Estimated", "Known"))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
               strip.text = element_text(size = 12)
               ##legend.position = c(0, 1),
               ##legend.justification = c(0, 1)
)
p

## Faire un box plot
p <- ggplot(results_summary, aes(x = as.factor(K), y = K_select))
p <- p + facet_grid(. ~ ntaxa, labeller = my.labeller)
p <- p + geom_boxplot()
p <- p + geom_abline(intercept = 0, slope=1)
p <- p + geom_hline(aes(yintercept = floor(sqrt(ntaxa))))
p <- p + geom_smooth(aes(group = 1), method = "lm", se = FALSE, show_guide=FALSE)
p <- p + scale_size(name = "Frequence Estimated")
#                           breaks = c("FALSE", "TRUE"),
#                           labels = c("Estimated", "Known"))
p <- p + labs(x = "True K",
              y = "Selected K")
p <- p + coord_cartesian(ylim = c(-0.5, 16.5))
p <- p + theme_bw()
p <- p + theme(axis.text = element_text(size = 12),
               strip.text = element_text(size = 12)
               ##legend.position = c(0, 1),
               ##legend.justification = c(0, 1)
)
p
pbastide/PhylogeneticEM documentation built on Feb. 21, 2025, 9:46 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
pbastide/PhylogeneticEM
Automatic Shift Detection using a Phylogenetic EM

simulations_study/several_K_exploitation.R
In pbastide/PhylogeneticEM: Automatic Shift Detection using a Phylogenetic EM

R Package Documentation

Browse R Packages

We want your feedback!

pbastide/PhylogeneticEM Automatic Shift Detection using a Phylogenetic EM

simulations_study/several_K_exploitation.R In pbastide/PhylogeneticEM: Automatic Shift Detection using a Phylogenetic EM

R Package Documentation

Browse R Packages

We want your feedback!

pbastide/PhylogeneticEM
Automatic Shift Detection using a Phylogenetic EM

simulations_study/several_K_exploitation.R
In pbastide/PhylogeneticEM: Automatic Shift Detection using a Phylogenetic EM