knitr::opts_chunk$set(echo=FALSE) library(magrittr)
# Create batchtools registry # batchtools::makeRegistry(file.dir = "r_batchtools_reg/sim/TrueMod") batchtools::loadRegistry(file.dir = "r_batchtools_reg/sim/TrueMod", writeable = TRUE) rm(list = ls()) dir_output <- "results/Simulation/TrueMod" dir.create(dir_output, showWarnings = TRUE, recursive = TRUE)
# larger sample size m1 <- rep(c(1, 0, -1), each = 66) m1 <- m1 / sqrt(sum(m1^2)) s1 <- rep(1, length = 100) s1 <- s1 / sqrt(sum(s1^2)) t1 <- rep(1, length = 10) t1 <- t1 / sqrt(sum(t1^2)) lambda1 <- 1000 m2 <- rep(rep(c(-1, 0, 1), each = 22), 3) m2 <- m2 / sqrt(sum(m2^2)) s2 <- rep(c(0, 1), each = 50) s2 <- s2 / sqrt(sum(s2^2)) t2 <- rep(c(0, 1), each = 5) t2 <- t2 / sqrt(sum(t2^2)) lambda2 <- 500 Y <- lambda1 * outer(m1, outer(s1, t1)) + lambda2 * outer(m2, outer(s2, t2)) p <- apply(Y, c(2, 3), function(x) exp(x) / sum(exp(x))) depth_range <- c(0.1, 10) params <- list(m = cbind(m1, m2), s = cbind(s1, s2), t = cbind(t1, t2), lambda = c(lambda1, lambda2), Y = Y, p = p, depth_range = depth_range) save(params, file = paste0(dir_output, "/params_n1.RData")) # smaller sample size m1 <- rep(c(1, 0, -1), each = 66) m1 <- m1 / sqrt(sum(m1^2)) s1 <- rep(1, length = 30) s1 <- s1 / sqrt(sum(s1^2)) t1 <- rep(1, length = 10) t1 <- t1 / sqrt(sum(t1^2)) lambda1 <- 1000 / sqrt(10/3) m2 <- rep(rep(c(-1, 0, 1), each = 22), 3) m2 <- m2 / sqrt(sum(m2^2)) s2 <- rep(c(0, 1), each = 15) s2 <- s2 / sqrt(sum(s2^2)) t2 <- rep(c(0, 1), each = 5) t2 <- t2 / sqrt(sum(t2^2)) lambda2 <- 500 / sqrt(10/3) Y <- lambda1 * outer(m1, outer(s1, t1)) + lambda2 * outer(m2, outer(s2, t2)) p <- apply(Y, c(2, 3), function(x) exp(x) / sum(exp(x))) depth_range <- c(0.1, 10) params <- list(m = cbind(m1, m2), s = cbind(s1, s2), t = cbind(t1, t2), lambda = c(lambda1, lambda2), Y = Y, p = p, depth_range = depth_range) save(params, file = paste0(dir_output, "/params_n2.RData")) rm(list = c("m1", "m2", "s1", "s2", "t1", "t2", "lambda1", "lambda2", "Y", "p", "depth_range", "params"))
# simulation job grid tb_job <- tidyr::crossing( alpha0 = 10^seq(2.5, 4, by = 0.25), t_missing = c(0, 4), median_depth = c(1e4, 1e5), R = 3, rep = seq(1, 1000), n = c(1, 2), method = c("microTensor", "ctf") ) %>% dplyr::mutate(i_job = seq(1, dplyr::n())) save(tb_job, file = paste0(dir_output, "/tb_job.RData")) n_job <- nrow(tb_job) n_job_each <- 200
# fit decomposition methods to simulated data one_job <- function(i_job) { library(magrittr) load(paste0(dir_output, "/tb_job.RData")) l_statistics <- list() for(ii_job in seq((i_job - 1) * n_job_each + 1, i_job * n_job_each)) { i_tb_job <- tb_job[ii_job, ] load(paste0(dir_output, "/params_n", i_tb_job$n, ".RData")) ptilde <- params$p alpha0 <- i_tb_job$alpha0 set.seed(ii_job+1) for(j in seq(1, dim(ptilde)[2])) for(k in seq(1, dim(ptilde)[3])) ptilde[, j, k] <- MCMCpack::rdirichlet( n = 1, alpha = alpha0 * params$p[, j, k]) ptilde_missing <- ptilde seq_depth <- runif(n = dim(ptilde)[2] * dim(ptilde)[3], min = log(params$depth_range[1] * i_tb_job$median_depth), max = log(params$depth_range[2] * i_tb_job$median_depth)) %>% exp() %>% round() N <- matrix(seq_depth, nrow = dim(ptilde)[2], ncol = dim(ptilde)[3]) X_array <- ptilde for(j in seq(1, dim(ptilde)[2])) for(k in seq(1, dim(ptilde)[3])) X_array[, j, k] <- rmultinom(n = 1, size = N[j, k], prob = ptilde[, j, k]) # setting some time points to NA if(i_tb_job$t_missing > 0) { for(j in seq(1, dim(ptilde)[2])) { k_missing <- sample.int(n = dim(ptilde)[3], size = i_tb_job$t_missing) X_array[, j, k_missing] <- NA ptilde_missing[, j, k_missing] <- NA } } if(i_tb_job$method == "microTensor") { time <- system.time( fit_decomp <- microTensor::microTensor( X = X_array, R = i_tb_job$R, ortho_m = FALSE, nn_t = FALSE, weighted = TRUE, control = list( L_init = 3, gamma = 2, init = "ctf", abs_tol = 1e-4, rel_tol = 1e-4, maxit = 5000, verbose = FALSE))) } else if(i_tb_job$method == "ctf") { time <- system.time( fit_decomp <- microTensor::ctf( X = X_array, R = i_tb_job$R, control = list( maxit = 5000, verbose = FALSE))) } fit_decomp$time <- time ## evaluation results_summary <- microTensor::evaluate_fit( fit_decomp = fit_decomp, p = params$p, X_array = X_array, R = 2) if(i_tb_job$method == "microTensor") { results_summary_uw <- microTensor::evaluate_fit( fit_decomp = fit_decomp$unweighted, p = params$p, X_array = X_array, R = 2) } # evaluation of the simulated data zero_perc <- mean(X_array == 0) median_depth <- median(apply(X_array, c(2, 3), sum, na.rm = TRUE)) mat_L1_ptilde <- matrix(NA, nrow = dim(params$p)[2], ncol = dim(params$p)[3]) for(j in seq(1, nrow(mat_L1_ptilde))) for(k in seq(1, ncol(mat_L1_ptilde))) mat_L1_ptilde[j, k] <- mean(abs(ptilde[, j, k] - params$p[, j, k]) / params$p[, j, k]) p_norm <- apply(X_array, c(2, 3), function(x) { if(all(x == 0, na.rm = TRUE) ) return(x) else return(x / sum(x, na.rm = TRUE)) }) mat_L1_pnorm <- matrix(NA, nrow = dim(params$p)[2], ncol = dim(params$p)[3]) for(j in seq(1, nrow(mat_L1_pnorm))) for(k in seq(1, ncol(mat_L1_pnorm))) mat_L1_pnorm[j, k] <- mean(abs(p_norm[, j, k] - params$p[, j, k]) / params$p[, j, k]) # save particular runs to calculate feature level statistics if(i_tb_job$alpha0 == max(tb_job$alpha0) & i_tb_job$median_depth == 1e4 & i_tb_job$t_missing == 0 & i_tb_job$n == 1 & i_tb_job$rep %in% seq(1, 10)) { save(X_array, file = paste0(dir_output, "/X_array_", ii_job, ".RData")) save(fit_decomp, file = paste0(dir_output, "/fit_", ii_job, ".RData")) } i_l_statistics <- list(mean_L1_relative = mean(results_summary$mat_L1_relative, na.rm = TRUE), zero_percentage = zero_perc, avg_depth = median_depth, mean_L1_ptilde = mean(mat_L1_ptilde, na.rm = TRUE), mean_L1_pnorm = mean(mat_L1_pnorm, na.rm = TRUE), time = fit_decomp$time) if(i_tb_job$method == "microTensor") { i_l_statistics$mean_l1_relative_uw <- mean(results_summary_uw$mat_L1_relative, na.rm = TRUE) } l_statistics <- c(l_statistics, list(i_l_statistics)) } save(l_statistics, file = paste0(dir_output, "/l_statistics_", i_job, ".RData")) return(NULL) }
# submit jobs to cluster through rbatchtools batchtools::clearRegistry() tb_ids <- batchtools::batchMap(one_job, i_job = seq(1, n_job / n_job_each)) batchtools::batchExport(list("dir_output" = dir_output, "n_job_each" = n_job_each)) ncpus <- 1 walltime <- 3600 batchtools::submitJobs(ids = seq(1, 10), resources = list(ncpus = ncpus, walltime = walltime)) batchtools::submitJobs(ids = seq(11, n_job / n_job_each), resources = list(ncpus = ncpus, partition = partition, walltime = walltime))
library(ggplot2) load(paste0(dir_output, "/tb_job.RData")) l_statistics <- seq(1, nrow(tb_job) / n_job_each) %>% purrr::map(~ { load(paste0(dir_output, "/l_statistics_", .x, ".RData")) l_statistics }) %>% purrr::reduce(c) # Main figure tb_results <- tb_job %>% dplyr::mutate(mean_L1_relative = l_statistics %>% purrr::map_dbl("mean_L1_relative")) %>% rbind(tb_job %>% dplyr::filter(method == "microTensor") %>% {dplyr::mutate( ., mean_L1_relative = l_statistics[.$i_job] %>% purrr::map_dbl("mean_l1_relative_uw"), method = "microTensor (uw)")}) tb_plot_results <- tb_results %>% dplyr::group_by(alpha0, t_missing, median_depth, method, n) %>% dplyr::summarise(log10_L1_relative = mean(log10(mean_L1_relative)), sd_log10_L1_relative = sd(log10(mean_L1_relative)) / sqrt(dplyr::n())) %>% dplyr::ungroup() colors <- smar:::gg_color_hue( values = c("CTF", "microTensor (uw)", "microTensor")) tb_plot_results <- tb_plot_results %>% dplyr::mutate(method = method %>% dplyr::recode("ctf" = "CTF") %>% factor(levels = names(colors))) %>% dplyr::mutate(Missing = dplyr::case_when( t_missing == 0 ~ "Fully Observed", t_missing == 4 ~ "40% Samples Missing" ) %>% factor(levels = c("Fully Observed", "40% Samples Missing"))) %>% dplyr::mutate(Depth = dplyr::case_when( median_depth == 1e4 ~ "Median 10,000 Reads", TRUE ~ "Median 100,000 Reads" ) %>% factor(levels = c("Median 10,000 Reads", "Median 100,000 Reads"))) %>% dplyr::arrange(Depth, Missing) %>% dplyr::mutate(level_plot = paste0(Depth, ";\n ", Missing) %>% forcats::as_factor()) %>% dplyr::mutate(level_plot_label = c("A", "B", "C", "D")[as.integer(level_plot)] %>% paste0(") ", level_plot) %>% forcats::as_factor()) p_figure <- tb_plot_results %>% dplyr::filter(n == 1) %>% ggplot(aes(x = -log10(alpha0), y = log10_L1_relative, color = method, shape = method)) + geom_point(position = position_dodge(width = 0.05), size = 2) + geom_errorbar(aes(ymin = log10_L1_relative - sd_log10_L1_relative, ymax = log10_L1_relative + sd_log10_L1_relative), position = position_dodge(width = 0.05)) + geom_line(position = position_dodge(width = 0.05)) + scale_color_manual(values = colors) + facet_wrap(~ level_plot_label, nrow = 2) + theme_bw() + theme(legend.position = "bottom", legend.title = element_blank(), legend.direction = "horizontal", strip.background = element_blank()) + xlab(expression(paste("Dispersion (-", log[10], alpha[0], " for ", tilde(p)[ijk], ')'))) + ylab(expression(paste(log[10], " Relative ", L[1], " Difference (|", p[ijk], "-", hat(p)[ijk], "|/", p[ijk], ")"))) ggsave(p_figure, filename = "figures/figure3_simulation1/figure3.pdf", width = 6, height = 6) p_figure2 <- tb_plot_results %>% dplyr::filter(n == 2) %>% ggplot(aes(x = -log10(alpha0), y = log10_L1_relative, color = method, shape = method)) + geom_point(position = position_dodge(width = 0.05), size = 2) + geom_errorbar(aes(ymin = log10_L1_relative - sd_log10_L1_relative, ymax = log10_L1_relative + sd_log10_L1_relative), position = position_dodge(width = 0.05)) + geom_line(position = position_dodge(width = 0.05)) + scale_color_manual(values = colors) + facet_wrap(~ level_plot_label, nrow = 2) + theme_bw() + theme(legend.position = "bottom", legend.title = element_blank(), legend.direction = "horizontal", strip.background = element_blank()) + xlab(expression(paste("Dispersion (-", log[10], alpha[0], " for ", tilde(p)[ijk], ')'))) + ylab(expression(paste(log[10], " Relative ", L[1], " Difference (|", p[ijk], "-", hat(p)[ijk], "|/", p[ijk], ")"))) ggsave(p_figure2, filename = "figures/figure3_simulation1/figure3_lessSamples.pdf", width = 6, height = 6) # times plot tb_plot_time <- tb_job %>% dplyr::mutate(time = l_statistics %>% purrr::map_dbl(~.x$time[3])) %>% dplyr::filter(t_missing == 0, n == 1) %>% dplyr::mutate(method = method %>% dplyr::recode("ctf" = "CTF") %>% factor(levels = names(colors))) p_time <- tb_plot_time %>% ggplot(aes(x = method, y = time)) + geom_boxplot() + theme_bw() + xlab("Method") + ylab("Run time (seconds)") ggsave(p_time, filename = "figures/figure3_simulation1/time.pdf", width = 4.5, height = 4) # diagnostics plots tb_plot_diagnostics <- tb_job %>% dplyr::mutate( zero_percentage = l_statistics %>% purrr::map_dbl("zero_percentage"), avg_depth = l_statistics %>% purrr::map_dbl("avg_depth"), mean_L1_ptilde = l_statistics %>% purrr::map_dbl("mean_L1_ptilde"), mean_L1_pnorm = l_statistics %>% purrr::map_dbl("mean_L1_pnorm")) %>% dplyr::mutate(method = method %>% dplyr::recode("ctf" = "CTF") %>% factor(levels = names(colors))) tb_plot_diagnostics <- tb_plot_diagnostics %>% dplyr::filter(t_missing == 0, n == 1) %>% dplyr::group_by(alpha0) %>% dplyr::mutate(L1_ptilde = mean(log10(mean_L1_ptilde))) %>% dplyr::group_by(alpha0, median_depth, L1_ptilde) %>% dplyr::summarise(zero_percentage = mean(zero_percentage)) %>% dplyr::ungroup() # histogram of log p load(paste0(dir_output, "/params_n1.RData")) p_hist <- data.frame(logp = log10(as.vector(params$p))) %>% ggplot(aes(x = logp)) + geom_histogram() + theme_bw() + xlab(expression(paste(log[10], p[ijk]))) + ylab("Count") # difference between p and ptilde p_L1_ptilde <- tb_plot_diagnostics %>% dplyr::filter(median_depth == 10000) %>% ggplot(aes(x = -log10(alpha0), y = L1_ptilde)) + geom_point(position = position_dodge(width = 0.05)) + geom_line(position = position_dodge(width = 0.05)) + theme_bw() + xlab(expression(paste("Dispersion (-", log[10], alpha[0], " for ", tilde(p)[ijk], ')'))) + ylab(expression(paste(log[10], " Relative ", L[1], " Difference (|", p[ijk], "-", tilde(p)[ijk], "|/", p[ijk], ")"))) # ZI characteristics p_ZI <- tb_plot_diagnostics %>% dplyr::mutate(Depth = dplyr::case_when( median_depth == 1e4 ~ "Median 10,000 Reads", TRUE ~ "Median 100,000 Reads" ) %>% factor(levels = c("Median 10,000 Reads", "Median 100,000 Reads"))) %>% ggplot(aes(x = -log10(alpha0), y = zero_percentage)) + geom_point(position = position_dodge(width = 0.05)) + geom_line(position = position_dodge(width = 0.05)) + facet_grid(~ Depth) + theme_bw() + xlab(expression(paste("Dispersion (-", log[10], alpha[0], " for ", tilde(p)[ijk], ')'))) + ylab(expression(paste("Zero proportion in ", X[ijk]))) p_Supp <- cowplot::plot_grid(p_hist, p_L1_ptilde, nrow = 1, labels = c("A", "B")) %>% cowplot::plot_grid(p_ZI, ncol = 1, labels = c("", "C")) ggsave(p_Supp, filename = "figures/figure3_simulation1/figure3_Supp.pdf", width = 8, height = 8) # sparsity evaluation tb_eval <- tb_job %>% dplyr::filter(alpha0 == max(alpha0), median_depth == 1e4, t_missing == 0, n == 1, rep %in% seq(1, 10)) tb_eval <- tb_eval$i_job %>% purrr::map_dfr(function(i_job) { i_tb_job <- tb_job[i_job, ] load(paste0(dir_output, "/fit_", i_tb_job$i_job, ".RData")) load(paste0(dir_output, "/params_n", i_tb_job$n, ".RData")) load(paste0(dir_output, "/X_array_", i_tb_job$i_job, ".RData")) Yhat <- microTensor::get_Yhat(fit_decomp, R = 2) phat <- apply(Yhat, c(2, 3), function(x) { x <- x - max(x) return(exp(x)/sum(exp(x))) }) array_L1_relative <- abs(phat - params$p) / params$p rel_diff <- apply(log10(array_L1_relative), 1, mean) mean_x_zero <- apply(X_array == 0, 1, mean) tibble::tibble(rel_diff = rel_diff, mean_x_zero = mean_x_zero, method = i_tb_job$method) }) p_sparse <- tb_eval %>% dplyr::mutate(method = method %>% dplyr::recode("ctf" = "CTF") %>% factor(levels = names(colors))) %>% ggplot(aes(x = mean_x_zero, y = rel_diff, color = method, shape = method)) + geom_point() + scale_color_manual(values = colors[-2], name = "Method") + scale_shape_manual(values = c("CTF" = 19, "microTensor" = 15), name = "Method") + xlab("Per-feature sparsity") + ylab("Per-feature log10 relative difference") + theme_bw() ggsave(p_sparse, filename = "figures/figure3_simulation1/figure3_sparsity.pdf", width = 5, height = 4)
load(paste0(dir_output, "/params_n1.RData")) ptilde <- params$p alpha0 <- 10^4 set.seed(1) for(j in seq(1, dim(ptilde)[2])) for(k in seq(1, dim(ptilde)[3])) ptilde[, j, k] <- MCMCpack::rdirichlet( n = 1, alpha = alpha0 * params$p[, j, k]) seq_depth <- runif(n = dim(ptilde)[2] * dim(ptilde)[3], min = log(params$depth_range[1] * 1e5), max = log(params$depth_range[2] * 1e5)) %>% exp() %>% round() N <- matrix(seq_depth, nrow = dim(ptilde)[2], ncol = dim(ptilde)[3]) X_array <- ptilde for(j in seq(1, dim(ptilde)[2])) for(k in seq(1, dim(ptilde)[3])) X_array[, j, k] <- rmultinom(n = 1, size = N[j, k], prob = ptilde[, j, k]) R <- 5 fit_long <- microTensor::microTensor( X = X_array, R = R, ortho_m = TRUE, nn_t = FALSE, weighted = TRUE, control = list( L_init = 3, gamma = 2, abs_tol = 1e-4, rel_tol = 1e-4, maxit = 5000, verbose = FALSE)) negLogLiks <- seq(1, R) %>% purrr::map_dbl(function(i_R){ Yhat <- microTensor::get_Yhat( result = list( lambda = fit_long$lambda[seq(1, i_R)], m = fit_long$m[, seq(1, i_R), drop = FALSE], s = fit_long$s[, seq(1, i_R), drop = FALSE], t = fit_long$t[, seq(1, i_R), drop = FALSE] ) ) microTensor::negLogLik_cpp( X = X_array, Yhat = Yhat, wt = matrix(1, nrow = dim(X_array)[2], ncol = dim(X_array)[3]), lambda = 0, vm = rep(0, dim(X_array)[1]), vs = rep(0, dim(X_array)[2]), vt = rep(0, dim(X_array)[3]) ) }) # zero negLogLiks <- c(microTensor::negLogLik_cpp( X = X_array, Yhat = array(0, dim(X_array)), wt = matrix(1, nrow = dim(X_array)[2], ncol = dim(X_array)[3]), lambda = 0, vm = rep(0, dim(X_array)[1]), vs = rep(0, dim(X_array)[2]), vt = rep(0, dim(X_array)[3]) ), negLogLiks) tb_plot <- tibble::tibble( R = seq(0, 5), negLogLiks = negLogLiks ) p <- tb_plot %>% ggplot(aes(x = R, y = negLogLiks)) + geom_point() + geom_line() + theme_bw() + ylab("Negative log-likelihood") ggsave(p, filename = "figures/figure3_simulation1/fig3_supp_selectR.pdf", width = 4, height = 3.5)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.