#' dimsum__sample_count_distributions
#'
#' Plot sample nucleotide count distributions split by hamming distance (Nham_nt)
#'
#' @param input_dt input data table with count, WT and remaning column used for facetting (required)
#' @param output_file_prefix plot output path prefix (required)
#' @param width plot width (default: 12)
#' @param height plot height (default: 8)
#' @param title plot title (default: "")
#' @param error_rate sequencing error rate (default: 1e-3)
#' @param seq_length sequence length
#'
#' @return Nothing
#' @export
#' @import data.table
dimsum__sample_count_distributions <- function(
input_dt,
output_file_prefix,
width = 12,
height = 8,
title = "",
error_rate = 1e-3,
seq_length){
#Check if something to plot
if(input_dt[is.na(WT),.N]==0){
warning("dimsum__sample_count_distributions.R: No data to plot (empty data.table 'input_dt').", call. = FALSE, immediate. = TRUE, noBreaks. = TRUE)
return(NULL)
}
#Column used for facetting
fcols <- names(input_dt)[!grepl("_count|WT", names(input_dt))]
#Add 1 + jitter
set.seed(1)
count_cols <- names(input_dt)[grepl("_count", names(input_dt))]
for(i in count_cols){
input_dt[, paste0(i) := as.numeric(.SD[[1]]+1),,.SDcols = i]
input_dt[, paste0(i) := 10^(log10(.SD) + log10((.SD+1)/.SD) * runif(1)),1:nrow(input_dt) %% 1000,.SDcols = i]
# input_dt[, paste0(i) := .SD + runif(1),1:nrow(input_dt) %% 1000,.SDcols = i]
}
#Rename count columns
names(input_dt)[grep("_count", names(input_dt))] <- dimsum__plot_samplename(names(input_dt)[grep("_count", names(input_dt))])
#Real variants
plot_dt <- reshape2::melt(input_dt, id = c(fcols, "WT"))
plot_dt[, Hamming_distance := factor(.SD[[1]]),,.SDcols = fcols]
# plot_dt[,value := value+1]
#WT frequency
plot_dt_wt <- plot_dt[WT==T,.SD,,.SDcols = c("variable", "value")]
# #Most abundant variant frequency
# plot_dt_ma <- plot_dt[,.(value=max(value)),variable]
#Fake single expected frequency: fake_single_counts = WT_count*error_rate/4
plot_dt_fsingles <- data.table::copy(plot_dt_wt)
plot_dt_fsingles[, value := ((value - 1)*error_rate/4 + 1)]
plot_dt_fsingles[, Hamming_distance := factor(1)]
#Fake double expected frequency distribution: fake_double_counts = real_single_counts*error_rate*2/4
plot_dt_fdoubles <- plot_dt[get(fcols[1]) == 1]
for(i in unique(plot_dt_fdoubles[,variable])){
#Adjust observed single counts for outflow to doubles
plot_dt_fdoubles[variable==i,real_value := value - 1 + (value - 1 - plot_dt_fsingles[variable==i,value - 1])*seq_length*error_rate]
#Adjust observed single counts for inflow from WT
plot_dt_fdoubles[variable==i,real_value := real_value - plot_dt_fsingles[variable==i,value - 1]]
#Fake doubles
plot_dt_fdoubles[variable==i,value := real_value*error_rate*2/4 + 1]
}
plot_dt_fdoubles <- plot_dt_fdoubles[,.(value = median(value)),variable]
plot_dt_fdoubles[, Hamming_distance := factor(2)]
#Remove real variants with hamming distance ==0
plot_dt <- plot_dt[get(fcols[1])>0,]
#Plot
if(length(fcols)==1){
#Plot without subplot stratification (Nham_nt)
d <- ggplot2::ggplot(plot_dt, ggplot2::aes(value, color = Hamming_distance)) +
ggplot2::geom_density() + ggplot2::scale_x_log10() + ggplot2::theme_bw() +
ggplot2::labs(x = "variant count + 1 (log scale)", y = "Density", title = title, color='Nucleotide\nHamming dist.') +
# ggplot2::geom_vline(data = plot_dt_ma, ggplot2::aes(xintercept = value), linetype = 2) +
ggplot2::geom_vline(data = plot_dt_wt, ggplot2::aes(xintercept = value), linetype = 2) +
ggplot2::geom_vline(data = plot_dt_fsingles, ggplot2::aes(xintercept = value, color = Hamming_distance), linetype = 2) +
ggplot2::geom_vline(data = plot_dt_fdoubles, ggplot2::aes(xintercept = value, color = Hamming_distance), linetype = 2) +
ggplot2::facet_grid(Hamming_distance ~ variable, scales = "free")
dimsum__save_png(paste0(output_file_prefix, ".png"), d, width = width, height = height)
ggplot2::ggsave(paste0(output_file_prefix, ".pdf"), d, width = width, height = height)
}else{
#Plot with subplot stratification (using second facet column, Nham_aa)
plot_list <- list()
plot_indices <- unlist(plot_dt[order(get(fcols[2]), decreasing = F),unique(.SD),,.SDcols = fcols[2]])
for(i in plot_indices){
this_plot_dt <- plot_dt[get(fcols[2])==i & get(fcols[1])<=max((i*3), 3) & get(fcols[1])>=i,]
this_plot_dt[, Hamming_distance := factor(.SD[[1]], levels = 1:max((i*3), 3)),,.SDcols = fcols]
plot_list[[as.character(i)]] <- ggplot2::ggplot(this_plot_dt, ggplot2::aes(value, color = Hamming_distance)) +
ggplot2::geom_density() + ggplot2::scale_x_log10() + ggplot2::theme_bw() +
ggplot2::labs(x = "variant count + 1 (log scale)", y = "Density", title = paste0("Amino acid Hamming distance: ", i), color='Nucleotide\nHamming dist.') +
# ggplot2::geom_vline(data = plot_dt_ma, ggplot2::aes(xintercept = value), linetype = 2) +
ggplot2::geom_vline(data = plot_dt_wt, ggplot2::aes(xintercept = value), linetype = 2) +
ggplot2::geom_vline(data = plot_dt_fsingles, ggplot2::aes(xintercept = value, color = Hamming_distance), linetype = 2) +
ggplot2::geom_vline(data = plot_dt_fdoubles, ggplot2::aes(xintercept = value, color = Hamming_distance), linetype = 2) +
ggplot2::facet_grid(Hamming_distance ~ variable, scales = "free")
}
#Combine plots
temp_grid.arrange <- gridExtra::arrangeGrob
p <- do.call("temp_grid.arrange", c(plot_list, nrow = length(plot_indices)))
dimsum__save_png(paste0(output_file_prefix, ".png"), p, width = width, height = height)
ggplot2::ggsave(paste0(output_file_prefix, ".pdf"), p, width = width, height = height)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.