knitr::opts_chunk$set(echo = FALSE) library(fakeDataWithError) library(visualizationQualityControl) library(cowplot) library(dplyr) library(errorTransformation) save_plot_loc <- "~/Documents/posters/dataTransformationsError/figures/"
In many omics data analyses, a primary step is the application of a transformation to the data. Transformations are generally employed to convert proportional error (variance) to additive error, which most statistical methods appropriately handle. However, omics data frequently contain error sources that result in both additive and proportional errors [1 - 5]. To our knowledge, there has not been a systematic study on detecting the presence of proportional error in omics data, or the effect of transformations on the error structure.
In this work, we demonstrate a set of three simple graphs which facilitate the detection of proportional and mixed error in omics data when multiple replicates are available. The three graphs illustrate proportional and mixed error in a visually compelling manner that is both straight-forward to recognize and to communicate. The graphs plot the 1) absolute difference in the range, 2) standard deviation and 3) relative standard deviation against the mean signal across replicates (Fig. 1 C-E). In addition to showing the presence of different types of error, these graphs readily demonstrate the effect of various transformations on the error structure as well.
Using these graphical summaries we find that the log and hyperbolic inverse sin transforms are the most effective method of the common methods employed for removing proportional error.
set.seed(1234) n_point <- 1000 n_rep <- 100 offset <- 4 simulated_data <- c(rlnorm(n_point / 2, meanlog = 1, sdlog = 1), runif(n_point / 2, 5, 100)) simulated_data <- simulated_data + offset
Simulated data was generated by drawing points from two different distributions:
1) log-normal distribution with a mean in log-space of 1 and a standard
deviation of 1; 2) a uniform distribution over the range of 5 - 100, with
r n_point/2
points in each distribution. A value of r offset
was added to
the data purely to avoid values <= 1 following the addition
of error, making visualization easier. These data points are the pure
initial data to which different types of error are added. From this set,
r n_rep
replicates are generated with either additive (add
), proportional
(prop
) or mixed (both additive and proportional, mixed
) error.
The transformations applied include autoscaling, pareto scaling, root (2nd and 5th), log, and hyperbolic inverse sin (arcsinh).
add_sd <- 0.5 add_error <- add_uniform_noise(n_rep, simulated_data, add_sd) add_error[(add_error < 0)] <- 0 add_nozero <- filter_features(add_error, 1e-16) add_minone <- filter_features(add_error, 1)
prop_sd <- 0.1 prop_error <- add_prop_noise(n_rep, simulated_data, prop_sd) prop_error[(prop_error < 0)] <- 0 prop_nozero <- filter_features(prop_error, 1e-16) prop_minone <- filter_features(prop_error, 1)
mix_error <- add_prop_uniform(n_rep, simulated_data, add_sd, prop_sd) mix_error[(mix_error < 0)] <- 0 mix_nozero <- filter_features(mix_error, 1e-16) mix_minone <- filter_features(mix_error, 1)
Fig 1. Additive error only. A) Two replicates plotted in "original" space B) Same two replicates rotated using MA plot, where M is "minus" (rep1 - rep2) and A is average of rep1 and rep2. C) Difference between largest value and smallest value vs the mean E) Standard deviation (sd) across replicates vs the mean F) Relative standard deviation (rsd) vs the mean. Red line indicates the amount of additive error added.
pair_frame <- data.frame(rep1 = add_error[,1], rep2 = add_error[,2]) add_pair_plot <- ggplot(pair_frame, aes(x = rep1, y = rep2)) + geom_point() ma_frame <- data.frame(M = add_error[,1] - add_error[,2], A = rowMeans(add_error[, 1:2])) add_ma_plot <- ggplot(ma_frame, aes(x = A, y = M)) + geom_point() diff_error <- apply(add_error, 1, function(x){ max(x) - min(x) }) range_frame <- data.frame(difference = diff_error, mean = rowMeans(add_error)) add_diff_plot <- ggplot(range_frame, aes(x = mean, y = difference)) + geom_point() add_error_summ <- summarize_data(t(add_error)) add_sd_plot <- filter(add_error_summ, type == "sd") %>% ggplot(., aes(x = mean, y = var)) + geom_point() + ylab("sd") + geom_hline(yintercept = add_sd, color = "red") add_rsd_plot <- filter(add_error_summ, type == "rsd") %>% ggplot(., aes(x = mean, y = var)) + geom_point() + ylab("rsd")
add_rep_ma <- plot_grid(add_pair_plot, add_ma_plot, nrow = 1, labels = LETTERS[1:2]) add_summary_plot <- plot_grid(add_diff_plot, add_sd_plot, add_rsd_plot, nrow = 1, labels = LETTERS[3:5]) add_rep_ma add_summary_plot
ggsave(filename = file.path(save_plot_loc, "add_rep_ma.png"), plot = add_rep_ma, width = 8, height = 5, dpi = 300) ggsave(filename = file.path(save_plot_loc, "add_summary_plot.png"), plot = add_summary_plot, width = 12, height = 5, dpi = 300)
Fig 2. Proportional error only. A) Two replicates plotted in "original" space B) Same two replicates rotated using MA plot, where M is "minus" (rep1 - rep2) and A is average of rep1 and rep2. C) Difference between largest value and smallest value vs the mean E) Standard deviation (sd) across replicates vs the mean F) Relative standard deviation (rsd) vs the mean. Red line indicates the amount of proportional error added.
pair_frame <- data.frame(rep1 = prop_error[,1], rep2 = prop_error[,2]) prop_pair_plot <- ggplot(pair_frame, aes(x = rep1, y = rep2)) + geom_point() ma_frame <- data.frame(M = prop_error[,1] - prop_error[,2], A = rowMeans(prop_error[, 1:2])) prop_ma_plot <- ggplot(ma_frame, aes(x = A, y = M)) + geom_point() diff_error <- apply(prop_error, 1, function(x){ max(x) - min(x) }) range_frame <- data.frame(difference = diff_error, mean = rowMeans(prop_error)) prop_diff_plot <- ggplot(range_frame, aes(x = mean, y = difference)) + geom_point() prop_error_summ <- summarize_data(t(prop_error)) prop_sd_plot <- filter(prop_error_summ, type == "sd") %>% ggplot(., aes(x = mean, y = var)) + geom_point() + ylab("sd") prop_rsd_plot <- filter(prop_error_summ, type == "rsd") %>% ggplot(., aes(x = mean, y = var)) + geom_point() + ylab("rsd") + geom_hline(yintercept = prop_sd, color = "red")
prop_ma_plot <- plot_grid(prop_pair_plot, prop_ma_plot, nrow = 1, labels = LETTERS[1:2]) prop_summary_plot <- plot_grid(prop_diff_plot, prop_sd_plot, prop_rsd_plot, nrow = 1, labels = LETTERS[3:5]) prop_ma_plot prop_summary_plot
ggsave(filename = file.path(save_plot_loc, "prop_ma_plot.png"), plot = prop_ma_plot, width = 8, height = 5, dpi = 300) ggsave(filename = file.path(save_plot_loc, "prop_summary_plot.png"), plot = prop_summary_plot, width = 12, height = 5, dpi = 300)
Fig 3. Additive and proportional (mixed) error. A) Two replicates plotted in "original" space B) Same two replicates rotated using MA plot, where M is "minus" (rep1 - rep2) and A is average of rep1 and rep2. C) Difference between largest value and smallest value vs the mean E) Standard deviation (sd) across replicates vs the mean F) Relative standard deviation (rsd) vs the mean.
pair_frame <- data.frame(rep1 = mix_error[,1], rep2 = mix_error[,2]) mix_pair_plot <- ggplot(pair_frame, aes(x = rep1, y = rep2)) + geom_point() ma_frame <- data.frame(M = mix_error[,1] - mix_error[,2], A = rowMeans(mix_error[, 1:2])) mix_ma_plot <- ggplot(ma_frame, aes(x = A, y = M)) + geom_point() diff_error <- apply(mix_error, 1, function(x){ abs(max(x) - min(x)) }) range_frame <- data.frame(difference = diff_error, mean = rowMeans(mix_error)) mix_diff_plot <- ggplot(range_frame, aes(x = mean, y = difference)) + geom_point() mix_error_summ <- summarize_data(t(mix_error)) mix_sd_plot <- filter(mix_error_summ, type == "sd") %>% ggplot(., aes(x = mean, y = var)) + geom_point() + ylab("sd") mix_rsd_plot <- filter(mix_error_summ, type == "rsd") %>% ggplot(., aes(x = mean, y = var)) + geom_point() + ylab("rsd")
mix_ma_plot <- plot_grid(mix_pair_plot, mix_ma_plot, nrow = 1, labels = LETTERS[1:2]) mix_summary_plot <- plot_grid(mix_diff_plot, mix_sd_plot, mix_rsd_plot, nrow = 1, labels = LETTERS[3:5]) mix_ma_plot mix_summary_plot
ggsave(filename = file.path(save_plot_loc, "mix_ma_plot.png"), plot = mix_ma_plot, width = 8, height = 5, dpi = 300) ggsave(filename = file.path(save_plot_loc, "mix_summary_plot.png"), plot = mix_summary_plot, width = 12, height = 5, dpi = 300)
ae_as <- auto_scale(add_error) ae_as_summary <- summarize_data(t(ae_as)) ae_as_diff <- plot_diff(ae_as, log_mean = log, use_title = NULL) ae_as_sd <- plot_sd(ae_as, log_mean = log, use_title = NULL) ae_as_rsd <- plot_sd(ae_as, sd_type = "rsd", log_mean = log, use_title = NULL) plot_grid(ae_as_diff, ae_as_sd, ae_as_rsd, nrow = 1, labels = LETTERS[1:3])
pe_as <- auto_scale(prop_error) pe_as_diff <- plot_diff(pe_as, log_mean = log, use_title = NULL) pe_as_sd <- plot_sd(pe_as, log_mean = log, use_title = NULL) pe_as_rsd <- plot_sd(pe_as, sd_type = "rsd", log_mean = log, use_title = NULL) plot_grid(pe_as_diff, pe_as_sd, pe_as_rsd, nrow = 1, labels = LETTERS[1:3])
me_as <- auto_scale(mix_error) me_as_diff <- plot_diff(me_as, log_mean = NULL, use_title = NULL) me_as_sd <- plot_sd(me_as, log_mean = NULL, use_title = NULL) me_as_rsd <- plot_sd(me_as, sd_type = "rsd", log_mean = NULL, use_title = NULL) mix_auto <- plot_grid(me_as_diff, me_as_sd, me_as_rsd, nrow = 1, labels = LETTERS[1:3]) mix_auto
me_nt_summ <- summarize_data(t(mix_error)) max_me_nt <- filter(me_nt_summ, type == "rsd") %>% select(., var) %>% max() me_as_summ <- summarize_data(t(me_as)) max_me_as <- filter(me_as_summ, type == "rsd") %>% select(., var) %>% max() identical(max_me_nt, max_me_as) auto_compare <- summarize_two(mix_error, me_as, sd_type = "diff", name1 = "mix", name2 = "autoscale(mix)") #auto_compare$mean <- log(auto_compare$mean) # set up the factors properly auto_compare$data <- factor(auto_compare$data, levels = c("mix", "autoscale(mix)"), ordered = TRUE) ggplot(auto_compare, aes(x = var, fill = data)) + geom_density(alpha = 0.5) ggplot(auto_compare, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5)
me_as_plot_compare <- ggplot(auto_compare, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) + xlab("mean") + ylab("diff") + theme(legend.position = c(0.5, 0.8), legend.title = element_text(size = 0)) mix_auto_summary <- plot_grid(me_as_diff, me_as_sd, me_as_rsd, nrow = 1, labels = LETTERS[1:3]) mix_auto_compare <- plot_grid(me_as_plot_compare, nrow = 1, labels = LETTERS[4]) mix_auto_summary mix_auto_compare
ggsave(file.path(save_plot_loc, "mix_auto_summary.png"), plot = mix_auto_summary, dpi = 300, width = 12, height = 5) ggsave(file.path(save_plot_loc, "mix_auto_compare.png"), plot = mix_auto_compare, dpi = 300, width = 4, height = 5)
Fig 4. Autoscaling applied to mixed error model data. A) Maximum difference vs the mean. B) Standard deviation (sd) vs the mean. C) Relative standard deviation (rsd) vs the mean. D) Maximum difference of original and autoscaled data vs the mean.
Applying a variance or autoscaling to the data results in the removal of the additive error to a completely identical standard deviation of 1 across all samples at all abundances. This applies across all of the different error types. Although the standard deviation becomes identical and constant, the maximum relative variance is the same, and the distribution of the rsd values is identical, while the range of the mean is completely changed, as well as it's relationship to the differences.
ae_ps <- pareto_scale(add_error) ae_ps_diff <- plot_diff(ae_ps, log_mean = log, use_title = NULL) ae_ps_sd <- plot_sd(ae_ps, log_mean = log, use_title = NULL) ae_ps_rsd <- plot_sd(ae_ps, sd_type = "rsd", log_mean = log, use_title = NULL) plot_grid(ae_ps_diff, ae_ps_sd, ae_ps_rsd, nrow = 1, labels = LETTERS[1:3])
pe_ps <- pareto_scale(prop_error) pe_ps_diff <- plot_diff(pe_ps, log_mean = NULL, use_title = NULL) pe_ps_sd <- plot_sd(pe_ps, log_mean = NULL, use_title = NULL) pe_ps_rsd <- plot_sd(pe_ps, sd_type = "rsd", log_mean = NULL, use_title = NULL) plot_grid(pe_ps_diff, pe_ps_sd, pe_ps_rsd, nrow = 1, labels = LETTERS[1:3])
me_ps <- pareto_scale(mix_error) me_ps_diff <- plot_diff(me_ps, log_mean = NULL, use_title = NULL) me_ps_sd <- plot_sd(me_ps, log_mean = NULL, use_title = NULL) me_ps_rsd <- plot_sd(me_ps, sd_type = "rsd", log_mean = NULL, use_title = NULL) plot_grid(me_ps_diff, me_ps_sd, me_ps_rsd, nrow = 1, labels = LETTERS[1:3])
me_ps_summ <- summarize_data(t(me_ps)) max_me_ps <- filter(me_ps_summ, type == "rsd") %>% select(., var) %>% max() all.equal(max_me_nt, max_me_ps) pareto_compare <- summarize_two(mix_nozero, me_ps, sd_type = "rsd", name1 = "mix", name2 = "pareto(mix)") pareto_compare$data <- factor(pareto_compare$data, levels = c("mix", "pareto(mix)"), ordered = TRUE) #pareto_compare$mean <- log(pareto_compare$mean) ggplot(pareto_compare, aes(x = var, fill = data)) + geom_density(alpha = 0.5) ggplot(pareto_compare, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5)
Fig. 5 Pareto scaling applied to mixed error model data. A) Largest difference vs the mean. B) Standard deviation (sd) vs the mean. C) Relative standard deviation (rsd) vs the mean. D) Relative standard deviation of mixed error data before (mix) and after (pareto(mix)) pareto scaling.
In contrast to autoscaling, pareto scaling modifies the values by the square root of the variance. As shown in Figure 5, this changes the absolute values of the standard deviations, but does not change the distribution, or the relationship of the variance to the mean. In fact, the maximum of the RSD is virtually unchanged, it is only the range of the mean that changes.
me_ps_rsd_oomp <- ggplot(pareto_compare, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) + xlab("mean") + ylab("rsd") + theme(legend.position = c(0.5, 0.8), legend.title = element_text(size = 0)) pareto_summary_graph <- plot_grid(me_ps_diff, me_ps_sd, me_ps_rsd, nrow = 1, labels = LETTERS[1:3]) pareto_compare_graph <- plot_grid(me_ps_rsd_oomp, nrow = 1, labels = LETTERS[4]) pareto_summary_graph pareto_compare_graph ggsave(file.path(save_plot_loc, "mix_pareto_summary.png"), plot = pareto_summary_graph, dpi = 300, width = 12, height = 5) ggsave(file.path(save_plot_loc, "mix_pareto_compare.png"), plot = pareto_compare_graph, dpi = 300, width = 4, height = 5)
root_add <- root_transform(add_error) root_add2 <- root_transform(add_error, 5) root_add_comb <- cbind(root_add, root_add2) root_class <- rep(c("2", "5"), each = 100) root_summary <- summarize_data(t(root_add_comb), root_class) ae_rt_diff <- filter(root_summary, type == "diff") %>% ggplot(., aes(x = mean, y = var, color = class)) + geom_point() ae_rt_sd <- filter(root_summary, type == "sd") %>% ggplot(., aes(x = mean, y = var, color = class)) + geom_point() ae_rt_rsd <- filter(root_summary, type == "rsd") %>% ggplot(., aes(x = mean, y = var, color = class)) + geom_point() plot_grid(ae_rt_diff, ae_rt_sd, ae_rt_rsd, nrow = 1, labels = LETTERS[1:3])
root_prop1 <- root_transform(prop_error) root_prop2 <- root_transform(prop_error, root = 5) root_prop_comb <- cbind(root_prop1, root_prop2) root_pe_summary <- summarize_data(t(root_prop_comb), root_class) root_pe_diff <- filter(root_pe_summary, type == "diff") %>% ggplot(., aes(x = mean, y = var, color = class)) + geom_point() root_pe_sd <- filter(root_pe_summary, type == "sd") %>% ggplot(., aes(x = mean, y = var, color = class)) + geom_point() root_pe_rsd <- filter(root_pe_summary, type == "rsd") %>% ggplot(., aes(x = mean, y = var, color = class)) + geom_point() plot_grid(root_pe_diff, root_pe_sd, root_pe_rsd, nrow = 1, labels = LETTERS[1:3])
root_mixed <- root_transform(mix_error) root_mixed2 <- root_transform(mix_error, 5) me_rt_comb <- cbind(root_mixed, root_mixed2) me_rt_summary <- summarize_data(t(me_rt_comb), root_class) me_rt_diff <- filter(me_rt_summary, type == "diff") %>% ggplot(., aes(x = mean, y = var)) + geom_point() me_rt_sd <- filter(me_rt_summary, type == "sd") %>% ggplot(., aes(x = mean, y = var)) + geom_point() me_rt_rsd <- filter(me_rt_summary, type == "rsd") %>% ggplot(., aes(x = mean, y = var)) + geom_point() plot_grid(me_rt_diff, me_rt_sd, me_rt_rsd, nrow = 1, labels = LETTERS[1:3])
compare_rt_me_rs <- rbind( summarize_data(t(mix_error)) %>% filter(., type == "rsd") %>% mutate(., data = "mix") %>% transform(., mean = log(mean)), me_rt_summary %>% filter(., type == "sd") %>% mutate(., data = "root(mix)") ) hist_comp_rt_me_rs <- ggplot(compare_rt_me_rs, aes(x = var, fill = data)) + geom_density(alpha = 0.5) + theme(legend.position = "none") point_comp_rt_me_rs <- ggplot(compare_rt_me_rs, aes(x = mean, y = var)) + geom_point(alpha = 0.5) + theme(legend.position = c(0.5, 0.8), legend.title = element_text(size = 0)) plot_grid(hist_comp_rt_me_rs, point_comp_rt_me_rs, nrow = 1, labels = LETTERS[1:2]) compare_rt_me_rr <- rbind( summarize_data(t(mix_nozero)) %>% filter(., type == "rsd") %>% mutate(., data = "rsd(mix)") %>% transform(., mean = log(mean)), me_rt_summary %>% filter(., type == "rsd") %>% mutate(., data = "rsd(root(mix))") ) hist_comp_rt_me_rr <- ggplot(compare_rt_me_rr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_comp_rt_me_rr <- ggplot(compare_rt_me_rr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) + theme(legend.position = c(0.5, 0.8), legend.title = element_text(size = 0)) + ylab("rsd") plot_grid(hist_comp_rt_me_rr, point_comp_rt_me_rr, nrow = 1, labels = LETTERS[1:2])
mix_root_summary <- plot_grid(me_rt_diff, me_rt_sd, me_rt_rsd, nrow = 1, labels = LETTERS[1:3]) mix_root_compare <- plot_grid(point_comp_rt_me_rr, nrow = 1, labels = LETTERS[4]) mix_root_summary mix_root_compare ggsave(file.path(save_plot_loc, "mix_root_summary.png"), plot = mix_root_summary, dpi = 300, width = 12, height = 5) ggsave(file.path(save_plot_loc, "mix_root_compare.png"), plot = mix_root_compare, dpi = 300, width = 4, height = 5)
Fig 6. Root (2nd and 5th) transform applied to mixed error model data. A) Largest difference vs the mean. B) Standard deviation (sd) vs the mean. C) Relative standard deviation (rsd) vs the mean. D) Relative standard deviation (rsd) of mixed error data before (mix) and after (root(mix)) root transformation.
Although the root transform changes the overall amount of variance compared to the mean, as with pareto scaling, the overall relationship of variance to the mean is maintained, as all the plots show. The variance is still dependent on the mean value, and the proportional error is still present in the data.
log_add <- log(add_error) ae_lt_diff <- plot_diff(log_add, log_mean = FALSE) ae_lt_sd <- plot_sd(log_add, log_mean = FALSE) ae_lt_rsd <- plot_sd(log_add, log_mean = FALSE, sd_type = "rsd") plot_grid(ae_lt_diff, ae_lt_sd, ae_lt_rsd, nrow = 1, labels = LETTERS[1:3])
log_prop <- log(prop_error) pe_lt_diff <- plot_diff(log_prop, log_mean = FALSE, use_title = NULL) pe_lt_sd <- plot_sd(log_prop, log_mean = FALSE, use_title = NULL) pe_lt_rsd <- plot_sd(log_prop, log_mean = FALSE, sd_type = "rsd", use_title = NULL) plot_grid(pe_lt_diff, pe_lt_sd, pe_lt_rsd, nrow = 1, labels = LETTERS[1:3])
log_mix <- log(mix_error) me_lt_diff <- plot_diff(log_mix, log_mean = FALSE, use_title = NULL) me_lt_sd <- plot_sd(log_mix, log_mean = FALSE, use_title = NULL) me_lt_rsd <- plot_sd(log_mix, log_mean = FALSE, sd_type = "rsd", use_title = NULL) plot_grid(me_lt_diff, me_lt_sd, me_lt_rsd, nrow = 1, labels = LETTERS[1:3])
compare_lt_ae_sr <- rbind( summarize_data(t(add_error)) %>% filter(., type == "rsd") %>% mutate(., data = "add") %>% transform(., mean = log(mean)), summarize_data(t(log_add)) %>% filter(., type == "sd") %>% mutate(., data = "log")) hist_compare_lt_ae_sr <- ggplot(compare_lt_ae_sr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_compare_lt_ae_sr <- ggplot(compare_lt_ae_sr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) plot_grid(hist_compare_lt_ae_sr, point_compare_lt_ae_sr, nrow = 1, labels = LETTERS[1:2]) compare_lt_ae_rr <- rbind( summarize_data(t(add_nozero)) %>% filter(., type == "rsd") %>% mutate(., data = "add") %>% transform(., mean = log(mean)), summarize_data(t(log_add)) %>% filter(., type == "rsd") %>% mutate(., data = "log") ) hist_compare_lt_ae_rr <- ggplot(compare_lt_ae_rr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_compare_lt_ae_rr <- ggplot(compare_lt_ae_rr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) plot_grid(hist_compare_lt_ae_rr, point_compare_lt_ae_rr, nrow = 1, labels = LETTERS[1:2])
compare_lt_pe_rs <- rbind( summarize_data(t(prop_error)) %>% filter(., type == "rsd") %>% mutate(., data = "prop") %>% transform(., mean = log(mean)), summarize_data(t(log_prop)) %>% filter(., type == "sd") %>% mutate(., data = "prop_log") ) hist_comp_lt_pe_rs <- ggplot(compare_lt_pe_rs, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_comp_lt_pe_rs <- ggplot(compare_lt_pe_rs, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) plot_grid(hist_comp_lt_pe_rs, point_comp_lt_pe_rs, nrow = 1, labels = LETTERS[1:2]) compare_lt_pe_rr <- rbind( summarize_data(t(prop_error)) %>% filter(., type == "rsd") %>% mutate(., data = "prop") %>% transform(., mean = log(mean)), summarize_data(t(log_prop)) %>% filter(., type == "rsd") %>% mutate(., data = "prop_log") ) hist_comp_lt_pe_rr <- ggplot(compare_lt_pe_rr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_comp_lt_pe_rr <- ggplot(compare_lt_pe_rr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) plot_grid(hist_comp_lt_pe_rr, point_comp_lt_pe_rr, nrow = 1, labels = LETTERS[1:2])
\
compare_lt_me_rs <- rbind( summarize_data(t(mix_error)) %>% filter(., type == "rsd") %>% mutate(., data = "rsd(mix)") %>% transform(., mean = log(mean)), summarize_data(t(log_mix)) %>% filter(., type == "sd") %>% mutate(., data = "sd(log(mix))") ) compare_lt_me_rs$data <- factor(compare_lt_me_rs$data, levels = c("rsd(mix)", "sd(log(mix))"), ordered = TRUE) hist_comp_lt_me_rs <- ggplot(compare_lt_me_rs, aes(x = var, fill = data)) + geom_density(alpha = 0.5) + theme(legend.position = "none") point_comp_lt_me_rs <- ggplot(compare_lt_me_rs, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) + theme(legend.position = c(0.5, 0.8), legend.title = element_text(size = 0)) plot_grid(hist_comp_lt_me_rs, point_comp_lt_me_rs, nrow = 1, labels = LETTERS[1:2]) compare_lt_me_rr <- rbind( summarize_data(t(mix_nozero)) %>% filter(., type == "rsd") %>% mutate(., data = "mix") %>% transform(., mean = log(mean)), summarize_data(t(log_mix)) %>% filter(., type == "rsd") %>% mutate(., data = "log(mix)") ) compare_lt_me_rr$data <- factor(compare_lt_me_rr$data, levels = c("mix", "log(mix)"), ordered = TRUE) hist_comp_lt_me_rr <- ggplot(compare_lt_me_rr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_comp_lt_me_rr <- ggplot(compare_lt_me_rr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) + theme(legend.position = c(0.5, 0.8), legend.title = element_text(size = 0)) + ylab("rsd") plot_grid(hist_comp_lt_me_rr, point_comp_lt_me_rr, nrow = 1, labels = LETTERS[1:2])
log_mix_summary <- plot_grid(me_lt_diff, me_lt_sd, me_lt_rsd, nrow = 1, labels = LETTERS[1:3]) log_mix_compare <- plot_grid(point_comp_lt_me_rs, point_comp_lt_me_rr, nrow = 1, labels = LETTERS[4:5]) log_mix_summary log_mix_compare ggsave(file.path(save_plot_loc, "log_mix_summary.png"), plot = log_mix_summary, dpi = 300, width = 12, height = 5) ggsave(file.path(save_plot_loc, "log_mix_compare.png"), plot = log_mix_compare, dpi = 300, width = 8, height = 5)
Fig 7. Log transform applied to mixed error model data. A) Largest difference vs the mean. B) Standard deviation (sd) vs the mean. C) Relative standard deviation (rsd) vs the mean. D) Relative standard deviation of mixed error data (rsd(mix)) and standard deviation of log transformed mixed error data (sd(log(mix))) vs the mean and E) Relative standard deviation of mixed error data before (mix) and after (log(mix)) log transformation. Note that in D) and E), the log(mean) of the original mixed error is used to put the points on a comparable scale.
The log transform completely changes how the standard deviation depends on the mean. Most notably, the variance becomes constant at higher mean values, much like the standard deviation in the additive error case. As shown in Figure 7D, the standard deviation of the log transform actually looks almost exactly like the relative standard deviation of the un-transformed data. Although it also appears that in the log-transformed data the low values have more error than the high values, Figure 7E shows that overall the relative error has decreased for the data.
asinh_add <- arcsinh_transform(add_error) ae_ah_diff <- plot_diff(asinh_add, log_mean = FALSE) ae_ah_sd <- plot_sd(asinh_add, log_mean = FALSE) ae_ah_rsd <- plot_sd(asinh_add, log_mean = FALSE, sd_type = "rsd") plot_grid(ae_ah_diff, ae_ah_sd, ae_ah_rsd, nrow = 1, labels = LETTERS[1:3])
asinh_prop <- arcsinh_transform(prop_error) pe_ah_diff <- plot_diff(asinh_prop, log_mean = FALSE, use_title = NULL) pe_ah_sd <- plot_sd(asinh_prop, log_mean = FALSE, use_title = NULL) pe_ah_rsd <- plot_sd(asinh_prop, log_mean = FALSE, sd_type = "rsd", use_title = NULL) plot_grid(pe_ah_diff, pe_ah_sd, pe_ah_rsd, nrow = 1, labels = LETTERS[1:3])
asinh_mix <- arcsinh_transform(mix_error) me_ah_diff <- plot_diff(asinh_mix, log_mean = FALSE, use_title = NULL) me_ah_sd <- plot_sd(asinh_mix, log_mean = FALSE, use_title = NULL) me_ah_rsd <- plot_sd(asinh_mix, log_mean = FALSE, sd_type = "rsd", use_title = NULL) plot_grid(me_ah_diff, me_ah_sd, me_ah_rsd, nrow = 1, labels = LETTERS[1:3])
compare_ah_ae_sr <- rbind( summarize_data(t(add_error)) %>% filter(., type == "rsd") %>% mutate(., data = "add") %>% transform(., mean = log(mean)), summarize_data(t(asinh_add)) %>% filter(., type == "sd") %>% mutate(., data = "asinh")) hist_compare_ah_ae_sr <- ggplot(compare_ah_ae_sr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_compare_ah_ae_sr <- ggplot(compare_ah_ae_sr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) plot_grid(hist_compare_ah_ae_sr, point_compare_ah_ae_sr, nrow = 1, labels = LETTERS[1:2]) compare_ah_ae_rr <- rbind( summarize_data(t(add_error)) %>% filter(., type == "rsd") %>% mutate(., data = "add") %>% transform(., mean = log(mean)), summarize_data(t(asinh_add)) %>% filter(., type == "rsd") %>% mutate(., data = "asinh") ) hist_compare_ah_ae_rr <- ggplot(compare_ah_ae_rr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_compare_ah_ae_rr <- ggplot(compare_ah_ae_rr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) plot_grid(hist_compare_ah_ae_rr, point_compare_ah_ae_rr, nrow = 1, labels = LETTERS[1:2])
compare_ah_pe_rs <- rbind( summarize_data(t(prop_error)) %>% filter(., type == "rsd") %>% mutate(., data = "rsd(prop)") %>% transform(., mean = log(mean)), summarize_data(t(asinh_prop)) %>% filter(., type == "sd") %>% mutate(., data = "sd(asinh(prop))") ) hist_comp_ah_pe_rs <- ggplot(compare_ah_pe_rs, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_comp_ah_pe_rs <- ggplot(compare_ah_pe_rs, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) plot_grid(hist_comp_ah_pe_rs, point_comp_ah_pe_rs, nrow = 1, labels = LETTERS[1:2]) compare_ah_pe_rr <- rbind( summarize_data(t(prop_error)) %>% filter(., type == "rsd") %>% mutate(., data = "rsd(prop)") %>% transform(., mean = log(mean)), summarize_data(t(log_prop)) %>% filter(., type == "rsd") %>% mutate(., data = "rsd(asinh(prop))") ) hist_comp_ah_pe_rr <- ggplot(compare_ah_pe_rr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_comp_ah_pe_rr <- ggplot(compare_ah_pe_rr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) plot_grid(hist_comp_ah_pe_rr, point_comp_ah_pe_rr, nrow = 1, labels = LETTERS[1:2])
compare_ah_me_rs <- rbind( summarize_data(t(mix_error)) %>% filter(., type == "rsd") %>% mutate(., data = "rsd(mix)") %>% transform(., mean = log(mean)), summarize_data(t(asinh_mix)) %>% filter(., type == "sd") %>% mutate(., data = "sd(asinh(mix))") ) compare_ah_me_rs$data <- factor(compare_ah_me_rs$data, levels = c("rsd(mix)", "sd(asinh(mix))"), ordered = TRUE) hist_comp_ah_me_rs <- ggplot(compare_ah_me_rs, aes(x = var, fill = data)) + geom_density(alpha = 0.5) + theme(legend.position = "none") point_comp_ah_me_rs <- ggplot(compare_ah_me_rs, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) + theme(legend.position = c(0.5, 0.8), legend.title = element_text(size = 0)) plot_grid(hist_comp_ah_me_rs, point_comp_ah_me_rs, nrow = 1, labels = LETTERS[1:2]) compare_ah_me_rr <- rbind( summarize_data(t(mix_nozero)) %>% filter(., type == "rsd") %>% mutate(., data = "mix") %>% transform(., mean = log(mean)), summarize_data(t(asinh_mix)) %>% filter(., type == "rsd") %>% mutate(., data = "asinh(mix)") ) compare_ah_me_rr$data <- factor(compare_ah_me_rr$data, levels = c("mix", "asinh(mix)"), ordered = TRUE) hist_comp_ah_me_rr <- ggplot(compare_ah_me_rr, aes(x = var, fill = data)) + geom_density(alpha = 0.5) point_comp_ah_me_rr <- ggplot(compare_ah_me_rr, aes(x = mean, y = var, color = data)) + geom_point(alpha = 0.5) + theme(legend.position = c(0.5, 0.8), legend.title = element_text(size = 0)) + ylab("rsd") plot_grid(hist_comp_ah_me_rr, point_comp_ah_me_rr, nrow = 1, labels = LETTERS[1:2])
mix_asinh_summary <- plot_grid(me_ah_diff, me_ah_sd, me_ah_rsd, nrow = 1, labels = LETTERS[1:3]) mix_asinh_compare <- plot_grid(point_comp_ah_me_rs, point_comp_ah_me_rr, nrow = 1, labels = LETTERS[4:5]) mix_asinh_summary mix_asinh_compare ggsave(file.path(save_plot_loc, "mix_asinh_summary.png"), plot = mix_asinh_summary, dpi = 300, width = 12, height = 5) ggsave(file.path(save_plot_loc, "mix_asinh_compare.png"), plot = mix_asinh_compare, dpi = 300, width = 8, height = 5)
Fig 8. Arcsinh transform applied to the mixed error model data. A) Largest difference vs the mean. B) Standard deviation (sd) vs the mean. C) Relative standard deviation (rsd) vs the mean. D) Relative standard deviation of mixed error data (rsd(mix)) and standard deviation of arcsinh transformed mixed error data (sd(asinh(mix))) vs the mean and E) Relative standard deviation of mixed error data before (mix) and after (asinh(mix)) arcsinh transformation. Note that in D) and E), the log(mean) of the original mixed error is used to put the points on a comparable scale.
Like the log transform, the arcsinh transform completely changes how the standard deviation depends on the mean. Most notably, the variance becomes constant at higher mean values, much like the standard deviation in the additive error case. As shown in Figure 8D, the standard deviation of the arcsinh transform actually looks very much like the relative standard deviation of the un-transformed data. In contrast to the relative variance in the mixed error data, it is much decreased. In addition, the arcsinh transform is able to handle a wider range of values than the log transform.
Applying scalings and transformations to simulated replicate data with defined error models allows the investigation of the effects of the scaling and transformation on the error itself. Plotting the maximum difference, standard deviation, and relative standard deviation vs the mean illustrates how the error models differ, as well as how the error changes after applying a given scaling or transformation.
Interestingly, only autoscaling, log and arcsinh affect the proportional variance in the mixed error model. Pareto scaling and the root transform do not adjust the proportional variance in any meaningful fashion, but rather seem to only affect the overal scale of the data. Autoscaling, however, modifies the variance in a way that in many cases would be non-optimal, in that all of the variances become equal. The log and arcsinh transforms, in contrast, appear to mostly remove the proportional component of the error, moving it to the additive component. This preserves the error, while making the variance appropriate for use in the most commonly used statistical methods. Arcsinh has the advantage over the log transform in that it can be easily applied to negative values.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.