# Explorative plots for general sample reviewing
# Author: Mathias Kuhring
# Boxplot of compound concentrations per sample
plot_concentration_box <- function(data,
target = ENV$CONCENTRATION,
sample_id = "Sample.Name",
log10 = TRUE,
color = "Sample.Type",
fill = NULL){
data[[sample_id]] <- as.factor(data[[sample_id]])
if (!is.null(color)){
color = paste0("`", color, "`")
}
if (!is.null(fill)){
fill = paste0("`", fill, "`")
}
g <- ggplot(data = data,
mapping = aes_string(x = sample_id,
y = paste0("`", target, "`"),
color = color,
fill = fill)) +
labs(title = paste(target, "distribution")) +
xlab(label = gsub(".", " ", sample_id, fixed = TRUE)) +
scale_fill_brewer(palette="OrRd") +
geom_boxplot() +
coord_flip()
if (log10){
g <- g +
scale_y_log10() +
ylab(label = paste(target, "(log10)"))
} else {
g <- g +
ylab(label = target)
}
return(g)
}
# Violin plots of compound concentrations per sample
plot_concentration_violin <- function(data,
target = ENV$CONCENTRATION,
sample_id = "Sample.Name",
log10 = TRUE,
violin_width = 0.9,
color = "Sample.Type",
fill = NULL){
data[[sample_id]] <- as.factor(data[[sample_id]])
if (!is.null(color)){
color = paste0("`", color, "`")
}
if (!is.null(fill)){
fill = paste0("`", fill, "`")
}
g <- ggplot(data = data,
mapping = aes_string(x = sample_id,
y = paste0("`", target, "`"),
color = color,
fill = fill)) +
labs(title = paste(target, "distribution")) +
xlab(label = gsub(".", " ", sample_id, fixed = TRUE)) +
scale_fill_brewer(palette="OrRd") +
geom_violin(width = violin_width) +
coord_flip()
if (log10){
g <- g +
scale_y_log10() +
ylab(label = paste(target, "(log10)"))
} else {
g <- g +
ylab(label = target)
}
return(g)
}
# Overlapping boxplots/violins over all compound measurements (concentration,
# area, intensity) within a sample, but grouped by sample type and batch.
# Each box/violin represents all compounds in one sample.
plot_sample_variability_overlapped <- function(
data,
targets = c(ENV$CONCENTRATION, ENV$AREA, ENV$INTENSITY),
sample_types = c(SAMPLE_TYPE_BIOLOGICAL,
ENV$SAMPLE_TYPE_REFERENCE_QC,
SAMPLE_TYPE_POOLED_QC,
paste0("Standard L", 1:7)),
plot_type = c("boxplot", "violin")[1],
log10 = TRUE){
data_plot <- data
if (!is.null(sample_types)){
data_plot <- data_plot %>%
filter(Sample.Type %in% sample_types)
}
data_plot <- data_plot %>%
gather(key = "Target", "Value", targets) %>%
mutate(Target = factor(Target, levels = targets)) %>%
drop_na(Value)
g <- ggplot(data = data_plot,
mapping = aes_string(x = "Batch",
y = "Value",
color = "Sample.Type",
group = "Sample.Name")) +
xlab(label = NULL) +
coord_flip() +
facet_grid(as.formula("Sample.Type ~ Target"), scales = "free_x") +
theme(
strip.text.y = element_blank(),
strip.text.x = element_text(size = 6),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "bottom",
legend.box = "vertical"
)
if (length(unique(data_plot$Batch)) < 2) {
g <- g + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
}
assert_that(plot_type %in% c("boxplot", "violin"))
if (plot_type == "boxplot") {
g <- g + geom_boxplot(position = "identity", fill = NA)
} else {
g <- g + geom_violin(position = "identity", fill = NA)
}
if (log10) {
g <- g +
scale_y_log10() +
ylab(label = "Log 10 Scale")
} else {
g <- g +
ylab(label = "Unscaled")
}
return(g)
}
# Simple histogram matrix for several variables
plot_histogram_matrix <- function(data, variables,
sample_type = SAMPLE_TYPE_BIOLOGICAL){
data_var <- data %>%
filter(Sample.Type == sample_type) %>%
select("Sample.Name", variables) %>%
distinct() %>%
select(variables)
data_var_long <- gather(data_var, Key, Value)
g <- ggplot(data = data_var_long,
mapping = aes(x = Value,
fill = Key)) +
geom_histogram(stat = "count") +
labs(title = "Histograms of study variables") +
ylab(label = "Count") +
xlab(label = NULL) +
facet_wrap(~ Key, scales = "free")
return(g)
}
# Pairwise sample scatter plot matrix
plot_sample_scatter_matrix <- function(data,
target = ENV$CONCENTRATION,
sample_types = NULL,
max_cols = 9,
cat_header = NULL){
# Select samples
if (!is.null(sample_types)) {
data <- data %>% filter(Sample.Type %in% sample_types)
}
# Spread data to get Sample vs Compound matrix
data_wide <- data %>%
select(Sample.Name, Compound, UQ(sym(target))) %>%
spread(key = Compound, value = UQ(sym(target)))
count_data_matrix <- as.data.frame(data_wide)
rownames(count_data_matrix) <- count_data_matrix$Sample.Name
count_data_matrix$Sample.Name <- NULL
count_data_matrix <- t(as.matrix(count_data_matrix))
# Correlation panel based on code from Andranik Ivanov
panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
r <- trunc(r*100)/100
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste(prefix, txt, sep = "")
if (missing(cex.cor)) cex.cor <- 0.1/strwidth(txt)
text(0.5, 0.5, txt, cex = 2.5 * r)
}
# Text panel to rotate labels
panel.text <- function(x = 0.5, y = 0.5, txt, cex, font){
text(x, y, txt, cex = cex, font = font, srt = -45)
}
if (is.null(max_cols)){
# Labels size
label_max <- max(nchar(colnames(count_data_matrix)))
label_cex <- (12 / label_max)
# Scatter plot matrix based on code from Andranik Ivanov
pairs(log10(count_data_matrix + 1),
lower.panel = panel.smooth,
upper.panel = panel.cor,
text.panel = panel.text,
pch = ".", cex = 4, col = "darkblue", gap = 0.2,
cex.labels = label_cex,
main=paste("Sample correlation based on", target, "(log10)"))
} else {
# Create separate plots if more than max_cols samples
# With sliding windows of max size max_cols including one overlap
sample_num <- ncol(count_data_matrix)
window_num <- ceiling(sample_num / (max_cols - 1))
window_size <- ceiling(sample_num / window_num) + 1
# Iterate windows
for (i in seq_len(window_num)){
if (!is.null(cat_header)){
cat(paste0(cat_header, i, "\n"))
}
start_idx <- ((i - 1) * (window_size - 1)) + 1
stop_idx <- min(start_idx + window_size - 1, sample_num)
# Labels size
label_max <- max(nchar(colnames(count_data_matrix[, start_idx:stop_idx])))
label_cex <- (12 / label_max) * (max_cols / (stop_idx - start_idx + 1))
# Scatter plot matrix based on code from Andranik Ivanov
pairs(log10(count_data_matrix[, start_idx:stop_idx] + 1),
lower.panel = panel.smooth,
upper.panel = panel.cor,
text.panel = panel.text,
pch = ".", cex = 4, col = "darkblue", gap = 0.2,
cex.labels = label_cex,
main=paste("Sample correlation based on", target, "(log10)"))
}
}
}
# Plot SD vs Mean of target for each compound, within different sample types
plot_compound_sd_vs_mean <- function(data,
target = ENV$CONCENTRATION,
sample_types = c(SAMPLE_TYPE_BIOLOGICAL,
ENV$SAMPLE_TYPE_REFERENCE_QC,
SAMPLE_TYPE_POOLED_QC),
study_class = NULL,
shape = NULL,
label = "Compound",
label_number = 20,
fit = c("lm", "loess")[1],
return_with_data = FALSE){
# Select and group samples
if (!is.null(sample_types)) {
data <- data %>%
filter(Sample.Type %in% sample_types) %>%
group_by(Sample.Type)
}
if (!is.null(shape)){
data <- data %>% group_by(UQ(sym(shape)), add = TRUE)
}
# Indicate study classes
if (!is.null(study_class)){
sample_idx <- data$Sample.Type == SAMPLE_TYPE_BIOLOGICAL
data$Sample.Type[sample_idx] <- paste(data$Sample.Type[sample_idx],
data[[study_class]][sample_idx])
}
# Group by compound and calc SD and mean
data_sum <- data %>%
group_by(Compound, add = TRUE) %>%
summarize(Mean = mean(UQ(sym(target)), na.rm = TRUE),
SD = sd(UQ(sym(target)), na.rm = TRUE))
g <- ggplot(data = data_sum,
mapping = aes(x = Mean, y = SD,
color = Sample.Type)) +
labs(title = "Compound SD and Mean within different Sample Types") +
xlab(label = paste("Mean of", target, "(Log10)")) +
ylab(label = paste("SD of", target, "(Log10)")) +
theme(legend.position = "bottom",
legend.box = "vertical") +
scale_x_log10() +
scale_y_log10() +
scale_color_discrete(name = "Sample Type")
if (is.null(shape)) {
g <- g +
geom_point(alpha = 0.25, size = 4) +
geom_smooth(method = fit, alpha = 0.95)
} else {
g <- g +
geom_point(mapping = aes_string(shape = paste0("`", shape, "`")),
alpha = 0.25, size = 4) +
geom_smooth(method = fit, alpha = 0.95)
}
# Add label in low density areas
if (!is.null(label)) {
g <- g +
ggpp::stat_dens2d_filter(
mapping = aes_string(label = paste0("`", label, "`"),
color = "Sample.Type"),
geom = ggrepel::geom_text_repel()$geom, keep.number = label_number)
}
if (return_with_data) {
return(list("scatterplot" = g, "data" = data_sum))
}
else{
return(g)
}
}
# Plot %RSD vs Mean of target for each compound, within different sample types
plot_compound_rsd_vs_mean <- function(data,
target = ENV$CONCENTRATION,
sample_types = c(SAMPLE_TYPE_BIOLOGICAL,
ENV$SAMPLE_TYPE_REFERENCE_QC,
SAMPLE_TYPE_POOLED_QC),
study_class = NULL,
shape = NULL,
label = "Compound",
label_number = 20,
fit = c("lm", "loess")[1]){
# Select and group samples
if (!is.null(sample_types)) {
data <- data %>%
filter(Sample.Type %in% sample_types) %>%
group_by(Sample.Type)
}
if (!is.null(shape)){
data <- data %>% group_by(UQ(sym(shape)), add = TRUE)
}
# Indicate study classes
if (!is.null(study_class)){
sample_idx <- data$Sample.Type == SAMPLE_TYPE_BIOLOGICAL
data$Sample.Type[sample_idx] <- paste(data$Sample.Type[sample_idx],
data[[study_class]][sample_idx])
}
# Group by compound and calc SD and mean
data_sum <- data %>%
group_by(Compound, add = TRUE) %>%
summarize(Mean = mean(UQ(sym(target)), na.rm = TRUE),
`%RSD` = rsd(UQ(sym(target)), na.rm = TRUE))
g <- ggplot(data = data_sum,
mapping = aes(x = Mean, y = `%RSD`,
color = Sample.Type)) +
labs(title = "Compound %RSD and Mean within different Sample Types") +
xlab(label = paste("Mean of", target, "(Log10)")) +
ylab(label = paste("%RSD of", target, "(Log10)")) +
theme(legend.position = "bottom",
legend.box = "vertical") +
scale_x_log10() +
scale_y_log10() +
scale_color_discrete(name = "Sample Type") +
geom_smooth(method = fit, alpha = 0.5)
if (is.null(shape)){
g <- g + geom_point(alpha = 0.5, size = 4)
} else {
g <- g + geom_point(mapping = aes_string(shape = paste0("`", shape, "`")),
alpha = 0.5, size = 4)
}
# Add label in low density areas
if (!is.null(label)){
g <- g +
ggpp::stat_dens2d_filter(
mapping = aes_string(label = paste0("`", label, "`"),
color = "Sample.Type"),
geom = ggrepel::geom_text_repel()$geom, keep.number = label_number)
}
return(g)
}
# Scatter plots to compare different compound measeruments
plot_compound_scatter <- function(data,
x = ENV$CONCENTRATION,
y = "Analyte Intensity [cps]",
y_istd = "Internal Std. Intensity [cps]",
sample_types = c(SAMPLE_TYPE_BIOLOGICAL,
ENV$SAMPLE_TYPE_REFERENCE_QC,
SAMPLE_TYPE_POOLED_QC,
paste0("Standard L", 1:7)),
aspect = TRUE,
log_zero_addend = 1,
shape = NULL,
ncol = 3){
data <- data %>% filter(Sample.Type %in% sample_types)
# Remove NA-only compounds
# (shouldn't happen as long as standard or QC samples are available)
data <- data %>%
group_by(Compound) %>%
mutate(All.Conc.NA = all(is.na(UQ(sym(x)))))
na_comps <- data %>%
filter(All.Conc.NA == TRUE) %>%
summarize() %>%
pull(Compound)
# print(paste0("Skipped compounds with only NA: ",
# paste(na_comps, collapse = ", "
# )))
data <- data %>% filter(All.Conc.NA == FALSE)
# Normalize y
data[[y]] <- data[[y]]/data[[y_istd]]
# Prevent log of zero
if (log_zero_addend){
data[[x]] <- data[[x]] + log_zero_addend
data[[y]] <- data[[y]] + log_zero_addend
}
# limits <- NULL
# if (aspect){
# limits <- c(min(data_sub[[x]],data_sub[[y]], na.rm = TRUE),
# max(data_sub[[x]],data_sub[[y]], na.rm = TRUE))
# }
g <- ggplot(data = data,
mapping = aes_string(x = paste0("`", x, "`"),
y = paste0("`", y, "`"),
color = "Sample.Type")) +
xlab(label = paste(x, "(log10)")) +
ylab(label = paste("Normalized ", y, "(log10)")) +
theme(legend.position = "bottom",
# axis.text.x = element_text(angle = 90, hjust = 1),
axis.ticks.x = element_blank(), axis.text.x = element_blank(),
axis.ticks.y = element_blank(), axis.text.y = element_blank()) +
scale_color_discrete(name = "Sample Type") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(Compound ~ ., scales = "free", ncol = ncol)
# coord_fixed(xlim = limits, ylim = limits) +
# geom_abline(slope = 1)
if (is.null(shape)){
g <- g + geom_point(alpha = 0.5, size = 2)
} else {
g <- g + geom_point(mapping = aes_string(shape = paste0("`", shape, "`")),
alpha = 0.5, size = 2)
}
return(g)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.