#' placeholder title
#' for reference, standard pca (does not cope with missing values); summary(stats::prcomp(t(matrix_sample_intensities), center = T, scale. = F))
#' @param matrix_sample_intensities peptide or protein abundance matrix
#' @param samples sample metadata table
#' @param samples_colors sample colors in wide format tibble
#' @param sample_label_property labels used to represent the samples. Set to "auto" to automatically select based on number of samples (default). possible values; auto, shortname, index, index_asis
#' @param pch_as_exclude should exclude samples be represented with a distinct symbol shape? possible values; FALSE, TRUE (default)
#' @param infer_continuous_scale should the legends infer continuous scale from the data? FALSE = all categorical, TRUE = automatic (default)
#' @param pca_dims a list of integer pairs indicating PCA plots to make
#' @importFrom pcaMethods pca
#' @importFrom ggrepel geom_text_repel
#' @importFrom ggpubr ggarrange annotate_figure text_grob
#' @importFrom gtools mixedsort
#' @importFrom scales rescale
plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, sample_label_property = "auto", pch_as_exclude = TRUE, infer_continuous_scale = TRUE, pca_dims = list(1:2, c(1, 3), 2:3)) {
if(!sample_label_property %in% c("auto", "shortname", "index", "index_asis")) {
append_log(paste("invalid value for parameter 'sample_label_property':", sample_label_property), type = "error")
}
if(!all(c("sample_index", "sample_id", "shortname") %in% colnames(samples))) {
append_log(paste(paste(c("sample_index", "sample_id", "shortname"), collapse = ", "), "are required column in the samples table"), type = "error")
}
if(!(is.list(pca_dims) && all(unlist(lapply(pca_dims, function(x) length(x) == 2 && is.numeric(x) && all.equal(x, round(x)) == TRUE))) && max(unlist(pca_dims)) <= 15) ) {
append_log("pca_dims parameter must be a list that contains integer value pairs (max value: 15)", type = "error")
}
PPCA = pcaMethods::pca(base::scale(t(matrix_sample_intensities), center=TRUE, scale=FALSE), method="ppca", nPcs = as.integer(max(unlist(pca_dims))), seed = 123, maxIterations = 2000)
mat_pca = PPCA@scores # rownames = sample_id
pca_var = array(PPCA@R2, dimnames = list(colnames(PPCA@scores)))
props = grep("^(sample_id$|sample_index$|shortname$|contrast:)", colnames(samples_colors), ignore.case = T, value = T, invert = T)
# if more than 50 samples, don't ggrepel the sample labels and instead simply plot each label at the PCA x/y location (color-coded by metadata as per usual). No point/shape is drawn in this case
textonly_sample_labels = nrow(samples) > 50 # default
prop_sample_labels = "shortname" # default
if(sample_label_property == "auto") {
# only shortname if less than 25 samples AND 80% of those are actually short strings (<=30 characters)
prop_sample_labels = ifelse(nrow(samples) < 25 && sum(nchar(samples$shortname) > 30) < nrow(samples) * 0.2, "shortname", "sample_index") # alternatively; nrow(samples) * stats::median(nchar(samples$shortname)) < 25*10
}
if(sample_label_property == "index") {
prop_sample_labels = "sample_index"
}
if(sample_label_property == "index_asis") {
prop_sample_labels = "sample_index"
textonly_sample_labels = TRUE
}
pcaplots = list()
for (prop in props) { #prop=props[1]
# don't plot 'exclude' property if there are no excluded samples
if(prop == "exclude" && !any(samples %>% filter(sample_id %in% rownames(mat_pca)) %>% pull(exclude))) {
next
}
plotlist = list()
for (dims in pca_dims) { # dims=1:2
# extract data from PCA object & join with sample metadata to get color-coding and label
tib = tibble(x = mat_pca[, dims[1]], y = mat_pca[, dims[2]], sample_id = rownames(mat_pca)) %>%
left_join(samples %>% select(sample_id, label=!!prop_sample_labels, prop=!!prop), by="sample_id") %>%
left_join(samples_colors %>% select(sample_id, clr=!!prop), by="sample_id")
tib = left_join(tib, samples %>% select(sample_id, exclude), by="sample_id")
plot_as_numeric = FALSE
if(infer_continuous_scale) {
# check if current property (column in sample metadata) only contains numbers
prop_is_numeric = suppressWarnings(all(is.finite(as.numeric(na.omit(tib$prop)))))
# make a numeric plot only if there are more than 3 unique values. Also covers the case of booleans that should be plotted as categorical variables
plot_as_numeric = !prop %in% c("group", "exclude") && prop_is_numeric && n_distinct(as.numeric(tib$prop), na.rm = T) > 2
}
# mutate the actual data before making ggplot object
if(plot_as_numeric) {
tib$prop = as.numeric(tib$prop)
} else {
# for categorical variables, convert NA values to a character so they show up in the legend
tib$prop[is.na(tib$prop)] = "<NA>"
tib$prop = as.character(tib$prop)
}
# base plot
p = ggplot(tib, aes(x = x, y = y, label = label, colour = prop, fill = prop)) +
guides(alpha = "none", fill = "none") +
labs(
title = "",
x = sprintf("dimension %d (%.1f%%)", dims[1], pca_var[dims[1]] * 100),
y = sprintf("dimension %d (%.1f%%)", dims[2], pca_var[dims[2]] * 100)
) +
# coord_cartesian(clip = "off") +
theme_bw()
if(plot_as_numeric) {
tib = tib %>%
mutate(
# add alpha to colors
clr_fill = paste0(clr, "66"),
clr = paste0(clr, "BB"),
# scale color values between 0~1
prop = scales::rescale(prop)) %>%
# sort values from low to high
# doesn't matter that the order of the values submitted to ggplot2::scale_color_X is changed from when we created ggplot object above
arrange(prop)
clr_na = "#BBBBBBBB"
if(any(is.na(tib$prop))) {
# get NA color from data
clr_na = tib %>% filter(is.na(prop)) %>% pull(clr) %>% head(1)
# set color code to NA or they'll show up in ggplot's color scale / legend
tib$clr[is.na(tib$prop)] = NA
tib$clr_fill[is.na(tib$prop)] = NA
}
# for gradient colors, sorting by respective value is important (already converted the values to numeric type a few lines above)
p = p +
scale_color_gradientn(name = gsub("[ _]+", " ", prop),
values = tib$prop, # set the values parameter to use the exact value-to-color mapping we computed upstream
colours = tib$clr,
aesthetics = "colour",
na.value = clr_na) +
scale_color_gradientn(values = tib$prop,
colours = tib$clr_fill,
aesthetics = "fill",
na.value = clr_na) +
theme(legend.text = element_text(angle=90, hjust=0.5, vjust=0.5),
legend.title = element_text(size=10, face = "bold"))
} else {
uprop = gtools::mixedsort(unique(tib$prop))
clr_map = array(tib$clr[match(uprop, tib$prop)], dimnames=list(uprop))
p = p +
scale_colour_manual(
name = gsub("[ _]+", " ", prop),
values = paste0(clr_map, "BB"), # named array, value=color, name=property
breaks = names(clr_map), # sort the legend
aesthetics = "colour") +
scale_colour_manual(
values = paste0(clr_map, "66"), # named array, value=color, name=property
breaks = names(clr_map), # sort the legend
aesthetics = "fill") +
guides(colour = guide_legend(title.position = "top")) +
theme(legend.text = element_text(size = ifelse(length(clr_map) < 6,
10,
ifelse(length(clr_map) < 10, 8, 6)) ),
legend.title = element_text(size=10, face = "bold"))
}
if(textonly_sample_labels) {
p_labeled = p + geom_text(show.legend = FALSE, size = 2)
} else {
p_labeled = p +
geom_point(aes(shape = I(ifelse(exclude & pch_as_exclude, 0, 21)))) +
ggrepel::geom_text_repel(show.legend = FALSE, size = 2, segment.alpha = .3, min.segment.length = 0, na.rm = TRUE, max.time = 1, max.iter = 1e5, max.overlaps = Inf, point.padding = 0, box.padding = 0.2, seed = 123)
}
plotlist[[length(plotlist) + 1]] = p + geom_point(aes(shape = I(ifelse(exclude & pch_as_exclude, 0, 21))))
plotlist[[length(plotlist) + 1]] = p_labeled
}
# finally, collapse individual PCA plots
# importantly: if samples are labeled by index, add a warning/note
# eg; if user labels samples 1~6 as shortname and our dataset$samples table has a different order (causing sample_index to not align), users may mistake sample identities
# users labeling samples with shortnames like WT:1,2,4 KO:5,6,7 happens but all too often, so this note must be visible on same page as the plot
plotlist_merged = ggpubr::ggarrange(plotlist = plotlist, ncol = 2, nrow = length(plotlist) / 2, common.legend = T, legend = "bottom")
if(prop_sample_labels == "sample_index") {
plotlist_merged = ggpubr::annotate_figure(plotlist_merged, top = ggpubr::text_grob('samples are labeled by "sample_index" (described in samples.xlsx included with MS-DAP output)', size = 10))
}
pcaplots[[prop]] = plotlist_merged
}
### code snippet for adding PCA on any custom continuous scale. eg, some aspect that reflects sample quality such as; #detect, outliers deviation in retention time, impact on CoV
# counts = peptides %>% group_by(sample_id) %>% summarise(detect = sum(detect), quant=n()) %>% left_join(samples %>% select(sample_id, shortname), by="sample_id")
# plotlist = list()
# for (dims in list(1:2, c(1, 3), 2:3)) { # dims=1:2
# tib = tibble(x = mat_pca[, dims[1]], y = mat_pca[, dims[2]], sample_id = rownames(mat_pca)) %>% left_join(counts, by="sample_id")
#
# p = ggplot(tib, aes(x = x, y = y, label = shortname, colour = detect)) +
# geom_point() +
# scale_color_gradient(name = "detected peptides") +
# labs(
# title = "",
# x = sprintf("dimension %d (%.1f%%)", dims[1], pca_var[dims[1]] * 100),
# y = sprintf("dimension %d (%.1f%%)", dims[2], pca_var[dims[2]] * 100)
# ) +
# theme_bw() +
# theme(plot.title = element_text(hjust = 0.5),
# legend.text = element_text(angle=90, hjust=0.5, vjust=0.5))
#
# p_labeled = p + ggrepel::geom_text_repel(show.legend = FALSE, size = 2)
#
# plotlist[[length(plotlist) + 1]] = p
# plotlist[[length(plotlist) + 1]] = p_labeled
# }
# pcaplots[[length(pcaplots) + 1]] = ggarrange(plotlist = plotlist, ncol = 2, nrow = length(plotlist) / 2, common.legend = T, legend = "bottom")
return(pcaplots)
}
#' helper function for use-case beyond making a report using standard analysis_quickstart() workflow; PCA on the subset of samples-of-interest from some contrast
#'
#' @param dataset your dataset after application of dea. eg; results from analysis_quickstart()
#' @param contr the exact name of the contrast. Example; "contrast: ctrl1,ctrl2 vs phenotype1,phenotype2,phenotype3" This should be one of the columns in dataset$samples table (which are generated by the setup_contrasts() function)
#' @param sample_label_property see plot_sample_pca() function for params
#' @importFrom tidyr pivot_wider
#'
#' @export
plot_sample_pca__sample_in_contrast = function(dataset, contr, sample_label_property = "auto") {
if(length(contr) != 1 || !(contr %in% colnames(dataset$peptides) || contr %in% colnames(dataset$samples)) ) {
append_log("The contrast has to be a column name in the dataset's samples table, as created by the setup_contrasts function. Typical use-case: setup_contrasts(), analysis_quickstart(), then call this function.", type="error")
}
if(contr %in% colnames(dataset$peptides)) {
intensity_col_contr = contr
} else {
intensity_col_contr = paste0("intensity_", contr)
if(! intensity_col_contr %in% colnames(dataset$peptides)) {
append_log(sprintf("The column '%s' cannot be found in the dataset's peptides table, by_contrast filtering has to be enabled for this function to work (it relies on the subsetting of samples + filtering + normalization procedures that create a data matrix specific to some statistical contrast). Typical use-case: setup_contrasts(), analysis_quickstart(), then call this function.", intensity_col_contr), type="error")
}
}
# sample color-coding, exactly like report_as_rmarkdown.R (should compute colors based on entire dataset so color-coding is the same as the report that features the full dataset)
samples_colors_long = sample_color_coding__long_format(dataset$samples)
samples_colors = samples_colors_long %>% dplyr::select(sample_id, shortname, prop, clr) %>% tidyr::pivot_wider(id_cols = c(sample_id, shortname), names_from=prop, values_from=clr)
# case data to wide-format, then plot PCA
tibw = dataset$peptides %>%
dplyr::select(key_peptide, sample_id, intensity=!!intensity_col_contr) %>% dplyr::filter(!is.na(intensity)) %>%
tidyr::pivot_wider(id_cols = key_peptide, names_from = sample_id, values_from = intensity)
# return the results of the respective plot function
return( suppressWarnings(plot_sample_pca(as_matrix_except_first_column(tibw), samples = dataset$samples, samples_colors = samples_colors, sample_label_property = sample_label_property)) )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.