#' Compute an analysis report using principal component analysis from flashpca tool.
#'
#' The function can be used in a chunk within a Rmarkdown document/script with results="asis" to render the report.
#'
#' @param data A `vector` or `data.frame`. The numeric data on which the PCA has to be performed.
#' @param design A `data.frame`. Additional variables to be used with factorial planes.
#' @param id_var A `character`. The identifier column used to merge the data.
#' @param technical_vars A `vector(character)`. Variables from design to be used with factorial planes.
#' @param n_comp A `numeric`. The number of principal components to be computed.
#' @param fig_n_comp A `numeric`. The number of principal components to be used for figures.
#' @param outliers_threshold A `numeric`. The threshold to define outliers.
#' @param outliers_quantile A `numeric`. The upper quantile percentile to use to define outliers,
#' with `n x > quantile(x, outliers_quantile) + outliers_threshold * IQR(x)`.
#' @param title_level A `numeric`. The markdown title level, *i.e.*, the number of `#` preceding the section.
#'
#' @return A `data.frame`.
#' @export
#'
#' @examples
#'
#' if (interactive()) {
#' pca_report(
#' data = t(mtcars),
#' design = as.data.table(mtcars, keep.rownames = "Sample_ID"),
#' id_var = "Sample_ID",
#' technical_vars = c("cyl", "gear", "vs"),
#' n_comp = 5,
#' fig_n_comp = 5,
#' outliers_threshold = 3,
#' outliers_quantile = 0.75,
#' title_level = 0
#' )
#' }
#'
pca_report <- function(
data,
design,
id_var = "Sample_ID",
technical_vars,
n_comp = 10,
fig_n_comp = n_comp,
outliers_threshold = 1.5,
outliers_quantile = 0.75,
title_level = 3
) {
pc <- term <- `Pr(>F)` <- NULL # For global variable warnings
.data <- ggplot2::.data
`%>%` <- gt::`%>%`
message_prefix <- "[rain] "
message(message_prefix, "PCA started ...", appendLF = TRUE)
section_prefix <- paste(c("\n", rep("#", title_level)), collapse = "")
pca_theme <- ggplot2::theme(
plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = ggtext::element_markdown(),
plot.subtitle = ggtext::element_markdown(face = "italic", size = ggplot2::rel(0.8)),
plot.caption = ggtext::element_markdown(face = "italic", size = ggplot2::rel(0.5)),
axis.text.x = ggtext::element_markdown()
)
pca_methylation <- data[rowSums(is.na(data)) == 0, ]
design_idx <- design[[id_var]] %in% colnames(data)
pca_phenotypes <- data.table::as.data.table(design[design_idx, ])
n_comp <- min(c(n_comp, 10, ncol(pca_methylation)))
fig_n_comp <- min(c(n_comp, 3, ncol(pca_methylation)))
keep_technical <- names(which(sapply(pca_phenotypes[,
lapply(.SD, function(x) {
(data.table::uniqueN(x) > 1 & data.table::uniqueN(x) < length(x)) | is.numeric(x)
}),
.SDcols = technical_vars
], isTRUE)))
variables_excluded <- setdiff(technical_vars, keep_technical)
if (length(variables_excluded) != 0) {
cat(
"The following variables have been excluded (null variances or confounding with samples):\n",
paste("+", variables_excluded),
"\n",
sep = "\n"
)
}
pca_res <- flashpcaR::flashpca(X = t(pca_methylation), stand = "sd", ndim = n_comp)
pca_dfxy <- data.table::as.data.table(pca_res[["vectors"]], keep.rownames = id_var)
data.table::setnames(
x = pca_dfxy,
old = setdiff(names(pca_dfxy), id_var),
new = sprintf("PC%02d", as.numeric(gsub("V", "", setdiff(names(pca_dfxy), id_var))))
)
pca_dfxy <- merge(x = pca_dfxy, y = pca_phenotypes, by = id_var)
p_inertia <- ggplot2::ggplot(
data = data.table::data.table(
y = pca_res[["pve"]],
x = sprintf("PC%02d", seq_along(pca_res[["pve"]]))
)[1:fig_n_comp]
) +
ggplot2::aes(
x = paste0(
.data[["x"]],
"<br><i style='font-size:5pt;'>(",
format(.data[["y"]] * 100, digits = 2, nsmall = 2),
" %)</i>"
),
y = .data[["y"]]
) +
ggplot2::geom_col(width = 1, colour = "white", fill = "#21908CFF", na.rm = TRUE) +
ggplot2::scale_y_continuous(
labels = function(x) paste(format(x * 100, digits = 2, nsmall = 2), "%"),
expand = ggplot2::expansion(mult = c(0, 0.05))
) +
ggplot2::labs(
x = "Principal Components",
y = "Contribution"
) +
pca_theme
if (length(keep_technical) > 0) {
cat(section_prefix, "## Association Tests\n\n", sep = "")
asso_dt <- data.table::melt(
data = pca_dfxy,
measure.vars = grep("^PC[0-9]+$", names(pca_dfxy), value = TRUE),
variable.name = "pc",
value.name = "values"
)[pc %in% sprintf("PC%02d", 1:n_comp)][,
{
m <- stats::model.matrix(
object = stats::as.formula(paste0("values ~ ", paste(keep_technical, collapse = " + "))),
data = .SD
)
if (qr(m)$rank == ncol(m)) {
out <- data.table::as.data.table(
stats::anova(
stats::lm(
formula = stats::as.formula(paste0("values ~ ", paste(keep_technical, collapse = " + "))),
data = .SD
)
),
keep.rownames = "term"
)[term != "Residuals"]
} else {
out <- data.table::rbindlist(lapply(X = keep_technical, .data = .SD, FUN = function(.x, .data) {
data.table::as.data.table(
stats::anova(
stats::lm(
formula = stats::as.formula(paste0("values ~ ", .x)),
data = .SD
)
),
keep.rownames = "term"
)[term != "Residuals"]
}))
}
out[, "full_rank" := qr(m)$rank == ncol(m)]
},
by = "pc"
]
p_association <- ggplot2::ggplot(data = asso_dt) +
ggplot2::aes(
x = factor(.data[["pc"]]),
y = factor(
x = .data[["term"]],
levels = data.table::setorderv(
x = data.table::dcast(
data = asso_dt[, list(pc, term, `Pr(>F)` = data.table::fifelse(`Pr(>F)` <= 0.1, `Pr(>F)`, NA_real_))],
formula = term ~ pc,
value.var = "Pr(>F)"
),
cols = levels(asso_dt[["pc"]]),
order = -1
)[["term"]]
),
fill = .data[["Pr(>F)"]]
) +
ggplot2::geom_tile(colour = "white", na.rm = TRUE) +
ggtext::geom_richtext(
mapping = ggplot2::aes(
label = gsub(
pattern = "(.*)e([-+]*)0*(.*)",
replacement = "\\1 × 10<sup>\\2\\3</sup>",
x = format(.data[["Pr(>F)"]], digits = 2, nsmall = 2, scientific = TRUE)
)
),
colour = "white",
fill = NA,
label.colour = NA,
size = 1.25,
na.rm = TRUE
) +
ggplot2::scale_fill_viridis_c(name = "P-Value", na.value = "grey85", end = 0.75, limits = c(0, 0.1)) +
ggplot2::theme(panel.grid = ggplot2::element_blank()) +
ggplot2::scale_x_discrete(
expand = c(0, 0),
labels = function(x) {
paste0(
x, "<br><i style='font-size:5pt;'>(",
format(
x = pca_res[["pve"]][as.numeric(gsub("PC", "", x))] * 100,
digits = 2,
nsmall = 2
),
" %)</i>"
)
}
) +
ggplot2::scale_y_discrete(expand = c(0, 0), labels = toupper) +
ggplot2::labs(
x = "Principal Components",
y = "Variables",
title = "Association Tests Between Variables And Principal Components",
caption = ifelse(
test = all(asso_dt[["full_rank"]]),
yes = "Variables are tested against principal components using ANOVA.",
no = paste(
"Variables are independently tested against principal components using ANOVA",
"(*i.e.*, model matrix is not full rank)."
)
)
) +
pca_theme
print(p_association)
cat("\n")
cat(section_prefix, "## Factorial Planes\n\n", sep = "")
for (ivar in keep_technical) {
cat(section_prefix, "### `", ivar, "`\n\n", sep = "")
p <- patchwork::wrap_plots(
c(
apply(
X = utils::combn(sprintf("PC%02d", 1:fig_n_comp), 2),
MARGIN = 2,
FUN = function(x) {
ggplot2::ggplot(
data = pca_dfxy[, .SD, .SDcols = c(ivar, x)],
mapping = ggplot2::aes(x = .data[[x[1]]], y = .data[[x[2]]], colour = .data[[ivar]])
) +
ggplot2::geom_hline(yintercept = 0, linetype = 2, na.rm = TRUE) +
ggplot2::geom_vline(xintercept = 0, linetype = 2, na.rm = TRUE) +
ggplot2::geom_point(na.rm = TRUE) +
ggplot2::stat_ellipse(type = "norm", na.rm = TRUE, show.legend = FALSE) +
{
if (is.numeric(pca_dfxy[[ivar]])) {
ggplot2::scale_colour_viridis_c(
name = NULL,
begin = 0,
end = 0.75
)
} else {
ggplot2::scale_colour_viridis_d(
name = NULL,
begin = if (data.table::uniqueN(pca_dfxy[[ivar]]) == 2) 0.25 else 0,
end = 0.75,
guide = ggplot2::guide_legend(override.aes = list(size = 4))
)
}
}
}
),
list(p_inertia)
),
guides = "collect"
) +
patchwork::plot_annotation(
title = paste0("Structure Detection For: '<i>", ivar, "</i>'"),
tag_levels = "A",
theme = pca_theme
)
print(p)
cat("\n")
}
}
cat(section_prefix, "## Outliers Detection\n\n", sep = "")
pca_dfxy[,
"dist_centre" := rowSums(sapply(.SD, function(x) as.vector(scale(x))^2)),
.SDcols = sprintf("PC%02d", 1:fig_n_comp)
]
pca_dfxy[,
"is_outlier" := lapply(.SD, function(x) {
factor(
x = x > (
stats::quantile(x, outliers_quantile, na.rm = TRUE) + outliers_threshold * stats::IQR(x, na.rm = TRUE)
),
levels = c(FALSE, TRUE),
labels = c("No", "Yes")
)
}),
.SDcols = "dist_centre"
]
p <- patchwork::wrap_plots(
c(
apply(
X = utils::combn(sprintf("PC%02d", 1:fig_n_comp), 2),
MARGIN = 2,
FUN = function(x) {
ggplot2::ggplot(
data = pca_dfxy[, .SD, .SDcols = c("is_outlier", x)],
mapping = ggplot2::aes(
x = .data[[x[1]]],
y = .data[[x[2]]],
colour = .data[["is_outlier"]],
shape = .data[["is_outlier"]]
)
) +
ggplot2::geom_hline(yintercept = 0, linetype = 2, na.rm = TRUE) +
ggplot2::geom_vline(xintercept = 0, linetype = 2, na.rm = TRUE) +
ggplot2::geom_point(na.rm = TRUE) +
ggplot2::stat_ellipse(type = "norm", na.rm = TRUE, show.legend = FALSE) +
ggplot2::scale_colour_viridis_d(
name = "Outlier",
begin = 0.25,
end = 0.75,
guide = ggplot2::guide_legend(override.aes = list(size = 4))
) +
ggplot2::scale_shape_manual(name = "Outlier", values = c(1, 4))
}
),
list(p_inertia)
),
guides = "collect"
) +
patchwork::plot_annotation(
title = "Outliers Detection In Factorial Planes",
caption = paste0(
"Outliers defined for a Euclidean distance from cohort centroid (based on the principal components up to ", fig_n_comp, ")<br>",
"higher than ", outliers_threshold, " times the interquartile range above the 75<sup>th</sup> percentile."
),
tag_levels = "A",
theme = pca_theme
)
print(p)
gt::gt(
data = pca_dfxy[
pca_dfxy[["is_outlier"]] == "Yes",
.SD,
.SDcols = c(id_var, technical_vars, sprintf("PC%02d", 1:fig_n_comp))
],
auto_align = "center"
) %>%
gt::tab_header(title = "Samples Identified As Possible Outliers") %>%
gt::fmt_number(columns = sprintf("PC%02d", 1:fig_n_comp), decimals = 2) %>%
gt::opt_row_striping() %>%
gt::opt_all_caps() %>%
print()
message(message_prefix, "PCA ended.", appendLF = TRUE)
invisible(pca_dfxy)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.