#'GSPA of peptide data
#'
#'\code{pepGSPA} performs the analysis of Gene Set Probability Asymmetricity
#'(GSPA) against peptide \code{log2FC} data.
#'
#'@rdname prnGSPA
#'
#'@import purrr dplyr
#'@export
pepGSPA <- function (gset_nms = c("go_sets", "c2_msig", "kinsub"), method = "mean",
scale_log2r = TRUE, complete_cases = FALSE, impute_na = FALSE,
pval_cutoff = 5E-2, logFC_cutoff = log2(1.2),
gspval_cutoff = 5E-2, gslogFC_cutoff = log2(1.2),
min_size = 10, max_size = Inf,
min_delta = 4, min_greedy_size = 1,
use_adjP = FALSE,
fml_nms = NULL, df = NULL, filepath = NULL, filename = NULL, ...)
{
on.exit(
if (id %in% c("pep_seq", "pep_seq_mod")) {
mget(names(formals()), envir = rlang::current_env(), inherits = FALSE) %>%
c(rlang::enexprs(...)) %>%
save_call(paste0("anal", "_pepGSPA"))
} else if (id %in% c("prot_acc", "gene")) {
mget(names(formals()), envir = rlang::current_env(), inherits = FALSE) %>%
c(rlang::enexprs(...)) %>%
save_call(paste0("anal", "_prnGSPA"))
},
add = TRUE
)
check_dots(c("id", "anal_type", "var_cutoff"), ...)
check_gset_nms(gset_nms)
dat_dir <- get_gl_dat_dir()
dir.create(file.path(dat_dir, "Peptide/GSPA/log"),
recursive = TRUE, showWarnings = FALSE)
id <- match_call_arg(normPSM, group_psm_by)
stopifnot(rlang::as_string(id) %in% c("pep_seq", "pep_seq_mod"),
length(id) == 1L)
scale_log2r <- match_prnSig_scale_log2r(scale_log2r = scale_log2r,
impute_na = impute_na)
df <- rlang::enexpr(df)
filepath <- rlang::enexpr(filepath)
filename <- rlang::enexpr(filename)
method <- rlang::enexpr(method)
method <- if (rlang::is_call(method))
eval(method, envir = rlang::caller_env())
else
rlang::as_string(method)
stopifnot(all(method %in% c("mean", "limma")))
dots <- rlang::enexprs(...)
dots <- concat_fml_dots(
fmls = dots %>% .[grepl("^\\s*~", .)],
fml_nms = fml_nms,
dots = dots %>% .[!grepl("^\\s*~", .)],
anal_type = "GSPA"
)
reload_expts()
info_anal(df = !!df,
id = !!id,
filepath = !!filepath,
filename = !!filename,
scale_log2r = scale_log2r,
complete_cases = complete_cases,
impute_na = impute_na,
anal_type = "GSPA")(gset_nms = gset_nms,
var_cutoff = 1000,
pval_cutoff = pval_cutoff,
logFC_cutoff = logFC_cutoff,
gspval_cutoff = gspval_cutoff,
gslogFC_cutoff = gslogFC_cutoff,
min_size = min_size,
max_size = max_size,
min_delta = min_delta,
min_greedy_size = min_greedy_size,
use_adjP = use_adjP,
method = method,
!!!dots)
}
#'GSPA of protein data
#'
#'\code{prnGSPA} performs the analysis of Gene Set Probability Asymmetricity
#'(GSPA) against protein \code{log2FC} data.
#'
#'The significance \code{pVals} of individual proteins are first obtained from
#'\code{\link{prnSig}}, followed by log10 transformation and separation into up-
#'or down-expressed groups for each gene set. At the default of \code{method =
#'mean}, the geometric mean of \code{pVals}, \eqn{P}, are each calculated for
#'the groups of up or down regulated proteins, with a penalty-like term
#'
#'\deqn{-log10(P)=(\sum_{i=1}^{n}-log10(p)+m)/(n+m)}{-log10(P)=(-log10(p_1*p_2*...*p_n)+m)/(n+m)}
#'
#'where \eqn{n} and \eqn{m} are the numbers of entries with \eqn{p} values
#'\eqn{\le} or \eqn{>} a significance cut-off, respectively. The quotient of the
#'two \eqn{P} values is then used to represent the significance of gene set
#'enrichment. The arguments \code{pval_cutoff} and \code{logFC_cutoff} are used
#'to discriminate low influence genes. Additional subsetting of data via the
#'\code{vararg} approach of \code{filter_} is feasible. At \code{method =
#'limma}, moderated t-tests are performed against \code{-log10(pVals)} between
#'the up and the down groups via \code{\link[limma]{eBayes}}.
#'
#'@inheritParams anal_pepNMF
#'@inheritParams prnHist
#'@param impute_na Logical; if TRUE, data with the imputation of missing values
#' will be used. The default is FALSE.
#'@param gset_nms Character string or vector containing the shorthanded name(s),
#' full file path(s), or both, to gene sets for enrichment analysis. For
#' species among \code{"human", "mouse", "rat"}, the default of
#' \code{c("go_sets", "c2_msig", "kinsub")} will utilize terms from gene
#' ontology (\code{GO}), molecular signatures (\code{MSig}) and
#' kinase-substrate network (\code{PSP Kinase-Substrate}). Custom \code{GO},
#' \code{MSig} and other data bases at given species are also supported. See
#' also: \code{\link{prepGO}} for the preparation of custom \code{GO};
#' \code{\link{prepMSig}} for the preparation of custom \code{MSig}. For other
#' custom data bases, follow the same format of list as \code{GO} or
#' \code{MSig}.
#'@param method Character string or vector; the method to assess the p-values of
#' GSPA. The default is \code{mean} and the alternative is \code{limma}. See
#' also section \code{Details} for the calculations.
#'@param pval_cutoff Numeric value or vector; the cut-off in protein
#' significance \code{pVal}. Entries with \code{pVals} less significant than
#' the threshold will be excluded from enrichment analysis. The default is 0.05
#' for all formulas matched to or specified in argument \code{fml_nms}.
#' Formula-specific threshold is allowed by supplying a vector of cut-off
#' values.
#'@param logFC_cutoff Numeric value or vector; the cut-off in protein
#' \code{log2FC}. Entries with absolute \code{log2FC} smaller than the
#' threshold will be excluded from enrichment analysis. The default magnitude
#' is \code{log2(1.2)} for all formulas matched to or specified in argument
#' \code{fml_nms}. Formula-specific threshold is allowed by supplying a vector
#' of absolute values in \code{log2FC}.
#'@param gspval_cutoff Numeric value or vector; the cut-off in gene-set
#' significance \code{pVal}. Only enrichment terms with \code{pVals} more
#' significant than the threshold will be reported. The default is 0.05 for all
#' formulas matched to or specified in argument \code{fml_nms}.
#' Formula-specific threshold is allowed by supplying a vector of cut-off
#' values.
#'@param gslogFC_cutoff Numeric value or vector; the cut-off in gene-set
#' enrichment fold change. Only enrichment terms with absolute fold change
#' greater than the threshold will be reported. The default magnitude is
#' \code{log2(1.2)} for all formulas matched to or specified in argument
#' \code{fml_nms}. Formula-specific threshold is allowed by supplying a vector
#' of absolute values in \code{log2FC}.
#'@param min_size Numeric value or vector; minimum number of protein entries for
#' consideration in gene set tests. The number is after data filtration by
#' \code{pval_cutoff}, \code{logFC_cutoff} or varargs expressions under
#' \code{filter_}. The default is 10 for all formulas matched to or specified
#' in argument \code{fml_nms}. Formula-specific threshold is allowed by
#' supplying a vector of sizes.
#'@param max_size Numeric value or vector; maximum number of protein entries for
#' consideration in gene set tests. The number is after data filtration by
#' \code{pval_cutoff}, \code{logFC_cutoff} or varargs expressions under
#' \code{filter_}. The default in infinite for all formulas matched to or
#' specified in argument \code{fml_nms}. Formula-specific threshold is allowed
#' by supplying a vector of sizes.
#'@param min_delta Numeric value or vector; the minimum count difference between
#' the up- and the down-expressed group of proteins for consideration in gene
#' set tests. For example at \code{min_delta = 4}, a gene set will 6
#' upregulated proteins and 2 down-expressed proteins, or vice versa, will be
#' assessed. The number is after data filtration by \code{pval_cutoff},
#' \code{logFC_cutoff} or varargs expressions under \code{filter_}. The default
#' is 4 for all formulas matched to or specified in argument \code{fml_nms}.
#' Formula-specific threshold is allowed by supplying a vector of sizes.
#'@param min_greedy_size Numeric value or vector; minimum number of unique
#' protein entries for a gene set to be considered essential. The default in
#' \code{1} for all formulas matched to or specified in argument
#' \code{fml_nms}. Formula-specific threshold is allowed by supplying a vector
#' of sizes.
#'@param use_adjP Logical; if TRUE, use Benjamini-Hochberg pVals. The default is
#' FALSE.
#'@param fml_nms Character string or vector; the formula name(s). By default,
#' the formula(s) will match those used in \code{\link{pepSig}} or
#' \code{\link{prnSig}}.
#'@param ... \code{filter_}: Logical expression(s) for the row filtration
#' against data in a primary file of \code{/Model/Protein[_impNA]_pVals.txt}.
#' See also \code{\link{normPSM}} for the format of \code{filter_} statements.
#' \cr \cr \code{arrange_}: Variable argument statements for the row ordering
#' against data in a primary file of \code{/Model/Protein[_impNA]_pVals.txt}.
#' See also \code{\link{prnHM}} for the format of \code{arrange_} statements.
#'@import dplyr ggplot2
#'@importFrom magrittr %>% %T>% %$% %<>%
#'
#'@example inst/extdata/examples/prnGSPA_.R
#'
#'@seealso
#' \emph{Metadata} \cr
#' \code{\link{load_expts}} for metadata preparation and a reduced working example in data normalization \cr
#'
#' \emph{Data normalization} \cr
#' \code{\link{normPSM}} for extended examples in PSM data normalization \cr
#' \code{\link{PSM2Pep}} for extended examples in PSM to peptide summarization \cr
#' \code{\link{mergePep}} for extended examples in peptide data merging \cr
#' \code{\link{standPep}} for extended examples in peptide data normalization \cr
#' \code{\link{Pep2Prn}} for extended examples in peptide to protein summarization \cr
#' \code{\link{standPrn}} for extended examples in protein data normalization. \cr
#' \code{\link{purgePSM}} and \code{\link{purgePep}} for extended examples in data purging \cr
#' \code{\link{pepHist}} and \code{\link{prnHist}} for extended examples in histogram visualization. \cr
#' \code{\link{extract_raws}} and \code{\link{extract_psm_raws}} for extracting MS file names \cr
#'
#' \emph{Variable arguments of `filter_...`} \cr
#' \code{\link{contain_str}}, \code{\link{contain_chars_in}}, \code{\link{not_contain_str}},
#' \code{\link{not_contain_chars_in}}, \code{\link{start_with_str}},
#' \code{\link{end_with_str}}, \code{\link{start_with_chars_in}} and
#' \code{\link{ends_with_chars_in}} for data subsetting by character strings \cr
#'
#' \emph{Missing values} \cr
#' \code{\link{pepImp}} and \code{\link{prnImp}} for missing value imputation \cr
#'
#' \emph{Informatics} \cr
#' \code{\link{pepSig}} and \code{\link{prnSig}} for significance tests \cr
#' \code{\link{pepVol}} and \code{\link{prnVol}} for volcano plot visualization \cr
#' \code{\link{prnGSPA}} for gene set enrichment analysis by protein significance pVals \cr
#' \code{\link{gspaMap}} for mapping GSPA to volcano plot visualization \cr
#' \code{\link{prnGSPAHM}} for heat map and network visualization of GSPA results \cr
#' \code{\link{prnGSVA}} for gene set variance analysis \cr
#' \code{\link{prnGSEA}} for data preparation for online GSEA. \cr
#' \code{\link{pepMDS}} and \code{\link{prnMDS}} for MDS visualization \cr
#' \code{\link{pepPCA}} and \code{\link{prnPCA}} for PCA visualization \cr
#' \code{\link{pepLDA}} and \code{\link{prnLDA}} for LDA visualization \cr
#' \code{\link{pepHM}} and \code{\link{prnHM}} for heat map visualization \cr
#' \code{\link{pepCorr_logFC}}, \code{\link{prnCorr_logFC}}, \code{\link{pepCorr_logInt}} and
#' \code{\link{prnCorr_logInt}} for correlation plots \cr
#' \code{\link{anal_prnTrend}} and \code{\link{plot_prnTrend}} for trend analysis and visualization \cr
#' \code{\link{anal_pepNMF}}, \code{\link{anal_prnNMF}}, \code{\link{plot_pepNMFCon}},
#' \code{\link{plot_prnNMFCon}}, \code{\link{plot_pepNMFCoef}}, \code{\link{plot_prnNMFCoef}} and
#' \code{\link{plot_metaNMF}} for NMF analysis and visualization \cr
#'
#' \emph{Custom databases} \cr
#' \code{\link{Uni2Entrez}} for lookups between UniProt accessions and Entrez IDs \cr
#' \code{\link{Ref2Entrez}} for lookups among RefSeq accessions, gene names and Entrez IDs \cr
#' \code{\link{prepGO}} for \code{\href{http://current.geneontology.org/products/pages/downloads.html}{gene
#' ontology}} \cr
#' \code{\link{prepMSig}} for \href{https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/}{molecular
#' signatures} \cr
#' \code{\link{prepString}} and \code{\link{anal_prnString}} for STRING-DB \cr
#'
#' \emph{Column keys in PSM, peptide and protein outputs} \cr
#' system.file("extdata", "psm_keys.txt", package = "proteoQ") \cr
#' system.file("extdata", "peptide_keys.txt", package = "proteoQ") \cr
#' system.file("extdata", "protein_keys.txt", package = "proteoQ") \cr
#'
#'@export
prnGSPA <- function (gset_nms = c("go_sets", "c2_msig", "kinsub"), method = "mean",
scale_log2r = TRUE, complete_cases = FALSE, impute_na = FALSE,
pval_cutoff = 5E-2, logFC_cutoff = log2(1.2),
gspval_cutoff = 5E-2, gslogFC_cutoff = log2(1.2),
min_size = 10, max_size = Inf,
min_delta = 4, min_greedy_size = 1,
use_adjP = FALSE,
fml_nms = NULL, df = NULL, filepath = NULL, filename = NULL, ...)
{
on.exit(
if (id %in% c("pep_seq", "pep_seq_mod")) {
mget(names(formals()), envir = rlang::current_env(), inherits = FALSE) %>%
c(rlang::enexprs(...)) %>%
save_call(paste0("anal", "_pepGSPA"))
}
else if (id %in% c("prot_acc", "gene")) {
mget(names(formals()), envir = rlang::current_env(), inherits = FALSE) %>%
c(rlang::enexprs(...)) %>%
save_call(paste0("anal", "_prnGSPA"))
},
add = TRUE
)
check_dots(c("id", "anal_type", "var_cutoff"), ...)
check_gset_nms(gset_nms)
dat_dir <- get_gl_dat_dir()
dir.create(file.path(dat_dir, "Protein/GSPA/log"),
recursive = TRUE, showWarnings = FALSE)
id <- match_call_arg(normPSM, group_pep_by)
stopifnot(rlang::as_string(id) %in% c("prot_acc", "gene"),
length(id) == 1L)
scale_log2r <- match_prnSig_scale_log2r(scale_log2r = scale_log2r,
impute_na = impute_na)
df <- rlang::enexpr(df)
filepath <- rlang::enexpr(filepath)
filename <- rlang::enexpr(filename)
method <- rlang::enexpr(method)
method <- if (rlang::is_call(method))
eval(method, envir = rlang::caller_env())
else
rlang::as_string(method)
stopifnot(all(method %in% c("mean", "limma")))
dots <- rlang::enexprs(...)
dots <- concat_fml_dots(
fmls = dots %>% .[grepl("^\\s*~", .)],
fml_nms = fml_nms,
dots = dots %>% .[!grepl("^\\s*~", .)],
anal_type = "GSPA"
)
reload_expts()
info_anal(df = !!df,
id = !!id,
filepath = !!filepath,
filename = !!filename,
scale_log2r = scale_log2r,
complete_cases = complete_cases,
impute_na = impute_na,
anal_type = "GSPA")(gset_nms = gset_nms,
var_cutoff = 1000,
pval_cutoff = pval_cutoff,
logFC_cutoff = logFC_cutoff,
gspval_cutoff = gspval_cutoff,
gslogFC_cutoff = gslogFC_cutoff,
min_size = min_size,
max_size = max_size,
min_delta = min_delta,
min_greedy_size = min_greedy_size,
use_adjP = use_adjP,
method = method,
!!!dots)
}
#' Perform GSPA tests
#'
#' logFC_cutoff Numeric A threshold for the subset of data before the calculation
#' of adjusted pvals
#'
#' @inheritParams prnHist
#' @inheritParams prnHM
#' @inheritParams prnGSPA
#' @inheritParams prnGSVA
#' @inheritParams info_anal
#' @param id Currently only "entrez".
#' @param label_scheme_sub A data frame. Subset entries from \code{label_scheme}
#' for selected samples.
#' @import limma stringr purrr tidyr dplyr
#' @importFrom magrittr %>% %T>% %$% %<>% is_greater_than not
gspaTest <- function(df = NULL, id = "entrez", label_scheme_sub = NULL,
scale_log2r = TRUE, complete_cases = FALSE, impute_na = FALSE,
filepath = NULL, filename = NULL,
gset_nms = "go_sets", var_cutoff = 0.5,
pval_cutoff = 5E-2, logFC_cutoff = log2(1.2),
gspval_cutoff = 5E-2, gslogFC_cutoff = log2(1),
min_size = 6, max_size = Inf, min_delta = 4, min_greedy_size = 1,
use_adjP = FALSE,
method = "mean", anal_type = "GSPA", ...)
{
# `GSPA`: use `pVals` instead of `N_log2R` or `Z_log2R`;
# Protein[_impNA]_pVals.txt does not contain `scale_log2_r` info in the file name,
# and cannot tell `pVals` are from `N_log2R` or `Z_log2R`;
# therefore, `scale_log2r` will be matched to those in `prnSig()` and indicated in
# the names of out files as `_N[_impNA].txt` or `_Z[_impNA].txt`
# to inform user the `scale_log_r` status
# `complete_cases` is for entrez IDs, pVals, log2FC
# "id" for tibbling rownames
stopifnot(vapply(c(pval_cutoff,
logFC_cutoff,
min_size,
max_size,
min_delta,
min_greedy_size),
rlang::is_double, logical(1L)))
if (!nrow(label_scheme_sub))
stop("Empty metadata.")
if (!"entrez" %in% names(df))
stop("Column \"entrez\" not found.")
if (all(is.na(df[["entrez"]])))
stop("All values are NA under the column `entrez` in the input data.")
id <- rlang::as_string(rlang::enexpr(id))
dots = rlang::enexprs(...)
fmls <- dots %>% .[grepl("^\\s*~", .)]
dots <- dots %>% .[! names(.) %in% names(fmls)]
filter_dots <- dots %>%
.[purrr::map_lgl(., is.language)] %>%
.[grepl("^filter_", names(.))]
arrange_dots <- dots %>%
.[purrr::map_lgl(., is.language)] %>%
.[grepl("^arrange_", names(.))]
select_dots <- dots %>%
.[purrr::map_lgl(., is.language)] %>%
.[grepl("^select_", names(.))]
dots <- dots %>%
.[! . %in% c(filter_dots, arrange_dots, select_dots)]
df <- df %>%
filters_in_call(!!!filter_dots) %>%
arrangers_in_call(!!!arrange_dots)
if (!nrow(df))
stop("No data available after row filtration.", call. = FALSE)
if (!length(fmls))
stop("Formula(s) of contrasts not available.", call. = FALSE)
species <- unique(df$species) %>% .[!is.na(.)] %>% as.character()
gsets <- load_dbs(gset_nms = gset_nms, species = species)
fml_nms <- names(df) %>%
.[grepl("pVal\\s*\\(", .)] %>%
gsub("(.*)\\.pVal.*", "\\1", .) %>%
unique() %>%
.[. %in% names(fmls)]
fmls <- fmls %>% .[names(.) %in% fml_nms]
fml_nms <- fml_nms %>% .[purrr::map_dbl(., ~ which(.x == names(fmls)))]
if (!length(fml_nms)) {
stop("No formula matached; ",
"compare the formula name(s) with those in \"prnSig(..)\"")
}
# Note: wrong number of arguments to '>' at devtools building of a package
# (https://github.com/Mouse-Imaging-Centre/RMINC/issues/226)
col_ind <- fml_nms %>%
purrr::map(~ grepl(paste0("^", .x, "\\."), names(df))) %>%
`names<-`(paste0("nm_", seq_along(.))) %>%
dplyr::bind_cols() %>%
rowSums() %>%
magrittr::is_greater_than(0)
stopifnot(sum(col_ind) > 0L)
switch(anal_type,
GSPA = purrr::pwalk(list(fmls,
fml_nms,
pval_cutoff,
logFC_cutoff,
gspval_cutoff,
gslogFC_cutoff,
min_size,
max_size,
min_delta,
min_greedy_size,
method
),
fml_gspa,
df = df,
col_ind = col_ind,
id = !!id,
gsets = gsets,
label_scheme_sub = label_scheme_sub,
complete_cases = complete_cases,
scale_log2r = scale_log2r,
filepath = filepath,
filename = filename,
use_adjP = use_adjP,
!!!filter_dots),
GSEA = purrr::pwalk(list(fmls,
fml_nms,
var_cutoff,
pval_cutoff,
logFC_cutoff,
gspval_cutoff,
gslogFC_cutoff,
min_size,
max_size
),
fml_gsea,
df = df,
col_ind = col_ind,
id = !!id,
gsets = gsets,
label_scheme_sub = label_scheme_sub,
complete_cases = complete_cases,
scale_log2r = scale_log2r,
filepath = filepath,
filename = filename,
!!!dots),
stop("Unhandled task.")
)
invisible(gset_nms)
}
#' Helper of GSPA.
#'
#' @param fml A character string; the formula used in \link{prnSig}.
#' @param fml_nm A character string; the name of \code{fml}.
#' @param col_ind Numeric vector; the indexes of columns for the ascribed \code{fml_nm}.
#' @inheritParams prnHist
#' @inheritParams prnGSPA
#' @inheritParams gspaTest
#' @inheritParams gsVolcano
#' @import purrr dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
fml_gspa <- function (fml, fml_nm, pval_cutoff, logFC_cutoff,
gspval_cutoff, gslogFC_cutoff,
min_size, max_size, min_delta, min_greedy_size, method,
df, col_ind, id, gsets,
label_scheme_sub, complete_cases, scale_log2r,
filepath, filename, use_adjP = FALSE, ...)
{
on.exit(
pars <- mget(names(formals()), envir = rlang::current_env(),
inherits = FALSE) %>%
.[! names(.) %in% c("gsets", "df", "label_scheme_sub", "col_ind")] %>%
c(rlang::enexprs(...)) %>%
save_call(paste0(fn_prefix, "@", fml_nm))
)
if (!length(gsets))
stop("Zero entries in \"gsets\".")
dir.create(file.path(filepath, fml_nm), recursive = TRUE, showWarnings = FALSE)
id <- rlang::as_string(rlang::enexpr(id))
fn_prefix <- gsub("\\.[^.]*$", "", filename)
df <- df %>%
prep_gspa(id = "entrez", fml_nm = fml_nm, col_ind = col_ind,
pval_cutoff = pval_cutoff, logFC_cutoff = logFC_cutoff,
use_adjP = use_adjP)
if (complete_cases) {
df <- df[complete.cases(df), ]
nms <- names(df)
if (length(nms))
message("Complete cases against columns: ", paste(nms, collapse = ", "))
rm(list = "nms")
}
df <- df %>%
dplyr::mutate(p_val = -log10(p_val)) %>%
dplyr::mutate(valence = ifelse(.$log2Ratio > 0, "pos", "neg")) %>%
dplyr::mutate(valence = factor(valence, levels = c("neg", "pos"))) %>%
tidyr::complete(entrez, contrast, valence)
# do not re-`arrange` df, df_sub for limma after this point
df <- df %>% dplyr::arrange(entrez, contrast, valence)
gsets <- gsets %>%
.[!grepl("molecular_function$", names(.))] %>%
.[!grepl("cellular_component$", names(.))] %>%
.[!grepl("biological_process$", names(.))] %>%
.[purrr::map_lgl(., ~ length(.x) >= min_size)] %>%
.[purrr::map_lgl(., ~ length(.x) <= max_size)]
res_pass <- local({
contrast_groups <- unique(df$contrast) %>% as.character()
contrast_table <- tibble::tibble(
contrast = rep(contrast_groups, 2),
valence = rep(c("neg", "pos"), length(contrast_groups)),
p_val = rep(-log10(0.05), 2 * length(contrast_groups))) %>%
dplyr::mutate(contrast = factor(contrast, levels = levels(df$contrast)),
valence = factor(valence, levels(df$valence)))
res <- switch(method,
limma = purrr::map(gsets,
gspa_summary_limma,
df = df,
min_size = min_size,
min_delta = min_delta,
gspval_cutoff = gspval_cutoff,
gslogFC_cutoff = gslogFC_cutoff),
mean = purrr::map(gsets,
gspa_summary_mean,
df = df,
min_size = min_size,
min_delta = min_delta,
gspval_cutoff = gspval_cutoff,
gslogFC_cutoff = gslogFC_cutoff),
rlang::abort("Unknown `method`."))
idx <- purrr::map_dbl(res, is.null)
if (all(idx == 1L))
stop("No GSPA results available.")
res <- res[!idx] %>%
do.call(rbind, .) %>%
tibble::rownames_to_column("term") %>%
dplyr::mutate(term = factor(term)) %>%
dplyr::mutate_at(.vars = grep("^pVal\\s+\\(", names(.)), ~ abs(.x)) %>%
dplyr::group_by(term) %>%
dplyr::arrange(term) %>%
data.frame(check.names = FALSE)
pass <- purrr::map(res[paste0("pVal (", contrast_groups, ")")],
~ .x <= gspval_cutoff) %>%
dplyr::bind_cols() %>%
rowSums() %>%
magrittr::is_greater_than(0)
res_pass <- res[pass, ] %>%
dplyr::mutate_at(vars(grep("^pVal", names(.))),
~ replace(.x, .x < 1E-50, 1E-50))
})
if (!nrow(res_pass))
return(NULL)
out <- local({
adjp <- res_pass %>%
dplyr::select(grep("^pVal\\s+\\(", names(.))) %>%
purrr::map(~ p.adjust(., "BH")) %>%
data.frame(check.names = FALSE) %>%
`names<-`(gsub("pVal", "q_val", colnames(.))) %>%
`names<-`(gsub("^q_val\\s+\\((.*)\\)", "\\1", names(.))) %>%
tidyr::gather(key = contrast, value = q_val) %>%
dplyr::select(-contrast)
log2fc <- res_pass %>%
dplyr::select(grep("^log2Ratio\\s+\\(", names(.))) %>%
`names<-`(gsub("^log2Ratio\\s+\\((.*)\\)", "\\1", names(.))) %>%
tidyr::gather(key = contrast, value = log2fc) %>%
dplyr::select(-contrast)
pval <- res_pass %>%
dplyr::select(-grep("^log2Ratio\\s+\\(", names(.))) %>%
`names<-`(gsub("^pVal\\s+\\((.*)\\)", "\\1", names(.))) %>%
tidyr::gather(key = contrast, value = p_val, -term)
dplyr::bind_cols(pval, adjp, log2fc) %>%
dplyr::mutate_at(.vars = grep("^p_val$|^adjP$", names(.)), format,
scientific = TRUE, digits = 2) %>%
dplyr::mutate_at(.vars = grep("^log2Ratio|^FC\\s*\\(", names(.)), round, 2)
})
sig_sets <- purrr::map2(gsets, names(gsets), ~ {
tibble::tibble(id = as.character(.x)) %>%
dplyr::mutate(set = .y)
}) %>%
dplyr::bind_rows() %>%
`names<-` (c("id", "term")) %>%
dplyr::filter(id %in% unique(df$entrez)) %>%
dplyr::select(c("term", "id")) %>%
dplyr::filter(term %in% res_pass$term)
out <- out %>%
dplyr::mutate(term = as.character(term))
out <- sig_sets %>%
dplyr::group_by(term) %>%
dplyr::summarise(size = n()) %>% # the number of entries matched to input `df`
dplyr::right_join(out, by = "term")
res_greedy <- sig_sets %>%
greedysetcover() %T>%
write.table(file.path(filepath, fml_nm, paste0(fn_prefix, "_resgreedy.txt")),
sep = "\t", col.names = TRUE,
row.names = FALSE, quote = FALSE) %>%
dplyr::group_by(term) %>%
dplyr::summarise(ess_size = n()) %>%
dplyr::filter(ess_size >= min_greedy_size)
sig_sets <- res_greedy %>%
dplyr::right_join(out, by = "term") %>%
dplyr::mutate(is_essential = ifelse(!is.na(ess_size), TRUE, FALSE)) %>%
dplyr::select(term, is_essential, size, ess_size,
contrast, p_val, q_val, log2fc) %T>%
write.table(file.path(filepath, fml_nm, paste0(fn_prefix, ".txt")),
sep = "\t", col.names = TRUE,
row.names = FALSE, quote = FALSE) %>%
dplyr::filter(is_essential) %>%
dplyr::filter(!duplicated(term)) %>%
dplyr::select(-contrast, -p_val, -q_val, -log2fc) %T>%
write.table(file.path(filepath, fml_nm, paste0(fn_prefix, "_essmeta.txt")),
sep = "\t", col.names = TRUE,
row.names = FALSE, quote = FALSE) %>%
dplyr::select(term, ess_size) %>%
dplyr::right_join(sig_sets, by = "term")
map_essential(sig_sets) %>%
write.table(file.path(filepath, fml_nm, paste0(fn_prefix, "_essmap.txt")),
sep = "\t", col.names = TRUE,
row.names = FALSE, quote = FALSE)
}
#' checks the size of a gene set
#'
#' @inheritParams prnHist
#' @inheritParams prnGSPA
ok_min_size <- function (df, min_delta, gspval_cutoff = 1, gslogFC_cutoff = 0)
{
data <- df %>%
dplyr::group_by(contrast, valence) %>%
dplyr::filter(!is.na(log2Ratio), !is.na(p_val)) %>%
dplyr::mutate(n = n()) %>%
tidyr::unite(con_val, contrast, valence, sep = ".", remove = FALSE)
delta_p <- data %>%
dplyr::summarise(p_val = mean(p_val, na.rm = TRUE)) %>%
dplyr::mutate(p_val = ifelse(is.nan(p_val), 0, p_val)) %>%
tidyr::spread(key = contrast, value = p_val) %>%
dplyr::select(-valence)
delta_p[is.na(delta_p)] <- 0
if (nrow(delta_p) == 2L)
delta_p <- delta_p[2, ] - delta_p[1, ]
delta_fc <- data %>%
dplyr::summarise(log2Ratio = mean(log2Ratio, na.rm = TRUE)) %>%
dplyr::mutate(log2Ratio = ifelse(is.nan(log2Ratio), 0, log2Ratio)) %>%
tidyr::spread(key = contrast, value = log2Ratio) %>%
dplyr::select(-valence)
delta_fc[is.na(delta_fc)] <- 0
if (nrow(delta_fc) == 2L)
delta_fc <- delta_fc[2, ] + delta_fc[1, ]
delta_n <- data %>%
dplyr::summarise(n = mean(n, na.rm = TRUE)) %>% # empty levels will be kept
tidyr::spread(key = contrast, value = n) %>%
dplyr::select(-valence)
delta_n[is.na(delta_n)] <- 0
if (nrow(delta_n) == 2)
delta_n <- delta_n[2, ] - delta_n[1, ]
oks <- sign(delta_p) == sign(delta_n) &
abs(delta_p) >= -log10(gspval_cutoff) &
abs(delta_n) >= min_delta &
abs(delta_fc) >= gslogFC_cutoff
# pad missing contrast(s)
if (length(oks) < nlevels(df$contrast)) {
levs <- levels(df$contrast)
missing_nms <- levs[!levs %in% colnames(oks)]
if (length(missing_nms)) {
for (i in seq_along(missing_nms)) {
oks <- dplyr::bind_cols(!!missing_nms[i] := FALSE, oks)
}
}
oks <- oks[, levs, drop = FALSE]
}
invisible(oks)
}
#' gspa pVal calculations using geomean
#'
#' @param gset Character string; a gene set indicated by \code{gset_nms}.
#' @inheritParams prnHist
#' @inheritParams prnGSPA
gspa_summary_mean <- function(gset, df, min_size = 10, min_delta = 4,
gspval_cutoff = 0.05,
gslogFC_cutoff = log2(1))
{
df <- df %>%
dplyr::filter(entrez %in% gset)
if (length(unique(df$entrez)) < min_size)
return(NULL)
df <- df %>% dplyr::group_by(contrast, valence)
ok_delta_n <- ok_min_size(df = df,
min_delta = min_delta,
gspval_cutoff = gspval_cutoff,
gslogFC_cutoff = gslogFC_cutoff)
# penalty term
dfw <- df %>% dplyr::mutate(p_val = ifelse(is.na(p_val), 1, p_val))
delta_p <- dfw %>%
dplyr::summarise(p_val = mean(p_val, na.rm = TRUE)) %>%
dplyr::mutate(p_val = replace(p_val, is.nan(p_val), 1)) %>%
tidyr::spread(key = contrast, value = p_val) %>%
dplyr::select(-valence) %>%
`colnames<-`(paste0("pVal (", colnames(.), ")"))
if (nrow(delta_p) == 2)
delta_p <- delta_p[2, ] - delta_p[1, ]
delta_p <- abs(delta_p)
delta_p <- 1/10^delta_p
delta_p[] <- purrr::map2(as.list(delta_p), as.list(ok_delta_n),
~ {if (.y) .x else 1})
delta_fc <- dfw %>%
dplyr::summarise(log2Ratio = mean(log2Ratio, na.rm = TRUE)) %>%
dplyr::mutate(log2Ratio = replace(log2Ratio, is.nan(log2Ratio), 0)) %>%
tidyr::spread(key = contrast, value = log2Ratio) %>%
dplyr::select(-valence) %>%
`colnames<-`(paste0("log2Ratio (", colnames(.), ")")) %>%
abs(.)
if (nrow(delta_fc) == 2L)
delta_fc <- delta_fc[2, ] - delta_fc[1, ]
invisible(dplyr::bind_cols(delta_p, delta_fc))
}
#' GSPA pVal calculations using limma
#'
#' @inheritParams prnHist
#' @inheritParams prnGSPA
#' @inheritParams gspa_summary_mean
gspa_summary_limma <- function(gset, df, min_size = 10, min_delta = 4,
gspval_cutoff = 0.05, gslogFC_cutoff = 1.2)
{
df <- df %>%
dplyr::filter(entrez %in% gset)
if (!nrow(df))
stop("No entrez IDs matched to the provided gene sets.")
if (length(unique(df$entrez)) < min_size)
return(NULL)
lm_gspa(df, min_delta, gspval_cutoff, gslogFC_cutoff)
}
#' significance tests of pVals between the up and the down groups
#'
#' @inheritParams prnHist
#' @inheritParams prnGSPA
lm_gspa <- function(df, min_delta, gspval_cutoff, gslogFC_cutoff)
{
delta_fc <- df %>%
dplyr::group_by(contrast, valence) %>%
dplyr::summarise(log2Ratio = mean(log2Ratio, na.rm = TRUE)) %>%
dplyr::mutate(log2Ratio = replace(log2Ratio, is.nan(log2Ratio), 0)) %>%
tidyr::spread(key = contrast, value = log2Ratio) %>%
dplyr::select(-valence)
nms <- colnames(delta_fc)
delta_fc <- delta_fc %>%
`colnames<-`(paste0("log2Ratio (", colnames(.), ")")) %>%
abs()
if (nrow(delta_fc) == 2L)
delta_fc <- delta_fc[2, ] - delta_fc[1, ]
ok_delta_n <- ok_min_size(df = df,
min_delta = min_delta,
gspval_cutoff = gspval_cutoff,
gslogFC_cutoff = gslogFC_cutoff)
data <- df %>%
dplyr::group_by(contrast, valence) %>%
dplyr::select(-log2Ratio) %>%
tidyr::unite(sample, entrez, contrast, valence, sep = ".",
remove = TRUE) %>%
tidyr::spread(key = sample, value = p_val)
label_scheme_gspa <- tibble::tibble(Sample_ID = names(data),
Group = Sample_ID) %>%
dplyr::mutate(Group = gsub("^\\d+\\.", "", Group)) %>%
dplyr::mutate(Group = factor(Group))
design <- model.matrix(~0+label_scheme_gspa$Group) %>%
`colnames<-`(levels(label_scheme_gspa$Group)) %>%
data.frame(check.names = TRUE)
neg_cols <- colnames(design)[as.logical(seq_along(design) %% 2)]
pos_cols <- colnames(design)[!(seq_along(design) %% 2)]
contrs <- paste(pos_cols, neg_cols, sep = "-")
contr_mat <- makeContrasts(contrasts = contrs, levels = data.frame(design)) %>%
`colnames<-`(contrs) %>%
`rownames<-`(colnames(design))
fit <- data %>%
lmFit(design = design) %>%
contrasts.fit(contr_mat) %>%
eBayes()
delta_p <- fit$p.value %>%
replace(., is.na(.), 1) %>%
data.frame(check.names = FALSE) %>%
`colnames<-`(paste0("pVal (", nms, ")"))
delta_p[] <- purrr::map2(as.list(delta_p), as.list(ok_delta_n),
~ {if (.y) .x else 1})
invisible(dplyr::bind_cols(delta_p, delta_fc))
}
#' Helper of GSPA
#'
#' @inheritParams prnHist
#' @inheritParams prnGSPA
#' @inheritParams fml_gspa
#'
#' @import purrr dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
#' @importFrom readr read_tsv
prep_gspa <- function(df, id, fml_nm, col_ind, pval_cutoff = 5E-2,
logFC_cutoff = log2(1.2), use_adjP = FALSE)
{
id <- rlang::as_string(rlang::enexpr(id))
df <- df %>%
dplyr::select(grep(paste0("^", fml_nm, "\\."), names(.))) %>%
`colnames<-`(gsub(paste0("^", fml_nm, "\\."), "", names(.))) %>%
dplyr::bind_cols(df[, !col_ind, drop = FALSE], .) %>%
rm_pval_whitespace() %>%
dplyr::select(id, grep("^pVal|^adjP|^log2Ratio", names(.))) %>%
dplyr::mutate(!!id := as.character(.[[id]]))
if (use_adjP) {
df <- df %>%
dplyr::select(-grep("pVal", names(.))) %>%
`names<-`(gsub("adjP", "pVal", names(.)))
}
else {
df <- df %>%
dplyr::select(-grep("adjP", names(.)))
}
contrast_groups <- names(df[grep("^log2Ratio\\s+\\(", names(df))]) %>%
gsub("^log2Ratio\\s+\\(|\\)$", "", .)
pvals <- df %>%
dplyr::select(-grep("^log2Ratio\\s+\\(", names(.))) %>%
`names<-`(gsub("^pVal\\s+\\((.*)\\)$", "\\1", names(.))) %>%
tidyr::gather(key = contrast, value = p_val, -id)
log2rs <- df %>%
dplyr::select(-grep("^pVal\\s+\\(", names(.))) %>%
`names<-`(gsub("^log2Ratio\\s+\\((.*)\\)$", "\\1", names(.))) %>%
tidyr::gather(key = contrast, value = log2Ratio, -id) %>%
dplyr::select(-id, -contrast)
df <- dplyr::bind_cols(pvals, log2rs) %>%
dplyr::filter(p_val <= pval_cutoff) %>% # NA will be removed
dplyr::filter(abs(log2Ratio) >= logFC_cutoff) %>%
dplyr::filter(!is.na(id)) %>%
dplyr::mutate(contrast = factor(contrast, levels = contrast_groups)) %>%
dplyr::arrange(contrast)
invisible(df)
}
#' Helper for mapping between gene sets and essential gene sets
#'
#' @param sig_sets A data frame containing the gene sets that are significant
#' under given criteria.
#' @import purrr dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
map_essential <- function (sig_sets)
{
ess_terms <- sig_sets %>%
dplyr::filter(!is.na(ess_size), !duplicated(term)) %>%
dplyr::select(term) %>%
unlist
sig_greedy_sets <- sig_sets %>%
dplyr::filter(term %in% ess_terms)
res <- purrr::map(unique(sig_sets$term), ~ {
curr_sig_set <- sig_sets %>%
dplyr::filter(term %in% .x)
sig_greedy_sets %>%
dplyr::filter(id %in% curr_sig_set$id) %>%
dplyr::group_by(term) %>%
dplyr::summarise(size = n()) %>%
dplyr::left_join(curr_sig_set %>%
dplyr::select(c("term", "ess_size")) %>%
dplyr::filter(!duplicated(term)),
by = "term") %>%
dplyr::mutate(fraction = size/nrow(curr_sig_set)) %>%
dplyr::mutate(curr_sig_set = .x)
}, sig_sets, sig_greedy_sets) %>%
dplyr::bind_rows() %>%
dplyr::group_by(curr_sig_set) %>%
dplyr::rename(ess_term = "term", term = "curr_sig_set") %>%
dplyr::select(c("term", "ess_term", "size", "ess_size", "fraction")) %>%
dplyr::mutate(fraction = round(fraction, digits = 3)) %>%
# dplyr::ungroup(ess_term) %>%
dplyr::mutate(distance = 1 - fraction) %>%
dplyr::mutate(term = factor(term, levels = unique(as.character(term)))) %>%
dplyr::mutate(ess_term = factor(ess_term, levels = levels(term))) %>%
dplyr::mutate(idx = as.numeric(term), ess_idx = as.numeric(ess_term))
}
#' Heat map visualization of GSPA results
#'
#' \code{prnGSPAHM} visualizes distance heat maps and networks between essential
#' and all gene sets.
#'
#' The list of gene sets and the associative quality metrics of \code{size} and
#' \code{ess_size} are assessed after data filtration with the criteria
#' specified by arguments \code{pval_cutoff} and \code{logFC_cutoff}, as well as
#' optional varargs of \code{filter_}.
#'
#' @section \code{Protein_GSPA_[...].txt}:
#'
#' \tabular{ll}{ \strong{Key} \tab \strong{Description}\cr term \tab a gene
#' set term \cr is_essential \tab a logical indicator of gene set essentiality
#' \cr size \tab the number of IDs under a \code{term} \cr ess_size \tab the
#' number of IDs that can be found under a corresponding essential set \cr
#' contrast \tab a contrast of sample groups \cr p_val \tab significance p
#' values \cr q_val \tab \code{p_val} with \code{BH} adjustment of multiple
#' tests \cr log2fc \tab the fold change of a gene set at logarithmic base of 2
#' \cr }
#'
#' @section \code{Protein_GSPA_[...]essmap.txt}:
#'
#' \tabular{ll}{ \strong{Key} \tab \strong{Descrption}\cr term \tab a gene
#' set term \cr ess_term \tab an essential gene set term \cr size \tab the
#' number of IDs under a \code{term} with matches to an \code{ess_term} \cr
#' ess_size \tab the number of essential IDs under a \code{term} with matches
#' to an \code{ess_term} \cr fraction \tab a fraction of matches in IDs between
#' a \code{term} and a \code{ess_term} \cr distance \tab 1 - \code{fraction}
#' \cr idx \tab a numeric index of \code{term} \cr ess_idx \tab a numeric index
#' of \code{ess_term} \cr }
#'
#' @inheritParams plot_prnTrend
#' @inheritParams prnEucDist
#' @param impute_na Logical; at TRUE, input files with \code{_impNA[...].txt} in
#' name will be loaded. Otherwise, files without \code{_impNA} in name will be
#' taken. An error will be thrown if no files are matched under given
#' conditions. The default is FALSE.
#' @param fml_nms Character string or vector; the formula name(s). By default,
#' the formula(s) will match those used in \code{\link{pepSig}} or
#' \code{\link{prnSig}}.
#' @param annot_cols A character vector of column keys that can be found in
#' \code{_essmap.txt}. The values under the selected keys will be used to
#' color-code enrichment terms on the top of heat maps. The default is NULL
#' without column annotation.
#' @param annot_rows A character vector of column keys that can be found from
#' \code{_essmeta.txt} . The values under the selected keys will be used to
#' color-code essential terms on the side of heat maps. The default is NULL
#' without row annotation.
#' @param ... \code{filter2_}: Variable argument statements for the row
#' filtration against data in secondary file(s) of \code{_essmap.txt}. Each
#' statement contains to a list of logical expression(s). The \code{lhs} needs
#' to start with \code{filter2_}. The logical condition(s) at the \code{rhs}
#' needs to be enclosed in \code{exprs} with round parenthesis. For example,
#' \code{distance} is a column key in \code{Protein_GSPA_Z_essmap.txt}. The
#' statement \code{filter2_ = exprs(distance <= .95),} will remove entries
#' with \code{distance > 0.95}. See also \code{\link{normPSM}} for the format
#' of \code{filter_} statements against primary data. \cr \cr
#' \code{arrange2_}: Variable argument statements for the row ordering against
#' data in secondary file(s) of \code{_essmap.txt}. The \code{lhs} needs to
#' start with \code{arrange2_}. The expression(s) at the \code{rhs} needs to
#' be enclosed in \code{exprs} with round parenthesis. For example,
#' \code{distance} and \code{size} are column keys in
#' \code{Protein_GSPA_Z_essmap.txt}. The statement \code{arrange2_ =
#' exprs(distance, size),} will order entries by \code{distance}, then by
#' \code{size}. See also \code{\link{prnHM}} for the format of \code{arrange_}
#' statements against primary data. \cr \cr Additional arguments for
#' \code{\link[pheatmap]{pheatmap}}, i.e., \code{fontsize }... \cr \cr Note
#' arguments disabled from \code{pheatmap}: \cr \code{annotation_col}; instead
#' use keys indicated in \code{annot_cols} \cr \code{annotation_row}; instead
#' use keys indicated in \code{annot_rows}
#' @import purrr
#'
#' @example inst/extdata/examples/prnGSPAHM_.R
#'
#' @seealso
#' \emph{Metadata} \cr
#' \code{\link{load_expts}} for metadata preparation and a reduced working example in data normalization \cr
#'
#' \emph{Data normalization} \cr
#' \code{\link{normPSM}} for extended examples in PSM data normalization \cr
#' \code{\link{PSM2Pep}} for extended examples in PSM to peptide summarization \cr
#' \code{\link{mergePep}} for extended examples in peptide data merging \cr
#' \code{\link{standPep}} for extended examples in peptide data normalization \cr
#' \code{\link{Pep2Prn}} for extended examples in peptide to protein summarization \cr
#' \code{\link{standPrn}} for extended examples in protein data normalization. \cr
#' \code{\link{purgePSM}} and \code{\link{purgePep}} for extended examples in data purging \cr
#' \code{\link{pepHist}} and \code{\link{prnHist}} for extended examples in histogram visualization. \cr
#' \code{\link{extract_raws}} and \code{\link{extract_psm_raws}} for extracting MS file names \cr
#'
#' \emph{Variable arguments of `filter_...`} \cr
#' \code{\link{contain_str}}, \code{\link{contain_chars_in}}, \code{\link{not_contain_str}},
#' \code{\link{not_contain_chars_in}}, \code{\link{start_with_str}},
#' \code{\link{end_with_str}}, \code{\link{start_with_chars_in}} and
#' \code{\link{ends_with_chars_in}} for data subsetting by character strings \cr
#'
#' \emph{Missing values} \cr
#' \code{\link{pepImp}} and \code{\link{prnImp}} for missing value imputation \cr
#'
#' \emph{Informatics} \cr
#' \code{\link{pepSig}} and \code{\link{prnSig}} for significance tests \cr
#' \code{\link{pepVol}} and \code{\link{prnVol}} for volcano plot visualization \cr
#' \code{\link{prnGSPA}} for gene set enrichment analysis by protein significance pVals \cr
#' \code{\link{gspaMap}} for mapping GSPA to volcano plot visualization \cr
#' \code{\link{prnGSPAHM}} for heat map and network visualization of GSPA results \cr
#' \code{\link{prnGSVA}} for gene set variance analysis \cr
#' \code{\link{prnGSEA}} for data preparation for online GSEA. \cr
#' \code{\link{pepMDS}} and \code{\link{prnMDS}} for MDS visualization \cr
#' \code{\link{pepPCA}} and \code{\link{prnPCA}} for PCA visualization \cr
#' \code{\link{pepLDA}} and \code{\link{prnLDA}} for LDA visualization \cr
#' \code{\link{pepHM}} and \code{\link{prnHM}} for heat map visualization \cr
#' \code{\link{pepCorr_logFC}}, \code{\link{prnCorr_logFC}}, \code{\link{pepCorr_logInt}} and
#' \code{\link{prnCorr_logInt}} for correlation plots \cr
#' \code{\link{anal_prnTrend}} and \code{\link{plot_prnTrend}} for trend analysis and visualization \cr
#' \code{\link{anal_pepNMF}}, \code{\link{anal_prnNMF}}, \code{\link{plot_pepNMFCon}},
#' \code{\link{plot_prnNMFCon}}, \code{\link{plot_pepNMFCoef}}, \code{\link{plot_prnNMFCoef}} and
#' \code{\link{plot_metaNMF}} for NMF analysis and visualization \cr
#'
#' \emph{Custom databases} \cr
#' \code{\link{Uni2Entrez}} for lookups between UniProt accessions and Entrez IDs \cr
#' \code{\link{Ref2Entrez}} for lookups among RefSeq accessions, gene names and Entrez IDs \cr
#' \code{\link{prepGO}} for \code{\href{http://current.geneontology.org/products/pages/downloads.html}{gene
#' ontology}} \cr
#' \code{\link{prepMSig}} for \href{https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/}{molecular
#' signatures} \cr
#' \code{\link{prepString}} and \code{\link{anal_prnString}} for STRING-DB \cr
#'
#' \emph{Column keys in PSM, peptide and protein outputs} \cr
#' system.file("extdata", "psm_keys.txt", package = "proteoQ") \cr
#' system.file("extdata", "peptide_keys.txt", package = "proteoQ") \cr
#' system.file("extdata", "protein_keys.txt", package = "proteoQ") \cr
#'
#' @export
prnGSPAHM <- function (scale_log2r = TRUE, complete_cases = FALSE,
impute_na = FALSE, fml_nms = NULL,
annot_cols = NULL, annot_colnames = NULL, annot_rows = NULL,
df2 = NULL, filename = NULL, ...)
{
old_opts <- options()
on.exit(options(old_opts), add = TRUE)
options(warn = 1)
check_dots(c("id", "anal_type", "df", "filepath"), ...)
dat_dir <- get_gl_dat_dir()
dir.create(file.path(dat_dir, "Protein/GSPA/log"),
recursive = TRUE, showWarnings = FALSE)
id <- match_call_arg(normPSM, group_pep_by)
stopifnot(rlang::as_string(id) %in% c("prot_acc", "gene"), length(id) == 1L)
scale_log2r <- match_prnSig_scale_log2r(scale_log2r = scale_log2r,
impute_na = impute_na)
df2 <- rlang::enexpr(df2)
filename <- rlang::enexpr(filename)
annot_cols <- rlang::enexpr(annot_cols)
annot_colnames <- rlang::enexpr(annot_colnames)
annot_rows <- rlang::enexpr(annot_rows)
dots <- rlang::enexprs(...)
dots <- concat_fml_dots(
fmls = dots %>% .[grepl("^\\s*~", .)],
fml_nms = fml_nms,
dots = dots %>% .[!grepl("^\\s*~", .)],
anal_type = "GSPA"
)
reload_expts()
info_anal(id = !!id,
scale_log2r = scale_log2r,
complete_cases = complete_cases,
impute_na = impute_na,
df = NULL,
df2 = !!df2,
filepath = NULL,
filename = !!filename,
anal_type = "GSPA_hm")(annot_cols = !!annot_cols,
annot_colnames = !!annot_colnames,
annot_rows = !!annot_rows,
!!!dots)
}
#' Plots distance heat map of GSPA
#'
#' @inheritParams prnHist
#' @inheritParams prnHM
#' @inheritParams gspaMap
gspaHM <- function(scale_log2r = TRUE, complete_cases = FALSE, impute_na = FALSE,
df2, filepath, filename, ...)
{
dots <- rlang::enexprs(...)
fmls <- dots %>% .[grepl("~", .)]
dots <- dots %>% .[! names(.) %in% names(fmls)]
if (!length(fmls))
stop("No formula(s) of contrasts available.")
fml_nms <- names(fmls)
if (length(fml_nms)) {
purrr::walk(fml_nms, byfml_gspahm, df2, filepath, filename,
scale_log2r, impute_na, !!!dots)
}
}
#' A helper function for gspaHM by formula names
#'
#' @inheritParams prnHist
#' @inheritParams prnHM
#' @inheritParams gspaMap
#' @inheritParams fml_gspa
#' @import purrr dplyr pheatmap
#' @importFrom magrittr %>% %T>% %$% %<>%
byfml_gspahm <- function (fml_nm, df2, filepath, filename, scale_log2r = TRUE,
impute_na = FALSE, ...)
{
ins <- list.files(path = file.path(filepath, fml_nm),
pattern = "GSPA_[ONZ]_essmap\\.txt$")
if (!length(ins)) {
message("No GSPA results at ", fml_nm)
return(NULL)
}
if (is.null(df2)) {
ins <- ins %>%
{if (impute_na) .[grepl("_impNA", .)] else .[!grepl("_impNA", .)]}
ins <- if (is.na(scale_log2r))
ins[grepl("_GSPA_O_", ins)]
else if (scale_log2r)
ins[grepl("_GSPA_Z_", ins)]
else
ins[grepl("_GSPA_N_", ins)]
if (!length(ins)) {
stop("No GSPA inputs correspond to impute_na = ", impute_na,
", scale_log2r = ", scale_log2r)
}
}
else {
local({
non_exists <- df2 %>% .[! . %in% ins]
if (length(non_exists))
stop("Missing _essmap file(s): ", paste(non_exists, collapse = ", "))
})
if (!length(df2))
stop("Input file(s) not found.")
ins <- ins %>% .[. %in% df2]
}
meta_ins <- list.files(path = file.path(filepath, fml_nm),
pattern = "_essmeta\\.txt$")
meta_ins <- local({
required <- gsub("_essmap\\.txt$", "_essmeta.txt", ins)
meta_ins %>% .[. %in% required]
})
purrr::walk2(ins, meta_ins, byfile_gspahm, fml_nm, filepath,
filename, scale_log2r, impute_na, ...)
}
#' A helper function for gspaHM by input file names
#'
#' @param ess_in A list of file names for input data
#' @param meta_in A list of file names for input metadata
#' @inheritParams prnHist
#' @inheritParams prnHM
#' @inheritParams fml_gspa
#' @import purrr dplyr pheatmap
#' @importFrom magrittr %>% %T>% %$% %<>%
byfile_gspahm <- function (ess_in, meta_in, fml_nm, filepath, filename,
scale_log2r, impute_na, ...)
{
custom_prefix <- gsub("(.*_{0,1})Protein_GSPA.*", "\\1", ess_in)
fn_suffix <- gsub("^.*\\.([^.]*)$", "\\1", filename) # %>% .[1]
fn_prefix <- gsub("\\.[^.]*$", "", filename)
filename <- paste0(custom_prefix, fn_prefix, ".", fn_suffix)
dots <- rlang::enexprs(...)
if (length(dots)) {
if (any(grepl("^filter_", names(dots))))
stop("Primary `filter_` depreciated; use secondary `filter2_`.")
}
filter2_dots <- dots %>%
.[purrr::map_lgl(., is.language)] %>%
.[grepl("^filter2_", names(.))]
arrange2_dots <- dots %>%
.[purrr::map_lgl(., is.language)] %>%
.[grepl("^arrange2_", names(.))]
select2_dots <- dots %>%
.[purrr::map_lgl(., is.language)] %>%
.[grepl("^select2_", names(.))]
dots <- dots %>%
.[! . %in% c(filter2_dots, arrange2_dots, select2_dots)]
all_by_greedy <- tryCatch(readr::read_tsv(file.path(filepath, fml_nm, ess_in),
col_types = cols(ess_term = col_factor(),
size = col_double(),
ess_size = col_double(),
fraction = col_double(),
distance = col_double(),
idx = col_double(),
ess_idx = col_double())),
error = function(e) NA) %>%
filters_in_call(!!!filter2_dots) %>%
arrangers_in_call(!!!arrange2_dots)
rm(list = c("filter2_dots", "arrange2_dots", "select2_dots"))
if (!nrow(all_by_greedy))
stop("No GSPA terms available after data filtration.")
if (max(all_by_greedy$distance) == 0) {
warning("Identical, all-zero distance detected; ",
"try lower the criteria in data filtrations or ",
"rerun `prnGSPA` at more relaxed `gspval_cutoff` threshold.",
call. = FALSE)
return(NULL)
}
if (!is.null(dim(all_by_greedy)))
message(paste("File loaded:", ess_in, "at", fml_nm))
else
stop("Essential GSPA not found.", call. = FALSE)
ess_vs_all <- all_by_greedy %>%
dplyr::select(-which(names(.) %in% c("size", "ess_size",
"idx", "ess_idx", "distance"))) %>%
tidyr::spread(term, fraction) %>%
dplyr::mutate_at(vars(which(names(.) != "ess_term")), ~ {1 - .x}) %>%
`rownames<-`(NULL) %>%
tibble::column_to_rownames("ess_term")
d_row <- dist(ess_vs_all)
d_col <- dist(t(ess_vs_all))
max_d_row <- max(d_row, na.rm = TRUE)
max_d_col <- max(d_col, na.rm = TRUE)
d_row[is.na(d_row)] <- 1.2 * max_d_row
d_col[is.na(d_col)] <- 1.2 * max_d_col
cluster_rows <- if (is.infinite(max_d_row)) FALSE else hclust(d_row)
cluster_cols <- if (is.infinite(max_d_col)) FALSE else hclust(d_col)
mypalette <- if (is.null(dots$color))
colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100)
else
eval(dots$color, envir = rlang::caller_env())
annot_cols <- if (is.null(dots$annot_cols))
NULL
else
eval(dots$annot_cols, envir = rlang::caller_env())
annot_colnames <- if (is.null(dots$annot_colnames))
NULL
else
eval(dots$annot_colnames, envir = rlang::caller_env())
annot_rows <- if (is.null(dots$annot_rows))
NULL
else
eval(dots$annot_rows, envir = rlang::caller_env())
annotation_col <- if (is.null(annot_cols))
NA
else
gspa_colAnnot(annot_cols = annot_cols,
df = all_by_greedy,
sample_ids = all_by_greedy$term)
if (!is.null(annot_colnames) && length(annot_colnames) == length(annot_cols))
colnames(annotation_col) <- annot_colnames
if (is.null(annot_rows))
annotation_row <- NA
else {
ess_meta <- tryCatch(readr::read_tsv(file.path(filepath, fml_nm, meta_in),
col_types = cols(term = col_factor())),
error = function(e) NA)
annot_rows <- annot_rows %>% .[. %in% names(ess_meta)]
if (!length(annot_rows))
annotation_row <- NA
else {
annotation_row <- ess_meta %>%
dplyr::rename(ess_term = term) %>%
dplyr::select("ess_term", annot_rows) %>%
dplyr::mutate(ess_term =
factor(ess_term, levels = levels(all_by_greedy$ess_term))) %>%
`rownames<-`(NULL) %>%
tibble::column_to_rownames("ess_term")
}
}
annotation_colors <- if (is.null(dots$annotation_colors))
setHMColor(annotation_col)
else if (is.na(dots$annotation_colors))
NA
else
eval(dots$annotation_colors, envir = rlang::caller_env())
nrow <- nrow(ess_vs_all)
ncol <- ncol(ess_vs_all)
max_width <- 44
max_height <- 44
if (is.null(dots$width)) {
if (ncol <= 150L) {
fontsize <- 12
fontsize_col <- cellwidth <- 12
show_colnames <- TRUE
width <- pmax(ncol/1.2, 8)
}
else {
fontsize <- fontsize_col <- 1
cellwidth <- NA
show_colnames <- FALSE
width <- max_width
}
}
else {
width <- dots$width
fontsize <- if (is.null(dots$fontsize)) 1 else dots$fontsize
fontsize_col <- if (is.null(dots$fontsize_col)) 1 else dots$fontsize_col
cellwidth <- if (is.null(dots$cellwidth)) NA else dots$cellwidth
show_colnames <- if (is.null(dots$show_colnames)) FALSE else dots$show_colnames
}
if (is.null(dots$height)) {
if (nrow <= 150) {
fontsize <- 12
fontsize_row <- cellheight <- 12
show_rownames <- TRUE
height <- pmax(nrow/1.2, 8)
}
else {
fontsize <- fontsize_row <- 1
cellheight <- NA
show_rownames <- FALSE
height <- max_height
}
}
else {
height <- dots$height
fontsize <- if (is.null(dots$fontsize)) 1 else dots$fontsize
fontsize_row <- if (is.null(dots$fontsize_row)) 1 else dots$fontsize_row
cellheight <- if (is.null(dots$cellheight)) NA else dots$cellheight
show_rownames <- if (is.null(dots$show_rownames)) FALSE else dots$show_rownames
}
if ((!is.na(width)) && (width > max_width)) {
warning("The plot width is set to ", max_width, call. = FALSE)
width <- max_width
height <- min(max_height, width * nrow / ncol)
}
if ((!is.na(height)) && (height > max_height)) {
warning("The plot height is set to ", max_height, call. = FALSE)
height <- max_height
width <- min(max_width, height * ncol / nrow)
}
dots <- dots %>%
.[!names(.) %in% c("annot_cols", "annot_colnames", "annot_rows",
"mat", "filename", "annotation_col", "annotation_row",
"color", "annotation_colors", "breaks",
"cluster_rows", "cluster_cols",
"width", "height", "fontsize_row", "fontsize_col",
"cellheight", "cellwidth")]
ph <- my_pheatmap(
mat = ess_vs_all,
filename = file.path(filepath, fml_nm, filename),
annotation_col = annotation_col,
annotation_row = annotation_row,
color = mypalette,
annotation_colors = annotation_colors,
breaks = NA, # not used
cluster_rows = cluster_rows,
cluster_cols = cluster_cols,
fontsize_row = fontsize_row,
fontsize_col = fontsize_col,
cellheight = cellheight,
cellwidth = cellwidth,
show_rownames = show_rownames,
show_colnames = show_colnames,
width = width,
height = height,
!!!dots,
)
# networks
if (!requireNamespace("networkD3", quietly = TRUE)) {
stop("\n====================================================================",
"\nPackage \"networkD3\" needed for this function to work.",
"\n====================================================================",
call. = FALSE)
}
cluster <- data.frame(cluster = cutree(ph$tree_col, h = max_d_col)) %>%
tibble::rownames_to_column("term")
all_by_greedy <- local({
essterms <- unique(all_by_greedy$ess_term) %>% as.character()
terms <- unique(all_by_greedy$term) %>% .[! . %in% essterms] %>% as.character()
universe <- union(essterms, terms)
all_by_greedy <- all_by_greedy %>%
dplyr::select(-idx, -ess_idx) %>%
dplyr::mutate(term = factor(term, levels = universe)) %>%
dplyr::mutate(ess_term = factor(ess_term, levels = universe)) %>%
dplyr::mutate(source = as.numeric(term), target = as.numeric(ess_term))
min_target <- min(all_by_greedy$target, na.rm = TRUE)
all_by_greedy %>%
dplyr::mutate(source = source - min_target, target = target - min_target) %>%
dplyr::mutate(term = as.character(term)) %>%
dplyr::left_join(cluster, by = "term") %>%
dplyr::mutate(term = factor(term, levels(ess_term)))
})
my_nodes <- local({
my_nodes <- all_by_greedy %>%
dplyr::arrange(-size) %>%
dplyr::select(c("term", "cluster", "size")) %>%
dplyr::filter(!duplicated(term)) %>%
dplyr::arrange(cluster) %>%
dplyr::arrange(term)
# an `ess_term` may be missing from `term` after data row filtration
temp_nodes <- all_by_greedy %>%
dplyr::select(c("ess_term", "cluster", "size")) %>%
dplyr::filter(!duplicated(ess_term)) %>%
dplyr::filter(! ess_term %in% my_nodes$term) %>%
dplyr::arrange(cluster) %>%
dplyr::arrange(ess_term) %>%
dplyr::rename(term = ess_term)
my_nodes <- bind_rows(my_nodes, temp_nodes) %>%
dplyr::arrange(cluster) %>%
dplyr::arrange(term)
}) %>%
data.frame(check.names = FALSE)
my_links <- all_by_greedy %>%
dplyr::arrange(term) %>%
dplyr::select(source, target, fraction) %>%
dplyr::mutate(fraction = fraction * 10) %>%
dplyr::distinct() %>%
data.frame(check.names = FALSE)
fn_prefix <- gsub("\\.[^.]*$", "", filename)
networkD3::forceNetwork(Links = my_links, Nodes = my_nodes, Source = "source",
Target = "target", Value = "fraction", NodeID = "term",
Nodesize = "size",
Group = "cluster", opacity = 0.8, zoom = TRUE) %>%
networkD3::saveNetwork(file = file.path(filepath, fml_nm, paste0(fn_prefix, ".html")))
}
#' Sets up the column annotation in heat maps
#'
#' @param sample_ids A character vector containing the sample IDs for an ascribing analysis.
#' @inheritParams prnEucDist
#' @import dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
gspa_colAnnot <- function (annot_cols = NULL, df, sample_ids, annot_colnames = NULL)
{
if (is.null(annot_cols))
return(NA)
exists <- annot_cols %in% names(df)
if (sum(!exists) > 0) {
warning(paste0("Column '", annot_cols[!exists], "'",
" not found in GSPA inputs and will be skipped."))
annot_cols <- annot_cols[exists]
}
if (!length(annot_cols))
return(NA)
x <- df %>%
dplyr::filter(term %in% sample_ids, !duplicated(term)) %>%
dplyr::select(annot_cols, term) %>%
dplyr::select(which(not_all_NA(.))) %>%
data.frame(check.names = FALSE) %>%
`rownames<-`(.[["term"]])
if (any(duplicated(x[["term"]])))
stop("Duplicated terms found\n")
if (!"term" %in% annot_cols)
x <- dplyr::select(x, -term)
if (!ncol(x))
return(NA)
invisible(x)
}
##########################################
# From mzion
##########################################
#' Greedy set cover.
#'
#' @param df A two-column data frame.
greedysetcover <- function (df)
{
## assume: (1) two columns
# stopifnot(ncol(df) == 2L)
len <- length(unique(df[[1]]))
if (len == 1L)
return(df)
nms <- colnames(df)
colnames(df) <- c("s", "a")
# (2) no duplicated entries (for speeds)
# df <- df %>%
# tidyr::unite(sa, s, a, sep = "@", remove = FALSE) %>%
# dplyr::filter(!duplicated(sa)) %>%
# dplyr::select(-sa)
# ---
cts <- df %>%
dplyr::group_by(s) %>%
dplyr::summarise(n = n())
df <- dplyr::left_join(cts, df, by = "s") %>%
dplyr::arrange(-n)
sets <- NULL
while(nrow(df)) {
s <- df[[1, "s"]]
sa <- df[df$s == s, c("s", "a")]
sets <- rbind(sets, sa)
# may consider partial sorting
df <- df %>%
dplyr::filter(! a %in% sa[["a"]]) %>%
dplyr::group_by(s) %>%
dplyr::mutate(n = n()) %>%
dplyr::arrange(-n)
}
colnames(sets) <- nms
invisible(sets)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.