#' Prepares data for analysis
#'
#' \code{prepDM} prepares a minimal adequate data frame for subsequent analysis.
#'
#' @inheritParams prnEucDist
#' @inheritParams info_anal
#' @inheritParams normPSM
#' @param sub_grp Numeric. A list of sample IDs that will be used in subsequent
#' analysis.
#' @param type The type of data, for example ratio or intensity.
#' @return A data frame tailored for subsequent analysis.
#'
#' @examples
#' \donttest{tempData <- prepDM(df, entrez, scale_log2r, label_scheme_sub$Sample_ID)}
#' @import dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
prepDM <- function(df = NULL, id = "pep_seq", scale_log2r = TRUE, sub_grp = NULL,
type = "ratio", anal_type = NULL, rm_allna = FALSE)
{
dat_dir <- get_gl_dat_dir()
label_scheme <- load_ls_group(dat_dir, label_scheme)
if (!nrow(df))
stop("Zero row of data after subsetting.")
id <- rlang::as_string(rlang::enexpr(id))
if (anal_type %in% c("ESGAGE", "GSVA"))
id <- "entrez"
if ((anal_type %in% c("GSEA")) && (id != "gene"))
stop("Primary ID is not `gene`.", call. = FALSE)
NorZ_ratios <- find_NorZ(scale_log2r)
pattern <-
"I[0-9]{3}\\(|log2_R[0-9]{3}\\(|pVal\\s+\\(|adjP\\s+\\(|log2Ratio\\s+\\(|\\.FC\\s+\\("
df <- local({
df <- df %>%
dplyr::ungroup() %>%
dplyr::filter(!duplicated(!!rlang::sym(id)),
!is.na(!!rlang::sym(id)),)
if (!nrow(df))
stop("Zero row of data after the removals of duplicated or NA `", id, "`.")
df <- df %>%
dplyr::ungroup() %>%
{ if (rm_allna) dplyr::filter(., rowSums(!is.na(.[grep(NorZ_ratios, names(.))])) > 0)
else . } %>%
reorderCols(endColIndex = grep(pattern, names(.)), col_to_rn = id)
if (!nrow(df))
stop("All intensity are NA or 0 in the input data.")
df
})
Levels <- sub_grp %>%
as.character(.) %>%
.[!grepl("^Empty\\.[0-9]+", .)]
dfR <- local({
dfR <- df %>%
dplyr::select(grep(NorZ_ratios, names(.))) %>%
`colnames<-`(label_scheme$Sample_ID) %>%
dplyr::select(which(names(.) %in% sub_grp))
# reference will drop with single reference
non_trivials <- dfR %>%
dplyr::select(which(not_all_zero(.))) %>%
names()
trivials <- names(dfR) %>% .[! . %in% non_trivials]
if (length(trivials)) {
warning("Samples with all NA entries being dropped: ",
paste(trivials, collapse = ", "))
}
dfR <- dfR %>%
dplyr::select(which(not_all_zero(.))) %>%
dplyr::select(Levels[Levels %in% names(.)]) # ensure the same order
})
# dominated by log2R, no need to filter all-NA intensity rows
dfI <- df %>%
dplyr::select(grep("^N_I[0-9]{3}", names(.))) %>%
`colnames<-`(label_scheme$Sample_ID) %>%
dplyr::select(which(names(.) %in% sub_grp)) %>%
dplyr::select(which(not_all_zero(.))) %>%
dplyr::select(which(names(.) %in% names(dfR))) %>%
dplyr::select(Levels[Levels %in% names(.)])
tempI <- dfI
rownames(tempI) <- paste(rownames(tempI), "Intensity", sep = "@")
tempR <- dfR
rownames(tempR) <- paste(rownames(tempR), "log2R", sep = "@")
invisible(list(log2R = dfR,
Intensity = dfI,
IR = rbind(tempR, tempI)))
}
#' Finds the patterns of log2FC columns.
#'
#' @inheritParams prnHist
find_NorZ <- function (scale_log2r = TRUE)
{
if (is.na(scale_log2r))
"^log2_R"
else if (scale_log2r)
"^Z_log2_R"
else
"^N_log2_R"
}
#' Replaces column names to get Sample_ID(s).
#'
#' @param NorZ_ratios A string.
#' @param nms A vector of names.
replace_NorZ_names <- function (NorZ_ratios, nms)
{
gsub(paste0(NorZ_ratios, "[0-9]{3}[NC]{0,1}\\s+\\((.*)\\)$"), "\\1", nms)
}
#' Prefix form of colnames(x)[c(2, 5, ...)] for use in pipes
#'
#' \code{names_pos<-} rename the columns at the indexes of \code{pos}.
#'
#' @param x A data frame.
#' @param pos Numeric. The index of columns for name change.
#' @param value Characters. The new column names.
#' @return The data frame with new names.
#'
#' @import dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
`names_pos<-` <- function(x, pos, value)
{
names(x)[pos] <- value
x
}
#' Re-order file names
#'
#' \code{reorder_files} re-orders file names by TMT set numbers then by LCMS
#' injection numbers.
#'
#' @param filelist A list of file names.
#' @import dplyr
#' @importFrom stringr str_split
#' @importFrom magrittr %>% %T>% %$% %<>%
reorder_files <- function(filelist)
{
newlist <- NULL
tmt_sets <- gsub("^TMTset(\\d+).*", "\\1", filelist) %>%
unique() %>%
as.integer() %>%
sort()
lcms_injs <- gsub("^.*_LCMSinj(\\d+).*", "\\1", filelist) %>%
unique() %>%
as.integer() %>%
sort()
if (anyNA(tmt_sets)) {
stop("Values under `expt_smry.xlsx::TMT_Set` need to be integers.",
call. = FALSE)
}
if (anyNA(lcms_injs)) {
stop("Values under `expt_smry.xlsx::LCMS_Injection` need to be integers.",
call. = FALSE)
}
for (i in tmt_sets) {
newlist <- c(newlist, filelist[grep(paste0("TMTset", i, "_"), filelist,
ignore.case = TRUE)])
}
return(newlist)
}
#' Re-order columns in a data frame
#'
#' \code{reorderCols} re-orders columns in a data frame.
#'
#' @param df A data frame.
#' @param endColIndex the indexes of columns to be moved to the end of
#' \code{df}.
#' @param col_to_rn the column identifier where the values under that column
#' will be used as row names.
#' @import dplyr
#' @importFrom stringr str_split
#' @importFrom magrittr %>% %T>% %$% %<>%
reorderCols <- function(df, endColIndex, col_to_rn)
{
if (!length(endColIndex)) {
endColIndex <- grep("I[0-9]{3}|log2_R[0-9]{3}", names(df))
}
df <- cbind(df[, -endColIndex], df[, endColIndex])
if (sum(duplicated(df[[col_to_rn]])) > 0) {
df <- df %>%
dplyr::group_by(!!ensym(col_to_rn)) %>%
dplyr::mutate(Index = row_number()) %>%
dplyr::mutate(indexed_names = paste(get(col_to_rn), Index, sep=".")) %>%
data.frame(check.names = FALSE) %>%
`rownames<-`(.[, "indexed_names"]) %>%
dplyr::select(-c("Index", "indexed_names"))
} else {
rownames(df) <- df[[col_to_rn]]
}
return(df)
}
#' Re-order columns in a data frame
#'
#' \code{reorderCols2} re-orders columns in a data frame.
#'
#' @param df A data frame.
#' @param pattern columns matched the pattern will be moved to the end of
#' \code{df}.
#' @import dplyr
#' @importFrom stringr str_split
#' @importFrom magrittr %>% %T>% %$% %<>%
#' @export
reorderCols2 <- function(df = NULL, pattern = NULL)
{
if (is.null(df))
stop("`df` cannot be `NULL`.", call. = FALSE)
if (is.null(pattern)) {
pattern <-
"I[0-9]{3}\\(|log2_R[0-9]{3}\\(|pVal\\s+\\(|adjP\\s+\\(|log2Ratio\\s+\\(|\\.FC\\s+\\("
}
endColIndex <- grep(pattern, names(df))
if (length(endColIndex))
df <- dplyr::bind_cols(df[, -endColIndex], df[, endColIndex])
return(df)
}
#' Re-order columns in a data frame
#'
#' \code{ins_cols_after} re-orders columns in a data frame.
#'
#' @param df A data frame.
#' @param idx_bf A column index for the insertion of columns after.
#' @param idx_ins Column index(es) for the columns to be inserted after
#' \code{idx_bf}.
#' @import dplyr
#' @importFrom stringr str_split
#' @importFrom magrittr %>% %T>% %$% %<>%
ins_cols_after <- function(df = NULL, idx_bf = ncol(df), idx_ins = NULL)
{
if (is.null(df))
stop("`df` cannot be `NULL`.", call. = FALSE)
if (is.null(idx_ins))
return(df)
if (idx_bf >= ncol(df))
return(df)
col_nms <- names(df)[idx_ins]
dplyr::bind_cols(
df[, 1:idx_bf, drop = FALSE],
df[, idx_ins, drop = FALSE],
df[, (idx_bf + 1):ncol(df), drop = FALSE] %>% dplyr::select(-col_nms),
)
}
#' Sorts by the indexes of TMT_Set and LCMS_Injection
#'
#' "8.3" "8.5" "11.1" "11.3" "99.2" "99.10"
#' Not "8.3" "8.5" "11.1" "11.3" "99.10" "99.2"
#' Nor "11.1" "11.3" "8.3" "8.5" "99.10" "99.2"
#' @param nms A list of names of "TMT_Set" and "LCMS_Injection" separated by dot.
sort_tmt_lcms <- function (nms)
{
nms %>%
purrr::map(~ strsplit(.x, "[.]") %>% unlist()) %>%
do.call(rbind, .) %>%
data.frame() %>%
`colnames<-`(c("TMT_Set", "LCMS_Injection")) %>%
dplyr::mutate(TMT_Set = as.numeric(TMT_Set),
LCMS_Injection = as.numeric(LCMS_Injection)) %>%
dplyr::arrange(TMT_Set, LCMS_Injection) %>%
dplyr::mutate(tmt_inj = paste(TMT_Set, LCMS_Injection, sep = ".")) %>%
dplyr::select(tmt_inj) %>%
unlist()
}
#' Replaces zero intensity with NA
#'
#' \code{na_zeroIntensity} replaces zero intensity with NA to avoid -Inf in
#' log10 transformation.
#'
#' @param df A list of file names.
#' @import dplyr
#' @importFrom stringr str_split
#' @importFrom magrittr %>% %T>% %$% %<>%
na_zeroIntensity <- function(df)
{
ind <- grep("I[0-9]{3}|^LFQ\\sintensity\\s|^Intensity\\s", names(df))
temp <- df[, ind]
temp[temp == 0] <- NA
df[, ind] <- temp
invisible(df)
}
#' Finds all zero rows
#'
#' @param x A data frame.
not_allzero_rows <- function(x) (rowSums(x != 0, na.rm = TRUE) > 0)
#' Summarizes numeric values
#'
#' \code{aggrNums} summarizes \code{log2FC} and \code{intensity} by the
#' descriptive statistics of \code{c("mean", "median", "weighted.mean",
#' "top.3")}
#'
#' @param f A function for data summarization.
#' @examples \donttest{df_num <- aggrNums(median)(df, prot_acc, na.rm = TRUE)}
#' @import dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
aggrNums <- function(f)
{
function (df, id, ...) {
id <- rlang::as_string(rlang::enexpr(id))
dots <- rlang::enexprs(...)
df %>%
dplyr::select(id, grep("log2_R[0-9]{3}|I[0-9]{3}", names(.))) %>%
dplyr::group_by(!!rlang::sym(id)) %>%
dplyr::summarise_all(~ f(.x, !!!dots))
}
}
#' Sums top_n
#'
#' \code{aggrTopn} summarizes \code{log2FC} and \code{intensity} by the
#' descriptive statistics of \code{c("mean", "median", "sum")}. Note the
#' difference to \link{TMT_top_n}, which uses mean statistics.
#'
#' @param f A function for data summarization.
#' @examples \donttest{df_num <- aggrTopn(sum)(df, prot_acc, 3, na.rm = TRUE)}
aggrTopn <- function(f)
{
function (df, id, n, ...) {
id <- rlang::as_string(rlang::enexpr(id))
dots <- rlang::enexprs(...)
df %>%
dplyr::select(id, grep("log2_R[0-9]{3}|I[0-9]{3}", names(.))) %>%
dplyr::mutate(sum_Intensity = rowSums(.[grep("^I[0-9]{3}", names(.))],
na.rm = TRUE)) %>%
dplyr::group_by(!!rlang::sym(id)) %>%
dplyr::top_n(n = n, wt = sum_Intensity) %>%
dplyr::select(-sum_Intensity) %>%
dplyr::summarise_all(~ f(.x, !!!dots))
}
}
#' Aggregation of LFQ intensity.
#'
#' Summarizes \code{log2FC} and \code{intensity} by the descriptive statistics
#' of \code{c("mean", "median", "sum")}. Note that data are already subset by
#' top_n.
#'
#' \code{ids} are a list of column keys.
#'
#' @param f A function for data summary.
#' @examples \donttest{df_num <- aggrLFQ(sum)(df, prot_acc, na.rm = TRUE)}
aggrLFQs <- function(f)
{
function (df, ids, ...) {
nms <- names(df)
bads <- ids[!ids %in% nms]
if (length(bads))
stop("Columns not found `", paste(bads, collapse = ", "), "` not found.")
cols <- grep("log2_R[0-9]{3}|I[0-9]{3}", nms)
if (!length(cols))
stop("Columns of log2 ratios or intensity not found.")
df %>%
dplyr::select(ids, cols) |>
dplyr::group_by_at(ids) |>
dplyr::summarise_all(function (x) f(x, ...))
}
}
#' Calculates weighted mean
#'
#' \code{tmt_wtmean} calculates the weighted mean of \code{log2FC} based on
#' \code{intensity}.
#'
#' @param x A data frame of \code{log2FC} and \code{intensity}.
#' @param id The variable to summarize \code{log2FC}.
#' @param na.rm Logical; if TRUE, removes NA values.
#' @param ... Additional arguments for \code{weighted.mean}.
#' @import dplyr
#' @importFrom stringr str_length
#' @importFrom tidyr gather
#' @importFrom magrittr %>% %T>% %$% %<>%
tmt_wtmean <- function (x, id, na.rm = TRUE, ...)
{
my_trim <- function (x, range_int = c(.05, .95), na.rm = TRUE) {
if (range_int[2] > 1) range_int <- range_int/100
min_x <- min(x, na.rm = na.rm)
qs <- quantile(x, probs = range_int, na.rm = na.rm)
x[x < qs[1] | x > qs[2]] <- min_x
return(x)
}
my_wtmean <- function (x) {
x %>%
dplyr::mutate_at(.vars = "Intensity", my_trim, range_int = range_int,
na.rm = TRUE) %>%
dplyr::mutate(n = stringr::str_length(.[[id]])) %>%
dplyr::filter(n != 0) %>%
dplyr::group_by(!!rlang::sym(id), variable) %>%
dplyr::summarise(log2_R = weighted.mean(log2_R, log10(Intensity),
!!!dots)) %>%
tidyr::spread(variable, log2_R)
}
range_int <- c(.05, .95)
id <- rlang::as_string(rlang::enexpr(id))
dots <- rlang::enexprs(...)
# intensity
xi <- x %>%
dplyr::select(id, grep("^I[0-9]{3}[NC]{0,1}", names(.))) %>%
tidyr::gather(key = variable, value = Intensity, -id)
# mean: `I` and `N_I`
xi_wt <- x %>%
dplyr::select(id, grep("^[N]{0,1}[_]{0,1}I[0-9]{3}[NC]{0,1}", names(.))) %>%
dplyr::group_by(!!rlang::sym(id)) %>%
dplyr::summarise_all(mean, na.rm = TRUE, !!!dots)
# weighted.mean `log2_R`
xr <- x %>%
dplyr::select(id, grep("^log2_R[0-9]{3}[NC]{0,1}", names(.))) %>%
tidyr::gather(key = variable, value = log2_R, -id)
if ("log2_R" %in% names(xr)) {
xr_wt <- cbind.data.frame(xi, log2_R = xr[, c("log2_R")]) %>%
dplyr::mutate(variable = gsub("^I([0-9]{3}[NC]{0,1})", "log2_R\\1", variable),
variable = factor(variable, levels = unique(variable))) %>%
my_wtmean()
} else {
xr_wt <- xi_wt[, 1, drop = FALSE]
}
# weighted.mean `N_log2_R`
xnr <- x %>%
dplyr::select(id, grep("^N_log2_R[0-9]{3}[NC]{0,1}", names(.))) %>%
tidyr::gather(key = variable, value = log2_R, -id)
if ("log2_R" %in% names(xnr)) {
xnr_wt <- cbind.data.frame(xi, log2_R = xnr[, c("log2_R")]) %>%
dplyr::mutate(variable = gsub("^I([0-9]{3}[NC]{0,1})", "N_log2_R\\1", variable),
variable = factor(variable, levels = unique(variable))) %>%
my_wtmean()
} else {
xnr_wt <- xi_wt[, 1, drop = FALSE]
}
# weighted.mean `Z_log2_R` may not be available for PSM data
xzr <- x %>%
dplyr::select(id, grep("^Z_log2_R[0-9]{3}[NC]{0,1}", names(.))) %>%
tidyr::gather(key = variable, value = log2_R, -id)
if ("log2_R" %in% names(xzr)) {
xzr_wt <- cbind.data.frame(xi, log2_R = xzr[, c("log2_R")]) %>%
dplyr::mutate(variable = gsub("^I([0-9]{3}[NC]{0,1})", "Z_log2_R\\1", variable),
variable = factor(variable, levels = unique(variable))) %>%
my_wtmean()
} else {
xzr_wt <- xi_wt[, 1, drop = FALSE]
}
x <- list(xi_wt, xr_wt, xnr_wt, xzr_wt) %>%
purrr::reduce(dplyr::left_join, by = id)
dplyr::bind_cols(
x[, names(x) == id],
x[, grepl("^I[0-9]{3}[NC]{0,1}", names(x))],
x[, grepl("^N_I[0-9]{3}[NC]{0,1}", names(x))],
x[, grepl("^log2_R[0-9]{3}[NC]{0,1}", names(x))],
x[, grepl("^N_log2_R[0-9]{3}[NC]{0,1}", names(x))],
x[, grepl("^Z_log2_R[0-9]{3}[NC]{0,1}", names(x))],
)
}
#' Average top_n
#'
#' \code{TMT_wt_mean} calculates the weighted mean of \code{log2FC} and
#' \code{intensity}. Note the difference to \link{aggrTopn}, which uses sum
#' statistics.
#'
#' @param x A data frame of \code{log2FC} and \code{intensity}.
#' @param id The variable to summarize \code{log2FC}.
#' @param ... Additional arguments for \code{mean}.
#' @examples \donttest{df_num <- TMT_top_n(df, prot_acc, na.rm = TRUE)}
#' @import dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
TMT_top_n <- function (x, id, ...)
{
id <- rlang::as_string(rlang::enexpr(id))
dots <- rlang::enexprs(...)
x %>%
dplyr::select(id, grep("log2_R[0-9]{3}|I[0-9]{3}", names(.))) %>%
dplyr::mutate(sum_Intensity = rowSums(.[grep("^I[0-9]{3}", names(.))],
na.rm = TRUE)) %>%
dplyr::group_by(!!rlang::sym(id)) %>%
dplyr::top_n(n = 3, wt = sum_Intensity) %>%
dplyr::select(-sum_Intensity) %>%
dplyr::summarise_all(~ mean(.x, !!!dots))
}
#' Finds not-all-zero column(s)
#'
#' \code{not_all_zero} identifies the column indexes with all NA values.
#'
#' @param df A data frame.
#' @examples \donttest{not_all_zero(mtcars)}
#' @return A logical vector
not_all_zero <- function (df)
{
stopifnot(is.data.frame(df))
colSums(df != 0, na.rm = TRUE) > 0
}
#' Finds not-all-NA column(s)
#'
#' \code{not_all_NA} identifies the column indexes with all NA values.
#'
#' @param df A data frame.
#' @import dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
#' @examples \donttest{not_all_NA(mtcars)}
#' @return A logical vector
not_all_NA <- function (df)
{
stopifnot(is.data.frame(df))
colSums(!is.na(df), na.rm = TRUE) > 0
}
#' Finds non-all-NaN column(s)
#'
#' @param x A vector.
#' @param ... Additional arguments for \code{sum}.
#' @examples \donttest{purrr::map_lgl(mtcars, not_all_nan, na.rm = TRUE)}
not_all_nan <- function(x, ...)
{
stopifnot(is.vector(x))
sum(is.nan(x), ...) != length(x)
}
#' Finds all-NaN column(s)
#'
#' @param x A data frame of \code{log2FC} and \code{intensity}.
#' @param ... The same in \code{sum}.
#' @examples \donttest{purrr::map_lgl(mtcars, is_all_nan, na.rm = TRUE)}
is_all_nan <- function(x, ...)
{
stopifnot(is.vector(x))
sum(is.nan(x), ...) == length(x)
}
#' Finds a trivial column
#'
#' @param df A data frame of \code{log2FC} and \code{intensity}.
#' @param col The index of column.
#' @examples \donttest{is_trivial_col(mtcars, 1)}
is_trivial_col <- function (df, col)
{
stopifnot(length(col) == 1L)
x <- df[, col]
x[x == 0] <- NA
x[is.nan(x)] <- NA
all(is.na(x))
}
#' Finds a trivial vector
#'
#' @param x A numeric vector
is_trivial_dbl <- function (x)
{
stopifnot(is.vector(x))
if (!is.numeric(x))
return(FALSE)
x[x == 0] <- NA
x[is.nan(x)] <- NA
all(is.na(x))
}
#' Replaces a trivial vector with NA values.
#'
#' @param x A numeric vector
replace_trivial_with_na <- function (x)
{
if (is_trivial_dbl(x)) {
x <- rep(NA, length(x))
}
invisible(x)
}
#' Sets up the column annotation in heat maps
#'
#' @inheritParams prnEucDist
#' @inheritParams gspa_colAnnot
#' @import dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
colAnnot <- function (annot_cols = NULL, sample_ids = NULL, annot_colnames = NULL)
{
if (is.null(annot_cols))
return(NA)
dat_dir <- get_gl_dat_dir()
label_scheme <- load_ls_group(dat_dir, label_scheme)
exists <- annot_cols %in% names(label_scheme)
if (sum(!exists) > 0L) {
warning(paste0("Column '", annot_cols[!exists], "'",
" not found in \"label_scheme\" and will be skipped."),
call. = FALSE)
annot_cols <- annot_cols[exists]
}
if (!length(annot_cols))
return(NA)
x <- label_scheme %>%
dplyr::filter(Sample_ID %in% sample_ids) %>%
dplyr::select(annot_cols, Sample_ID)
idx <- !not_all_NA(x)
if (sum(idx) > 0L)
stop("No aesthetics defined under column(s) `",
paste(names(x)[idx], collapse = ", "), "`.")
x <- x %>%
dplyr::filter(!grepl("^Empty\\.", .[["Sample_ID"]]),
!is.na(.[["Sample_ID"]])) %>%
data.frame(check.names = FALSE) %>%
`rownames<-`(.[["Sample_ID"]])
if (any(duplicated(x[["Sample_ID"]])))
stop("Duplicated sample IDs found.\n")
if (!"Sample_ID" %in% annot_cols)
x <- x %>% dplyr::select(-Sample_ID)
if (!ncol(x))
return(NA)
if ("TMT_Set" %in% names(x)) {
x <- x %>%
tibble::rownames_to_column() %>%
dplyr::mutate(TMT_Set = as.factor(TMT_Set)) %>%
`rownames<-`(NULL) %>%
tibble::column_to_rownames(var = "rowname")
}
x <- x %>%
mutate_if(is.logical, factor)
invisible(x)
}
#' Customizes the colors in column annotation
#'
#' @param annotation_col The same as in \link[pheatmap]{pheatmap}.
#' @import dplyr
#' @importFrom magrittr %>% %T>% %$% %<>%
setHMColor <- function (annotation_col)
{
ncol <- ncol(annotation_col)
if (is.null(ncol))
return (NA)
if (!ncol)
return (NA)
suppressWarnings(
annotation_col <- annotation_col %>%
dplyr::mutate_at(vars(one_of("TMT_Set")), ~ factor(TMT_Set)) %>%
dplyr::mutate_if(is.character, as.factor)
)
palette <- lapply(names(annotation_col), function(x) {
n <- nlevels(annotation_col[[x]])
palette <- if(n < 9L && n >= 3L)
brewer.pal(n, name = "Set2") # Set2: maximum 8
else if(n >= 9L)
colorRampPalette(brewer.pal(n = 7L, "Set1"))(n)
else if(n == 2L)
c("#66C2A5", "#FC8D62")
else if(n == 1L)
c("#66C2A5")
else if(n == 0L)
colorRampPalette(brewer.pal(n = 9L, "YlOrBr"))(100)
names(palette) <- levels(annotation_col[[x]])
palette
})
names(palette) <- names(annotation_col)
palette
}
#' Sets the upper and the lower limits in the range of heat maps
#'
#' \code{setHMlims} imputes values beyond the limits to the corresponding
#' limits.
#'
#' @param x A data frame of \code{log2FC}.
#' @param xmin the lower limit.
#' @param xmax the upper limit.
#' @import dplyr
#' @importFrom stringr str_split
#' @importFrom magrittr %>% %T>% %$% %<>%
setHMlims <- function (x, xmin, xmax)
{
x[x < xmin] <- xmin
x[x > xmax] <- xmax
invisible(x)
}
#' Calculate the log2-ratio to the control group of samples
#'
#' @inheritParams info_anal
#' @inheritParams gspaTest
#' @param nm_ctrl The names of samples that belong to the control group.
#' @examples \donttest{ratio_toCtrl(df, "gene", label_scheme_sub, Heatmap_Group)}
ratio_toCtrl <- function(df, id, label_scheme_sub, nm_ctrl)
{
id <- rlang::as_string(rlang::enexpr(id))
nm_ctrl <- rlang::as_string(rlang::ensym(nm_ctrl))
x <- df %>%
dplyr::select(which(names(.) %in% label_scheme_sub$Sample_ID)) %>%
`rownames<-`(df[[id]])
col_ratio <- which(names(x) %in% label_scheme_sub$Sample_ID)
col_ctrl <- which(names(x) %in%
label_scheme_sub[!is.na(label_scheme_sub[[nm_ctrl]]), "Sample_ID"])
x[, col_ratio] <- sweep(x[, col_ratio], 1,
rowMeans(x[, col_ctrl, drop = FALSE], na.rm = TRUE), "-")
x <- x %>% tibble::rownames_to_column(id)
df %>%
dplyr::select(which(!names(df) %in% label_scheme_sub$Sample_ID)) %>%
dplyr::left_join(x, by = id) %>%
`rownames<-`(.[[id]])
}
#'Imputation of NA values
#'
#'
#'\code{imputeNA} imputes NA \code{log2FC} values in protein or peptide data.
#'Users should avoid call the method directly, but instead use the following
#'wrappers.
#'
#'@param overwrite Logical. If true, overwrite the previous results. The default
#' is FALSE.
#'@inheritParams prnHist
#'@inheritParams info_anal
#'@param ... Parameters for \code{\link[mice]{mice}}
#'@return \code{Peptide_impNA.txt} for peptide data and \code{Protein_impNA.txt}
#' for protein data.
#'
#'@import dplyr purrr
#'@importFrom magrittr %>% %T>% %$% %<>%
#'@export
imputeNA <- function (id, overwrite = FALSE, ...)
{
my_mice <- function (data, ...) {
data <- rlang::enexpr(data)
dots <- rlang::enexprs(...)
rlang::eval_bare(rlang::expr(mice::mice(data = !!data, !!!dots)),
env = rlang::caller_env())
}
handleNA <- function (x, ...) {
ind <- purrr::map(x, is.numeric) %>% unlist()
if (sum(ind) < ncol(x)) cat(names(ind)[!ind], "skipped.")
# handle special column names
nm_orig <- names(x[, ind])
x[, ind] <- x[, ind] %>%
data.frame(check.names = TRUE) %>%
my_mice(...) %>%
mice::complete(1) %>%
`colnames<-`(nm_orig)
x
}
dat_dir <- get_gl_dat_dir()
if (!requireNamespace("mice", quietly = TRUE)) {
stop("\n============================================================",
"\nNeed package \"mice\" for this function to work.",
"\n============================================================",
call. = FALSE)
}
id <- rlang::as_string(rlang::enexpr(id))
if (id %in% c("pep_seq", "pep_seq_mod")) {
src_path <- file.path(dat_dir, "Peptide", "Peptide.txt")
filename <- file.path(dat_dir, "Peptide", "Peptide_impNA.txt")
} else if (id %in% c("prot_acc", "gene")) {
src_path <- file.path(dat_dir, "Protein", "Protein.txt")
filename <- file.path(dat_dir, "Protein", "Protein_impNA.txt")
}
if ((!file.exists(filename)) || overwrite) {
df <- tryCatch(read.csv(src_path, check.names = FALSE,
header = TRUE, sep = "\t",
comment.char = "#"),
error = function(e) NA)
if (!is.null(dim(df))) {
message(paste("File loaded:", src_path))
} else {
stop("File or directory not found: ", src_path,
call. = FALSE)
}
df[, grep("N_log2_R", names(df))] <-
handleNA(df[, grep("N_log2_R", names(df))], ...)
fn_params <- file.path(dat_dir, "Protein/Histogram", "MGKernel_params_N.txt")
if (file.exists(fn_params)) {
cf_SD <- read.csv(fn_params, check.names = FALSE,
header = TRUE, sep = "\t", comment.char = "#") %>%
dplyr::filter(!duplicated(Sample_ID)) %>%
dplyr::select(Sample_ID, fct)
df[, grep("Z_log2_R", names(df))] <-
mapply(normSD,
df[, grepl("^N_log2_R[0-9]{3}", names(df))],
center = 0, SD = cf_SD$fct, SIMPLIFY = FALSE) %>%
data.frame(check.names = FALSE) %>%
`colnames<-`(gsub("N_log2", "Z_log2", names(.)))
} else {
df[, grep("Z_log2_R", names(df))] <- df[, grep("Z_log2_R", names(df))] %>%
handleNA(...)
}
if (any(duplicated(df[[id]]))) {
if (id == "pep_seq") {
warning("\`pep_seq\` is not uqique for rownames; ",
"uses \`pep_seq_mod\` instead.\n",
call. = FALSE)
rownames(df) <- df[["pep_seq_mod"]]
}
if (id == "gene") {
warning("\`gene\` is not uqique for rownames; ",
"uses \`prot_acc\` instead.\n",
call. = FALSE)
rownames(df) <- df[["prot_acc"]]
}
} else {
rownames(df) <- df[[id]]
}
write.table(df, filename, sep = "\t", col.names = TRUE, row.names = FALSE)
} else {
warning("NA imputation has been previously performed!\n", call. = FALSE)
}
}
#'Imputes NA values for peptide data
#'
#'\code{pepImp} is a wrapper of \code{\link{imputeNA}} for peptide data with
#'auto-determination of \code{id}.
#'
#'@rdname imputeNA
#'
#' @examples
#' \donttest{
#' # ===================================
#' # Peptide NA imputation
#' # ===================================
#' pepImp(
#' m = 3,
#' maxit = 3,
#' )
#' }
#'
#'@export
pepImp <- function (...)
{
check_dots(c("id"), ...)
id <- match_call_arg(normPSM, group_psm_by)
imputeNA(id = !!id, ...)
}
#'Impute NA values for protein data
#'
#'\code{prnImp} is a wrapper of \code{\link{imputeNA}} for protein data with
#'auto-determination of \code{id}.
#'
#'@rdname imputeNA
#'
#' @examples
#' \donttest{
#' # ===================================
#' # Protein NA imputation
#' # ===================================
#' prnImp(
#' m = 5,
#' maxit = 5,
#' )
#' }
#'
#'@export
prnImp <- function (...)
{
check_dots(c("id"), ...)
id <- match_call_arg(normPSM, group_pep_by)
imputeNA(id = !!id, ...)
}
#' Species lookup
#'
#' @inheritParams load_dbs
sp_lookup <- function(species)
{
switch(species,
human = "hs",
mouse = "mm",
rat = "rn",
unknown = "unknown",
species)
}
#' Taxonomy lookup
#' @inheritParams load_dbs
taxid_lookup <- function(species)
{
switch (species,
human = 9606,
mouse = 10090,
rat = 10116,
unknown = 999999,
999999)
}
#' Reversed taxonomy lookup
#' @inheritParams load_dbs
taxid_lookup_rev <- function(species)
{
switch (species,
"9606" = "human",
"10090" = "mouse",
"10116" = "rat",
"unknown")
}
#' Species lookup UpperLower (Ul)
#'
#' @inheritParams load_dbs
sp_lookup_Ul <- function(species)
{
switch(species,
human = "Hs",
mouse = "Mm",
rat = "Rn",
unknown = "Unknown",
"unknown")
}
#' Adds gene IDs for RefSeq
#'
#' @inheritParams add_entrez
add_refseq_gene <- function (acc_lookup, acc_type)
{
sp_map <- c(
human = "hs",
mouse = "mm",
rat = "rn"
)
acc_map <- c(
uniprot_acc = "uniprot_",
uniprot_id = "uniprot_",
refseq_acc = "refseq_"
)
species <- unique(acc_lookup$species) %>% .[!is.na(.)]
abbr_sp <- sp_map[species]
abbr_acc <- acc_map[acc_type]
stopifnot(acc_type == "refseq_acc")
entrez_key <- switch(acc_type,
uniprot_acc = "uniprot_acc",
uniprot_id = "uniprot_acc",
refseq_acc = "refseq_acc",
other_acc = "other_acc",
stop("Unknown `accession type`.",
Call. = FALSE))
# (`species` can be empty)
if (length(species) && all(species %in% c("human", "mouse", "rat"))) {
filelist <- paste0(abbr_acc, "entrez_", abbr_sp)
data(package = "proteoQ", list = filelist, envir = environment())
entrez_db <- purrr::map(filelist, ~ get(.x)) %>%
dplyr::bind_rows() %>%
dplyr::filter(!duplicated(.[[entrez_key]])) %>%
dplyr::mutate(entrez = as.numeric(entrez),
!!entrez_key := as.character(!!rlang::sym(entrez_key))) %>%
dplyr::select(entrez_key, "gene")
acc_lookup <- dplyr::left_join(acc_lookup, entrez_db,
by = c("prot_acc" = entrez_key))
} else {
acc_lookup <- acc_lookup %>%
dplyr::mutate(gene = NA_character_)
}
invisible(acc_lookup)
}
#' Adds Entrez IDs
#'
#' @param acc_lookup A data frame of protein accession lookups
#' @param acc_type The type of protein accessions
#' @inheritParams parse_fasta
add_entrez <- function (acc_lookup, acc_type, warns = TRUE)
{
sp_map <- c(
human = "hs",
mouse = "mm",
rat = "rn"
)
acc_map <- c(
uniprot_acc = "uniprot_",
uniprot_id = "uniprot_",
refseq_acc = "refseq_"
)
def_sps <- c("human", "mouse", "rat")
species <- unique(acc_lookup$species) %>% .[!is.na(.)]
all_ok_species <- all(species %in% def_sps)
if (length(species) && (!all_ok_species)) {
cat("\n")
if (warns) {
warning("No default `entrez` lookups for species other than: ",
paste(def_sps, collapse = ", "), ".\n",
"To annotate, provide the file path and name(s) via argument `entrez`.\n",
"See also `?Uni2Entrez` and `?Ref2Entrez` for custom `entrez` lookups.",
call. = FALSE)
if (!all_ok_species) {
cat("\n")
warning("At `entrez = NULL`, no entrez annoation of ",
"non-default species: \n",
paste(species[!species %in% def_sps], collapse = ", "),
call. = FALSE)
}
}
species <- species %>% .[. %in% def_sps]
}
if (!length(species)) {
warning("No default species found for entrez annotation at ",
acc_type, ".",
call. = FALSE)
acc_lookup <- acc_lookup %>% dplyr::mutate(entrez = NA_integer_)
return(acc_lookup)
}
abbr_sp <- sp_map[species] %>% .[!is.na(.)]
abbr_acc <- acc_map[acc_type]
entrez_key <- switch(acc_type,
uniprot_acc = "uniprot_acc",
uniprot_id = "uniprot_acc",
refseq_acc = "refseq_acc",
other_acc = "other_acc",
stop("Unknown `accession type`.", Call. = FALSE))
# multiple uniprot_acc(s) can share the same entrez id
filelist <- paste0(abbr_acc, "entrez_", abbr_sp)
# four columns:
# 1. uniprot_acc/refseq_acc/other_acc 2. gene 3. entrez 4. species
data(package = "proteoQ", list = filelist, envir = environment())
db <- purrr::map(filelist, ~ get(.x)) %>%
dplyr::bind_rows() %>%
dplyr::filter(!duplicated(.[[entrez_key]])) %>%
dplyr::mutate(entrez = as.integer(entrez),
!!entrez_key := as.character(!!rlang::sym(entrez_key))) %>%
dplyr::select(entrez_key, "entrez")
acc_lookup <- dplyr::left_join(acc_lookup, db, by = entrez_key)
}
#' Adds custom Entrez IDs and optional overwriting species
#'
#' @inheritParams add_entrez
#' @inheritParams normPSM
#' @inheritParams annotKin
add_custom_entrez <- function(acc_lookup, entrez, acc_type)
{
if (!all(file.exists(entrez)))
stop("Wrong `entrez` file path(s) or name(s).",
"\nMake sure that the file type is `.rds`, not `.rda`",
call. = FALSE)
entrez_key <- switch(acc_type,
uniprot_acc = "uniprot_acc",
uniprot_id = "uniprot_acc",
refseq_acc = "refseq_acc",
other_acc = "other_acc",
stop("Unknown `accession type`.",
Call. = FALSE))
cols_ess <- c(entrez_key, "entrez")
cols_out <- c(entrez_key, "entrez", "species")
db <- purrr::map(entrez, ~ {
db <- readRDS(.x)
if (!all(cols_ess %in% names(db)))
stop("Need columns '",
purrr::reduce(cols_ess, paste, sep = ", "),
"' in ", .x, ".",
call. = FALSE)
if (!length(db))
stop("Empty file: ", .x, call. = FALSE)
db %>%
dplyr::select(which(names(.) %in% cols_out)) %>%
{ if ("species" %in% names(.)) . else .[["species"]] <- NA_character_ } %>%
dplyr::mutate(entrez = as.integer(entrez)) %>%
dplyr::select(cols_out)
}) %>%
dplyr::bind_rows()
# higher priority for custom "species" in `db`
if (all(is.na(db$species)))
db[["species"]] <- NULL
else
acc_lookup[["species"]] <- NULL
acc_lookup %>% dplyr::left_join(db, by = entrez_key)
}
#' Determines the protein accession type from PSM tables
#'
#' Find the protein accession from a non-cRAP entry and parse it.
#'
#'@param df An input data frame.
parse_acc <- function(df)
{
stopifnot("prot_acc" %in% names(df))
prot_accs <- unique(df$prot_acc)
pat_uni_acc <-
"^[OPQ][0-9][A-Z0-9]{3}[0-9]|^[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pat_uni_id <-
"^[[:alnum:]]+_[A-Z]{1,10}$"
pat_ref_acc <-
"^[XNY]{1}[MRP]{1}_"
pat_uni_both <- "^..\\|[[:alnum:]]+\\|[^_]+_[A-Z]{1,10}$"
acc_types <- purrr::map_chr(prot_accs, ~ {
if (grepl(pat_uni_id, .x))
"uniprot_id"
else if (grepl(pat_ref_acc, .x))
"refseq_acc"
else if (grepl(pat_uni_acc, .x))
"uniprot_acc"
else
"other_acc"
}, prot_accs)
df %>%
dplyr::left_join(data.frame(prot_acc = prot_accs, acc_type = acc_types),
by = "prot_acc")
}
#' Parses FASTA for accession lookups
#'
#' NA genes are replaced with prot_acc
#'
#' @param df An input data frame.
#' @param warns Logical; if TRUE, show warning message(s).
#' @inheritParams add_entrez
#' @inheritParams normPSM
#' @seealso \code{\link{read_fasta}} for the definition of fasta_name(s).
#' @return A lookup table, \code{acc_lookup}.
parse_fasta <- function (df, fasta, entrez, warns = TRUE)
{
my_lookup <- c(
"Homo sapiens" = "human",
"Mus musculus" = "mouse",
"Rattus norvegicus" = "rat")
pat_uni_acc <- "^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pat_uni_id <- "^[[:alnum:]]+_[A-Z]{1,10}$"
pat_ref_acc <- "^[XNY]{1}[MRP]{1}_"
dat_dir <- get_gl_dat_dir()
# =======================================================================================
# Two assumptions:
# (a) `acc_type` parsed from `prot_acc` with rules of UniProt accession etc.,
# and must be in one of c("uniprot_acc", "uniprot_id", "refseq_acc", "other_acc")
# with NA `acc_type` being coerced to "other_acc" (see `na_acc_type_to_other()`)
#
# (b) In acc_lookup, `fasta_name` <- `names(fasta)`
# (see also `read_fasta()`)
# =======================================================================================
# Normally handled `acc_type`s:
# (1) at `acc_type` %in% c("uniprot_acc", "uniprot_id", "refseq_acc")
# read_fasta(">([^ ]+?) .*", ...)
# -> fasta_name = "sp|Q9H553|ALG2_HUMAN"
# -> "Q9H553" at acc_type == uniprot_acc
# -> "ALG2_HUMAN" at acc_type == uniprot_id
# -> NP_001254479 titin isoform...
# -> "NP_001254479" at acc_type == refseq_acc
#
# Two special cases:
# (a) if the proteoQ fasta does not include all the entries in the database-search fasta,
# such `df$prot_acc` will not be in `acc_lookup` and
# annotated as NA under columns fasta_name, species etc. after data joining;
# (possible coercion: NA fasta_name <- prot_acc)
#
# (b) only at "acc_type == other_acc",
# if different parsing in making `df$prot_acc` and `names(fasta)`,
# such `df$prot_acc` will be annotated as NA in species etc.
#
# when making `acc_lookup`, organism etc. set to `NA_character_`, not "other_acc";
# this make consistent NA organism after after data joining for `df$prot_acc`s not
# found in `acc_lookup`.
# =======================================================================================
# Protein attributes:
# `fasta_name` kept in `df` for matching with `fasta_db`
# =======================================================================================
# add column `acc_type`; note that `df` not saved
if (!nrow(df))
warning("Zero row of input data.")
df <- df %>%
parse_acc() %>%
dplyr::filter(nchar(prot_acc) > 0L)
# load fasta_db
fasta_db <- load_fasta(fasta) %>%
`names<-`(gsub("\\.[0-9]*$", "", names(.)))
if (!length(fasta_db))
warning("FASTA databases are empty.")
# accession lookups by acc_type's
df_splits <- df %>%
split(.$acc_type, drop = TRUE)
acc_lookup <- lapply(df_splits, hparse_fasta,
fasta_db, pat_uni_acc, pat_uni_id, pat_ref_acc,
fasta, warns) %>%
dplyr::bind_rows()
# --- subset and annotate fasta_db with entries in acc_lookup ---
fasta_db <- local({
fasta_db <- fasta_db %>%
.[names(.) %in% unique(acc_lookup$fasta_name)] %>%
.[! duplicated(names(.))]
if (!length(fasta_db))
stop("No fasta entries match protein accessions; probably wrong fasta file.",
call. = FALSE)
fasta_db <- lapply(unique(df$acc_type), add_prot_desc,
acc_lookup = acc_lookup,
fasta_db = fasta_db) %>%
do.call(`c`, .) %>%
.[!duplicated(names(.))]
save(fasta_db, file = file.path(dat_dir, "fasta_db.rda"))
fasta_db
})
# -- add columns prot_desc, prot_mass, prot_len ---
acc_lookup <- local({
more <- dplyr::bind_cols(
prot_acc = unname(purrr::map_chr(fasta_db, attr, "prot_acc")),
prot_desc = unname(purrr::map_chr(fasta_db, attr, "prot_desc")),
prot_mass = unname(purrr::map_dbl(fasta_db, ~ calc_avgpep(.x))),
prot_len = unname(nchar(fasta_db)),
) %>%
dplyr::filter(!duplicated(prot_acc)) %>%
dplyr::mutate(prot_mass = round(prot_mass, digits = 0))
dplyr::left_join(acc_lookup, more, by = "prot_acc")
})
# -- add columns gene, organism, species, entrez ---
df_splits <- df %>%
split(.$acc_type, drop = TRUE)
acc_lookup <- purrr::map(df_splits, ~ {
acc_type <- .x[["acc_type"]][1]
if (acc_type %in% c("uniprot_acc", "uniprot_id")) {
genes <- local({
genes <- acc_lookup %>% dplyr::select(prot_acc, prot_desc)
na_genes <- genes %>%
dplyr::filter(!grepl("GN=", .$prot_desc)) %>%
dplyr::mutate(gene = NA_character_)
genes <- genes %>%
dplyr::filter(grepl("GN=", .$prot_desc)) %>%
dplyr::mutate(gene = gsub("^.*GN=(\\S+)\\s*.*", "\\1", prot_desc)) %>%
dplyr::bind_rows(., na_genes) %>%
na_genes_by_acc()
})
acc_lookup <- local({
na_org <- genes %>%
dplyr::filter(!grepl("OS=", .$prot_desc)) %>%
dplyr::mutate(organism = NA_character_)
# adds gene and organism
acc_lookup <- genes %>%
dplyr::filter(grepl("OS=", .$prot_desc)) %>%
dplyr::mutate(organism = gsub("^.*OS=(.*?)=.*$", "\\1", prot_desc)) %>%
dplyr::mutate(organism = gsub("\\s\\S*$", "", organism)) %>%
dplyr::bind_rows(., na_org) %>%
dplyr::select(-prot_desc) %>%
dplyr::right_join(acc_lookup, by = "prot_acc")
# adds species
acc_lookup <- acc_lookup %>%
dplyr::mutate(species = my_lookup[.$organism]) %>%
na_species_by_org()
})
if (!nrow(acc_lookup)) {
warning("\nNo matches in protein accessions at `acc_type = ", acc_type, "`.\n",
"Fix the FASTA before running into errors",
call. = FALSE)
}
# adds entrez
acc_lookup <- if (is.null(entrez))
add_entrez(acc_lookup, acc_type, warns)
else
add_custom_entrez(acc_lookup, entrez, acc_type)
acc_lookup <- acc_lookup %>%
dplyr::filter(!is.na(.[["uniprot_acc"]]))
}
else if (acc_type == "refseq_acc") {
# adds organism and species
acc_lookup <- acc_lookup %>%
dplyr::filter(grepl("^[XNY]{1}[MRP]{1}_", prot_acc)) %>%
dplyr::mutate(organism = gsub("^.*\\[(.*)\\]\\.*.*", "\\1", prot_desc)) %>%
dplyr::mutate(species = my_lookup[.$organism]) %>%
na_species_by_org()
if (!nrow(acc_lookup)) {
warning("\nNo matches in protein accessions at `acc_type = ", acc_type, "`.\n",
"Fix the FASTA before running into an error such as ",
"no species in gene name annoation.",
call. = FALSE)
}
# adds entrez
acc_lookup <- if (is.null(entrez))
add_entrez(acc_lookup, acc_type, warns)
else
add_custom_entrez(acc_lookup, entrez, acc_type)
# separately column gene
acc_lookup <- add_refseq_gene(acc_lookup, acc_type)
acc_lookup <- acc_lookup %>%
dplyr::filter(!is.na(.[["refseq_acc"]]))
}
else {
acc_lookup <- acc_lookup %>%
dplyr::filter(acc_type == "other_acc") %>%
dplyr::mutate(gene = NA_character_,
organism = NA_character_,
species = NA_character_,
entrez = NA_integer_)
}
acc_lookup <- acc_lookup %>%
dplyr::select(c("prot_acc", "gene", "organism", "prot_desc",
"prot_mass", "prot_len", "fasta_name", "uniprot_acc",
"uniprot_id", "refseq_acc", "other_acc", "acc_type",
"entrez", "species")) %>%
na_genes_by_acc()
}) %>%
dplyr::bind_rows() %>%
reloc_col_after("acc_type", "species")
save(acc_lookup, file = file.path(dat_dir, "acc_lookup.rda"))
invisible(acc_lookup)
}
#' Helper of \link{parse_fasta} by accession type.
#'
#' NA genes are replaced with prot_acc.
#'
#' @param df_sub An input data frame with a single type of protein accession.
#' @param fasta A fasta database.
#' @inheritParams parse_fasta
hparse_fasta <- function (df_sub, fasta_db, pat_uni_acc, pat_uni_id, pat_ref_acc,
fasta, warns)
{
acc_type <- df_sub[["acc_type"]][1]
if (acc_type %in% c("uniprot_acc", "uniprot_id")) {
acc_lookup <- names(fasta_db) %>%
gsub("^..\\|", "", .) %>%
stringr::str_split("\\|", simplify = TRUE) %>%
data.frame(check.names = FALSE) %>%
dplyr::select(1:2) %>%
`names<-`(c("uniprot_acc", "uniprot_id")) %>%
dplyr::bind_cols(data.frame(fasta_name = names(fasta_db)), .) %>%
dplyr::filter(grepl(pat_uni_acc, uniprot_acc),
grepl(pat_uni_id, uniprot_id)) %>%
dplyr::filter(.[[acc_type]] %in% unique(df_sub$prot_acc)) %>%
dplyr::filter(!duplicated(.[[acc_type]])) %>%
dplyr::mutate(refseq_acc = NA_character_,
other_acc = NA_character_,
acc_type = acc_type,
prot_acc = !!rlang::sym(acc_type))
}
else if (acc_type == "refseq_acc") {
unique_accs <- unique(df_sub$prot_acc)
unique_accs <- gsub("\\.[0-9]+$", "", unique_accs)
acc_lookup <- tibble::tibble(fasta_name = names(fasta_db),
refseq_acc = names(fasta_db)) %>%
dplyr::filter(grepl(pat_ref_acc, refseq_acc)) %>%
dplyr::filter(.[[acc_type]] %in% unique_accs) %>%
dplyr::filter(!duplicated(.[[acc_type]])) %>%
dplyr::mutate(uniprot_acc = NA_character_,
uniprot_id = NA_character_,
other_acc = NA_character_,
acc_type = acc_type,
prot_acc = !!rlang::sym(acc_type)) %>%
dplyr::select(c("fasta_name", "uniprot_acc", "uniprot_id", "refseq_acc",
"other_acc", "acc_type", "prot_acc"))
}
else if (acc_type == "other_acc") {
acc_lookup <- tibble::tibble(fasta_name = names(fasta_db),
other_acc = names(fasta_db)) %>%
dplyr::filter(other_acc %in% unique(df_sub$prot_acc)) %>%
dplyr::filter(!duplicated(other_acc)) %>%
dplyr::mutate(uniprot_acc = NA_character_,
uniprot_id = NA_character_,
refseq_acc = NA_character_,
acc_type = acc_type,
prot_acc = other_acc) %>%
dplyr::select(c("fasta_name", "uniprot_acc", "uniprot_id", "refseq_acc",
"other_acc", "acc_type", "prot_acc"))
}
n_fasta <- length(fasta_db)
n_lookup <- nrow(acc_lookup)
perc <- n_lookup/n_fasta
message("At acc_type = \"", acc_type, "\":\n",
" the number of entries in FASTA: ", n_fasta, "\n",
" the number of entries matched: ", n_lookup, "\n",
" percent annotated: ", format(round(perc, 3), nsmall = 3))
if (acc_type != "other_acc" && perc < 1e-5 && warns) {
warning("\n\n============================================================\n",
"PLEASE READ:\n\n",
"Almost NO protein accessions being annotated.\n",
# paste(fasta, collapse = "\n"), " is `", perc, "`.\n",
"The fasta database(s) are probably incorrect.",
"\n============================================================\n",
call. = FALSE)
}
else if (acc_type != "other_acc" && perc < .1 && warns) {
warning("\n------------------------------------------------------------",
"\nDouble check the choice of fasta(s) for probable mismatches.\n",
"\nA common source of mistakes: ",
"\n choose Uniprot with proteoQ",
"\n but used Refseq in database searches, or vice versa.",
"\n------------------------------------------------------------",
call. = FALSE)
}
invisible(acc_lookup)
}
#' Helper of \link{parse_fasta}.
#'
#' Coerces NA values in genes by the value of accessions.
#'
#' @param acc_type The type of protein accession.
na_genes_by_acc <- function(acc_lookup)
{
if (nrow(acc_lookup) &&
"prot_acc" %in% names(acc_lookup) &&
"gene" %in% names(acc_lookup))
{
na_genes <- (is.na(acc_lookup$gene)) | (stringr::str_length(acc_lookup$gene) == 0)
acc_lookup$gene <- as.character(acc_lookup$gene)
acc_lookup$gene[na_genes] <- as.character(acc_lookup$prot_acc[na_genes])
}
acc_lookup
}
#' Helper of \link{parse_fasta}.
#'
#' Coerces NA values in species by the value of organism.
#'
#' @param acc_type The type of protein accession.
na_species_by_org <- function(acc_lookup)
{
if (nrow(acc_lookup) &&
"organism" %in% names(acc_lookup) &&
"species" %in% names(acc_lookup))
{
na_species <-
(is.na(acc_lookup$species)) | (stringr::str_length(acc_lookup$species) == 0)
acc_lookup$species <- as.character(acc_lookup$species)
acc_lookup$species[na_species] <- as.character(acc_lookup$organism[na_species])
}
acc_lookup
}
#' Helper of \link{parse_fasta}.
#'
#' Adds attributes to fasta entries.
#'
#' @param pat_acc A regular expression of protein accession.
#' @inheritParams add_prot_desc
add_fasta_attrs <- function (pat_acc, acc_type, acc_lookup, fasta_db)
{
fasta_names_sub <- acc_lookup %>%
dplyr::filter(!is.na(!!rlang::sym(acc_type))) %>%
.[["fasta_name"]] %>%
unique()
fasta_db_sub <- fasta_db %>%
.[names(.) %in% fasta_names_sub]
# add attributes
prot_acc <- if(is.null(pat_acc))
names(fasta_db_sub)
else
gsub(pat_acc, "\\1", names(fasta_db_sub))
fasta_db_sub <- purrr::map2(fasta_db_sub, prot_acc, function (fa, pr) {
attr(fa, "prot_acc") <- pr
fa
})
prot_desc <- fasta_db_sub %>%
purrr::map(attr, "header") %>%
gsub("^.*\\|.*\\s+?", "", .)
fasta_db_sub <- purrr::map2(fasta_db_sub, prot_desc, function (fa, pr) {
attr(fa, "prot_desc") <- pr
fa
})
invisible(fasta_db_sub)
}
#' Helper of \link{parse_fasta}.
#'
#' Adds \code{prot_desc} by \code{acc_type}.
#'
#' @param acc_type The type of protein accession.
#' @param acc_lookup A look-up table of protein accessions.
#' @param fasta_db A database of fasta.
add_prot_desc <- function (acc_type, acc_lookup, fasta_db)
{
pat_acc <- if (acc_type == "uniprot_acc")
"^..\\|(.*)\\|.*$"
else if (acc_type == "uniprot_id")
"^.*\\|(.*)$"
else if (acc_type == "refseq_acc")
"^(.*)\\.[0-9]*$"
else
NULL
add_fasta_attrs(pat_acc, acc_type, acc_lookup, fasta_db)
}
#' Adds protein annotation
#'
#' \code{annotPrn} cross-referencing proteins among \code{uniprot_acc},
#' \code{uniprot_id}, \code{refseq} and \code{entrez}.
#'
#' @inheritParams info_anal
#' @inheritParams normPSM
#' @import dplyr purrr stringr
#' @importFrom magrittr %>% %$% %T>%
annotPrn <- function (df, fasta, entrez)
{
na_fasta_name_by_prot_acc <- function(df)
{
if (nrow(df) && ("fasta_name" %in% names(df))) {
na_fasta_name <-
(is.na(df$fasta_name)) | (stringr::str_length(df$fasta_name) == 0)
df$fasta_name[na_fasta_name] <- df$prot_acc[na_fasta_name]
}
invisible(df)
}
na_acc_type_to_other <- function(df)
{
if (nrow(df) && "acc_type" %in% names(df)) {
na_acc_type <-
(is.na(df$acc_type)) | (stringr::str_length(df$acc_type) == 0)
df$acc_type[na_acc_type] <- "other_acc"
}
invisible(df)
}
# currently with MSGF+ outputs at UniProt DB
uniboth <- grepl("^..\\|[[:alnum:]]+\\|[^_]+_[A-Z]{1,10}$", df[["prot_acc"]])
if (sum(uniboth)) {
df[["prot_acc"]][uniboth] <-
gsub("^..\\|([[:alnum:]]+)\\|[^_]+_[A-Z]{1,10}$", "\\1",
df[["prot_acc"]][uniboth])
}
acc_lookup <- parse_fasta(df, fasta, entrez)
acc_lookup <- dplyr::bind_cols(
acc_lookup %>%
dplyr::select(prot_acc),
acc_lookup %>%
dplyr::select(-which(names(.) %in% names(df))) %>%
dplyr::select(-"organism"))
# (1) multiple uniprot_acc(s) can share the same entrez id
# prot_acc gene acc_type uniprot_acc species entrez
# A1AT1_MOUSE Serpina1a uniprot_id P07758 mouse 20703
# A1AT4_MOUSE Serpina1d uniprot_id Q00897 mouse 20703
# (2) same uniprot_acc can have different entrez ids
# From To
# P02088 100503605
# P02088 101488143
# P02088 15129
# (3) ok that some uniprot_accs(s) have no corresponding entrez id
# NA values after joining, for `prot_cc` without corresponding entries in fasta
# except for the coercion to "other_acc" for `acc_type`
# [1] "gene" "prot_len" "fasta_name" "uniprot_acc" "uniprot_id"
# [7] "refseq_acc" "other_acc" "entrez" "species" "acc_type"
df <- df %>%
dplyr::mutate(psm_index = row_number()) %>%
dplyr::left_join(acc_lookup, by = "prot_acc") %>%
dplyr::filter(!duplicated(psm_index)) %>%
na_acc_type_to_other() %>%
# na_fasta_name_by_prot_acc() %>%
dplyr::select(-psm_index) %>%
# dplyr::ungroup() %>%
reloc_col_after("prot_mass", "prot_acc")
invisible(df)
}
#' Adds kinase annotation
#'
#' @inheritParams info_anal
#' @param acc_type Character string; the type of protein accessions in one of
#' c("refseq_acc", "uniprot_acc", "uniprot_id")
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
annotKin <- function (df, acc_type)
{
if (is.null(df))
return(df)
stopifnot ("prot_acc" %in% names(df))
data(package = "proteoQ", kinase_lookup, envir = environment())
if (!acc_type %in% names(kinase_lookup)) {
df <- df %>%
dplyr::mutate(kin_attr = FALSE,
kin_class = NA_character_,
kin_order = NA_integer_)
return(df)
}
lookup <- kinase_lookup %>%
dplyr::select(acc_type, kin_attr, kin_class, kin_order) %>%
dplyr::filter(!duplicated(.)) %>%
dplyr::filter(!is.na(.[[acc_type]])) %>%
dplyr::mutate(!!acc_type := as.character(!!rlang::sym(acc_type)))
df <- df %>%
dplyr::left_join(lookup, by = c("prot_acc" = acc_type))
df$kin_attr[is.na(df$kin_attr)] <- FALSE
invisible(df)
}
#' Saves the arguments in a function call
#'
#' @param call_pars Language.
#' @param fn The name of function being saved.
#'
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
save_call <- function(call_pars, fn)
{
dat_dir <- get_gl_dat_dir()
dir.create(file.path(dat_dir, "Calls"), recursive = TRUE, showWarnings = FALSE)
call_pars[names(call_pars) == "..."] <- NULL
save(call_pars, file = file.path(dat_dir, "Calls", paste0(fn, ".rda")))
}
#' Matches formulas to those in calls to pepSig or prnSig
#'
#' @param formulas Language; the formulas in linear modeling.
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
match_fmls <- function(formulas)
{
dat_dir <- get_gl_dat_dir()
fml_file <- file.path(dat_dir, "Calls/pepSig_formulas.rda")
if (file.exists(fml_file))
load(file = fml_file)
else
stop("Run `pepSig()` first.")
fml_chr <- formulas %>%
as.character() %>%
gsub("\\s+", "", .)
pepSig_chr <- pepSig_formulas %>%
purrr::map(~ .[is_call(.)]) %>%
as.character() %>%
gsub("\\s+", "", .)
ok <- purrr::map_lgl(fml_chr, ~ . %in% pepSig_chr)
if (!all(ok))
stop("Formula match failed: ", formulas[[which(!ok)]],
" not found in the latest call to 'pepSig(...)'.",
call. = FALSE)
}
#' Matches the arg to anal_prnGSPA
#'
#' @param call_rda the name of a rda.
#' @param arg Argument to be matched.
#'
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
match_call_arg <- function (call_rda = "foo", arg = "scale_log2r")
{
dat_dir <- get_gl_dat_dir()
call_rda <- rlang::as_string(rlang::enexpr(call_rda))
arg <- rlang::as_string(rlang::enexpr(arg))
rda <- paste0(call_rda, ".rda")
file <- file.path(dat_dir, "Calls", rda)
if (!file.exists(file))
stop(rda, " not found.")
load(file = file)
if (!arg %in% names(call_pars))
stop(arg, " not found in the latest call to ", call_rda,
call. = FALSE)
call_pars[[arg]]
}
#' Matches gset_nms to prnGSPA.
#'
#' Not currently used.
#'
#' @inheritParams prnGSPA
match_gset_nms <- function (gset_nms = NULL)
{
dat_dir <- get_gl_dat_dir()
file <- file.path(dat_dir, "Calls/anal_prnGSPA.rda")
if (is.null(gset_nms)) {
if (file.exists(file)) {
gset_nms <- match_call_arg(anal_prnGSPA, gset_nms)
} else {
stop("`gset_nms` is NULL and ", file, " not found.",
call. = FALSE)
}
}
else {
if (file.exists(file)) {
# gset_nms <- gset_nms %>% .[. %in% match_call_arg(anal_prnGSPA, gset_nms)]
gset_nms <- gset_nms %>% .[. %in% match_call_arg(anal_prnGSPA, .)]
} else {
warning("File `", file, "`` not available for `gset_nms` matches.",
"\nUse the `gset_nms` value as it.",
call. = FALSE)
}
}
if (is.null(gset_nms)) {
stop ("The `gset_nms` is NULL after matching parameters to the latest `prnGSPA(...)`.",
"\n\tConsider providing explicitly the `gset_nms`: ",
"\n\t\t`gset_nms = \"go_sets\"` or ",
"\n\t\t`gset_nms = \"~/proteoQ/dbs/go_hs.rds\"` ...",
call. = FALSE)
}
if (!length(gset_nms)) {
stop ("The `gset_nms` is EMPTY after parameter matching.",
"\n\tCheck the values of `gset_nms` in the latest call to `prnGSPA(...)`.",
call. = FALSE)
}
return(gset_nms)
}
#' A match to a global logical variable
#'
#' @param var Character string representation of a variable.
#' @param val The value of \code{var} before matching.
#' @examples
#' \donttest{
#' foo <- function(scale_log2r = FALSE) {
#' match_logi_gv("scale_log2r", scale_log2r)
#' }
#'
#' scale_log2r <- TRUE
#' foo()
#' }
match_logi_gv <- function(var, val)
{
oval <- val
ok <- exists(var, envir = .GlobalEnv)
gvar <- if (ok) get(var, envir = .GlobalEnv) else NULL
if (is.null(gvar))
val
else {
if (!is.logical(gvar))
stop("Argument \"", var, "\" is not logical.")
if ((!is.na(gvar)) && (gvar != oval)) {
warning("Coerce ", var, " to ", gvar,
" after matching to the global setting.",
call. = FALSE)
}
gvar
}
}
#' Matches scale_log2r
#'
#' \code{match_prnSig_scale_log2r} matches the value of \code{scale_log2r} to the value
#' in the most recent prnSig at a given impute_na status
#'
#' @inheritParams prnHM
match_prnSig_scale_log2r <- function(scale_log2r = TRUE, impute_na = FALSE)
{
stopifnot(rlang::is_logical(scale_log2r), rlang::is_logical(impute_na))
mscale_log2r <- if (impute_na)
match_call_arg(prnSig_impTRUE, scale_log2r)
else
match_call_arg(prnSig_impFALSE, scale_log2r)
if ((!is.na(mscale_log2r)) && (scale_log2r != mscale_log2r)) {
warning("scale_log2r = ", mscale_log2r,
" after matching to prnSig(impute_na = ", impute_na, ", ...)",
call. = FALSE)
}
mscale_log2r
}
#' Matches scale_log2r
#'
#' \code{match_pepSig_scale_log2r} matches the value of \code{scale_log2r} to the value
#' in the most recent pepSig at a given impute_na status.
#' @inheritParams prnHM
match_pepSig_scale_log2r <- function(scale_log2r = TRUE, impute_na = FALSE)
{
stopifnot(rlang::is_logical(scale_log2r), rlang::is_logical(impute_na))
mscale_log2r <- if (impute_na)
match_call_arg(pepSig_impTRUE, scale_log2r)
else
match_call_arg(pepSig_impFALSE, scale_log2r)
if ((!is.na(mscale_log2r)) && (scale_log2r != mscale_log2r)) {
warning("scale_log2r = ", mscale_log2r,
" after matching to pepSig(impute_na = ", impute_na, ", ...)",
call. = FALSE)
}
mscale_log2r
}
#' Replaces NA genes (not currently used)
#'
#' @inheritParams info_anal
#' @inheritParams annotKin
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
replace_na_genes <- function(df, acc_type)
{
acc_type <- tolower(acc_type)
if (acc_type == "refseq_acc") {
na_gene <- (is.na(df[, c("gene")])) | (str_length(df$gene) == 0)
df$gene <- as.character(df$gene)
df$gene[na_gene] <- as.character(df$prot_acc[na_gene])
}
else if (acc_type == "uniprot_id") {
temp <- data.frame(do.call('rbind',
strsplit(as.character(df$prot_desc),
'GN=',
fixed = TRUE))) %>%
dplyr::select(2) %>%
`colnames<-`("gene") %>%
dplyr::mutate(gene = gsub("PE\\=.*", "", gene)) %>%
dplyr::mutate(gene = gsub("\\s+.*", "", gene))
na_gene <- is.na(df$gene)
df[na_gene, c("gene")] <- temp[na_gene, c("gene")]
rm(list = c("temp"))
df <- df %>%
dplyr::mutate(gene = gsub(".*\\|", "", gene))
}
invisible(df)
}
#' Replaces NA genes
#'
#' @param df A data frame.
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
na_genes_by_acc <- function(df)
{
stopifnot("prot_acc" %in% names(df), "gene" %in% names(df))
na_genes <- (is.na(df$gene)) | (stringr::str_length(df$gene) == 0)
df$gene <- as.character(df$gene)
df$gene[na_genes] <- as.character(df$prot_acc[na_genes])
invisible(df)
}
#' Finds peptide start and end positions
#'
#' \code{find_pep_pos} finds the start and the end positions of peptides in
#' ascribed proteins description based on the \code{fasta}.
#'
#' @param fasta_name The fasta name
#' @param pep_seq Peptide sequence
#' @param fasta The database of fasta
#' @import dplyr purrr stringr tidyr
#' @importFrom magrittr %>% %T>% %$% %<>%
#'
#' @examples
#' \donttest{
#' fasta_name <- "NP_612429"
#' pep_seq <- "ADVSLPSMQGDLK"
#' fasta <- read_fasta("~/proteoQ/dbs/fasta/refseq/refseq_hs_2013_07.fasta")
#' x <- find_pep_pos(fasta_name, pep_seq, fasta)
#'
#' pep_seq <- "ADVSLPSMQGDL"
#' x <- find_pep_pos(fasta_name, pep_seq, fasta)
#' }
find_pep_pos <- function (fasta_name, pep_seq, fasta)
{
if (is.na(fasta_name)) {
out <- cbind(pep_seq = pep_seq, pep_res_before = NA_character_,
start = NA_integer_, end = NA_integer_,
pep_res_after = NA_character_, fasta_name = fasta_name,
pep_istryptic = NA, pep_semitryptic = NA_character_)
return(out)
}
this_fasta <- fasta[names(fasta) == fasta_name]
len_fasta <- length(this_fasta)
pep_seq <- as.character(pep_seq)
if (len_fasta == 1L) {
pep_pos_all <- stringr::str_locate_all(this_fasta, pattern = pep_seq)
pep_pos_all <- pep_pos_all[[1]]
# can have no matches if the fasta is different to what are used in
# database searches; i.e., the fasta sequence may be altered.
n_rows <- nrow(pep_pos_all)
if (!n_rows) {
warning(pep_seq, " not found in ", fasta_name, ".\n",
"Probably different fastas between database searches and proteoQ.",
call. = FALSE)
out <- cbind(pep_seq = pep_seq, pep_res_before = NA_character_,
start = NA_integer_, end = NA_integer_,
pep_res_after = NA_character_, fasta_name = fasta_name,
pep_istryptic = NA, pep_semitryptic = NA_character_)
return(out)
}
cts <- nts <- logical(n_rows)
for (i in seq_len(n_rows)) {
pep_pos_i <- pep_pos_all[i, ]
pos_bf <- pep_pos_i[1] - 1L
pos_af <- pep_pos_i[2] + 1L
pep_res_before <- stringr::str_sub(this_fasta, pos_bf, pos_bf)
pep_res_after <- stringr::str_sub(this_fasta, pos_af, pos_af)
# Mascot specialty of "X" residues (see also aa_residues.rda)
# prot_acc: "XP_003960355", original "QERFCQXK" becomes "QERFCQVK"
if (is.na(pep_res_before) || is.na(pep_res_after)) {
out <- cbind(pep_seq = pep_seq, pep_res_before = NA_character_,
start = NA_integer_, end = NA_integer_,
pep_res_after = NA_character_, fasta_name = fasta_name,
pep_istryptic = NA, pep_semitryptic = NA_character_)
return(out)
}
if (nchar(pep_res_before) == 0L) pep_res_before <- "-"
if (nchar(pep_res_after) == 0L) pep_res_after <- "-"
# N-term M:
# pep_res_before == "M" && pep_pos_i[1] == 2L
# Not N-term M:
# pep_res_before != "M" || pep_pos_i[1] != 2L
ok_ct <- grepl("[KR]$", pep_seq) || pep_res_after == "-"
is_nt <- pep_res_before %in% c("K", "R", "-")
is_nm <- pep_res_before == "M" && pep_pos_i[1] == 2L
ok_nt <- is_nt || is_nm
if (ok_ct && ok_nt) {
out <- cbind(pep_seq = pep_seq,
pep_res_before = pep_res_before,
start = pep_pos_i[[1]], end = pep_pos_i[[2]],
pep_res_after = pep_res_after, fasta_name = fasta_name,
pep_istryptic = TRUE, pep_semitryptic = "both")
return(out)
}
else {
cts[[i]] <- ok_ct
nts[[i]] <- ok_nt
}
}
## no full-tryptic matches
semi_tryp <- "none"
for (k in seq_along(nts)) {
if (nts[[k]]) {
semi_tryp <- "nterm"
break
}
else if (cts[[k]]) {
semi_tryp <- "cterm"
break
}
}
pep_pos_i <- pep_pos_all[k, ]
out <- cbind(pep_seq = pep_seq,
pep_res_before = pep_res_before,
start = pep_pos_i[[1]], end = pep_pos_i[[2]],
pep_res_after = pep_res_after, fasta_name = fasta_name,
pep_istryptic = FALSE, pep_semitryptic = semi_tryp)
}
else if (len_fasta == 0L) {
out <- cbind(pep_seq = pep_seq,
pep_res_before = NA_character_,
start = NA_integer_,
end = NA_integer_,
pep_res_after = NA_character_,
fasta_name = fasta_name,
pep_istryptic = NA, pep_semitryptic = NA_character_)
}
else {
stop("More than one matched fasta_name.")
}
invisible(out)
}
#' Annotation of peptide positions and adjacent amino acid residues
#'
#' \code{annotPeppos} annotates the start and the end positions of peptides in
#' ascribed proteins description based on the \code{fasta}. It also annotates the
#' preceding and the following AA residues.
#'
#' @param df A data frame
#' @import dplyr purrr stringr tidyr
#' @importFrom magrittr %>% %T>% %$% %<>%
annotPeppos <- function (df)
{
stopifnot(all(c("fasta_name", "pep_seq") %in% names(df)))
file <- file.path(dat_dir, "fasta_db.rda")
if (!file.exists(file))
stop("File not found: ", file, call. = FALSE)
load(file = file)
# ok cases that same `pep_seq` but different `prot_acc`
# (K) MENGQSTAAK (L) NP_510965
# (-) MENGQSTAAK (L) NP_001129505
df <- df %>%
dplyr::mutate(pep_prn = paste(pep_seq, fasta_name, sep = "@"))
df_pep_prn <- df %>%
dplyr::filter(!duplicated(pep_prn)) %>%
dplyr::select(c("pep_seq", "fasta_name"))
fasta_db_sub <- fasta_db %>%
.[names(.) %in% unique(df_pep_prn$fasta_name)]
pep_pos_all <- suppressWarnings(
purrr::map2(as.list(df_pep_prn$fasta_name),
as.list(df_pep_prn$pep_seq),
find_pep_pos,
fasta_db_sub)
) %>%
do.call(rbind, .) %>%
`colnames<-`(c("pep_seq", "pep_res_before", "pep_start",
"pep_end", "pep_res_after",
"fasta_name", "pep_istryptic", "pep_semitryptic")) %>%
data.frame(check.names = FALSE) %>%
tidyr::unite(pep_prn, pep_seq, fasta_name, sep = "@", remove = TRUE) %>%
dplyr::mutate(pep_start = as.integer(pep_start),
pep_end = as.integer(pep_end))
df$pep_res_before <- NULL
df$pep_res_after <- NULL
df$pep_start <- NULL
df$pep_end <- NULL
nms <- names(df)
if ("pep_res_before" %in% nms) pep_pos_all$pep_res_before <- NULL
if ("pep_res_after" %in% nms) pep_pos_all$pep_res_after <- NULL
if ("pep_start" %in% nms) pep_pos_all$pep_start <- NULL
if ("pep_end" %in% nms) pep_pos_all$pep_end <- NULL
df %>%
dplyr::left_join(pep_pos_all, by = "pep_prn") %>%
dplyr::select(-pep_prn) %>%
reloc_col_before("pep_end", "pep_res_before") %>%
reloc_col_before("pep_start", "pep_end")
}
#' Subsets fasta by accession type
#'
#' Not yet used.
#'
#' @inheritParams info_anal
#' @inheritParams normPSM
#' @inheritParams annotKin
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
subset_fasta <- function (df, fasta, acc_type)
{
stopifnot("prot_acc" %in% names(df))
if (! acc_type %in% c("refseq_acc", "uniprot_id", "uniprot_acc"))
stop("The type of protein accesion needs to one of ",
"\'uniprot_id\', \'uniprot_acc\' or \'refseq_acc\'.",
call. = FALSE)
fasta <- purrr::map(fasta, ~ read_fasta(.x)) %>% do.call(`c`, .)
if (acc_type == "uniprot_id") {
fasta <- fasta %>%
`names<-`(gsub("^.*\\|.*\\|(.*)$", "\\1", names(.))) %>%
.[names(.) %in% unique(df$prot_acc)]
}
else if (acc_type == "uniprot_acc") {
fasta <- fasta %>%
`names<-`(gsub("^.*\\|(.*)\\|.*$", "\\1", names(.))) %>%
.[names(.) %in% unique(df$prot_acc)]
}
else if (acc_type == "refseq_acc") {
fasta <- fasta %>%
.[names(.) %in% unique(df$prot_acc)]
}
}
#' Finds peptide abundance indexes.
#'
#' In relative to tryptic peptides.
#'
#' @param max_len The maximum length of peptide sequence for consideration.
#' @inheritParams info_anal
#' @inheritParams find_shared_prots
add_prot_icover <- function (df, id = "gene", pep_id = "pep_seq",
max_len = 500L)
{
dat_dir <- get_gl_dat_dir()
ok <- tryCatch(load(file = file.path(dat_dir, "fasta_db.rda")),
error = function(e) "e")
if (ok != "fasta_db")
stop("`fasta_db.rda` not found under ", dat_dir, ".",
call. = FALSE)
fasta_db <- fasta_db %>% .[!duplicated(names(.))]
id <- rlang::as_string(rlang::enexpr(id))
if (id == "gene") {
gn_rollup <- TRUE
id <- "prot_acc"
}
else {
gn_rollup <- FALSE
}
min_len <- min(nchar(df[[pep_id]])) # - 1L
max_len2 <- max_len # - 1L
zero_npeps <- df %>%
dplyr::mutate(pep_istryptic = as.logical(pep_istryptic)) %>%
{ if ("pep_istryptic" %in% names(df)) dplyr::filter(., pep_istryptic) else . } %>%
dplyr::filter(pep_len <= max_len, pep_miss == 0L) %>%
dplyr::select(c(pep_id, "fasta_name")) %>%
dplyr::filter(!duplicated(!!rlang::sym(pep_id))) %>%
dplyr::group_by(fasta_name) %>%
dplyr::summarise(prot_n_pep0 = n())
max_npeps <- fasta_db %>%
purrr::map_int(~ {
.x <- gsub("([KR]{1})", paste0("\\1", "@"), .x)
x <- stringr::str_split(.x, "@")
x <- lapply(x, function (p) {
p1 <- p[[1]]
if (grepl("^M", p1)) c(gsub("^M", "", p1), p) else p
})
x %>% purrr::map_int(~ {
len <- stringr::str_length(.x)
sum(len >= min_len & len <= max_len2)
})
}) %>%
tibble::tibble(
fasta_name = names(.),
prot_n_pepi = .,
)
max_npeps <- max_npeps %>%
dplyr::left_join(zero_npeps, by = "fasta_name") %>%
dplyr::mutate(prot_n_pep0 = ifelse(is.na(prot_n_pep0), 1L, prot_n_pep0)) %>%
dplyr::mutate(prot_icover = prot_n_pep0/prot_n_pepi,
prot_icover = round(prot_icover, digits = 3L)) %>%
dplyr::select(-c("prot_n_pepi", "prot_n_pep0"))
if (gn_rollup) {
df <- local({
tempdata <- df %>%
dplyr::select(fasta_name, gene) %>%
dplyr::filter(!duplicated(fasta_name)) %>%
dplyr::left_join(max_npeps, by = "fasta_name") %>%
dplyr::select(-fasta_name) %>%
dplyr::filter(!is.na(gene)) %>%
dplyr::group_by(gene)
suppressWarnings(
tempdata %>%
dplyr::summarise_all(~ max(.x, na.rm = TRUE))
) %>%
dplyr::left_join(df, ., by = "gene")
})
}
else {
df <- df %>%
dplyr::left_join(max_npeps, by = "fasta_name")
}
invisible(df)
}
#' Calculates protein percent coverage
#'
#' @inheritParams info_anal
#' @inheritParams normPSM
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
calc_cover <- function(df, id)
{
dat_dir <- get_gl_dat_dir()
stopifnot(all(c("prot_acc", "gene", "pep_start", "pep_end") %in% names(df)))
df$prot_cover <- NULL
load(file = file.path(dat_dir, "fasta_db.rda"))
if (all(is.factor(df$pep_start))) {
df$pep_start <-as.numeric(as.character(df$pep_start))
}
if (all(is.factor(df$pep_end))) {
df$pep_end <- as.numeric(as.character(df$pep_end))
}
id <- rlang::as_string(rlang::enexpr(id))
if (id == "gene") {
gn_rollup <- TRUE
id <- "prot_acc"
}
else {
gn_rollup <- FALSE
}
load(file = file.path(dat_dir, "acc_lookup.rda"))
if (!length(fasta_db))
stop("No fasta entries match protein accessions.",
"Check the correctness of fasta file(s).",
call. = FALSE)
if (length(fasta_db) <= 200L)
warning("Less than 200 entries in fasta matched by protein accessions.\n",
"Make sure the fasta file is correct.",
call. = FALSE)
df_sels <- df %>%
dplyr::select(prot_acc, pep_start, pep_end) %>%
unique() %>%
dplyr::left_join(acc_lookup, by = "prot_acc") %>%
dplyr::filter(!is.na(prot_len)) %>%
dplyr::filter(pep_start <= prot_len,
pep_end <= prot_len)
if (!nrow(df_sels))
stop("Probably incorrect accession types in the fasta file(s).",
call. = FALSE)
df_sels <- df_sels %>%
split(.[["prot_acc"]], drop = TRUE) %>%
purrr::map(function (.x) {
len <- .x[1, "prot_len"]
aa_map <- rep(NA, len)
for (i in 1:nrow(.x)) {
aa_map[.x[i, ]$pep_start : .x[i, ]$pep_end] <- TRUE
}
sum(aa_map, na.rm = TRUE)/len
} ) %>%
do.call("rbind", .) %>%
data.frame(check.names = FALSE) %>%
`colnames<-`("prot_cover") %>%
tibble::rownames_to_column("prot_acc") %>%
dplyr::mutate(prot_cover = ifelse(prot_cover > 1, 1, prot_cover))
if (gn_rollup) {
df_sels <- local({
df_sub <- df %>%
dplyr::select(prot_acc, gene) %>%
dplyr::filter(!duplicated(prot_acc))
df_sels <- df_sub %>%
dplyr::left_join(df_sels, by = "prot_acc") %>%
dplyr::select(-prot_acc) %>%
dplyr::filter(!is.na(gene)) %>%
dplyr::group_by(gene)
df_sels <- suppressWarnings(
df_sels %>% dplyr::summarise_all(~ max(.x, na.rm = TRUE))
)
df_sels <- df_sub %>%
dplyr::left_join(df_sels, by = "gene") %>%
dplyr::select(-gene)
})
}
df <- df %>%
dplyr::mutate(index = row_number()) %>%
dplyr::left_join(df_sels, by = "prot_acc") %>%
dplyr::filter(!duplicated(index)) %>%
dplyr::select(-index)
if ("prot_icover" %in% names(df)) {
df <- df %>%
dplyr::mutate(prot_icover =
ifelse(!is.na(prot_icover), prot_icover, prot_cover),
prot_icover = round(prot_icover, digits = 3L))
}
df <- df %>%
dplyr::mutate(prot_cover = round(prot_cover, digits = 3L))
invisible(df)
}
#' Converts log2FC to linear fold changes
#'
#' @inheritParams info_anal
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
to_linfc <- function(df)
{
nms <- rownames(df)
df %>%
purrr::map(~ {ifelse(.x > 0, 2^.x, -1/(2^.x))}) %>%
data.frame(check.names = FALSE) %>%
`rownames<-`(nms)
}
#' Removes single-value columns
#'
#' @inheritParams setHMlims
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
rm_sglval_cols <- function (x)
{
sgl_val <- x %>%
dplyr::summarise_all(~ n_distinct(.x)) %>%
purrr::map(~ .x == 1) %>%
purrr::flatten_lgl()
cols <- names(x[sgl_val])
if (!purrr::is_empty(cols)) {
warning("Columns at a single-factor level for aesthetics excluded: \n",
purrr::reduce(cols, paste, sep = ", "), ".\n",
call. = FALSE)
}
x[, !sgl_val, drop = FALSE]
}
#' Combines data with metadata
#'
#' @param data A data frame
#' @param metadata Another data frame
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
cmbn_meta <- function(data, metadata)
{
data <- data %>%
tibble::rownames_to_column("Sample_ID") %>%
dplyr::left_join(metadata, by = "Sample_ID") %>%
dplyr::mutate_at(vars(one_of("Color", "Fill", "Shape", "Size", "Alpha")),
~ as.factor(.))
col_nms <- data %>%
.[!not_all_NA(.)] %>%
names()
if (!purrr::is_empty(col_nms))
warning("columns with all-NA values for aesthetics excluded: \n",
purrr::reduce(col_nms, paste, sep = ", "), ".\n",
call. = FALSE)
data %>%
dplyr::select(which(not_all_NA(.))) %>%
rm_sglval_cols()
}
#' Checks file names for ggsave
#'
#' @param filename Character string; An output file name.
gg_imgname <- function(filename)
{
fn_suffix <- gsub("^.*\\.([^.]*)$", "\\1", filename)
fn_prefix <- gsub("\\.[^.]*$", "", filename)
exts <- c("png", "eps", "ps", "tex", "pdf", "jpeg", "tiff", "png", "bmp", "svg")
if(! fn_suffix %in% exts) {
warning("Unrecognized file extenstion: '", fn_suffix,
"'. Image will be saved as a '.png'.",
call. = FALSE)
fn_suffix <- "png"
}
paste0(fn_prefix, ".", fn_suffix)
}
#' Checks file names for ggsave
#'
#' @inheritParams info_anal
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
rm_pval_whitespace <- function(df)
{
df <- df %>%
dplyr::mutate_at(vars(grep("pVal|adjP", names(.))), as.character) %>%
dplyr::mutate_at(vars(grep("pVal|adjP", names(.))), ~ gsub("\\s*", "", .x) ) %>%
dplyr::mutate_at(vars(grep("pVal|adjP", names(.))), ~ suppressWarnings(as.numeric(.x)))
}
#' Filters rows
#'
#' @param df a data frame.
#' @param ... Arguments for \link[dplyr]{filter}
#'
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
filters_in_call <- function (df, ...)
{
dots <- rlang::enexprs(...)
nms <- names(dots)
for (i in seq_along(dots)) {
row_exprs <- dots[[nms[i]]] %>%
rlang::eval_bare()
if (!rlang::is_list(row_exprs)) row_exprs <- list(row_exprs)
row_exprs <- purrr::map(row_exprs, ~ {
is_char <- is.character(.x)
if (is_char) .x <- rlang::sym(.x)
return(.x)
})
row_vals <- row_exprs %>%
purrr::map(eval_tidy, df) %>%
purrr::reduce(`&`, .init = 1)
row_vals <- row_vals %>% ifelse(is.na(.), FALSE, .)
stopifnot(is.logical(row_vals))
if (sum(row_vals) == 0)
stop("Zero row of data available after `filter_` vararg(s).\n",
"Examine both (a) the values under the selected column(s) and ",
"(b) the supplied logical condition(s).\n",
"For example, values are not all NAs or FALSE under the column(s).",
call. = FALSE)
df <- df[row_vals, , drop = FALSE]
}
return(df)
}
#' Arranges rows
#'
#' @param .df a data frame.
#' @param .na.last The same as \link[base]{order}
#' @param ... Arguments for \link[dplyr]{arrange}
#'
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
arrangers_in_call <- function(.df, ..., .na.last = TRUE)
{
dots <- rlang::enexprs(...)
nms <- names(dots)
for (i in seq_along(dots)) {
row_orders <- dots[[nms[i]]] %>%
rlang::eval_bare()
if (!rlang::is_list(row_orders))
row_orders <- list(row_orders)
row_orders <- purrr::map(row_orders, ~ {
is_char <- is.character(.x)
if (is_char) .x <- rlang::sym(.x)
.x
})
order_call <- rlang::expr(order(!!!row_orders, na.last = !!.na.last))
ord <- rlang::eval_tidy(order_call, .df)
stopifnot(length(ord) == nrow(.df))
.df <- .df[ord, , drop = FALSE]
}
return(.df)
}
#' Calculates PSM SDs
#'
#' The standard deviations are based on samples under each TMT set and LCMS.
#'
#' @inheritParams info_anal
#' @inheritParams standPep
#' @inheritParams channelInfo
#' @inheritParams calcPeptide
calc_sd_fcts_psm <- function (df, range_log2r = c(5, 95), range_int = c(5, 95),
set_idx, injn_idx)
{
dat_dir <- get_gl_dat_dir()
load(file = file.path(dat_dir, "label_scheme_full.rda"))
label_scheme <- label_scheme_full %>%
dplyr::filter(TMT_Set == set_idx, LCMS_Injection == injn_idx) %>%
dplyr::mutate(tmt_nm = gsub("TMT-", "N_log2_R", TMT_Channel))
label_scheme_sd <- label_scheme %>%
dplyr::filter(!Reference, !grepl("^Empty\\.", Sample_ID)) %>%
dplyr::mutate(Sample_ID = factor(Sample_ID, levels = (.$Sample_ID)))
SD <- df %>%
dplyr::select(grep("^N_log2_R|^N_I", names(.))) %>%
dblTrim(., range_log2r, range_int) %>%
`names<-`(gsub(".*\\s*\\((.*)\\)$", "\\1", names(.)))
cf_SD <- SD/mean(SD %>% .[names(.) %in% label_scheme_sd$tmt_nm], na.rm = TRUE)
cf_SD <- cbind.data.frame(fct = cf_SD, SD) %>%
tibble::rownames_to_column("tmt_nm") %>%
dplyr::mutate(tmt_nm = factor(tmt_nm, levels = label_scheme$tmt_nm)) %>%
dplyr::arrange(tmt_nm)
}
#' Calculates CV per TMT_Set and LCMS_injection
#'
#' @inheritParams info_anal
#' @param type Character string; the type of data for SD calculations.
calcSD_Splex <- function (df, id, type = "log2_R")
{
if (type == "log2_R") {
df <- df %>%
dplyr::select(!!rlang::sym(id), grep("^log2_R[0-9]{3}", names(.)))
}
else if (type == "N_log2_R") {
df <- df %>%
dplyr::select(!!rlang::sym(id), grep("^N_log2_R[0-9]{3}", names(.)))
}
else if (type == "Z_log2_R") {
df <- df %>%
dplyr::select(!!rlang::sym(id), grep("^Z_log2_R[0-9]{3}", names(.)))
}
df %>%
dplyr::mutate(!!id := as.character(!!rlang::sym(id))) %>%
dplyr::arrange(!!rlang::sym(id)) %>%
dplyr::group_by(!!rlang::sym(id)) %>%
dplyr::summarise_at(vars(starts_with(type)), ~ sd(.x, na.rm = TRUE)) %>%
dplyr::mutate_at(vars(starts_with(type)), ~ replace_trivial_with_na(.x))
}
#' Violin plots of CV per TMT_Set and LCMS_injection
#'
#' @param width The width of a plot.
#' @param height The height of a plot.
#' @param is_psm Logical; indicator if the data belong to a PSM table .
#'
#' @inheritParams info_anal
#' @inheritParams purgePSM
#' @inheritParams prnCorr_logFC
#' @inheritParams calcSD_Splex
#' @import dplyr ggplot2
sd_violin <- function(df = NULL, id = NULL, filepath = NULL,
width = NULL, height = NULL,
type = "log2_R", adjSD = FALSE,
is_psm = FALSE, col_select = NULL, col_order = NULL,
theme = NULL, ...)
{
fn_suffix <- gsub("^.*\\.([^.]*)$", "\\1", filepath)
fn_prefix <- gsub("\\.[^.]*$", "", filepath)
df <- df %>%
dplyr::select(which(not_all_NA(.)))
if (!ncol(df)) {
message("No SD columns available.")
return (NULL)
}
dat_dir <- get_gl_dat_dir()
err_msg1 <- paste0("\'Sample_ID\' is reserved. Choose a different column key.")
col_select <- rlang::enexpr(col_select)
col_order <- rlang::enexpr(col_order)
col_select <- if (is.null(col_select))
rlang::expr(Select)
else
rlang::sym(col_select)
col_order <- if (is.null(col_order))
rlang::expr(Order)
else
rlang::sym(col_order)
if (col_select == rlang::expr(Sample_ID))
stop(err_msg1, call. = FALSE)
if (col_order == rlang::expr(Sample_ID))
stop(err_msg1, call. = FALSE)
label_scheme_full <- load_ls_group(dat_dir, label_scheme_full)
label_scheme <- load_ls_group(dat_dir, label_scheme)
if (is.null(label_scheme_full[[col_select]]))
stop("Column \'", rlang::as_string(col_select), "\' does not exist.",
call. = FALSE)
else if (sum(!is.na(label_scheme_full[[col_select]])) == 0)
stop("No samples were selected under column \'", rlang::as_string(col_select), "\'.",
call. = FALSE)
if (is.null(label_scheme_full[[col_order]])) {
warning("Column \'", rlang::as_string(col_order), "\' does not exist.
Samples will be arranged by the alphebatic order.",
call. = FALSE)
}
else if (sum(!is.na(label_scheme_full[[col_order]])) == 0) {
# warning("No orders under column \'", rlang::as_string(col_order), "\'.", call. = FALSE)
}
if (is_psm) {
label_scheme_sub <- local({
set_idx <- as.integer(gsub("^.*TMTset(\\d+)_.*", "\\1", filepath))
injn_idx <- as.integer(gsub("^.*TMTset\\d+_LCMSinj(\\d+)_sd\\.png$", "\\1", filepath))
label_scheme_full %>%
dplyr::filter(TMT_Set == set_idx, LCMS_Injection == injn_idx)
})
}
else {
label_scheme_sub <- label_scheme
}
label_scheme_sub <- label_scheme_sub %>%
dplyr::mutate(new_id = paste0(TMT_Channel, " (", Sample_ID, ")")) %>%
dplyr::mutate(new_id = gsub("TMT-", "", new_id)) %>%
dplyr::select(Sample_ID, TMT_Set, new_id, !!col_select, !!col_order) %>%
dplyr::filter(!is.na(!!col_select))
TMT_plex <- TMT_plex(label_scheme_full)
if (!TMT_plex) {
label_scheme_sub$new_id <- label_scheme_sub$Sample_ID
}
id <- rlang::as_string(rlang::enexpr(id))
dots <- rlang::enexprs(...)
if (rlang::is_missing(width)) width <- 8
if (rlang::is_missing(height)) height <- 8
ymax <- eval(dots$ymax, envir = rlang::caller_env())
ybreaks <- eval(dots$ybreaks, envir = rlang::caller_env())
flip_coord <- eval(dots$flip_coord, envir = rlang::caller_env())
if (is.null(flip_coord)) flip_coord <- FALSE
df <- df %>%
dplyr::filter(!duplicated(.[[id]]))
if (("pep_scan_num" %in% names(df)) && (!all(is.na(df$pep_scan_num)))) {
df <- df %>%
tidyr::unite(uniq_by., raw_file, pep_scan_num, sep = "@") %>%
dplyr::group_by(uniq_by.) %>%
dplyr::filter(row_number() == 1L) %>%
dplyr::ungroup() %>%
dplyr::select(-uniq_by.)
}
pat0 <- "[0-9]{3}[NC]{0,1}"
df_sd <- df %>%
dplyr::select(id, grep(paste0("^sd_", type, pat0), names(.)))
# all-NA first removed for finding all-NaN columns
pat_log2_R <- paste0("^.*log2_R", pat0)
df_sd <- df_sd %>%
dplyr::filter(rowSums(!is.na(.[grep(pat_log2_R, names(.))])) > 0L)
if (adjSD) {
SD <- df %>%
dplyr::select(grep("^log2_R[0-9]{3}[NC]{0,1}|^I[0-9]{3}[NC]{0,1}", names(.))) %>%
dblTrim(range_log2r = c(0, 100), range_int = c(0, 100),
type_r = "log2_R", type_int = "I")
df_sd[, grep("^.*log2_R", names(df_sd))] <-
df_sd[, grep("^.*log2_R", names(df_sd)), drop = FALSE] %>%
sweep(., 2, sqrt(SD), "/")
df_z <- df_sd %>% dplyr::select(grep(pat_log2_R, names(.)))
nan_cols <- purrr::map_lgl(df_z, is_all_nan, na.rm = TRUE)
df_z[, nan_cols] <- 0
df_sd[, grep("^.*_log2_R[0-9]{3}", names(df_sd))] <- df_z
rm(list = c("df_z", {nan_cols}, "SD"))
}
if (TMT_plex) {
df_sd <- df_sd %>%
`names<-`(gsub("^.*log2_R", "", names(.)))
}
else {
df_sd <- df_sd %>%
`names<-`(gsub("sd_log2_R000 \\((.*)\\)$", "\\1", names(.)))
}
if (!is_psm) {
df_sd <- df_sd %>%
dplyr::select(id, which(names(.) %in% label_scheme_sub$new_id))
}
Levels <- names(df_sd) %>% .[! . %in% id]
if (length(Levels)) {
df_sd <- df_sd %>%
tidyr::gather(key = !!rlang::sym(id), value = "SD") %>%
dplyr::rename(Channel := !!rlang::sym(id)) %>%
dplyr::mutate(Channel = factor(Channel, levels = Levels)) %>%
dplyr::filter(!is.na(SD))
mean_sd <- df_sd %>%
dplyr::group_by(Channel) %>%
dplyr::summarise(SD = mean(SD, na.rm = TRUE)) %>%
dplyr::mutate(SD = round(SD, digits = 3L)) %T>%
readr::write_tsv(paste0(fn_prefix, ".txt"))
p <- ggplot() +
geom_violin(df_sd, mapping = aes(x = Channel, y = SD, fill = Channel),
alpha = .8, size = .25, linetype = 0,
draw_quantiles = c(.95, .99)) +
geom_boxplot(df_sd, mapping = aes(x = Channel, y = SD),
width = 0.1, lwd = .2, fill = "white") +
stat_summary(df_sd, mapping = aes(x = Channel, y = SD),
fun = "mean", geom = "point",
shape=23, size=2, fill="white", alpha=.5) +
labs(title = expression(""),
x = expression("Channel"),
y = expression("SD ("*log[2]*"FC)")) +
geom_text(data = mean_sd, aes(x = Channel, label = SD, y = SD + 0.2),
size = 5, colour = "red", alpha = .5)
if (!is.null(ymax)) {
if (is.null(ybreaks)) {
ybreaks <- ifelse(ymax > 1, 0.5, ifelse(ymax > 0.5, 0.2, 0.1))
}
p <- p +
scale_y_continuous(limits = c(0, ymax), breaks = seq(0, ymax, ybreaks))
}
if (is.null(theme) || purrr::is_function(theme))
theme <- theme_psm_violin
p <- p + theme
if (flip_coord) {
p <- p + coord_flip()
width_temp <- width
width <- height
height <- width_temp
rm(list = "width_temp")
}
ggsave_dots <- set_ggsave_dots(dots, c("filename", "plot", "width", "height",
"limitsize"))
my_call <- rlang::expr(ggplot2::ggsave(filename = !!filepath, plot = !!p,
width = !!width, height = !!height,
limitsize = FALSE,
!!!ggsave_dots))
ok <- tryCatch(eval(my_call, rlang::caller_env()),
error = function (e) NA_character_)
if (is.na(ok)) {
my_call$width <- 40
suppressWarnings(try(eval(my_call, rlang::caller_env())))
}
}
}
#' Violin plots of reporter-ion intensity per TMT_Set and LCMS_injection
#'
#' @inheritParams info_anal
#' @inheritParams sd_violin
rptr_violin <- function(df, filepath, width, height)
{
df <- df %>%
dplyr::select(which(not_all_NA(.)))
if (!ncol(df)) {
message("No intensity columns available.")
return (NULL)
}
df_int <- df %>%
`names<-`(gsub("^N_I|^I", "", names(.)))
Levels <- names(df_int)
df_int <- df_int %>%
tidyr::gather(key = "Channel", value = "Intensity") %>%
dplyr::mutate(Channel = factor(Channel, levels = Levels)) %>%
dplyr::filter(!is.na(Intensity))
mean_int <- df_int %>%
dplyr::group_by(Channel) %>%
dplyr::summarise(Intensity = mean(log10(Intensity), na.rm = TRUE)) %>%
dplyr::mutate(Intensity = round(Intensity, digits = 1))
p <- ggplot() +
geom_violin(df_int, mapping = aes(x = Channel, y = log10(Intensity), fill = Channel),
alpha = .8, size = .25, linetype = 0) +
geom_boxplot(df_int, mapping = aes(x = Channel, y = log10(Intensity)),
width = 0.2, lwd = .2, fill = "white") +
stat_summary(df_int, mapping = aes(x = Channel, y = log10(Intensity)),
fun = "mean", geom = "point",
shape = 23, size = 2, fill = "white", alpha = .5) +
labs(title = expression("Ions"),
x = expression("Channel"),
y = expression("Intensity ("*log[10]*")")) +
geom_text(data = mean_int, aes(x = Channel, label = Intensity, y = Intensity + 0.2),
size = 5, colour = "red", alpha = .5) +
theme_psm_violin
try(ggplot2::ggsave(filepath, p, width = width, height = height, units = "in"))
}
#' Geometric mean
#'
#' @param x A data frame.
#' @param ... Additional arguments for \code{mean}.
my_geomean <- function (x, ...)
{
x <- mean(log10(x), ...)
10^x
}
#' Counts phospho.
#'
#' Not currently used.
#'
#' @param filename The file name of peptide table.
#' @param collapses The modified residues to be collapsed with unmodified
#' counterparts.
#' @param pat_mod Pattern of modification
#' @inheritParams normPSM
count_phosphopeps <- function(filename = "Peptide.txt", collapses = c("mn"),
rm_allna = FALSE, pat_mod = "sty")
{
dat_dir <- get_gl_dat_dir()
df <- read.csv(file.path(dat_dir, "Peptide", filename), check.names = FALSE,
header = TRUE, sep = "\t", comment.char = "#")
not_single_sample <- (sum(grepl("^log2_R[0-9]{3}", names(df))) > 1)
if (not_single_sample && rm_allna) {
df <- df %>%
dplyr::filter(rowSums(!is.na( .[grep("^log2_R[0-9]{3}", names(.))] )) > 0)
}
id <- match_call_arg(normPSM, group_psm_by)
use_lowercase_aa <- match_call_arg(normPSM, use_lowercase_aa)
if (use_lowercase_aa) {
pat_mod2 <- paste0("[", pat_mod, "]")
df_phos <- df %>% dplyr::filter(grepl(pat_mod2, .[[id]]))
n_phos_peps <- nrow(df_phos)
pat <- paste(strsplit(collapses, "")[[1]], collapse = "|") %>%
paste0("(", ., ")")
df_phos <- df_phos %>%
dplyr::mutate(pep_seq_mod2 = gsub(pat, paste0("@", "\\1"), !!rlang::sym(id))) %>%
dplyr::mutate_at(vars("pep_seq_mod2"), ~ purrr::map_chr(.x, my_upper, "@")) %>%
dplyr::arrange(-pep_locprob) %>%
dplyr::filter(!duplicated(pep_seq_mod2)) %>%
dplyr::mutate(phos_class = dplyr::case_when(
pep_locprob >= .75 ~ 1L,
pep_locprob >= .50 ~ 2L,
pep_locprob >= .25 ~ 3L,
pep_locprob >= 0 ~ NA_integer_,
))
}
else {
hdr_files <- list.files(file.path(dat_dir, "PSM/cache"), "_header\\.txt$",
full.names = TRUE)
if (!length(hdr_files)) {
message("Mascot header file not found")
return(NULL)
}
hdr_files <- hdr_files[[1]]
hdr <- readLines(hdr_files)
lvs <- local({
l1 <- grep("Variable modifications,-----", hdr)[[1]]
l2 <- grep("Search Parameters,------", hdr)[[1]]
lvs <- hdr[(l1+1):(l2-1)]
lvs <- lvs[lvs != "\"\""]
lvs <- lvs[!grepl("-term", lvs)]
})
pat_mod <- lapply(strsplit(toupper(pat_mod), "")[[1]], function (x) {
l <- lvs[grepl(paste0(" \\(.*", x), lvs)]
l <- gsub("\"", "", l)
gsub("([^\\,])\\,.*", "\\1", l)
}) %>%
unlist() %>%
unique() %>%
paste(collapse = "")
pat_mod2 <- paste0("[", pat_mod, "]")
df_phos <- df %>% dplyr::filter(grepl(pat_mod2, .[[id]]))
n_phos_peps <- nrow(df_phos)
## convert collapses: "mn" -> "24"
collapses <- lapply(strsplit(toupper(collapses), "")[[1]], function (x) {
l <- lvs[grepl(paste0(" \\(.*", x), lvs)]
l <- gsub("\"", "", l)
gsub("([^\\,])\\,.*", "\\1", l)
}) %>%
unlist() %>%
unique() %>%
paste(collapse = "")
pat <- paste(strsplit(collapses, "")[[1]], collapse = "|") %>%
paste0("(", ., ")")
df_phos <- df_phos %>%
dplyr::mutate(pep_seq_mod2 = gsub(pat, "0", !!rlang::sym(id))) %>%
dplyr::arrange(-pep_locprob) %>%
dplyr::filter(!duplicated(pep_seq_mod2)) %>%
dplyr::mutate(phos_class = dplyr::case_when(
pep_locprob >= .75 ~ 1L,
pep_locprob >= .50 ~ 2L,
pep_locprob >= .25 ~ 3L,
pep_locprob >= 0 ~ NA_integer_,
))
}
n_phos_sites <- sum(stringr::str_count(df_phos[[id]], pat_mod2))
n_classes_12 <- df_phos %>%
dplyr::filter(phos_class <= 2L) %$%
stringr::str_count(.[[id]], pat_mod2) %>%
sum()
ans <- data.frame(n_peps = n_phos_peps,
n_sites = n_phos_sites,
n_12 = n_classes_12)
out_name <- paste0("phospep_", gsub("\\.[^.]*$", "", filename), ".tsv")
write.table(ans, file.path(dat_dir, "Peptide/cache", out_name),
sep = "\t", row.names = FALSE)
ans
}
#' Peptide mis-cleavage counts
#'
#' NOt currently used.
#'
count_pepmiss <- function()
{
dat_dir <- get_gl_dat_dir()
dir.create(file.path(dat_dir, "PSM/cache"), recursive = TRUE, showWarnings = FALSE)
rmPSMHeaders()
filelist <- list.files(path = file.path(dat_dir, "PSM/cache"),
pattern = "^F[0-9]{6}_hdr_rm.csv$")
if (!length(filelist))
stop(paste("No PSM files under", file.path(dat_dir, "PSM")))
df <- purrr::map(filelist, ~ {
data <- read.delim(file.path(dat_dir, "PSM/cache", .x), sep = ',',
check.names = FALSE, header = TRUE,
stringsAsFactors = FALSE, quote = "\"",
fill = TRUE , skip = 0)
data$dat_file <- gsub("_hdr_rm\\.csv", "", .x)
data <- data %>%
dplyr::filter(!duplicated(pep_seq))
tot <- nrow(data)
mis <- data %>% dplyr::filter(pep_miss > 0) %>% nrow()
tibble::tibble(total = tot, miscleavage = mis, percent = miscleavage/tot)
}) %>%
do.call(rbind, .)
write.csv(df, file.path(dat_dir, "PSM/cache/miscleavage_nums.csv"),
row.names = FALSE)
invisible(df)
}
#' Helper of row filtration
#'
#' \code{contain_str}: contain a literal string; "PEPTIDES" contain_str "TIDE".
#'
#' @param match A character string containing the pattern for matching.
#' @param vars A character string of the name of a variable. The default is
#' FALSE.
#' @param ignore.case Logical; if TRUE, ignores case when matching.
#' @examples
#' \donttest{
#' pepHist(
#' col_select = BI,
#' scale_log2r = TRUE,
#' filter_peps = exprs(contain_chars_in("sty", pep_seq_mod)),
#' scale_y = FALSE,
#' ncol = 4,
#' filename = "BI_pSTY_scaley_no.png",
#' )
#' }
#' @export
contain_str <- function (match, vars, ignore.case = FALSE)
{
stopifnot(rlang::is_string(match), nchar(match) > 0L)
grepl(match, vars, fixed = TRUE, ignore.case)
}
#' Helper of row filtration
#'
#' \code{contain_chars_in}: contain some of the characters in a literal string;
#' "PEPTIDES" contain_chars_in "XP".
#'
#' @rdname contain_str
#' @export
contain_chars_in <- function (match, vars, ignore.case = FALSE)
{
stopifnot(rlang::is_string(match), nchar(match) > 0L)
grepl(paste0("[", match, "]"), vars, fixed = FALSE, ignore.case)
}
#' Helper of row filtration
#'
#' \code{not_contain_str}" not contain a literal string; "PEPTIDES"
#' not_contain_str "TED".
#'
#' @rdname contain_str
#' @export
not_contain_str <- function (match, vars, ignore.case = FALSE)
{
stopifnot(rlang::is_string(match), nchar(match) > 0L)
!grepl(match, vars, fixed = TRUE, ignore.case)
}
#' Helper of row filtration
#'
#' \code{not_contain_chars_in}: not contain any of the characters in a literal
#' string; "PEPTIDES" not_contain_chars_in "CAB".
#'
#' @rdname contain_str
#' @export
not_contain_chars_in <- function (match, vars, ignore.case = FALSE)
{
stopifnot(rlang::is_string(match), nchar(match) > 0L)
!grepl(paste0("[", match, "]"), vars, fixed = FALSE, ignore.case = FALSE)
}
#' Helper of row filtration
#'
#' \code{start_with_str}: start with a literal string. "PEPTIDES" start_with_str
#' "PEP".
#'
#' @rdname contain_str
#' @export
start_with_str <- function (match, vars, ignore.case = FALSE)
{
stopifnot(is_string(match), nchar(match) > 0)
grepl(paste0("^", match), vars, fixed = FALSE, ignore.case)
}
#' Helper of row filtration
#'
#' \code{end_with_str}: end with a literal string. "PEPTIDES" end_with_str
#' "TIDES".
#'
#' @rdname contain_str
#' @export
end_with_str <- function (match, vars, ignore.case = FALSE)
{
stopifnot(rlang::is_string(match), nchar(match) > 0L)
grepl(paste0(match, "$"), vars, fixed = FALSE, ignore.case)
}
#' Helper of row filtration
#'
#' \code{start_with_chars_in}: start with one of the characters in a literal
#' string. "PEPTIDES" start_with_chars_in "XP".
#'
#' @rdname contain_str
#' @export
start_with_chars_in <- function (match, vars, ignore.case = FALSE)
{
stopifnot(rlang::is_string(match), nchar(match) > 0L)
grepl(paste0("^[", match, "]"), vars, fixed = FALSE, ignore.case)
}
#' Helper of row filtration
#'
#' \code{ends_with_chars_in}: end with one of the characters in a literal
#' string. "PEPTIDES" ends_with_chars_in "XS".
#'
#' @rdname contain_str
#' @export
ends_with_chars_in <- function (match, vars, ignore.case = FALSE)
{
stopifnot(is_string(match), nchar(match) > 0)
grepl(paste0("[", match, "]$"), vars, fixed = FALSE, ignore.case)
}
#' Helper of row filtration
#'
#' \code{rows_are_all}: rows are all
#' @rdname contain_str
rows_are_all <- function (match, vars, ignore.case = FALSE)
{
stopifnot(rlang::is_string(match), nchar(match) > 0L)
!grepl(paste0("[^", match, "]"), vars, fixed = FALSE, ignore.case = FALSE)
}
#' Helper of row filtration
#'
#' \code{rows_are_all}: rows are all
#' @rdname contain_str
rows_are_not_all <- function (match, vars, ignore.case = FALSE)
{
stopifnot(rlang::is_string(match), nchar(match) > 0L)
grepl(paste0("[^", match, "]"), vars, fixed = FALSE, ignore.case = FALSE)
}
#' Concatenates formula(s) to varargs of dots
#'
#' @param fmls A character vector of formula(s)
#' @param dots A character vector of formula(s) in \code{dots}
#' @param fml_nms A character vector containing the names of \code{fmls}.
#' @inheritParams info_anal
concat_fml_dots <- function(fmls = NULL, fml_nms = NULL,
dots = NULL, anal_type = "zzz")
{
dat_dir <- get_gl_dat_dir()
if ((!purrr::is_empty(fmls)) && (anal_type == "GSEA"))
return(c(dots, fmls))
if (!length(fmls)) {
fml_file <- file.path(dat_dir, "Calls/pepSig_formulas.rda")
if (file.exists(fml_file)) {
load(file = fml_file)
if (!is.null(fml_nms)) {
stopifnot(all(fml_nms %in% names(pepSig_formulas)))
pepSig_formulas <- pepSig_formulas %>%
.[names(.) %in% fml_nms]
}
dots <- c(dots, pepSig_formulas)
} else {
stop("Run both `pepSig()` and `prnSig()` first.",
call. = FALSE)
}
}
else {
match_fmls(fmls)
dots <- c(dots, fmls)
}
dots
}
#' Rolls up genes
#'
#' @param df A data frame
#' @param cols Column indexes
gn_rollup <- function (df, cols)
{
if (! "gene" %in% names(df))
return(df)
dfa <- df %>%
dplyr::select(gene, cols) %>%
dplyr::filter(!is.na(gene)) %>%
dplyr::group_by(gene) %>%
dplyr::summarise_all(~ median(.x, na.rm = TRUE))
dfb <- df %>%
dplyr::select(-cols) %>%
dplyr::select(-which(names(.) %in% c("prot_cover"))) %>%
dplyr::filter(!is.na(gene)) %>%
dplyr::filter(!duplicated(gene))
if ("prot_cover" %in% names(df)) {
dfc <- df %>%
dplyr::select(gene, prot_cover) %>%
dplyr::filter(!is.na(gene), !is.na(prot_cover)) %>%
dplyr::group_by(gene)
dfc <- suppressWarnings(
dfc %>% dplyr::summarise_all(~ max(.x, na.rm = TRUE))
)
}
else {
dfc <- df %>%
dplyr::select(gene) %>%
dplyr::mutate(prot_cover = NA)
}
df <- list(dfc, dfb, dfa) %>%
purrr::reduce(right_join, by = "gene") %>%
dplyr::filter(!is.na(gene), !duplicated(gene))
}
#' Compares dot-dot-dot between prior and current
#'
#' @param call_nm The name of a function call.
#' @param curr_dots The values of the current dots.
#' @param pattern The pattern for comparison.
identical_dots <- function(call_nm, curr_dots, pattern)
{
dat_dir <- get_gl_dat_dir()
file <- file.path(dat_dir, "Calls", paste0(call_nm, ".rda"))
if (!file.exists(file))
return(FALSE)
load(file = file)
identical(call_pars %>% .[grepl(pattern, names(.))], curr_dots)
}
#' Complete cases among sample IDs in label_scheme_sub, not label_scheme
#'
#' @inheritParams info_anal
#' @inheritParams gspaTest
my_complete_cases <- function (df, scale_log2r, label_scheme_sub)
{
dat_dir <- get_gl_dat_dir()
load(file = file.path(dat_dir, "label_scheme.rda"))
NorZ_ratios <- find_NorZ(scale_log2r)
# in case that reference(s) are included in label_scheme_sub,
# the values are zero for single ref or non-NA/NaN for multi ref
# and will not affect the complete.cases assessment
rows <- df %>%
dplyr::select(grep(NorZ_ratios, names(.))) %>%
`colnames<-`(label_scheme$Sample_ID) %>%
dplyr::select(which(names(.) %in% label_scheme_sub$Sample_ID)) %>%
complete.cases(.)
if (sum(rows) == 0)
stop("None of the cases are complete.", call. = FALSE)
df[rows, ]
}
#' The union of named list
#'
#' names will be kept after the unification
#'
#' @param x A list of values.
#' @param y Another list of values.
my_union <- function (x, y)
{
x %>%
.[! names(.) %in% names(y)] %>%
c(y)
}
#' Finds the base names of files
#'
#' (not currently used)
#'
#' @param filenames A character vector of filenames
find_fn_bases <- function (filenames)
{
gsub("\\.[^.]*$", "", filenames) # %>% .[1]
}
#' Finds the extensions of files
#'
#' @param filename A character string of filename
#' @param type the type of filename extension
find_fn_exts <- function (filename, type = "text")
{
purrr::map_chr(filename, ~ {
if (!grepl("\\.", .x)) {
fn_ext <- switch(
text = "txt",
graphics = "png",
stop("Invalid extension in file name(s).", Call. = FALSE)
)
}
else {
fn_ext <- gsub("^.*\\.([^.]*)$", "\\1", .x) # %>% .[1]
}
})
}
#' Checks duplicate argments in 'dots'
#'
#' @param blacklist A character vector of variable names.
#' @param ... A list of arguments for checking.
check_dots <- function (blacklist = NULL, ...)
{
dots <- rlang::enexprs(...)
dups <- purrr::map_lgl(names(dots), ~ .x %in% blacklist)
nms <- names(dots[dups])
if (!rlang::is_empty(nms))
stop("Do not use argument(s): ", purrr::reduce(nms, paste, sep = ", "),
call. = FALSE)
}
#' Checks depreciated arguments
#'
#' @param blacklist A list of paired character vectors. In each pair, the first
#' element is the old argument name if incurred and the second is the new name
#' for reminding.
#' @param ... A list of arguments for checking. The depreciated argument(s) are
#' no longer in the formalArgs of a function. If present, they will be in
#' \code{...}.
check_depreciated_args <- function (blacklist = NULL, ...)
{
dots <- rlang::enexprs(...)
old_args <- purrr::map_chr(blacklist, `[[`, 1)
new_args <- purrr::map_chr(blacklist, `[[`, 2)
depreciated <- purrr::map_lgl(names(dots), ~ .x %in% old_args)
nms <- names(dots[depreciated])
ind <- which(old_args %in% nms)
old_args <- old_args %>% .[ind]
new_args <- new_args %>% .[ind]
if (!rlang::is_empty(nms)) {
message("Depreciated argument(s): ",
purrr::reduce(old_args, paste, sep = ", "))
stop("Use replacement argument(s): ",
purrr::reduce(new_args, paste, sep = ", "),
call. = FALSE)
}
}
#' Forces 'complete_cases = TRUE' at 'impute_na = FALSE'
#'
#' @inheritParams prnHM
to_complete_cases <- function (complete_cases = FALSE, impute_na = FALSE)
{
warn_msg1 <- "Coerce `complete_cases = TRUE` at `impute_na = FALSE`."
if (!(impute_na || complete_cases)) {
complete_cases <- TRUE
rlang::warn(warn_msg1)
}
invisible(complete_cases)
}
#' Checks \code{gset_nms}
#'
#' It is OK that \code{gset_nms} not in "go_sets", "c2_msig", "kegg_sets",
#' "kinsub" as custom gene sets may be applied.
#'
#' @inheritParams prnGSPA
check_gset_nms <- function (gset_nms)
{
customs <- gset_nms %>%
.[! . %in% c("go_sets", "c2_msig", "kegg_sets", "kinsub")]
if (!purrr::is_empty(customs)) {
message("Apply custom database(s): ",
purrr::reduce(customs, paste, sep = ", ", .init = ""))
}
soft_dep <- gset_nms %>% .[. == "kegg_sets"]
if (!rlang::is_empty(soft_dep)) {
warning("Data set(s) softely depreciated: ",
purrr::reduce(soft_dep, paste, sep = ", ", .init = ""),
call. = FALSE)
}
invisible(gset_nms)
}
#' Checks the suitability of existing param files for MGKernel
#'
#' @param filepath A file path to \code{"MGKernel_params_N.txt"}
ok_existing_params <- function (filepath)
{
dat_dir <- get_gl_dat_dir()
# load(file = file.path(dat_dir, "label_scheme.rda"))
label_scheme <- load_ls_group(dat_dir, label_scheme)
if (!file.exists(filepath))
return(TRUE)
params <- readr::read_tsv(file.path(filepath),
col_types = cols(Component = col_double()))
missing_samples <- local({
par_samples <- unique(params$Sample_ID) %>% .[!is.na(.)]
expt_samples <- unique(label_scheme$Sample_ID)
setdiff(expt_samples, par_samples)
})
if (!length(missing_samples))
return(TRUE)
try(unlink(file.path(dat_dir, "Peptide/Histogram/MGKernel_params_[NZ].txt")))
try(unlink(file.path(dat_dir, "Protein/Histogram/MGKernel_params_[NZ].txt")))
warning(
"\nEntries(s) in `expt_smry.xlsx` missing from `MGKernel_params` files: \n ",
purrr::reduce(missing_samples, paste, sep = ", "),
"\nThis is probably due to the modification of values under the `Sample_ID` columns.",
"\n=========================================================",
"\n=== The previous `MGKernel_params` files are deleted. ===",
"\n=========================================================\n",
call. = FALSE)
invisible(return)
}
#' Finds the environment of a function
#'
#' @param name The name of a function.
#' @param env The environment.
#' @export
env_where <- function(name, env = rlang::caller_env())
{
if (identical(env, rlang::empty_env()))
stop("Can't find ", name, call. = FALSE)
else if (rlang::env_has(env, name))
env
else
env_where(name, rlang::env_parent(env))
}
#' Generates cut points.
#'
#' @param x A sequence of numeric values with probable NA value.
#' @inheritParams prnHist
set_cutpoints <- function (cut_points = NULL, x = NULL)
{
if (is.null(cut_points) && is.null(x))
stop("`cut_points` and `x` can not be both NULL.",
call. = FALSE)
if (!is.null(x)) {
oks <- x %>% .[!is.na(.)] %>% is.numeric() %>% all()
if (!oks)
stop("All numeric `x` are required for setting cut points.",
call. = FALSE)
}
if (!is.null(cut_points)) {
oks <- cut_points %>% is.numeric() %>% all()
if (!oks)
stop("All numeric values are required for `cut_points`.",
call. = FALSE)
}
if (is.null(x) && !is.null(cut_points)) {
cut_points <- cut_points %>%
c(-Inf, ., Inf) %>%
.[!duplicated(.)] %>%
.[order(.)]
return(cut_points)
}
if (is.null(cut_points) && !is.null(x)) {
cut_points <- x %>%
quantile() %>%
c(-Inf, ., Inf) %>%
round(digits = 1)
return(cut_points)
}
cut_points %>%
c(-Inf, ., Inf) %>%
.[!duplicated(.)] %>%
.[order(.)]
}
#' A wrapper of \code{set_cutpoint}.
#'
#' @param df A data frame.
#' @inheritParams prnHist
set_cutpoints2 <- function(cut_points, df)
{
if (is.null(cut_points) || is.infinite(cut_points))
return(c(mean_lint = -Inf, mean_lint = Inf))
if (any(is.na(cut_points))) {
cut_points <- cut_points[1]
if (is.null(names(cut_points)))
cut_nm <- "mean_lint"
else
cut_nm <- names(cut_points)
if (! cut_nm %in% names(df)) {
warning("Column `", cut_nm, "` not found in `df`;",
"instead use column `mean_lint`.",
call. = FALSE)
cut_nm <- "mean_lint"
}
if (!is.numeric(df[[cut_nm]]))
stop("Column `", cut_nm, "` is not numeric.", call. = FALSE)
cut_points <- quantile(df[[cut_nm]], na.rm = TRUE)
seqs <- seq_along(cut_points)
names(cut_points)[seqs] <- cut_nm
}
else {
cut_nm <- gsub("1$", "", names(cut_points[1]))
if (!length(cut_nm))
cut_nm <- "mean_lint"
if (!cut_nm %in% names(df)) {
warning("Column `", cut_nm, "` not found in `df`;",
"instead use column `mean_lint`.",
call. = FALSE)
cut_nm <- "mean_lint"
}
if (!is.numeric(df[[cut_nm]]))
stop("Column `", cut_nm, "` is not numeric.", call. = FALSE)
cut_points <- set_cutpoints(cut_points, df[[cut_nm]])
seqs <- seq_along(cut_points)
names(cut_points)[seqs] <- cut_nm
}
invisible(cut_points)
}
#' Loads cRAPs
#'
#' Load the cRAP proteins by the types of accessions
#' @param acc_types The types of protein accessions
load_craps <- function(acc_types)
{
data(package = "proteoQ", prn_annot_crap, envir = environment())
if (identical(acc_types, "other_acc")) {
craps <- prn_annot_crap %>%
dplyr::select("fasta_name") %>%
dplyr::filter(!is.na(fasta_name), !duplicated(fasta_name)) %>%
unlist()
}
else {
craps <- prn_annot_crap %>%
dplyr::select(which(names(.) %in% acc_types)) %>%
tidyr::gather("acc_type", "prot_acc") %>%
dplyr::filter(!is.na(prot_acc), !duplicated(prot_acc)) %>%
dplyr::select(prot_acc) %>%
unlist()
}
}
#' Finds the columns of reporter-ion intensity.
#'
#' @inheritParams TMT_levels
find_int_cols <- function (TMT_plex = 10L)
{
oks <- c(0L, 6L, 10L, 11L, 16L, 18L)
if (! TMT_plex %in% oks)
warning("`TMT_plex` is not one of ", paste(oks, collapse = ", "), ".",
call. = FALSE)
col_int <- if (TMT_plex == 18L) {
c("I126", "I127N", "I127C", "I128N", "I128C", "I129N", "I129C",
"I130N", "I130C", "I131N", "I131C",
"I132N", "I132C", "I133N", "I133C", "I134N",
"I134C", "I135N")
}
else if (TMT_plex == 16L) {
c("I126", "I127N", "I127C", "I128N", "I128C", "I129N", "I129C",
"I130N", "I130C", "I131N", "I131C",
"I132N", "I132C", "I133N", "I133C", "I134N")
}
else if (TMT_plex == 11L) {
c("I126", "I127N", "I127C", "I128N", "I128C", "I129N", "I129C",
"I130N", "I130C", "I131N", "I131C")
}
else if (TMT_plex == 10L) {
c("I126", "I127N", "I127C", "I128N", "I128C", "I129N", "I129C",
"I130N", "I130C", "I131")
}
else if(TMT_plex == 6L) {
c("I126", "I127", "I128", "I129", "I130", "I131")
}
else {
NULL
}
}
#' Finds the columns of reporter-ion ratios.
#'
#' @inheritParams TMT_levels
find_ratio_cols <- function (TMT_plex = 10L)
{
col_ratio <- if (TMT_plex == 18L) {
c("R127N", "R127C", "R128N", "R128C", "R129N", "R129C",
"R130N", "R130C", "R131N", "R131C",
"R132N", "R132C", "R133N", "R133C", "R134N",
"R134C", "R135N")
}
else if (TMT_plex == 16L) {
c("R127N", "R127C", "R128N", "R128C", "R129N", "R129C",
"R130N", "R130C", "R131N", "R131C",
"R132N", "R132C", "R133N", "R133C", "R134N")
}
else if (TMT_plex == 11L) {
c("R127N", "R127C", "R128N", "R128C", "R129N", "R129C",
"R130N", "R130C", "R131N", "R131C")
}
else if (TMT_plex == 10L) {
c("R127N", "R127C", "R128N", "R128C", "R129N", "R129C",
"R130N", "R130C", "R131")
}
else if(TMT_plex == 6L) {
c("R127", "R128", "R129", "R130", "R131")
}
else {
NULL
}
}
#' Processes MaxQuant mqpar.xml
#'
#' @inheritParams load_expts
#' @param mqpar The name of .xml file. The default is "mqpar.xml".
#' @param filename The name of metadata file.
#' @param out The name of output .xml file.
make_mq_meta <- function (dat_dir = proteoQ:::get_gl_dat_dir(), mqpar = "mqpar.xml",
filename = "mq_meta.txt", out = "new_mqpar.xml")
{
if (!requireNamespace("xml2", quietly = TRUE)) {
stop("\n====================================================================",
"\nNeed package \"xml2\" for this function to work.",
"\n====================================================================",
call. = FALSE)
}
lookup <- readr::read_tsv(file.path(dat_dir, filename)) %>%
dplyr::rename(RAW_File = Name, Sample_ID = Experiment) %>%
dplyr::filter(rowSums(!is.na(.)) > 0)
n_smpls <- nrow(lookup)
parent <- xml2::read_xml(file.path(dat_dir, mqpar))
children <- xml2::xml_children(parent)
contents <- parent %>% xml2::xml_contents()
filePaths <-
xml2::xml_children(children[[which(xml2::xml_name(contents) == "filePaths")]])
len <- length(filePaths)
if (len > n_smpls) {
xml2::xml_text(filePaths[(n_smpls + 1):len]) <- ""
filePaths <- filePaths[1:n_smpls]
}
pre <- filePaths %>% xml2::xml_text() %>% gsub("(.*\\\\|/).*", "\\1", .)
xml2::xml_text(filePaths) <- paste0(pre, lookup$RAW_File)
experiments <-
xml2::xml_children(children[[which(xml2::xml_name(contents) == "experiments")]])
len <- length(experiments)
if (len > n_smpls) {
xml2::xml_text(experiments[(n_smpls + 1):len]) <- ""
experiments <- experiments[1:n_smpls]
}
xml2::xml_text(experiments) <- lookup$Sample_ID
fractions <-
xml2::xml_children(children[[which(xml2::xml_name(contents) == "fractions")]])
len <- length(fractions)
if (len > n_smpls) {
xml2::xml_text(fractions[(n_smpls + 1):len]) <- ""
fractions <- fractions[1:n_smpls]
}
ptms <-
xml2::xml_children(children[[which(xml2::xml_name(contents) == "ptms")]])
len <- length(ptms)
if (len > n_smpls) {
xml2::xml_text(ptms[(n_smpls + 1):len]) <- ""
ptms <- ptms[1:n_smpls]
}
paramGroupIndices <-
xml2::xml_children(children[[which(xml2::xml_name(contents) == "paramGroupIndices")]])
referenceChannel <-
xml2::xml_children(children[[which(xml2::xml_name(contents) == "referenceChannel")]])
xml2::write_xml(parent, file.path(dat_dir, out), options = "format")
}
#' Processes MaxQuant mqpar.xml
#'
#' @inheritParams load_expts
#' @param mqpar The name of .xml file. The default is "mqpar.xml".
#' @param filename The name of metadata file.
#' @param out The name of output .xml file.
#' @examples
#' \dontrun{
#' dat_dir <- "Y:/qzhang/MaxQuant"
#' pre_mq_meta()
#' make_mq_meta2()
#' }
make_mq_meta2 <- function (dat_dir = proteoQ:::get_gl_dat_dir(),
mqpar = "mqpar.xml",
filename = "mq_meta.txt", out = "new_mqpar.xml")
{
fn_suffix <- gsub("^.*\\.([^\\.]*)$", "\\1", filename)
fn_prefix <- gsub("\\.[^\\.]*$", "", filename)
lookup <- if (fn_suffix %in% c("xls", "xlsx")) {
readxl::read_excel(file.path(dat_dir, filename))
} else if (fn_suffix == "csv") {
readr::read_csv(file.path(dat_dir, filename))
} else if (fn_suffix == "txt") {
readr::read_tsv(file.path(dat_dir, filename))
} else {
stop(filename, " needs to be '.xls' or '.xlsx'.")
}
lookup <- lookup %>%
dplyr::filter(rowSums(!is.na(.)) > 0)
if ((!"PTM" %in% names(lookup)) || all(is.na(lookup$PTM))) {
lookup$PTM <- "False"
}
if ((!"Fraction" %in% names(lookup)) || all(is.na(lookup$Fraction))) {
lookup$Fraction <- 1
}
if (!"Group" %in% names(lookup) || all(is.na(lookup$Group))) {
lookup$Group <- 0
}
if ((!"Reference" %in% names(lookup)) || all(is.na(lookup$Reference))) {
lookup$Reference <- ""
}
if (!grepl("\\.d$", lookup$Name[[1]])) {
lookup$Name <- paste0(lookup$Name, ".d")
}
mqxml <- readLines(file.path(dat_dir, mqpar)) %>%
update_nodes("filePaths", "string", lookup$Name) %>%
update_nodes("experiments", "string", lookup$Experiment) %>%
update_nodes("fractions", "short", lookup$Fraction) %>%
update_nodes("ptms", "boolean", lookup$PTM) %>%
update_nodes("paramGroupIndices", "int", lookup$Group) %>%
update_nodes("referenceChannel", "string", lookup$Reference) %T>%
writeLines(file.path(dat_dir, out))
}
#' Helper of \link{make_mq_meta2}
#'
#' @param mqxml The XML from mqpar.xml.
#' @param field The field to be updated.
#' @param type The type of data.
#' @param vals The Values of data.
update_nodes <- function(mqxml, field = "filePaths", type = "string", vals)
{
row_1 <- grep(paste0("<", field, ">"), mqxml)
row_2 <- grep(paste0("</", field, ">"), mqxml)
lines <- paste0(paste0("\t\t\t<", type, ">"),
vals,
paste0("</", type, ">"))
if (field == "filePaths") {
paths <- mqxml[(row_1+1):(row_2-1)] %>%
gsub("^.*<string>", "", .) %>%
gsub("</string>$", "", .) %>%
gsub("(.*\\\\).*", "\\1", .) %>%
gsub("(.*/).*", "\\1", .)
} else {
paths <- ""
}
lines <- paste0(paste0("\t\t\t<", type, ">"),
paths,
vals,
paste0("</", type, ">"))
bf <- mqxml[1:row_1]
af <- mqxml[row_2:length(mqxml)]
mqxml <- bf %>% append(lines) %>% append(af)
}
#' Updates MaxQuant experimentalDesignTemplate.txt
#'
#' @param dat_dir The working directory.
#' @param expt_smry The filename of proteoQ metadata.
#' @param filename The filename of MaxQuant metadata.
pre_mq_meta <- function (dat_dir = proteoQ:::get_gl_dat_dir(),
expt_smry = "expt_smry.xlsx",
filename = "combined/experimentalDesignTemplate.txt")
{
meta_proteoq <- file.path(dat_dir, expt_smry) %>%
readxl::read_excel() %>%
dplyr::select(c("Sample_ID", "RAW_File")) %>%
dplyr::mutate(RAW_File = gsub("\\.d$", "", RAW_File))
meta_mq <- file.path(dat_dir, filename) %>%
readr::read_tsv() %>%
dplyr::left_join(meta_proteoq, by = c("Name" = "RAW_File")) %>%
dplyr::mutate(Experiment = Sample_ID) %>%
dplyr::select(-Sample_ID) %T>%
readr::write_tsv(file.path(dat_dir, "mq_meta.txt"))
}
#' Adds numeric indices for peptides or proteins
#'
#' @param df A data frame.
#' @param col_in The column key to be indexed.
#' @param col_out The name of the output column.
add_entry_ids <- function (df, col_in = "pep_seq", col_out = "pep_index")
{
if (col_out %in% names(df)) {
return(reloc_col_after_last(df, col_out))
}
ids <- unique(df[[col_in]])
seqs <- seq_along(ids) %>% `names<-`(ids)
seqs <- tibble::tibble(!!col_in := names(seqs), !!col_out := seqs)
df %>% dplyr::left_join(seqs, by = col_in)
}
#' Checks conflicts in function arguments.
#'
#' Conflicts between wrapper and function to be wrapped. Note that \code{f}
#' contains function name but not package name.
#'
#' if `method` in the formalArgs and "m" in the "..." of `foo`,
#' foo(m = 2) without `method` interpreted as `method`.
#' Call foo(method = kmeans, m =2) to avoid such ambiguity.
#'
#' @param w Expression; the wrapper function.
#' @param f Expression; the function to be wrapped.
#' @param excludes Character string; arguments from \code{f} to be excluded for
#' checking.
check_formalArgs <- function(w = anal_prnTrend, f = cmeans, excludes = NULL)
{
w <- rlang::as_string(rlang::enexpr(w))
f <- rlang::as_string(rlang::enexpr(f))
args_w <- formalArgs(w) %>% .[! . == "..."]
args_f <- formalArgs(f) %>% .[! . == "..."]
if (!is.null(excludes)) {
args_f <- args_f %>% .[! . %in% excludes]
}
if (length(args_f)) {
purrr::walk(args_w, ~ {
ambi <- stringr::str_detect(.x, paste0("^", args_f)) %>%
which()
if (length(ambi)) {
warning("Ambiguous arguments between \"",
purrr::reduce(args_f[ambi], paste, sep = ", "),
"\" from `", f,
"` and \"", .x,
"\" from `", w,
"`.\n",
"### Include explicitly \"", .x, "\"",
" when calling `", w, "`. ###\n",
call. = FALSE)
}
})
}
}
#' Checks conflicts in function arguments.
#'
#' Conflicts between wrapper and function to be wrapped. Note that \code{f}
#' contains both function name and package name.
#'
#' @inheritParams check_formalArgs
check_formalArgs2 <- function(w = anal_prnTrend, f = NMF::nmf, excludes = NULL)
{
# for error messages
nm_f <- rlang::enexpr(f)
if (any(grepl("::", nm_f))) {
nm_f <- nm_f %>% as.character()
stopifnot(length(nm_f) ==3)
nm_f <- nm_f %>% `[`(3)
}
w <- rlang::as_string(rlang::enexpr(w))
args_w <- formalArgs(w) %>% .[! . == "..."]
args_f <- formalArgs(f) %>% .[! . == "..."]
if (!is.null(excludes)) {
args_f <- args_f %>% .[! . %in% excludes]
}
purrr::walk(args_w, ~ {
ambi <- stringr::str_detect(.x, paste0("^", args_f)) %>%
which()
if (!purrr::is_empty(ambi)) {
warning("Ambiguous arguments between \"",
purrr::reduce(args_f[ambi], paste, sep = ", "),
"\" from `", nm_f,
"` and \"", .x,
"\" from `", w,
"`.\n",
"### Include explicitly \"", .x, "\"",
" when calling `", w, "`. ###\n",
call. = FALSE)
}
})
}
#' Sets dot-dot-dot for ggsave.
#'
#' @param dots Arguments passed on.
#' @param dummies Arguments automated for ggsave.
set_ggsave_dots <- function(dots, dummies = c("filename", "plot", "device", "path"))
{
if (purrr::is_empty(dots)) return(dots)
args <- formalArgs(ggplot2::ggsave) %>% .[! . == "..."]
dots %>%
.[names(.) %in% args] %>%
.[! names(.) %in% dummies]
}
#' Finds the type of search engine
#'
#' @inheritParams load_expts
find_search_engine <- function(dat_dir = NULL)
{
if (is.null(dat_dir))
dat_dir <- get_gl_dat_dir()
pat_mascot <- "^F[0-9]+.*\\.csv$"
pat_mq <- "^msms.*\\.txt$"
pat_mf <- "^psm.*\\.tsv$"
pat_sm <- "^PSMexport.*\\.ssv$"
pat_pq <- "^psm[QC]{1}.*\\.txt$"
# pat_msgf <- "(^peptide)|(^protein)|(^psm)\\.tsv$"
pat_msgf <- "^psmMSGF.*\\.txt$$"
mascot <-list.files(path = file.path(dat_dir), pattern = pat_mascot) %>%
length() %>% `>`(0L)
mq <- list.files(path = file.path(dat_dir), pattern = pat_mq) %>%
length() %>% `>`(0L)
mf <- list.files(path = file.path(dat_dir), pattern = pat_mf) %>%
length() %>% `>`(0L)
sm <- list.files(path = file.path(dat_dir), pattern = pat_sm) %>%
length() %>% `>`(0L)
pq <- list.files(path = file.path(dat_dir), pattern = pat_pq) %>%
length() %>% `>`(0L)
msgf <- list.files(path = file.path(dat_dir), pattern = pat_msgf) %>%
length() %>% `>`(0L)
if (FALSE) {
msgf <- local({
files <- list.files(path = file.path(dat_dir), pattern = "\\.tsv$")
files <- files[!grepl("(^peptide)|(^protein)|(^psm)", files)]
length(files) > 0L
})
}
engines <- c(mascot = mascot, mq = mq, mf = mf, sm = sm, pq = pq, msgf = msgf)
oks <- names(engines[engines])
if (!length(oks)) {
stop("No PSM files found.\n",
"File name(s) need to be in the format of \n",
paste(c(paste(" Mascot", pat_mascot, sep = ": "),
paste(" MaxQuant", pat_mq, sep = ": "),
paste(" MSFragger", pat_mf, sep = ": "),
paste(" proteoQ", pat_pq, sep = ": "),
paste(" MSGF", pat_msgf, sep = ": "),
paste(" Spectrum Mill", pat_sm, sep = ": ")),
collapse = "\n"))
}
if (length(oks) > 1L)
stop("Multiple data types found: ", paste(oks, collapse = ", "))
invisible(oks)
}
#' Checks missing values in a ggplot2 object.
#'
#' @param p A ggplot2 object.
check_ggplot_aes <- function(p)
{
q <- p %>% ggplot_build() %>%
`[[`(1) %>%
`[[`(1) %>%
dplyr::select(which(not_all_NA(.)))
cols <- q %>%
purrr::map_lgl(anyNA)
if (any(cols)) {
warning("\nMismatches in the lengths of vectors ",
"between selected samples and the aesthetics of `",
purrr::reduce(names(which(cols)), paste, sep = ", "), "`.\n",
"Fix the missing values in the corresponding columns in `expt_smry.xlsx`.\n",
"Otherwise, may run into errors.\n",
call. = FALSE)
}
}
#' Standardizes range values.
#'
#' @param from_to A numeric vector of length 2.
prep_range <- function(from_to = c(0, 1))
{
stopifnot(length(from_to) == 2)
stopifnot(from_to[2] > from_to[1],
from_to[1] >= 0,
from_to[2] <= 100)
if (from_to[2] <= 1) from_to <- from_to * 100
from_to
}
#' Finds the deliminator of a file.
#'
#' @param filename A file name.
find_delim <- function (filename)
{
fn_suffix <- gsub("^.*\\.([^.]*)$", "\\1", filename)
if (fn_suffix %in% c("txt", "tsv")) {
delim <- "\t"
} else if (fn_suffix == "csv") {
delim <- ","
} else if (fn_suffix == "ssv") {
delim <- ";"
} else {
stop("File extension needs to be one of 'txt', 'tsv', 'csv' or 'ssv'.",
call. = FALSE)
}
invisible(delim)
}
#' Flattens lists recursively
#'
#' @param x Lists
recur_flatten <- function (x)
{
if (!inherits(x, "list")) {
list(x)
} else {
unlist(c(purrr::map(x, recur_flatten)), recursive = FALSE)
}
}
#' Subsets elements with attributes kept.
#'
#' @param data The data vector.
#' @param nm The names of elements in the vector for removals.
subset_keepattr <- function (data, nm = c("M", "N"))
{
attrs <- attributes(data)
for (i in seq_along(nm)) {
data <- data %>% .[! names(.) == nm[i]]
}
attrs$names <- names(data)
attributes(data) <- attrs
data
}
#' Lists to a data frame
#'
#' A simple replacement of plyr::ldply.
#'
#' @param x Lists of vectors.
list_to_dataframe <- function (x)
{
nms <- names(x)
lens <- unlist(lapply(x, length))
mlen <- max(lens)
for (i in seq_along(x)) {
d <- mlen - lens[i]
if (d) x[[i]] <- c(x[[i]], rep(NA, d))
}
ans <- do.call(rbind, x)
if (is.null(nms)) {
# rownames(ans) <- seq_len(nrow(ans))
cns <- seq_len(mlen)
} else {
# rownames(ans) <- nms
ans <- cbind(nms, ans)
cns <- c(".id", seq_len(mlen))
}
ans <- data.frame(ans)
colnames(ans) <- cns
rownames(ans) <- seq_len(nrow(ans))
invisible(ans)
}
#' Finds the list of psmQ files.
#'
#' @inheritParams load_expts
find_psmQ_files <- function (dat_dir)
{
filelistQ <- list.files(path = file.path(dat_dir), pattern = "^psmQ.*\\.txt$")
filelistC <- list.files(path = file.path(dat_dir), pattern = "^psmC.*\\.txt$")
if (length(filelistQ) && length(filelistC)) {
message("Both `psmQ[...].txt` and `psmC[...].txt` are present; ",
"\"psmQ[...].txt\" will be used in baseline analysis.\n",
"Need \"psmC[...].txt\" for optional matches between runs in LFQ.")
filelist <- filelistQ
}
else if (length(filelistQ)) {
filelist <- filelistQ
}
else if (length(filelistC)) {
filelist <- filelistC
stop(paste("No PSM files `psmQ[...].txt` under", file.path(dat_dir), ".",
"\nMake sure that the names of PSM files start with `psmQ`."),
call. = FALSE)
}
else {
stop(paste("No PSM files `psmQ[...].txt` under", file.path(dat_dir), ".",
"\nMake sure that the names of PSM files start with `psmQ`."),
call. = FALSE)
}
filelist
}
#' Gets column types.
#'
#' @import readr
get_col_types <- function ()
{
# just a remainder of mzion columns
col_types_pq <- cols(
prot_acc = col_character(),
prot_issig = col_logical(),
prot_isess = col_logical(),
prot_tier = col_integer(),
prot_hit_num = col_integer(),
prot_family_member = col_integer(),
prot_es = col_number(),
prot_es_co = col_number(),
pep_seq = col_character(),
pep_n_ms2 = col_integer(),
pep_scan_title = col_character(),
pep_exp_mz = col_number(),
pep_exp_mr = col_number(),
pep_exp_z = col_character(),
pep_calc_mr = col_number(),
pep_delta = col_number(),
pep_tot_int = col_number(),
pep_ret_range = col_number(),
pep_scan_num = col_character(), # timsTOF
pep_mod_group = col_integer(),
pep_frame = col_integer(),
pep_fmod = col_character(),
pep_vmod = col_character(),
pep_isdecoy = col_logical(),
pep_ivmod = col_character(),
pep_len = col_integer(),
pep_ms2_moverzs = col_character(),
pep_ms2_ints = col_character(),
pep_ms2_theos = col_character(),
pep_ms2_theos2 = col_character(),
pep_ms2_exptints = col_character(),
pep_ms2_exptints2 = col_character(),
pep_n_matches = col_integer(),
pep_n_matches2 = col_integer(),
pep_ms2_deltas = col_character(),
pep_ms2_ideltas = col_character(),
pep_ms2_deltas2 = col_character(),
pep_ms2_ideltas2 = col_character(),
pep_ms2_deltas_mean = col_double(),
pep_ms2_deltas_sd = col_double(),
pep_issig = col_logical(),
pep_score = col_double(),
pep_expect = col_double(),
pep_rank = col_integer(),
pep_locprob = col_double(),
pep_locdiff = col_double(),
pep_rank_nl = col_integer(),
pep_literal_unique = col_logical(),
pep_razor_unique = col_logical(),
raw_file = col_character(), )
col_types <- cols(
prot_hit_num = col_integer(),
prot_family_member = col_integer(),
prot_acc = col_character(),
prot_desc = col_character(),
prot_score = col_double(),
prot_mass = col_double(),
prot_matches = col_integer(),
prot_matches_sig = col_integer(),
prot_sequences = col_integer(),
prot_sequences_sig = col_integer(),
prot_n_psm = col_integer(),
prot_n_pep = col_integer(),
prot_len = col_integer(),
prot_pi = col_double(),
prot_tax_str = col_character(),
prot_tax_id = col_integer(),
prot_seq = col_character(),
prot_empai = col_double(),
prot_icover = col_double(),
prot_cover = col_double(),
prot_index = col_integer(),
prot_issig = col_logical(),
prot_isess = col_logical(),
prot_tier = col_integer(),
prot_es = col_double(),
prot_es_co = col_double(),
pep_query = col_integer(),
pep_rank = col_integer(),
pep_n_psm = col_integer(),
pep_isbold = col_logical(),
pep_isunique = col_logical(),
pep_literal_unique = col_logical(),
pep_razor_unique = col_logical(),
pep_tot_int = col_double(),
pep_unique_int = col_double(),
pep_razor_int = col_double(),
pep_exp_mz = col_double(),
pep_exp_mr = col_double(),
pep_exp_z = col_character(),
pep_calc_mr = col_double(),
pep_delta = col_double(),
pep_score = col_double(),
pep_homol = col_double(),
pep_ident = col_double(),
pep_expect = col_double(),
pep_res_before = col_character(),
pep_seq = col_character(),
pep_seq_mod = col_character(),
pep_res_after = col_character(),
pep_start = col_integer(),
pep_end = col_integer(),
pep_len = col_integer(),
pep_miss = col_integer(),
pep_frame = col_integer(),
pep_var_mod = col_character(),
pep_var_mod_pos = col_character(),
pep_summed_mod_pos = col_character(),
pep_local_mod_pos = col_character(),
pep_num_match = col_integer(), # Mascot
pep_scan_title = col_character(),
pep_index = col_integer(),
pep_scan_range = col_character(), # timsTOF
pep_n_exp_z = col_integer(),
pep_ret_sd = col_double(),
pep_n_nl = col_integer(),
pep_ret_range = col_double(),
pep_ms2_sumint = col_double(),
pep_n_ions = col_integer(),
pep_locprob = col_double(),
pep_locdiff = col_double(),
pep_phospho_locprob = col_double(),
pep_phospho_locdiff = col_double(),
pep_ions_first = col_character(),
pep_ions_second = col_character(),
pep_ions_third = col_character(),
pep_n_ms2 = col_integer(),
pep_scan_num = col_character(),
pep_mod_group = col_integer(),
pep_fmod = col_character(),
pep_vmod = col_character(),
pep_isdecoy = col_logical(),
pep_ivmod = col_character(),
pep_issig = col_logical(),
pep_rank_nl = col_integer(),
gene = col_character(),
fasta_name = col_character(),
uniprot_acc = col_character(),
uniprot_id = col_character(),
refseq_acc = col_character(),
other_acc = col_character(),
entrez = col_integer(),
species = col_character(),
acc_type = col_character(),
shared_prot_accs = col_character(),
shared_genes = col_character(),
kin_attr = col_logical(),
kin_class = col_character(),
kin_order = col_integer(),
pep_istryptic = col_logical(),
pep_semitryptic = col_character(),
dat_file = col_character(),
raw_file = col_character(),
mean_lint = col_double(),
count_nna = col_integer(),
TMT_Set = col_integer(),
)
nms <- names(col_types$cols)
stopifnot(length(nms) == length(unique(nms)))
col_types
}
#' Wrapper of writing Excel
#'
#' @param df A data frame.
#' @param sheetName An Excel sheet name.
#' @param filename An output file name.
#' @inheritParams load_expts
write_excel_wb <- function (df, sheetName = "Setup", dat_dir = NULL, filename)
{
if (is.null(dat_dir))
dat_dir <- get_gl_dat_dir()
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, sheetName = sheetName)
openxlsx::writeData(wb, sheet = sheetName, df)
openxlsx::saveWorkbook(wb, file.path(dat_dir, filename), overwrite = TRUE)
}
#' Loads label_scheme or label_scheme_group.
#'
#' Prefer label_scheme_group over label_scheme.
#'
#' @param filename A file name of "label_scheme" or "label_scheme_full".
#' @param prefer_group Logical; if TRUE, prefer label_scheme_group over
#' label_scheme etc.
#' @inheritParams load_expts
load_ls_group <- function (dat_dir, filename = "label_scheme", prefer_group = TRUE)
{
filename <- rlang::as_string(rlang::enexpr(filename))
if (grepl("\\.rda$", filename)) {
fn_prefix <- gsub("\\.[^.]*$", "", filename)
}
else {
fn_prefix <- filename
filename <- paste0(filename, ".rda")
}
file <- file.path(dat_dir, filename)
file2 <- file.path(dat_dir, paste0(fn_prefix, "_group.rda"))
fn_prefix2 <- paste0(fn_prefix, "_group")
if (file.exists(file2) && prefer_group) {
fi <- file2
nm <- fn_prefix2
}
else if (file.exists(file)) {
fi <- file
nm <- fn_prefix
}
else {
stop("`label_schem` not found under ", dat_dir)
}
load(fi)
get(nm, envir = environment(), inherits = FALSE)
}
#' Parses \code{col_select}.
#'
#' @inheritParams standPep
#' @param label_scheme Experiment summary
parse_col_select <- function (col_select, label_scheme)
{
sids <- label_scheme[["Sample_ID"]]
if (is.null(sids))
stop("Column `Sample_ID` not found in metadata.", call. = FALSE)
if (col_select == "Sample_ID" && all(is.na(sids)))
stop("All values under column `Sample_ID` are NA.", call. = FALSE)
if (col_select == "Sample_ID")
return(col_select)
if (is.null(label_scheme[[col_select]])) {
col_select <- "Sample_ID"
warning("Column `", col_select, "` not existed. ",
"Used column `Sample_ID` instead.", call. = FALSE)
}
else if (sum(!is.na(label_scheme[[col_select]])) == 0) {
col_select <- "Sample_ID"
warning("No samples under column '", col_select, "`. ",
"Used column `Sample_ID` instead.", call. = FALSE)
}
col_select
}
#' Parses file name.
#'
#' @param filename A file name.
#' @param dat_dir A working directory.
#' @param must_exists Logical; if TRUE, the file must be present.
parse_filename <- function (filename, dat_dir, must_exists = FALSE)
{
file <- file.path(dat_dir, filename)
if (must_exists && !file.exists(file))
stop(filename, " not found under '", dat_dir, "'.", call. = FALSE)
if (!grepl("\\.", filename))
stop("File name has no extension: ", filename)
fn_suffix <- gsub("^.*\\.([^\\.]*)$", "\\1", filename)
fn_prefix <- gsub("\\.[^\\.]*$", "", filename)
if (nchar(fn_prefix) <= 0L)
stop("File name has no base: ", filename)
list(fn_prefix = fn_prefix, fn_suffix = fn_suffix)
}
#' Subsets Bruker's MGF data
#'
#' Applied after the fixes of TITLE lines.
#'
#' @param file A file name
#' @param begin_offset The number of lines before a BEGIN line.
#' @param charge_offset The number lines after a BEGIN line to a following
#' CHARGE line.
#' @param topn_ms2ions Top-n MS2 ions to be retained.
subsetBrukerMGF <- function (file, begin_offset = 5L, charge_offset = 5L,
topn_ms2ions = Inf)
{
message("Processing: ", file)
lines <- readLines(file)
begins <- .Internal(which(stringi::stri_startswith_fixed(lines, "BEGIN IONS")))
ends <- .Internal(which(stringi::stri_endswith_fixed(lines, "END IONS")))
hdrs <- 1:(begins[1]-begin_offset-1L)
z_lns <- lines[begins+charge_offset]
oks <- grepl("^CHARGE", z_lns) & (z_lns != "CHARGE=1+")
b_oks <- begins[oks] - begin_offset
e_oks <- ends[oks]
ranges <- mapply(function (x, y) x:y, b_oks, e_oks, SIMPLIFY = TRUE)
ranges <- do.call(`c`, ranges)
ranges <- c(hdrs, ranges)
lines <- lines[ranges]
rm(list = c("begins", "ends", "b_oks", "e_oks", "oks", "ranges", "z_lns"))
if (is.infinite(topn_ms2ions)) {
writeLines(lines, file)
return(NULL)
}
pat_mgf <- mzion:::find_mgf_type(file)
type_mgf <- pat_mgf$type
n_bf_begin <- pat_mgf$n_bf_begin
n_spacer <- pat_mgf$n_spacer
n_hdr <- pat_mgf$n_hdr
n_to_pepmass <- pat_mgf$n_to_pepmass
n_to_title <- pat_mgf$n_to_title
n_to_scan <- pat_mgf$n_to_scan
n_to_rt <- pat_mgf$n_to_rt
n_to_charge <- pat_mgf$n_to_charge
sep_ms2s <- pat_mgf$sep_ms2s
nfields_ms2s <- pat_mgf$nfields_ms2s
sep_pepmass <- pat_mgf$sep_pepmass
nfields_pepmass <- pat_mgf$nfields_pepmass
raw_file <- pat_mgf$raw_file
begins <- .Internal(which(stringi::stri_startswith_fixed(lines, "BEGIN IONS")))
ends <- .Internal(which(stringi::stri_endswith_fixed(lines, "END IONS")))
## MS2
# (-1L: one line above "END IONS")
ms2s <- mapply(function (x, y) lines[(x + n_hdr) : (y - 1L)],
begins, ends, SIMPLIFY = FALSE, USE.NAMES = FALSE)
ms2s <- lapply(ms2s, stringi::stri_split_fixed, pattern = sep_ms2s,
n = nfields_ms2s, simplify = TRUE)
ms2_moverzs <- lapply(ms2s, function (x) as.numeric(x[, 1]))
# not as.integer; intensity may be > .Machine$integer.max
ms2_ints <- lapply(ms2s, function (x) as.numeric(x[, 2]))
rm(list = c("ms2s"))
mz_n_int <- mzion:::sub_mgftopn(ms2_moverzs = ms2_moverzs,
ms2_ints = ms2_ints,
topn_ms2ions = topn_ms2ions,
min_ms2mass = 0L,
max_ms2mass = 5000L)
ms2_moverzs <- mz_n_int[["ms2_moverzs"]]
ms2_ints <- mz_n_int[["ms2_ints"]]
ms2_out <- mapply(function (x, y) paste0(x, "\t", y, "\t"),
ms2_moverzs, ms2_ints, SIMPLIFY = FALSE)
aboves <- begins - (n_bf_begin + 1L)
belows <- begins + (n_hdr - 1L)
ms1_out <- mapply(function (x, y) lines[x:y], aboves, belows, SIMPLIFY = FALSE)
ans <- mapply(function (m1, m2) c(m1, m2, "END IONS"), ms1_out, ms2_out)
ans <- unlist(ans)
writeLines(c(lines[hdrs], ans), file)
}
#' Parallel subsetBrukerMGF
#'
#' @param filepath A file path to MGF.
#' @param n_cores The number of CPU cores.
#' @inheritParams subsetBrukerMGF
#' @export
msubsetBrukerMGF <- function (filepath, begin_offset = 5L, charge_offset = 5L,
topn_ms2ions = Inf, n_cores = 1L)
{
files <- list.files(filepath, pattern = "\\.mgf$", full.names = TRUE,
recursive = TRUE)
len <- length(files)
if (!len)
stop("No MGF files found.")
if (n_cores > 1L)
n_cores <- min(parallel::detectCores(), n_cores, len)
if (n_cores > 1L) {
cl <- parallel::makeCluster(getOption("cl.cores", n_cores))
parallel::clusterApply(cl, files, subsetBrukerMGF,
begin_offset = begin_offset,
charge_offset = charge_offset,
topn_ms2ions = topn_ms2ions)
parallel::stopCluster(cl)
}
else
lapply(files, subsetBrukerMGF, begin_offset = begin_offset,
charge_offset = charge_offset, topn_ms2ions = topn_ms2ions)
}
#' Reprocesses of Bruker MGF files.
#'
#' To be deleted
#'
#' @param file A file name with prepending path.
#' @examples
#' \donttest{
#' filepath = "F:/timsTOF/mgf"
#' mprocBrukerMGF(filepath)
#' }
procBrukerMGF_v1 <- function (file)
{
message("Processing: ", file)
lines <- readLines(file)
fi <- local({
hdr <- lines[1:50L]
ln <- hdr[grepl("###.*\\.d$", hdr)]
nm <- gsub("###\t(.*\\.d$)", "\\1", ln)
paste0("File:", nm)
})
rows <- which(stringi::stri_startswith_fixed(lines, "TITLE="))
tits <- lapply(rows, function (i)
gsub("^([^,]*?), (.*)", paste("\\1", fi, "\\2", sep = ", "), lines[i]))
lines[rows] <- tits
writeLines(unlist(lines), file)
}
#' Reprocesses of Bruker MGF files.
#'
#' @param file A file name with prepending path.
#'
#' @examples
#' \donttest{
#' filepath = "F:/timsTOF/mgf"
#' mprocBrukerMGF(filepath)
#' }
procBrukerMGF <- function (file)
{
message("Processing: ", file)
lines <- readLines(file)
fi <- local({
hdr <- lines[1:50L]
ln <- hdr[grepl("###.*\\.d$", hdr)]
nm <- gsub("###\t(.*\\.d$)", "\\1", ln)
paste0("~File:", nm, "~")
})
rows <- which(stringi::stri_startswith_fixed(lines, "TITLE="))
tits <- lapply(rows, function (i)
gsub("^([^,]*?),[ ]{0,1}(.*)", paste("\\1", fi, "\\2", sep = ", "), lines[i]))
lines[rows] <- tits
writeLines(unlist(lines), file)
}
#' Batch-reprocessing of Bruker MGF files.
#'
#' @param filepath A file path to MGF.
#' @param n_cores The number of CPU cores.
#' @export
mprocBrukerMGF <- function (filepath, n_cores = 1L)
{
files <- list.files(filepath, pattern = "\\.mgf$", full.names = TRUE,
recursive = TRUE)
len <- length(files)
if (!len)
stop("No MGF files found.")
if (n_cores > 1L)
n_cores <- min(parallel::detectCores(), n_cores, len)
if (n_cores > 1L) {
cl <- parallel::makeCluster(getOption("cl.cores", n_cores))
parallel::clusterApply(cl, files, procBrukerMGF)
parallel::stopCluster(cl)
}
else
lapply(files, procBrukerMGF)
}
#' Fix the \code{File} field in \code{Title} lines.
#'
#' The first tilde in the \code{Title} lines needs to before \code{File}.
#'
#' @param file A file name with prepending path.
fixBrukerMGF <- function (file)
{
lines <- readLines(file)
rows <- which(stringi::stri_startswith_fixed(lines, "TITLE="))
tits <- lapply(rows, function (i) gsub("File:~", "~File:", lines[i]))
lines[rows] <- tits
writeLines(unlist(lines), file)
}
#' Batch-fixing of Bruker MGF files.
#'
#' The first tilde in the \code{Title} lines needs to before \code{File}.
#'
#' @param filepath A file path to MGF.
#' @param n_cores The number of CPU cores.
#' @export
mfixBrukerMGF <- function (filepath, n_cores = 1L)
{
files <- list.files(filepath, pattern = "\\.mgf$", full.names = TRUE,
recursive = TRUE)
len <- length(files)
if (!len)
stop("No MGF files found.")
if (n_cores > 1L)
n_cores <- min(parallel::detectCores(), n_cores, len)
if (n_cores > 1L) {
cl <- parallel::makeCluster(getOption("cl.cores", n_cores))
parallel::clusterApply(cl, files, fixBrukerMGF)
parallel::stopCluster(cl)
}
else
lapply(files, fixBrukerMGF)
}
#' Checks lengths of aesthetics.
#'
#' The length between metadata and manual aesthetics.
#'
#' @param label_scheme A metadata.
#' @param x A column name in metadata.
#' @param vals Values of manual aesthetics.
#' @param aes The name of an aesthetics.
check_aes_length <- function (label_scheme = NULL, x = "Size",
aes = "size_manual", vals = c(3, 5))
{
len_1 <- length(unique(label_scheme[[x]]))
len_2 <- length(vals)
if (len_1 != len_2)
stop("Unequal lengths: metadata ", x, " = ", len_1, ", ", aes, " = ", len_2)
}
#' Adds column \code{raw_file}.
#'
#' @param df A PSM table.
add_col_rawfile <- function (df)
{
if ("raw_file" %in% names(df)) {
df <- df %>%
dplyr::rename(RAW_File = raw_file) %>%
dplyr::mutate(RAW_File = gsub("\\.raw$", "", RAW_File)) %>%
dplyr::mutate(RAW_File = gsub("\\.d$", "", RAW_File))
df <- df %>%
dplyr::mutate(pep_scan_title = gsub("\\\\", "/", pep_scan_title))
}
else {
df <- df %>%
dplyr::mutate(pep_scan_title = gsub("\\\\", "/", pep_scan_title)) %>%
dplyr::mutate(RAW_File = pep_scan_title) %>%
dplyr::mutate(RAW_File = gsub("^.*/([^/]*)\\.raw[\\\"]{0,1}; .*", "\\1",
RAW_File)) %>%
dplyr::mutate(RAW_File = gsub("^.*/([^/]*)\\.d[\\\"]{0,1}; .*", "\\1",
RAW_File))
}
# (with some refseq_acc)
df <- df %>%
dplyr::mutate(prot_acc = gsub("\\.[0-9]*$", "", prot_acc))
}
#' Loads psmC.txt
#'
#' @param file The file name of \code{psmC[...].txt}.
load_psmC <- function(file = NULL, ...)
{
dat_dir <- get_gl_dat_dir()
base_name <- gsub("\\.txt$", "", file)
dfC <- suppressWarnings(
readr::read_tsv(file.path(dat_dir, file),
col_types = cols(
prot_acc = col_character(),
prot_issig = col_logical(),
# prot_isess = col_logical(),
# prot_tier = col_integer(),
# prot_hit_num = col_integer(),
# prot_family_member = col_integer(),
prot_es = col_number(),
prot_es_co = col_number(),
pep_seq = col_character(),
pep_n_ms2 = col_integer(),
pep_scan_title = col_character(),
pep_exp_mz = col_number(),
pep_exp_mr = col_number(),
pep_exp_z = col_character(),
pep_calc_mr = col_number(),
pep_delta = col_number(),
pep_tot_int = col_number(),
pep_ret_range = col_number(),
pep_scan_num = col_character(), # timsTOF
pep_mod_group = col_integer(),
pep_frame = col_integer(),
pep_fmod = col_character(),
pep_vmod = col_character(),
pep_isdecoy = col_logical(),
pep_ivmod = col_character(),
pep_len = col_integer(),
pep_issig = col_logical(),
pep_score = col_double(),
pep_expect = col_double(),
pep_rank = col_integer(),
pep_locprob = col_double(),
pep_locdiff = col_double(),
# pep_rank_nl = col_integer(),
# pep_literal_unique = col_logical(),
# pep_razor_unique = col_logical(),
raw_file = col_character(), ),
show_col_types = FALSE)
)
ans <- dfC %>%
add_col_rawfile() %>%
dplyr::select(c("pep_seq", "pep_tot_int", "pep_ret_range", "pep_scan_num",
# "pep_scan_title",
"pep_n_ms2", "pep_exp_mz", "pep_exp_mr", "pep_delta",
# "pep_calc_mr", "pep_mod_group", "pep_isdecoy", "pep_len",
"pep_issig", "pep_score", "pep_locprob", "pep_locdiff",
"pep_rank", "pep_expect",
"RAW_File", "pep_ivmod", "pep_fmod", "pep_vmod",
"pep_exp_z", )) %>%
dplyr::mutate(I000 = pep_tot_int) %>%
dplyr::mutate(R000 = I000/I000,
R000 = ifelse(is.infinite(R000), NA_real_, R000)) %>%
dplyr::mutate(dat_file = base_name)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.