Nothing
#' Identify All Biomolecules for Use in Normalization
#'
#' Selects biomolecules for normalization via choosing all biomolecules
#' currently in the data
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @details This function returns the subset of all biomolecules. These will be
#' used for normalization.
#'
#' @return Character vector containing all biomolecules.
#'
#' @author Kelly Stratton
#'
all_subset <- function(e_data, edata_id) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# subset to all cases
peps <- as.character(e_data[, edata_id_ind])
if (length(peps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(peps)
}
#' Identify Biomolecules from the Top L Order Statistics for Use in
#' Normalization
#'
#' Select biomolecules for normalization via the method of the top L order
#' statistics (LOS)
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the column giving the
#' peptide, protein, lipid, or metabolite identifier. Usually obtained by
#' calling \code{attr(omicsData, "cnames")$edata_cname}.
#' @param L numeric value between 0 and 1, indicating the top proportion of
#' biomolecules to be retained (default value 0.05)
#'
#' @details The biomolecule abundances of the top \code{L} order statistics are
#' identified and returned. Specifically, for each sample, the biomolecules with
#' the top \code{L} proportion of highest absolute abundance are retained, and
#' the union of these biomolecules is taken as the subset identified.
#'
#' @return Character vector containing the biomolecules belonging to the subset.
#'
#' @author Kelly Stratton, Lisa Bramer
#'
los <- function(e_data, edata_id, L = .05) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
n_samps <- ncol(e_data[, -edata_id_ind])
n_peps <- nrow(e_data[, -edata_id_ind])
peps <- e_data[, edata_id_ind]
row.names(e_data) <- peps
mydata <- e_data[, -edata_id_ind]
# get number of peps to keep from L that was provided
num_kp <- round(L * n_peps)
kp <- c()
# for each sample, ... #
for (i in 1:n_samps) {
# rank the features from highest to lowest (NAs are low) #
cur_data <- mydata[, i]
names(cur_data) <- peps
# keep the top L percent of the features (excluding NAs, if NAs are included)
ordered_data <- sort(abs(cur_data), decreasing = TRUE) # absolute value!
# take the union of those features - that is the subset to return #
kp <- c(kp, names(ordered_data)[1:num_kp])
}
kp <- unique(kp)
if (length(kp) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(kp)
}
#' Identify Biomolecules from the Proportion Present (PPP) for Use in
#' Normalization
#'
#' Selects biomolecules for normalization via the method of percentage of the
#' peptides (or proteins, metabolites, etc.) present (PPP)
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#' @param proportion numeric value between 0 and 1, indicating the proportion at
#' or above which a biomolecule must be present across all samples in order to be
#' retained (default value 0.5)
#'
#' @details Biomolecules present across \code{proportion} samples are designated as
#' PPP.
#'
#' @return Character vector containing the biomolecules belonging to the PPP
#' subset.
#'
#' @author Kelly Stratton
#'
ppp <- function(e_data, edata_id, proportion = 0.5) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# get column of proportion present #
mydata_pct_present <- rowSums(!is.na(e_data[, -edata_id_ind])) /
ncol(e_data[, -edata_id_ind])
# which features have proportion present above the value of "proportion" #
inds <- which(mydata_pct_present >= proportion)
subset_peps <- as.character(e_data[inds, edata_id_ind])
if (length(subset_peps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(subset_peps)
}
#' Identify Proportion of Peptides Present (PPP) and Rank Invariant Peptides
#' (RIP) for Use in Normalization
#'
#' Selects biomolecules for normalization via the method of proportion of
#' biomolecules present and rank invariant biomolecules (ppp_rip)
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#' @param fdata_id character string indicating the name of the sample column
#' name in f_data.
#' @param groupDF data.frame created by \code{group_designation} with columns
#' for sample.id and group. If two main effects are provided the original main
#' effect levels for each sample are returned as the third and fourth columns
#' of the data.frame.
#' @param alpha numeric p-value threshold, above which the biomolecules are retained
#' as rank invariant (default value 0.25)
#' @param proportion numeric value between 0 and 1, indicating the percentage at
#' or above which a biomolecule must be present across all samples in order to be
#' retained (default value 0.5)
#'
#' @details Biomolecules present across \code{proportion} samples are subjected to a
#' Kruskal-Wallis test (non-parametric one-way ANOVA, where NAs are ignored)
#' on group membership, and those biomolecules with p-value greater than a
#' defined threshold \code{alpha} (common values include 0.1 or 0.25) are
#' retained as rank-invariant biomolecules.
#'
#' @return Character vector containing the biomolecules belonging to the ppp_rip subset.
#'
#' @author Kelly Stratton
#'
ppp_rip <- function(e_data, edata_id, fdata_id, groupDF, alpha = 0.2, proportion = 0.5) {
samp_id = fdata_id
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# subset to features present in at least proportion samples
peps <- e_data[, edata_id_ind]
# get column of proportion present #
mydata_pct_present <- rowSums(!is.na(e_data[, -edata_id_ind])) /
ncol(e_data[, -edata_id_ind])
# which features have proportion present above the value of "proportion" #
inds <- which(mydata_pct_present >= proportion)
mydata <- e_data[inds, -edata_id_ind]
peps <- peps[inds]
# make sure groups are aligned with the columns of e_data
reorder = match(colnames(mydata), as.character(groupDF[,fdata_id]))
group_dat = as.character(groupDF[reorder,]$Group)
# conduct K-W test using kw_rcpp function
pvals = kw_rcpp(as.matrix(mydata), group_dat)
pvals = data.frame(pvals)
RIPeps <- as.character(peps[as.numeric(pvals[, 1]) > alpha &
!is.na(as.numeric(pvals[, 1]))])
if (length(RIPeps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(RIPeps)
}
#' Identify Rank-Invariant Biomolcules for Use in Normalization
#'
#' Selects biomolecules for normalization via the method of rank-invariant
#' biomolcules (RIP)
#'
#' @param e_data a \eqn{p \times n} data.frame, where \eqn{p} is the number of
#' peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#' @param fdata_id character string indicating the name of the sample column
#' name in f_data.
#' @param groupDF data.frame created by \code{group_designation} with columns
#' for sample.id and group. If two main effects are provided the original main
#' effect levels for each sample are returned as the third and fourth columns
#' of the data.frame.
#' @param alpha numeric p-value threshold, above which the biomolecules are
#' retained as rank invariant (default value 0.25)
#'
#' @details Biomolecules with complete data are subjected to a Kruskal-Wallis
#' test (non-parametric one-way ANOVA) on group membership, and those
#' biomolecules with p-value greater than a defined threshold \code{alpha}
#' (common values include 0.1 or 0.25) are retained as rank-invariant
#' biomolecules.
#'
#' @return Character vector containing the biomolecules belonging to the RIP
#' subset.
#'
#' @author Kelly Stratton
#'
rip <- function(e_data, edata_id, fdata_id, groupDF, alpha = .2) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# subset to complete cases (features with complete data)
peps <- e_data[, edata_id_ind]
inds <- which(complete.cases(e_data[, -edata_id_ind]) == TRUE)
mydata <- e_data[inds, -edata_id_ind]
peps <- peps[inds]
# added 2/6/17 iobani
group_dat = as.character(groupDF$Group[order(groupDF$Group)])
mydata <- mydata[, order(groupDF$Group)]
# conduct K-W test on un-normalized data, used kw_rcpp function
pvals = kw_rcpp(as.matrix(mydata), group_dat)
pvals = data.frame(pvals)
RIPeps <- as.character(peps[as.numeric(pvals[, 1]) > alpha &
!is.na(as.numeric(pvals[, 1]))])
if (length(RIPeps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(RIPeps)
}
#' Identify biomolecules with no missing values across samples
#'
#' Selects biomolecules that have complete rows in e_data, equivalent to 'ppp'
#' with proportion = 1.
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @return Character vector containing the biomolecules with no missing values
#' across all samples.
#'
complete_mols <- function(e_data, edata_id) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# rows with no missing values
complete_inds <- which(complete.cases(e_data[, -edata_id_ind]) == TRUE)
# retain only character peptide names for complete rows
peps <- as.character(e_data[complete_inds, edata_id_ind])
if (length(peps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(peps)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.