Nothing
#' Reduce Dimension of Data for Exploratory Data Analysis
#'
#' For data types other than seqData, this function calculates principal
#' components using projection pursuit estimation, which implements an
#' expectation-maximization (EM) estimation algorithm when data is missing. For
#' seqData counts, a generalized version of principal components analysis for
#' non-normally distributed data is calculated under the assumption of a
#' negative binomial distribution with global dispersion.
#'
#' @param omicsData an object of the class 'pepdata', 'prodata', 'metabData',
#' 'lipidData', 'nmrData', or 'seqData', created by \code{\link{as.pepData}},
#' \code{\link{as.proData}}, \code{\link{as.metabData}},
#' \code{\link{as.lipidData}}, \code{\link{as.nmrData}}, or
#' \code{\link{as.seqData}}, respectively.
#' @param k integer number of principal components to return. Defaults to 2.
#'
#' @return a data.frame with first \code{k} principal component scores, sample
#' identifiers, and group membership for each sample (if group designation was
#' previously run on the data). The object is of class dimRes (dimension
#' reduction Result).
#'
#' @references Redestig H, Stacklies W, Scholz M, Selbig J, & Walther
#' D (2007). \emph{pcaMethods - a bioconductor package providing PCA methods
#' for incomplete data}. Bioinformatics. 23(9): 1164-7.
#'
#' Townes FW, Hicks SC, Aryee MJ, Irizarry RA (2019). \emph{Feature selection
#' and dimension reduction for single-cell RNA-seq based on a multinomial
#' model.} Genome Biol. 20, 1–16.
#'
#' Huang H, Wang Y, Rudin C, Browne EP (2022). \emph{Towards a comprehensive
#' evaluation of dimension reduction methods for transcriptomic data
#' visualization.} Communications Biology 5, 719.
#'
#' @details Any biomolecules seen in only one sample or with a variance less
#' than 1E-6 across all samples are not included in the PCA calculations. This
#' function leverages code from \code{\link[pcaMethods]{pca}} and
#' \code{\link[glmpca]{glmpca}} .
#'
#' @examplesIf requireNamespace("pmartRdata", quietly = TRUE)
#' library(pmartRdata)
#'
#' mylipid <- edata_transform(omicsData = lipid_neg_object, data_scale = "log2")
#' mylipid <- group_designation(omicsData = mylipid, main_effects = "Virus")
#' pca_lipids <- dim_reduction(omicsData = mylipid)
#'
#' \donttest{
#' myseq <- group_designation(omicsData = rnaseq_object, main_effects = "Virus")
#' pca_seq <- dim_reduction(omicsData = myseq)
#' }
#'
#' @export
#'
dim_reduction <- function(omicsData, k = 2) {
# check that omicsData is of appropriate class #
if (!inherits(omicsData, c(
"pepData", "proData", "metabData",
"lipidData", "nmrData", "seqData"
)))
stop(paste("omicsData must be an object of class 'pepdata','prodata',",
"'metabData', 'lipidData', 'nrmData', or 'seqData'.",
sep = " "
))
# check that group designation has been run #
if (!("group_DF" %in% names(attributes(omicsData))))
warning(paste("group_designation has not been run on this data and may",
"limit plotting options",
sep = " "
))
# data should be log transformed #
if (get_data_scale(omicsData) == "abundance") {
stop("omicsData must be log transformed prior to calling dim_reduction.")
}
# Check the data scale. Data must be on one of the log scales.
if (inherits(omicsData, "seqData") &&
get_data_scale(omicsData) != "counts") {
# Welcome to the pit of despair!
stop("seqData must be untransformed prior to calling dim_reduction.")
}
samp_id = attr(omicsData, "cnames")$fdata_cname
pep_id = attr(omicsData, "cnames")$edata_cname
temp_data = omicsData$e_data[, -which(names(omicsData$e_data) == pep_id)]
## check for samples seen in only one sample or no samples and remove ##
minsamps = which(rowSums(!is.na(temp_data)) < 2)
if (length(minsamps) > 0) {
temp_data = temp_data[-minsamps, ]
}
## check for near zero variance features and remove ##
minvars = which(apply(temp_data, 1, var, na.rm = TRUE) < 0.000001)
if (length(minvars) > 0) {
temp_data = temp_data[-minvars, ]
}
if (inherits(omicsData, "seqData")) {
# GLM-PCA
# https://www.bioconductor.org/packages/devel/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html#pca-plot-using-generalized-pca
## Supposed to be performed on raw counts
## Temp data looks like an e_data w/o the identifier column
pca_res <- glmpca::glmpca(temp_data, L = k, fam = "nb")
pca_ests <- pca_res$factors ### Plot from here for PC1 and PC2
colnames(pca_ests) <- paste0("PC", 1:ncol(pca_ests))
} else {
pca_res = pcaMethods::pca(
object = as.matrix(t(temp_data)),
method = "ppca",
scale = "vector",
nPcs = k
)
pca_ests = pca_res@scores[, 1:k]
}
temp_res = data.frame(SampleID = names(temp_data), pca_ests)
class(temp_res) <- "dimRes"
attr(temp_res, "group_DF") <- get_group_DF(omicsData)
if (!inherits(pca_res, "glmpca")) {
attr(temp_res, "R2") <- pca_res@R2
}
return(temp_res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.