#' Convert Data to Appropriate pmartRseq Class
#'
#' Converts a list object or several data.frames of rRNA-level data (16s) to an
#' object of the class 'rRNAdata'. Objects of the class 'rRNAdata' are lists
#' with two obligatory components \code{e_data} and \code{f_data}. An optional
#' list component \code{e_meta} is used if analysis or visualization at other
#' levels (e.g. taxonomy) is also desired.
#'
#' @param e_data a \eqn{p \times n + 1} data.frame of expression data, where
#' \eqn{p} is the number of accession numbers observed and \eqn{n} is the
#' number of samples (an additional feature identifier/name column should also
#' be present anywhere in the data.frame). Each row corresponds to data for
#' each accession number.
#' @param f_data a data.frame with \eqn{n} rows. Each row corresponds to a
#' sample with one column giving the unique sample identifiers found in e_data
#' column names and other columns providing qualitative and/or quantitative
#' traits of each sample.
#' @param e_meta an optional data.frame with \eqn{p} rows. Each row corresponds
#' to an OTU with one column giving OTU identifiers (must be named the same as
#' the column in \code{e_data}) and other columns giving meta information
#' (e.g. mappings of OTU identification to taxonomy).
#' @param e_tree an optional NEXUS or Newick formatted phylogenetic tree file,
#' imported using ape::read.tree(tree_path). The OTU labels in the tree file
#' should match the OTU identifiers in the preceeding data fields.
#' @param e_seq an optional fasta formatted representation of biological
#' sequences imported using Biostrings::readDNAStringSet(fasta_path, ...).
#' Each OTU in the fasta maps to at least one sequence in the preceeding data
#' fields.
#' @param edata_cname character string specifying the name of the column
#' containing the identifiers in \code{e_data} and \code{e_meta} (if
#' applicable).
#' @param emeta_cname character string specifying the name of the column
#' containing the mapped identifiers in \code{e_meta} (if applicable).
#' Defaults to NULL. If \code{e_meta} is NULL, then specify \code{emeta_cname}
#' as NULL.
#' @param fdata_cname character string specifying the name of the column
#' containing the sample identifiers in \code{f_data}.
#' @param ... further arguments
#'
#' @details Objects of class 'rRNAdata' contain some attributes that are
#' referenced by downstream functions. These attributes can be changed from
#' their default value by manual specification. A list of these attributes as
#' well as their default values are as follows: \tabular{ll}{ data_scale \tab
#' Scale of the data provided in \code{e_data}. Acceptable values are 'log2',
#' 'log10', 'log', 'count', and 'abundance', which indicate data is log base
#' 2, base 10, natural log transformed, raw count data, and raw abundance,
#' respectively. Default values is 'count'. \cr \tab \cr data_norm \tab A
#' logical argument, specifying whether the data has been normalized or not.
#' Default value is 'FALSE'. \cr \tab \cr norm_method \tab Null if data_norm
#' is FALSE. If data_norm is TRUE, character string defining which
#' normalization method was used. Default value is 'NULL'. \cr \tab \cr
#' location_param \tab NULL if there are no location parameters from
#' normalization, otherwise a vector detailing the normalization location
#' parameters for each sample. \cr \tab \cr scale_param \tab NULL if there are
#' no scale parameters from normalization, otherwise a vector detailing the
#' normalization scale parameters for each sample. \cr \tab \cr data_types
#' \tab Character string describing the type of data (e.g. 'HiSeq' or
#' 'Positive ion'). Default value is 'NULL'. \cr \tab \cr db \tab Character
#' string describing which database was used to process the data (e.g.
#' "TIGR"). Default value is 'NULL'. \cr \tab \cr db_version \tab Character
#' string describing which version of the database was used. Default value is
#' 'NULL'. If db is NULL, then db_version will default to a NULL value. }
#'
#' Computed values included in the \code{data_info} attribute are as follows:
#' \tabular{ll}{ num_edata \tab The number of unique \code{edata_cname}
#' entries.\cr \tab \cr num_na \tab The number of NA observations in the
#' dataset.\cr \tab \cr frac_na \tab The prportion of \code{e_data} values
#' that are NA. \cr \tab \cr num_zero \tab The number of observations that
#' equal 0 in the dataset. \cr \tab \cr frac_zero \tab The proportion of
#' \code{e_data} values that are 0. \cr \tab \cr num_emeta \tab The number of
#' unique \code{emeta_cname} entries. \cr \tab \cr num_samps \tab The number
#' of samples that make up the columns of \code{e_data}.\cr \tab \cr meta_info
#' \tab A logical argument, specifying whether \code{e_meta} is provided.\cr
#' \tab \cr }
#'
#' @author Allison Thompson, Lisa Bramer
#' @seealso \code{\link{as.gDNAdata}}
#' @seealso \code{\link{as.cDNAdata}}
#'
#'
#' @export
as.rRNAdata <- function(e_data, f_data, e_meta=NULL, edata_cname, fdata_cname, emeta_cname=NULL, ...){
.as.rRNAdata(e_data, f_data, e_meta, e_tree, e_seq, edata_cname, fdata_cname, emeta_cname, ...)
}
## rRNA data ##
.as.rRNAdata <- function(e_data, f_data, e_meta=NULL, e_tree, e_seq,
edata_cname, fdata_cname, emeta_cname,
data_scale="count", data_norm=FALSE, norm_method=NULL,
location_param=NULL, scale_param=NULL,
data_types=NULL, db=NULL, db_version=NULL, ...){
# check class structures for paths and attempt read #
if (inherits(e_data, "character")) {
mess <- try({
biom_read <- biomformat::read_biom(biom_file = e_data)
otu_table <- as.matrix(biomformat::biom_data(biom_read))
# create e_data with OTU identifier
e_data = data.frame(OTU = rownames(otu_table), otu_table, stringsAsFactors = FALSE)
edata_cname <- "OTU"
# create e_meta with OTU identifier
otu_meta = biomformat::observation_metadata(biom_read)
e_meta = data.frame(OTU = row.names(otu_meta), otu_meta, stringsAsFactors = FALSE)
emeta_cname <- "OTU"
})
if (inherits(mess, "try-error")) {
stop("Biom import failed. Incorrect file path or unsupported biom format.")
}
}
if (inherits(f_data, "character")) {
mess <- try({
f = readLines(f_data)
skipLines = which.max(grepl("#", x = f[1:length(f)])) #find the last commented line and assume header info
f_data <- data.table::fread(input=paste0(f, collapse = "\n"), sep = "\t", header = TRUE, skip = skipLines - 1, data.table = FALSE)
fdata_cname = colnames(f_data)[1] # choose the sample identifier column (first for now)
}, silent = TRUE)
if (inherits(mess, "try-error")) {
stop("Sample metadata import failed. Incorrect file path or unsupported file format.")
}
}
# initial checks #
# check that the OTU column exists in e_data and e_meta (if applicable) #
if (!(edata_cname %in% names(e_data))) stop(paste("OTU column ", edata_cname," not found in e_data. See details of as.rRNAdata for specifying column names.", sep = ""))
if (!is.null(e_meta)) {
if (!(edata_cname %in% names(e_meta))) stop(paste("OTU column ", edata_cname," not found in e_meta. Column names for OTUs must match for e_data and e_meta. See details of as.rRNAdata for specifying column names.", sep = ""))
}
# if e_meta is NULL set emeta_cname to NULL #
if (is.null(e_meta)) emeta_cname = NULL
# if e_meta is not NULL check that the taxonomy column is found #
if (!is.null(e_meta)){
if (!is.null(emeta_cname)){
if (!(emeta_cname %in% names(e_meta))) stop(paste("Taxonomy column ", emeta_cname, " not found in e_meta. See details of as.rRNAdata for specifying column names.", sep = "") )
}
}
# check that the Sample column name is in f_data column names #
if (!(fdata_cname %in% names(f_data))) stop(paste("Sample column ", fdata_cname, " not found in f_data. See details of as.rRNAdata for specifying column names.", sep = ""))
# check that all samples in e_data are present in f_data #
edat_sampid = which(names(e_data) == edata_cname)
samps.miss = sum(!(names(e_data[,-edat_sampid]) %in% f_data[[fdata_cname]]))
if( samps.miss > 0) stop(paste( samps.miss, " samples from e_data not found in f_data", sep = ""))
# check for any extra samples in f_data than in e_data - necessary to remove before group_designation function #
if(any(!(f_data[[fdata_cname]] %in% names(e_data)))){
f_data <- f_data[-which(!(f_data[[fdata_cname]] %in% names(e_data))),]
}
# check that f_data has at least 2 columns #
if(ncol(f_data) < 2) stop("f_data must contain at least 2 columns")
# if e_meta is provided, check that all OTUs in e_data occur in e_meta #
if(!is.null(e_meta)){
if(sum(!(e_data[[edata_cname]] %in% e_meta[[edata_cname]])) > 0 ) stop("Not all OTUs in e_data are present in e_meta")
}
# if e_meta is provided, remove any extra features that were provided #
if(!is.null(e_meta)){
if(any(!(e_meta[[edata_cname]] %in% e_data[[edata_cname]]))){
e_meta <- e_meta[-which(!(e_meta[[edata_cname]] %in% e_data[[edata_cname]])),]
}
}
# check that data_scale is one of the acceptable options #
if(!(data_scale %in% c('log2', 'log10', 'log', 'count', 'abundance'))) stop(paste(data_scale, " is not a valid option for 'data_scale'. See details of as.rRNAdata for specifics.", sep=""))
# if e_meta is NULL, set emeta_cname to NULL #
if(is.null(e_meta)){
emeta_cname = NULL
}
# store results #
rRNAobj = list(e_data = e_data, f_data = f_data, e_meta = e_meta)
# set column name attributes #
attr(rRNAobj, "cnames") = list(edata_cname = edata_cname, emeta_cname = emeta_cname, fdata_cname = fdata_cname)
# count missing values in e_data #
#num_miss_obs = sum(is.na(e_data[,-which(names(e_data)==edata_cname)])) + length(which(e_data[,-which(names(e_data)==edata_cname)] == 0))
#frac_missing = mean(is.na(e_data[,-which(names(e_data)==edata_cname)]))
num_na = sum(is.na(e_data[,-which(names(e_data)==edata_cname)]))
num_zero = length(which(e_data[,-which(names(e_data)==edata_cname)] == 0))
frac_na = mean(is.na(e_data[,-which(names(e_data)==edata_cname)]))
frac_zero = length(which(e_data[,-which(names(e_data)==edata_cname)]==0)) / length(which(e_data[,-which(names(e_data)==edata_cname)]>=0))
# number of unique OTUs #
num_edata = length(unique(e_data[, edata_cname]))
# number of samples #
num_samps = ncol(e_data) - 1
if(!is.null(e_meta)){
# number of unique taxonomy that map to an OTU in e_data #
if(!is.null(emeta_cname)){
num_emeta = length(unique(e_meta[which(as.character(e_meta[, edata_cname]) %in% as.character(e_data[, edata_cname])), emeta_cname]))
}else{num_emeta = NULL}
}else{
num_emeta = NULL
}
# set data information attributes #
attr(rRNAobj, "data_info") = list(data_scale = data_scale, data_norm = data_norm, norm_method = norm_method,
location_param = location_param, scale_param = scale_param,
num_samps = num_samps, num_edata = num_edata, num_emeta = num_emeta,
num_na = num_na, frac_na = frac_na, num_zero = num_zero, frac_zero = frac_zero,
#num_miss_obs = num_miss_obs, frac_missing = frac_missing,
data_types = data_types)
# set meta data attributes #
if(!is.null(e_meta)){
attr(rRNAobj, "meta_info") = TRUE
}else{ attr(rRNAobj, "meta_info") = FALSE}
# set database attributes #
if(is.null(db)){
db_version = NULL
}else{
db_version = db_version
}
attr(rRNAobj, "database") = list(db = db, db_version = db_version)
# set group dataframe attribute to NULL, will be filled in after running group_designation function #
attr(rRNAobj, "group_DF") = NULL
# set class of list #
class(rRNAobj) = "rRNAdata"
return(rRNAobj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.