#' Convert Data to Appropriate pmartRseq Class
#'
#' Converts a list object or several data.frames of rRNA (16S/ITS/18S),
#' metatranscript, or metagenomic data to an object of the class 'seqData'.
#' Objects of the class 'seqData' are lists with two obligatory components
#' \code{e_data} and \code{f_data}. An optional list component \code{e_meta}
#' is used if analysis or visualization at other levels (e.g. taxonomy) is
#' also desired.
#'
#' @param e_data a \eqn{p \times n + 1} data.frame of expression data, where
#' \eqn{p} is the number of features observed and \eqn{n} is the
#' number of samples (an additional feature identifier/name column should also
#' be present anywhere in the data.frame). Each row corresponds to data for
#' each feature.
#' @param f_data a data.frame with \eqn{n} rows. Each row corresponds to a
#' sample with one column giving the unique sample identifiers found in e_data
#' column names and other columns providing qualitative and/or quantitative
#' traits of each sample.
#' @param e_meta an optional data.frame with \eqn{p} rows. Each row corresponds
#' to a feature with one column giving identifiers (must be named the same as
#' the column in \code{e_data}) and other columns giving meta information
#' (e.g. mappings of OTU identification to taxonomy).
#' @param e_tree an optional NEXUS or Newick formatted phylogenetic tree file,
#' imported using ape::read.tree(tree_path). The OTU labels in the tree file
#' should match the OTU identifiers in the preceeding data fields.
#' @param e_seq an optional fasta formatted representation of biological
#' sequences imported using Biostrings::readDNAStringSet(fasta_path, ...).
#' Each OTU in the fasta maps to at least one sequence in the preceeding data
#' fields.
#' @param data_type character string specifying if this is 'rRNA' (for
#' 16S/ITS/18S), 'metagenomic', or 'metatranscriptomic' data.
#' @param edata_cname character string specifying the name of the column
#' containing the identifiers in \code{e_data} and \code{e_meta} (if
#' applicable).
#' @param taxa_cname optional character string specifying the name of the column
#' containing the taxonomy in \code{e_meta} (if applicable).
#' Defaults to NULL. If \code{e_meta} is NULL, then specify \code{taxa_cname}
#' as NULL.
#' @param ec_cname optional character string specifying the name of the column
#' containing the EC numbers in \code{e_meta} (if applicable).
#' Defaults to NULL. If \code{e_meta} is NULL, then specify \code{ec_cname}
#' as NULL.
#' @param gene_cname optional character string specifying the name of the column
#' containing the gene names in \code{e_meta} (if applicable).
#' Defaults to NULL. If \code{e_meta} is NULL, then specify \code{gene_cname}
#' as NULL.
#' @param fdata_cname character string specifying the name of the column
#' containing the sample identifiers in \code{f_data}.
#' @param ... further arguments
#'
#' @details Objects of class 'seqData' contain some attributes that are
#' referenced by downstream functions. These attributes can be changed from
#' their default value by manual specification. A list of these attributes as
#' well as their default values are as follows: \tabular{ll}{ data_scale \tab
#' Scale of the data provided in \code{e_data}. Acceptable values are 'log2',
#' 'log10', 'log', 'count', and 'abundance', which indicate data is log base
#' 2, base 10, natural log transformed, raw count data, and raw abundance,
#' respectively. Default values is 'count'. \cr \tab \cr data_norm \tab A
#' logical argument, specifying whether the data has been normalized or not.
#' Default value is 'FALSE'. \cr \tab \cr norm_method \tab Null if data_norm
#' is FALSE. If data_norm is TRUE, character string defining which
#' normalization method was used. Default value is 'NULL'. \cr \tab \cr
#' location_param \tab NULL if there are no location parameters from
#' normalization, otherwise a vector detailing the normalization location
#' parameters for each sample. \cr \tab \cr scale_param \tab NULL if there are
#' no scale parameters from normalization, otherwise a vector detailing the
#' normalization scale parameters for each sample. \cr \tab \cr seq_type
#' \tab Character string describing the type of sequencer (e.g. 'HiSeq').
#' Default value is 'NULL'. \cr \tab \cr db \tab Character
#' string describing which database was used to process the data (e.g.
#' "TIGR"). Default value is 'NULL'. \cr \tab \cr db_version \tab Character
#' string describing which version of the database was used. Default value is
#' 'NULL'. If db is NULL, then db_version will default to a NULL value. }
#'
#' Computed values included in the \code{data_info} attribute are as follows:
#' \tabular{ll}{ num_edata \tab The number of unique \code{edata_cname}
#' entries.\cr \tab \cr num_na \tab The number of NA observations in the
#' dataset.\cr \tab \cr frac_na \tab The prportion of \code{e_data} values
#' that are NA. \cr \tab \cr num_zero \tab The number of observations that
#' equal 0 in the dataset. \cr \tab \cr frac_zero \tab The proportion of
#' \code{e_data} values that are 0. \cr \tab \cr num_taxa \tab The number of
#' unique \code{taxa_cname} entries. \cr \tab \cr num_ec \tab The number of
#' unique \code{ec_cname} entries. \cr \tab \cr num_gene \tab The number of
#' unique \code{gene_cname} entries. \cr \tab \cr num_samps \tab The number
#' of samples that make up the columns of \code{e_data}.\cr \tab \cr meta_info
#' \tab A logical argument, specifying whether \code{e_meta} is provided.\cr
#' \tab \cr }
#'
#' @author Allison Thompson, Lisa Bramer
#'
#'
#' @export
as.seqData <- function(e_data, f_data, e_meta=NULL, edata_cname, fdata_cname, data_type, taxa_cname=NULL, ...){
.as.seqData(e_data, f_data, e_meta, e_tree, e_seq, edata_cname, fdata_cname, data_type, taxa_cname, ...)
}
## seqData ##
.as.seqData <- function(e_data, f_data, e_meta=NULL, e_tree, e_seq,
edata_cname, fdata_cname, data_type,
taxa_cname, ec_cname=NULL, gene_cname=NULL,
data_scale="count", data_norm=FALSE, norm_method=NULL,
location_param=NULL, scale_param=NULL,
seq_type=NULL, db=NULL, db_version=NULL, ...){
# initial checks #
#browser()
# names checks #
colnames(e_data) <- gsub("[^A-Za-z0-9_]","\\.",colnames(e_data))
colnames(f_data) <- gsub("[^A-Za-z0-9_]","\\.",colnames(f_data))
if(!is.null(e_meta)) colnames(e_meta) <- gsub("[^A-Za-z0-9_]","\\.",colnames(e_meta))
colnames(f_data) <- sapply(colnames(f_data), function(x) ifelse(!is.na(suppressWarnings(as.numeric(substr(x,1,1)))), paste("X",x,sep=""),x))
colnames(e_data) <- sapply(colnames(e_data), function(x) ifelse(!is.na(suppressWarnings(as.numeric(substr(x,1,1)))), paste("X",x,sep=""),x))
colnames(e_meta) <- sapply(colnames(e_meta), function(x) ifelse(!is.na(suppressWarnings(as.numeric(substr(x,1,1)))), paste("X",x,sep=""),x))
if(!is.na(suppressWarnings(as.numeric(substr(fdata_cname, 1, 1))))){
fdata_cname <- paste("X",fdata_cname,sep="")
}
fdata_cname <- gsub("[^A-Za-z0-9_]","\\.",fdata_cname)
edata_cname <- gsub("[^A-Za-z0-9_]","\\.",edata_cname)
taxa_cname <- if(!is.null(taxa_cname)) gsub("[^A-Za-z0-9_]","\\.",taxa_cname)
ec_cname <- if(!is.null(ec_cname)) gsub("[^A-Za-z0-9_]","\\.",ec_cname)
gene_cname <- if(!is.null(gene_cname)) gsub("[^A-Za-z0-9_]","\\.",gene_cname)
f_data[,fdata_cname] <- gsub("[^A-Za-z0-9_]","\\.",f_data[,fdata_cname])
f_data[,fdata_cname] <- sapply(f_data[,fdata_cname], function(x) ifelse(!is.na(suppressWarnings(as.numeric(substr(x,1,1)))), paste("X",x,sep=""),x))
# check that the OTU column exists in e_data and e_meta (if applicable) #
if (!(edata_cname %in% names(e_data))) stop(paste("OTU column ", edata_cname," not found in e_data. See details of as.seqData for specifying column names.", sep = ""))
# check that data_type is correct type #
if(!(tolower(data_type) %in% c("rrna","metag","metagenome","metagenomic","metat","metatranscript","metatranscriptome","metatranscriptomic"))){
stop(paste("Data type ",data_type," is not supported - must be one of 'rRNA', 'metagenomic', or 'metatranscriptomic'.",sep=""))
}
# standardize data_type for metagenomic data #
if(tolower(data_type) %in% c("metag","metagenome","metagenomic")){
data_type = "metagenomic"
}
# standardize data_type for metatranscriptomic data #
if(tolower(data_type) %in% c("metat","metatranscript","metatranscriptome","metatranscriptomic")){
data_type = "metatranscriptomic"
}
# check that edata_cname is in e_meta, if e_meta is provided #
if (!is.null(e_meta)) {
if (!(edata_cname %in% names(e_meta))) stop(paste("OTU column ", edata_cname," not found in e_meta. Column names for OTUs must match for e_data and e_meta. See details of as.seqData for specifying column names.", sep = ""))
}
# if e_meta is NULL set emeta_cnames to NULL #
if (is.null(e_meta)){
taxa_cname = NULL
ec_cname = NULL
gene_cname = NULL
}
# if e_meta is not NULL check that the emeta_cnames (taxa_cname, ec_cname, gene_cname) columns are found, if given #
if (!is.null(e_meta)){
# check taxa_cname
if (!is.null(taxa_cname)){
if (!(taxa_cname %in% names(e_meta))) stop(paste("Taxonomy column ", taxa_cname, " not found in e_meta. See details of as.seqData for specifying column names.", sep = "") )
}
# check ec_cname
if (!is.null(ec_cname)){
if (!(ec_cname %in% names(e_meta))) stop(paste("Enzyme commission number column ", ec_cname, " not found in e_meta. See details of as.seqData for specifying column names.", sep = "") )
}
# check gene_cname
if (!is.null(gene_cname)){
if (!(gene_cname %in% names(e_meta))) stop(paste("Gene column ", gene_cname, " not found in e_meta. See details of as.seqData for specifying column names.", sep = "") )
}
}
# check that the Sample column name is in f_data column names #
if (!(fdata_cname %in% names(f_data))) stop(paste("Sample column ", fdata_cname, " not found in f_data. See details of as.seqData for specifying column names.", sep = ""))
# check that all samples in e_data are present in f_data #
edat_sampid = which(names(e_data) == edata_cname)
samps.miss = sum(!(names(e_data[,-edat_sampid]) %in% f_data[,fdata_cname]))
if( samps.miss > 0) stop(paste( samps.miss, " samples from e_data not found in f_data", sep = ""))
# check for any extra samples in f_data than in e_data - necessary to remove before group_designation function #
if(any(!(f_data[,fdata_cname] %in% names(e_data)))){
f_data <- f_data[-which(!(f_data[,fdata_cname] %in% names(e_data))),]
}
# check that f_data has at least 2 columns #
if(ncol(f_data) < 2) stop("f_data must contain at least 2 columns")
# if e_meta is provided, check that all OTUs in e_data occur in e_meta #
if(!is.null(e_meta)){
if(sum(!(e_data[,edata_cname] %in% e_meta[,edata_cname])) > 0 ) stop("Not all OTUs in e_data are present in e_meta")
}
# if e_meta is provided, remove any extra features that were provided #
if(!is.null(e_meta)){
if(any(!(e_meta[,edata_cname] %in% e_data[,edata_cname]))){
e_meta <- e_meta[-which(!(e_meta[,edata_cname] %in% e_data[,edata_cname])),]
}
}
# check that data_scale is one of the acceptable options #
if(!(data_scale %in% c('log2', 'log10', 'log', 'count', 'abundance'))) stop(paste(data_scale, " is not a valid option for 'data_scale'. See details of as.seqData for specifics.", sep=""))
# store results #
seqObj = list(e_data = e_data, f_data = f_data, e_meta = e_meta)
# #----check for unhallowed characters in column names----#
# names(seqObj$f_data) <- make.names(names(seqObj$f_data))
# names(seqObj$e_data) <- make.names(names(seqObj$e_data))
# set column name attributes #
attr(seqObj, "cnames") = list(edata_cname = edata_cname, fdata_cname = fdata_cname, taxa_cname = taxa_cname, ec_cname = ec_cname, gene_cname = gene_cname)
# count missing values in e_data #
#num_miss_obs = sum(is.na(e_data[,-which(names(e_data)==edata_cname)])) + length(which(e_data[,-which(names(e_data)==edata_cname)] == 0))
#frac_missing = mean(is.na(e_data[,-which(names(e_data)==edata_cname)]))
num_na = sum(is.na(e_data[,-which(names(e_data)==edata_cname)]))
num_zero = length(which(e_data[,-which(names(e_data)==edata_cname)] == 0))
frac_na = mean(is.na(e_data[,-which(names(e_data)==edata_cname)]))
frac_zero = length(which(e_data[,-which(names(e_data)==edata_cname)]==0)) / length(which(e_data[,-which(names(e_data)==edata_cname)]>=0))
# number of unique OTUs #
num_edata = length(unique(e_data[, edata_cname]))
# number of samples #
num_samps = ncol(e_data) - 1
if(!is.null(e_meta)){
# number of unique taxonomy that map to an OTU in e_data #
if(!is.null(taxa_cname)){
num_taxa = length(unique(e_meta[which(as.character(e_meta[, edata_cname]) %in% as.character(e_data[, edata_cname])), taxa_cname]))
}else{num_taxa = NULL}
}else{
num_taxa = NULL
}
if(!is.null(e_meta)){
# number of unique ec that map to an OTU in e_data #
if(!is.null(ec_cname)){
num_ec = length(unique(e_meta[which(as.character(e_meta[, ec_cname]) %in% as.character(e_data[, edata_cname])), ec_cname]))
}else{num_ec = NULL}
}else{
num_ec = NULL
}
if(!is.null(e_meta)){
# number of unique genes that map to an OTU in e_data #
if(!is.null(gene_cname)){
num_gene = length(unique(e_meta[which(as.character(e_meta[, edata_cname]) %in% as.character(e_data[, edata_cname])), gene_cname]))
}else{num_gene = NULL}
}else{
num_gene = NULL
}
# set data information attributes #
attr(seqObj, "data_info") = list(data_type = data_type, data_scale = data_scale, data_norm = data_norm,
norm_method = norm_method, location_param = location_param, scale_param = scale_param,
num_samps = num_samps, num_edata = num_edata, num_taxa = num_taxa, num_ec = num_ec, num_gene = num_gene,
num_na = num_na, frac_na = frac_na, num_zero = num_zero, frac_zero = frac_zero,
#num_miss_obs = num_miss_obs, frac_missing = frac_missing,
seq_type = seq_type)
# set meta data attributes #
if(!is.null(e_meta)){
attr(seqObj, "meta_info") = TRUE
}else{ attr(seqObj, "meta_info") = FALSE}
# set database attributes #
if(is.null(db)){
db_version = NULL
}else{
db_version = db_version
}
attr(seqObj, "database") = list(db = db, db_version = db_version)
# set group dataframe attribute to NULL, will be filled in after running group_designation function #
attr(seqObj, "group_DF") = NULL
# set class of list #
class(seqObj) = "seqData"
return(seqObj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.