bouquet: Filter and Augment Biological GC-MS Data

Documented in make_chemtable

#' Make chemical table
#'
#' Summarize information about each chemical, including retention time and frequency of occurence by groups. Prerequisite for the filter* functions.
#'
#' Generates chemtable, a data frame of the compounds, with the following columns:
#' \describe{
#' \item{compound}{compound name}
#' \item{RT}{mean RT}
#' \item{RT.var}{variance of RT}
#' \item{match}{mean match}
#' \item{match.var}{variance of match}
#' \item{max.floral}{maximum area in floral samples}
#' \item{mean.floral}{mean area in floral samples, including zeros}
#' \item{mean.nonzero.floral}{mean area in floral samples, excluding zeros}
#' \item{median.floral}{median area in floral samples, including zeros}
#' \item{count.X}{multiple columns, number of occurrences within types and treatment groups}
#' \item{freq.X}{multiple columns, proportion of occurrences within types and treatment groups}
#' }
#' @param gcdata peaks integrated from GC data, either in long format (peaks in rows with columns for area, match score, and retention time, as generated by `load_longdata()`) or if detailed peak information is not available, in wide format (table of peak areas with samples in rows and compounds in columns, as generated by `make_sampletable()`).
#' @param metadata the sample metadata, in the same row order as gcdata if datatype is 'wide'
#' @param datatype the format of gcdata, either 'long' (the default) or 'wide'
#' @return chemtable, a data frame of the compound
#' @examples
#' data(GCMSfloral)
#' \dontrun{chemtable <- make_chemtable(longdata, metadata)}
#' @export
make_chemtable <- function(gcdata, metadata, datatype = "long") {
  if(datatype == "long") {
    longdata <- gcdata
    sampletable <- bouquet::make_sampletable(longdata, metadata)
    chemtable <- data.frame(name=levels(longdata$name))
  } else if(datatype == "wide") {
    sampletable <- gcdata
    chemtable <- data.frame(name=colnames(sampletable))
  } else {
    stop("datatype must be /'long/' or /'wide/'")
  }
  chemtable <- within(chemtable, {
    if(datatype == "long") {
      RT <-     sapply(name, function(x) {median(longdata$RT[longdata$name==x])})
      RT.var <- sapply(name, function(x) {var(longdata$RT[longdata$name==x])})
      match <-  sapply(name, function(x) {median(longdata$match[longdata$name==x])})
      match.var <- sapply(name, function(x) {var(longdata$match[longdata$name==x])})
    }
    max.floral <-    sapply(sampletable[metadata$type=="floral",], max)
    max.ambient <-   sapply(sampletable[metadata$type=="ambient",],max)
    mean.floral <-   sapply(sampletable[metadata$type=="floral",], mean)
    mean.ambient <-  sapply(sampletable[metadata$type=="ambient",],mean)
    mean.nonzero.floral <- sapply(sampletable[metadata$type=="floral",],  function(x){mean(x[x>0])})
    mean.nonzero.ambient <-sapply(sampletable[metadata$type=="ambient",], function(x){mean(x[x>0])})
    median.floral <-   sapply(sampletable[metadata$type=="floral",], median)
    median.ambient <-  sapply(sampletable[metadata$type=="ambient",],median)
  })

  add_counts_freqs <- function(chemtable, sampletable, groups) {
    for(g in levels(groups)[levels(groups)!=""]) { #"" are probably the ambients and blanks
      chemtable[,paste0("count.",g)] <- sapply(na.omit(sampletable[groups==g,]), function(x) sum(x>0))
      chemtable[,paste0("freq.",g)] <- sapply(na.omit(sampletable[groups==g,]), function(x) sum(x>0) /  length(x))
    }
    return(chemtable)
  }

  chemtable <- add_counts_freqs(chemtable, sampletable, metadata$type)
  for(grouping in attr(metadata, "group")) {
    chemtable <- add_counts_freqs(chemtable, sampletable, metadata[,grouping])
  }

  return(chemtable)
}