R/create_mzdata.R

Defines functions create_mzdata

Documented in create_mzdata

#' Create mzdata.
#'
#' \code{create_mzdata()} pre-processes the LCMS data used for modelling in
#' gordon01.
#'
#' Initially, the function takes the raw output from xcms and removes unwanted
#' data (e.g. retention times, isotopes, peak counts etc.). Then, it creates
#' new categorical variables based on the sample information. Finally, it
#' replaces true non-detects with noise, removes poorly resolved mass features
#' and then replaces the small number of remining missing values using random
#' forest imputaion. The list below details the logic behind the missing values
#' imputation:
#' \itemize{
#' \item \strong{TRUE NON-DETECTS:}
#'  Replace values missing in one class, but not others, with a random number
#'  between zero and the minimum of the matrix (i.e. noise). To be considered a
#'  true non-detect, a class should be missing at least 60 precent of its
#'  values. Achieved with,
#'  \code{metabolomics::MissingValues(group.cutoff = 0.6)}
#'  \item \strong{POORLY RESOLVED MASS FEATURES:}
#'  Remove mass features with more than 90 percent missing values. Achieved
#'  with, \code{metabolomics::MissingValues(column.cutoff = 0.9)}
#'  \item \strong{FALSE NON DETECTS:}
#'  Remaining missing values will be computed using
#'  \code{missForest::missForest()}. Achieved with,
#'  \code{metabolomics::MissingValues(complete.matrix = FALSE)}
#' }
#'
#' @param parallel Logical indicating if missing values imputation should be run
#' in parallel. If \code{TRUE}, the default number of cores is equal to half the
#' available number of cores
#' @param seed An integer used for setting seeds of random number generation
#' @param savecsv Logical indicating if output should be saved as a \code{.csv}
#' file to the current working directory
#' @param saverda Logical indicating if a .rda file should be saved to /data
#'
#' @return Returns a dataframe of class tbl_df
#'
#' @note Using \code{parallel = TRUE} is not reproducible. Future versions of
#' this function may include support for reproducible RNG seeds when using
#' parallel processing. Although this function is exported,
#' \code{create_mzdata()} was not intended to be used outside of this package.
#'
#' @author Benjamin R. Gordon
#'
#' @seealso
#' \code{\link[phdhelpr]{missing_values}}
#' \code{\link[doMC]{registerDoMC}}
#' \code{\link[missForest]{missForest}}
#'
#' @export
#'
create_mzdata <- function(parallel = FALSE,
                          seed = 100,
                          savecsv = FALSE,
                          saverda = TRUE) {

  mzdata  <-  readr::read_csv("./data-raw/mzdata-raw.csv", na = "0")

  # remove unwanted data -------------------------------------------------------
  mzdata <- mzdata[-1]
  mzdata <- mzdata[-2:-24]
  mzdata <- mzdata[-grep("[M+1]", mzdata$isotopes, fixed = TRUE),]
  mzdata <- mzdata[-grep("[M+2]", mzdata$isotopes, fixed = TRUE),]
  mzdata <- mzdata[-grep("[M+3]", mzdata$isotopes, fixed = TRUE),]
  mzdata <- mzdata[-grep("[M+4]", mzdata$isotopes, fixed = TRUE),]
  mzdata <- mzdata[-103:-105]
  mz_names <- round(mzdata[1], 4)
  mzdata <- mzdata[-1]
  mz_names <- paste("mz", mz_names$mz, sep = "_")
  mz_names <- make.names(mz_names, unique = TRUE)
  mzdata <- tibble::as_tibble(t(mzdata), rownames = "sample_ids")
  colnames(mzdata)[2:ncol(mzdata)] <- mz_names
  sample_ids <- mzdata[1]

  # Create categorical variables -----------------------------------------------
  class <- c(rep("eCO2", 6),
             rep("eCO2eT", 6),
             rep("eCO2", 5),
             rep("eCO2eT", 5),
             rep("eCO2", 6),
             rep("eCO2eT", 6),
             rep("eCO2", 6),
             rep("eCO2eT", 6),
             rep("control", 6),
             rep("eT", 6),
             rep("control", 6),
             rep("eT", 6),
             rep("control", 6),
             rep("eT", 3),
             rep("control", 6),
             rep("eT", 6),
             rep("PBQC", 10)
             )
  class <- factor(class,
                  levels = c("control", "eT", "eCO2", "eCO2eT", "PBQC")
                  )
  day <- c(rep("day1", 12),
           rep("day14", 10),
           rep("day4", 12),
           rep("day6", 12),
           rep("day1", 12),
           rep("day14", 12),
           rep("day4", 9),
           rep("day6", 12),
           rep("PBQC", 10)
           )
  day <- factor(day,
                levels = c("day1", "day4", "day6", "day14", "PBQC")
                )
  tank <- c(rep(c("L","L","L", "R","R","R"), 2),
            rep(c("L","L","L","R","R","L","L"), 1),
            rep(c("R","R","R","L","L","L"), 10),
            rep(c("L","L","L", "R","R","R"), 2),
            rep("LR", 10)
            )
  tank <- factor(tank,
                 levels = c("L", "R", "LR"))
  rep <- c(rep(c(1, 2, 3), 5),
           rep(c(1, 3, 1, 2), 1),
           rep(c(1, 2, 3), 24),
           c(1, 10, 2:9))
  rep <- factor(rep,
                levels = (c(1:10)))
  class_day <- interaction(class,
                           day,
                           drop = TRUE,
                           sep = ".")

  # Impute noise and remove unreliable mass features ---------------------------
  mzdata <- tibble::as_tibble(cbind(class_day, mzdata[-1]))
  mzdata_filt <- phdhelpr::missing_values(mzdata,
                                          column.cutoff = 0.9,
                                          group.cutoff = 0.6,
                                          complete.matrix = FALSE,
                                          seed = seed
                                          )
  mzdata <- data.frame(sample_ids,
                     class,
                     day,
                     tank,
                     rep,
                     mzdata_filt$output
                     )
  percent_na <- round(sum(is.na(mzdata))/prod(dim(mzdata[-1:-6]))*100, 2)

  # Impute remaining missing values --------------------------------------------
  if(parallel) {
    doMC::registerDoMC()
    set.seed(seed)
    mzdata.imp <- missForest::missForest(mzdata[-1], parallelize = "variables")
  }

  else {
    set.seed(seed)
    mzdata.imp <- missForest::missForest(mzdata[-1], parallelize = "no")
  }

  mzdata <- tibble::as_tibble(cbind(sample_ids, mzdata.imp$ximp))
  mzdata <- dplyr::arrange(mzdata, class, day)

  # write data -----------------------------------------------------------------
  if(saverda) {
    save(mzdata, file = "./data/mzdata.rda", compress = "bzip2")
  }

  if(savecsv) {
    readr::write_csv(mzdata, "./data/mzdata.csv")
  }

  message("The data contained ", percent_na, "% NAs")########
  message("MissForest NRMSE: ", round(mzdata.imp$OOBerror, 4))
  mzdata
}

## Data documentation ----------------------------------------------------------

#' Preprocessed LCMS data
#'
#' A dataset of positive ESI-LCMS mz variables for coral samples subjected to
#' experimental ocean warming and acidification at Heron Island in Feb 2011.
#'
#' @format A tibble with 101 rows and 1647 variables:
#' \describe{
#'   \item{sample_ids}{ID's for each sample}
#'   \item{class}{Treatment group. One of, control, eT (eleveated temperature),
#'   eCO2 (elevated CO2), eCO2eT (elevated CO2 and temperature), PBQC}
#'   \item{day}{Days of exposure to treatment}
#'   \item{tank}{Tank membership. Either L (left) or R (right)}
#'   \item{rep}{Sample replicate number}
#'   \item{class_day}{interaction variable between class and day variables}
#' }
#' @source Benjamin R. Gordon
"mzdata"
brgordon17/coralclass documentation built on June 15, 2020, 9:21 p.m.