R/formatting-functions.R

Defines functions output_MetaData.default output_MetaData.mass_all output_MetaData.pktable output_MetaData format_MetabolomicData.default format_MetabolomicData.mass_all format_MetabolomicData.pktable format_MetabolomicData format_simca

Documented in format_MetabolomicData format_MetabolomicData.default format_MetabolomicData.mass_all format_MetabolomicData.pktable format_simca output_MetaData output_MetaData.default output_MetaData.mass_all output_MetaData.pktable

#' @title Formats Peak.list for SIMCA
#'
#' @export
#' @description Formats Peak.list for import into SIMCA
#' @param Peak.list data frame containing combined ion mode peaklist with ion mode duplicates removed.  Alternatively can be retrieved from databases.  Default is NULL
#' @param Sample.df data frame with class info as columns.  Must contain a separate row entry for each unique sex/class combination. Must contain the columns 'Sex','Class','n','Endogenous'.
#' @param Sample.data data frame with phenotype data as columns and a row for each study sample.  First column must be a unique sample identifier with the header 'CT-ID'.  Phenotype columns may vary, but must include two columns called 'Plate Number' and 'Plate Position' for determining run order.
#'    'Plate Number' must be numeric and is equivalent to batch number.  'Plate Position' must be alphanumeric and corresponds to row(alpha) and column(numeric) positions on, e.g. a 96-well plate
#' @param tbl.id character Table name to draw from database. Default is NULL
#' @param QC.id character vector specifying identifier in filename designating a Pooled QC sample.  Only the first value will be used.  Default is 'Pooled_QC_'
#' @param ... Arguments to pass parameters to database functions
#' @return NULL testing
#' @importFrom stats setNames
#' @importFrom gtools mixedorder
#' @importFrom plyr rbind.fill
#' @examples
#' library(LUMA)
#' if(require(lcmsfishdata, quietly = TRUE)) {
#'   # is case sensitive on Linux
#'   file <- system.file('extdata','Sample_Class.txt', package = "lcmsfishdata")
#'   Sample.df <- read.table(file, header = TRUE, sep = "\t")
#'  # is case sensitive on Linux
#'  file2 <- system.file('extdata','Sample_Data.csv', package = "lcmsfishdata")
#'   Sample.data <- read.table(file2, header = TRUE, sep = ",")
#'  Peak.list <- Peaklist_db$Peaklist_Normalized
#'   test <- format_simca(Peak.list = Peak.list, Sample.df = Sample.df, Sample.data = Sample.data)
#' }
format_simca = function(Peak.list = NULL, Sample.df, Sample.data, tbl.id = NULL, QC.id = "Pooled_QC_", ...) {



  ##-----------------------------------------------------------------------------------------
  ## Initialize Peaklist from database
  ##-----------------------------------------------------------------------------------------
  if (missing(Peak.list))
    Peak.list = NULL
  if (missing(tbl.id))
    tbl.id = NULL
  if (is.null(tbl.id) && is.null(Peak.list)) {
    stop("Need to specify tbl.id if using databases to retrieve Peak.list!", call. = FALSE)
  }
  if (is.null(Peak.list)) {
    Peak.list <- read_tbl(tbl.id, ...)
  }

  ##-----------------------------------------------------------------------------------------
  ## Format metabolite data.
  ##-----------------------------------------------------------------------------------------

  # Extract peak intensity data from Peak.list using the sexes as a search string.
  sexes <- unique(paste(Sample.df$Sex, "_", sep = ""))
  samples <- vector(mode = "character", length = length(colnames(Peak.list)))
  for (i in 1:length(sexes)) {
    rows_loop <- grep(sexes[i], colnames(Peak.list))
    samples[rows_loop] <- sexes[i]
  }
  res <- samples %in% sexes
  sample.peaks <- Peak.list[, res]


  ## Creates a new column for grouping by class based on user input
  groups <- paste(Sample.df$Sex, Sample.df$Class, sep = ";")  ## Generate search string for all classes
  groups <- strsplit(groups, split = ";")
  names(groups) <- paste(Sample.df$Sex, Sample.df$Class, sep = "_")
  group <- vector(mode = "character", length = length(colnames(sample.peaks)))
  for (i in 1:length(groups)) {
    rows_loop <- intersect(grep(groups[[i]][1], colnames(sample.peaks)),
                           grep(groups[[i]][2], colnames(sample.peaks))
    )
    group[rows_loop] <- names(groups)[i]
  }
  group <- unlist(group)

  # modify sample data to include the user defined exposure class
  Sample.data <- Sample.data[order(Sample.data[,1]), ]
  Sample.data <- setNames(cbind.data.frame(Sample.data[, 1], group, Sample.data[-1]), c(colnames(Sample.data[1]),
                                                                                        "Exposure Class", colnames(Sample.data[-1])))
  sample.ID <- suppressWarnings(as.numeric(sub("\\D*(\\d{6}).*", "\\1", colnames(sample.peaks))))  #pulls out the 6-digit numeric sample codes into a vector for matching against the Sample data spreadsheet
  if (all(is.na(sample.ID))) {
    ## Alternatively uses a unique alphanumeric ID provided by user
    sample.ID <- sub("\\D*(\\d{6}).*", "\\1", colnames(sample.peaks))
  }
  sample.peaks <- cbind.data.frame(Peak.list[, "MS.ID"], sample.peaks)
  # Create a new ID for each metabolite and record how many features were combined into that metabolite for
  # metadata
  temp <- as.character(sample.peaks[, 1])

  no.features <- str_count(temp, ";") + 1
  MetID <- gsub("^(.*?);.*", "\\1", temp)
  Mettag <- rep("Unidentified", length = length(MetID), mode = "character")
  Mettag[which(grepl("Annotated", temp, fixed = TRUE))] <- "Annotated"

  my_ion <- Peak.list$Ion.Mode
  my_mass <- round(Peak.list$mono_mass, digits = 5)
  my_rt <- round(Peak.list$meanRT, digits = 2)
  Metbase <- paste(my_ion,my_mass,my_rt,sep = "_")

  MetID <- paste(Metbase, Mettag, sep = "_")
  # End new ID generation

  tsample.peaks <- setNames(data.frame(t(sample.peaks[, -1])), MetID)
  # add columns for sex, sample class, and all phenotype data for each sample
  tsample.peaks <- cbind.data.frame(X = sample.ID, tsample.peaks)
  colnames(tsample.peaks)[1] = colnames(Sample.data)[1]
  tsample.peaks <- merge(Sample.data, tsample.peaks, by = colnames(tsample.peaks)[1])

  mylist <- split(tsample.peaks, f = tsample.peaks$Plate.Number)
  new.list <- lapply(mylist, function(X) X[mixedorder(X[, "Plate.Position"]), ])
  tsample.peaks <- rbind.fill(new.list)

  # make a dataframe of metabolite intensity values for pooled QC samples then transpose so the colnames are the
  # same as metabolite names: Pos_###_Annotated
  Pooled.QC <- QC.id  ## Generate search string for all sexes
  for (i in 1:length(Pooled.QC)) {
    rows_loop <- grep(Pooled.QC[i], colnames(Peak.list))
    samples[rows_loop] <- Pooled.QC[i]
  }
  QC <- samples %in% Pooled.QC
  QC.peaks <- Peak.list[, QC]
  QC.peaks <- cbind.data.frame(Peak.list[, "MS.ID"], QC.peaks)
  tQC.peaks <- setNames(data.frame(t(QC.peaks[, -1])), MetID)
  # add columns for sex, sample class, and all phenotype data for Pooled QCs, and fill with NA
  dummy.data <- setNames(data.frame(matrix(ncol = length(Sample.data), nrow = nrow(tQC.peaks))), colnames(Sample.data))
  tQC.peaks <- cbind.data.frame(dummy.data, tQC.peaks)


  # make a dataframe of all metadata columns then transpose so the colnames are the same as the metabolite
  # names: Pos_###_Annotated
  log.list <- list()
  log.list[[1]] <- res
  log.list[[2]] <- QC
  meta.peaks <- Peak.list[, !Reduce("|", log.list)]
  temp <- str_count(meta.peaks$Duplicate_EIC, ";")
  Ion.duplicate <- vector(mode = "logical", length = length(temp))
  Ion.duplicate[which(temp > 0)] = TRUE  #add flag for the metabolites detected in both ion modes
  endo <- meta.peaks[, "Endogenous_flag"]
  meta.peaks$Endogenous_flag <- as.logical(meta.peaks$Endogenous_flag)  #convert endogenous flag to a logical
  meta.peaks <- cbind.data.frame(meta.peaks[, 1:11], no.features, meta.peaks[, 12:ncol(meta.peaks)], Ion.duplicate)
  tmeta.peaks <- setNames(data.frame(t(meta.peaks[, -1])), MetID)
  dummy.data <- setNames(data.frame(matrix(ncol = length(Sample.data), nrow = nrow(tmeta.peaks))), colnames(Sample.data))
  tmeta.peaks <- cbind.data.frame(dummy.data, tmeta.peaks)
  # pheno.peaks <- Peak.list[,-grep('[MF]_',colnames(Peak.list))] colnames(pheno.peaks)

  # combine the three dataframes, first sample data, then QC data, lastly metadata
  SIMCA.data <- rbind.data.frame(tsample.peaks, tQC.peaks, tmeta.peaks)
  SIMCA.data <- cbind.data.frame(X = rownames(SIMCA.data), SIMCA.data)
  SIMCA.data[, 1] <- gsub("^X", "\\1", SIMCA.data[, 1])
  SIMCA.data <- setNames(cbind.data.frame(SIMCA.data, row.names = NULL), c("", colnames(SIMCA.data)[-1]))



}

#' @title Formatting of metabolomic data for MetaboAnalystR.
#'
#' @export
#' @description This function initializes objects that will hold the metabolite
#'   data, formats peak intensity data into one of the formats acceptable by
#'   MetaboAnalystR, and sets the metabolite data object.
#' @param mSetObj NULL
#' @param Peak.list data frame containing combined ion mode peaklist with ion
#'   mode duplicates removed.
#' @param Sample.df data frame with class info as columns.  Must contain a
#'   separate row entry for each unique sex/class combination. Must contain the
#'   columns 'Sex','Class','n','Endogenous'.
#' @param Sample.data data frame with phenotype data as columns and a row for
#'   each study sample.  First column must be a unique sample identifier with
#'   the header 'CT-ID'.  Phenotype columns may vary, but must include two
#'   columns called 'Plate Number' and 'Plate Position' for determining run
#'   order.
#' @param tbl.id character Table name to draw from database. Default is NULL
#' @param ... Arguments to pass parameters to database functions
#' @return mSetObj
#' @examples
#' library(LUMA)
#' if(require(lcmsfishdata, quietly = TRUE)) {
#' file <- system.file("extdata","Sample_Class.txt", package = "LUMA") # is case sensitive on Linux
#' Sample.df <- read.table(file, header = TRUE, sep = "\t")
#' file2 <- system.file("extdata","Sample_Data.csv", package = "LUMA") # is case sensitive on Linux
#' Sample.data <- read.table(file2, header = TRUE, sep = ",")
#' Peak.list <- Peaklist_db$Peaklist_Normalized
#' class(mSetObj) <- "pktable"
#' new_mSetObj <- format_MetabolomicData(mSetObj = mSetObj, Peak.list =
#' Peak.list, Sample.df = Sample.df, Sample.data = Sample.data)
#' new_mSetObj$dataSet$orig.cls
#'
#' class(mSetObj) <- "mass_all"
#' new_mSetObj <- format_MetabolomicData(mSetObj = mSetObj, Peak.list =
#'                                         Peak.list, Sample.df = Sample.df, Sample.data = Sample.data)
#' new_mSetObj$dataSet$orig.cls
#' }
format_MetabolomicData <- function(mSetObj, Peak.list, Sample.df, Sample.data, tbl.id, ...)
{
  UseMethod("format_MetabolomicData", mSetObj)
}

#' @rdname format_MetabolomicData
#' @export
format_MetabolomicData.pktable <- function(mSetObj, Peak.list, Sample.df, Sample.data, tbl.id, ...)
{

  ##-----------------------------------------------------------------------------------------
  ## Initialize Peaklist from database
  ##-----------------------------------------------------------------------------------------
  if (missing(Peak.list))
    Peak.list = NULL
  if (missing(tbl.id))
    tbl.id = NULL
  if (is.null(tbl.id) && is.null(Peak.list)) {
    stop("Need to specify tbl.id if using databases to retrieve Peak.list!", call. = FALSE)
  }
  if (is.null(Peak.list)) {
    Peak.list <- read_tbl(tbl.id, ...)
  }




  ##-----------------------------------------------------------------------------------------
  ## Format metabolite data.
  ##-----------------------------------------------------------------------------------------

  # Extract peak intensity data from Peak.list using the sexes as a search string.
  sexes  <-  unique(paste(Sample.df$Sex, "_",  sep = ""))    ## Generate search string for all sexes
  samples  <-  vector(mode = "character", length = length(colnames(Peak.list)))
  for (i  in  1:length(sexes))
  {
    rows_loop  <-  grep(sexes[i], colnames(Peak.list))
    samples[rows_loop]  <-  sexes[i]
  }
  res  <-  samples %in% sexes
  sample.peaks  <- Peak.list[, res]

  # Generate sample IDs
  sample.ID  <-  suppressWarnings(as.numeric(sub("\\D*(\\d{6}).*", "\\1", colnames(sample.peaks))))
  if (all(is.na(sample.ID)))
  {
    ## Alternatively uses a unique alphanumeric ID provided by user
    sample.ID <- sub("\\D*(\\d{6}).*", "\\1", colnames(sample.peaks))
  }

  ## Code to generate treatment levels (sometimes confusingly called sample classes)
  # Creates a new column for grouping by class based on user input
  groups <- paste(Sample.df$Sex, Sample.df$Class, sep = ";")  ## Generate search string for all classes
  groups <- strsplit(groups, split = ";")
  names(groups) <- paste(Sample.df$Sex, Sample.df$Class, sep = "_")
  group <- vector(mode = "character", length = length(colnames(sample.peaks)))
  for (i in 1:length(groups)) {
    rows_loop <- intersect(grep(groups[[i]][1], colnames(sample.peaks)), grep(groups[[i]][2], colnames(sample.peaks)))
    group[rows_loop] <- names(groups)[i]
  }
  group <- unlist(group)

  # Modify sample data to include the user defined exposure class
  Sample.data <- Sample.data[order(Sample.data[,1]), ]
  Sample.data <- setNames(cbind.data.frame(Sample.data[, 1], group, Sample.data[-1]), c(colnames(Sample.data[1]),
                                                                                        "Exposure Class", colnames(Sample.data[-1])))

  # Extract only the treatment level column.
  treatment.Level <- Sample.data[, 2]

  # Transpose the peak intensity data frame.
  tsample.peaks <- t(sample.peaks)

  # Build the peak intensity table for the MetaboAnalystR package.
  tsample.peaks <- cbind.data.frame(sample.ID, treatment.Level, tsample.peaks)

  ##-----------------------------------------------------------------------------------------
  ## Set the metabolomic data object.
  ##-----------------------------------------------------------------------------------------
  mSetObj$dataSet$cls.type <-  "disc"
  mSetObj$dataSet$format <-  "rowu"
  dat  <-  tsample.peaks

  #From MetaboAnalyst::Read.TextData code
  smpl.nms <-dat[,1];

  all.nms <- colnames(dat);

  facA.lbl <- all.nms[2];

  cls.lbl <- facA <- dat[,2]; # default assign facA to cls.lbl in order for one-factor analysis
  conc <- dat[,-c(1,2)];
  var.nms <- colnames(conc);

  #assign the dimension names
  rownames(conc) <- smpl.nms
  colnames(conc) <- var.nms

  mSetObj$dataSet$type.cls.lbl <- class(cls.lbl);

  mSetObj$dataSet$orig.cls  <-  mSetObj$dataSet$cls  <-  cls.lbl;
  mSetObj$dataSet$orig  <-  conc;

  return(mSetObj)
}


#' @rdname format_MetabolomicData
#' @export
format_MetabolomicData.mass_all <- function(mSetObj, Peak.list, Sample.df, Sample.data, tbl.id, ...)
{

}

#' @rdname format_MetabolomicData
#' @export
format_MetabolomicData.default <- function(mSetObj, Peak.list, Sample.df, Sample.data, tbl.id, ...)
{
  warning(paste("format_MetabolomicData does not know how to handle object of class ",
                class(mSetObj),
                "and can only be used on classes pktable and mass_all"))
}


#' @title Printing metadata as CSV files for MetaboAnalystR.
#'
#' @export
#' @description This function prints the metabolite and sample metadata to CSV files.
#' @param mSetObj NULL
#' @param Peak.list data frame containing combined ion mode peaklist with ion mode duplicates removed.
#' @param Sample.df data frame with class info as columns.  Must contain a separate row entry for each unique sex/class combination. Must contain the columns 'Sex','Class','n','Endogenous'.
#' @param Sample.data data frame with phenotype data as columns and a row for each study sample.  First column must be a unique sample identifier with the header 'CT-ID'.  Phenotype columns may vary, but must include two columns called 'Plate Number' and 'Plate Position' for determining run order.
#' @importFrom utils write.csv
#' @return mSetObj
#' @examples
#' \dontrun{
#' library(LUMA)
#' file <- system.file("extdata","Sample_Class.txt", package = "LUMA") # is case sensitive on Linux
#' Sample.df <- read.table(file, header = TRUE, sep = "\t")
#' file2 <- system.file("extdata","Sample_Data.csv", package = "LUMA") # is case sensitive on Linux
#' Sample.data <- read.table(file2, header = TRUE, sep = ",")
#' Peak.list <- Peaklist_db$Peaklist_Normalized
#' mSetObj <- NULL
#' output_MetaData(mSetObj = mSetObj, Peak.list =
#' Peak.list, Sample.df = Sample.df, Sample.data = Sample.data)
#' }
output_MetaData <- function(mSetObj, Peak.list, Sample.df, Sample.data)
{
  UseMethod("output_MetaData", mSetObj)
}

#' @rdname output_MetaData
#' @export
output_MetaData.pktable <- function(mSetObj, Peak.list, Sample.df, Sample.data)
{
  ##-----------------------------------------------------------------------------------------
  ## Output metabolite and sample metadata.
  ##-----------------------------------------------------------------------------------------

  # Determine location of peak intensity data from Peak.list using the sexes
  # as a search string.
  sexes  <-  unique(paste(Sample.df$Sex, "_",  sep = ""))    ## Generate search string for all sexes
  samples  <-  vector(mode = "character", length = length(colnames(Peak.list)))
  for (i  in  1:length(sexes))
  {
    rows_loop  <-  grep(sexes[i], colnames(Peak.list))
    samples[rows_loop]  <-  sexes[i]
  }
  res  <-  samples %in% sexes
  sample.peaks <- Peak.list[, res]

  # Apply the logical NOT operator "!" to "res" to identify the columns with the metadata.
  meta_res <- !res

  # Extract the metabolite metadata
  metabolite_metadata  <- Peak.list[, meta_res]

  # Write metabolite metadata to a CSV file
  write.csv(metabolite_metadata, "Metabolite_Metadata.csv")

  ## Code to generate the exposure class
  # Creates a new column for grouping by class based on user input
  groups <- paste(Sample.df$Sex, Sample.df$Class, sep = ";")  ## Generate search string for all classes
  groups <- strsplit(groups, split = ";")
  names(groups) <- paste(Sample.df$Sex, Sample.df$Class, sep = "_")
  group <- vector(mode = "character", length = length(colnames(sample.peaks)))
  for (i in 1:length(groups)) {
    rows_loop <- intersect(grep(groups[[i]][1], colnames(sample.peaks)), grep(groups[[i]][2], colnames(sample.peaks)))
    group[rows_loop] <- names(groups)[i]
  }
  group <- unlist(group)

  # Modify sample data to include the user defined exposure class
  Sample.data <- Sample.data[order(Sample.data[,1]), ]
  Sample.data <- setNames(cbind.data.frame(Sample.data[, 1], group, Sample.data[-1]), c(colnames(Sample.data[1]),
                                                                                        "Exposure Class", colnames(Sample.data[-1])))
  # Write the sample metadata to a CSV file.
  write.csv(Sample.data, "Sample_Metadata.csv")

}

#' @rdname output_MetaData
#' @export
output_MetaData.mass_all <- function(mSetObj, Peak.list, Sample.df, Sample.data)
  {

}

#' @rdname output_MetaData
#' @export
output_MetaData.default <- function(mSetObj, Peak.list, Sample.df, Sample.data)
{
  warning(paste("format_MetaData does not know how to handle object of class ",
                class(mSetObj),
                "and can only be used on classes pktable and mass_all"))
}
USEPA/LUMA documentation built on Aug. 29, 2020, 1:40 p.m.