LUMA: LCMS-Based Untargeted Metabolomics Assistant

Documented in calc_corrstat calc_cv calc_minfrac sum_features

#' @title Calculate correlation matrices for metabolite groups
#'
#' @export
#' @description Calculates the correlation matrices for metabolite groups based
#'   on the best feature within the group that belongs to the primary
#'   metabolite.
#' @param Sample.df a data frame with class info as columns.  Must contain a
#'   separate row entry for each unique sex/class combination. Must contain the
#'   columns \code{"Sex","Class","n","Endogenous"}.
#' @param Peak.list a data frame from CAMERA that has been parsed.  Should
#'   contain all output columns from XCMS and CAMERA, and additional columns
#'   from \code{match_Annotation()}, \code{calc_minfrac()} and either
#'   \code{parse_pos_results()} or \code{parse_neg_results()}.
#' @param get.mg numerical vector of metabolite groups that have more than one
#'   feature
#' @param BLANK a logical indicating whether blanks are being evaluated
#' @param IonMode a character string defining the ionization mode.
#'   Must be one of  \code{c("Positive","Negative")}.
#' @return Peak.list of class \code{'tbl_df','tbl' or 'data.frame'} with
#'   variables as columns.  Has all of the columns as the original data frame
#'   with one additional column \code{"Correlation.stat"}
#' @importFrom stats cor dist
#' @importFrom utils setTxtProgressBar
#' @importFrom flashClust flashClust
#' @examples
#' library(LUMA)
#' if(require(lcmsfishdata, quietly = TRUE)) {
#' file <- system.file('extdata','CAMERA_objects_Pos.Rdata', package =
#' "lcmsfishdata") # is case sensitive on Linux
#' load(file)
#' pspec.length <- sapply(anposGa@pspectra, function(x) length(x))
#' get.mg <- which(pspec.length > 1)
#' file2 <- system.file('extdata','Sample_Class.txt', package = "LUMA") # is
#' # case sensitive on Linux
#' Sample.df <- read.table(file2, sep = "\t", header = TRUE) #Ignore Warning message
#' Peak.list <-  lcmsfishdata::Peaklist_Pos$input_parsed
#' test <- calc_corrstat(Sample.df = Sample.df, Peak.list = Peak.list,
#' get.mg = get.mg, BLANK = FALSE, IonMode = "Positive")
#' test[["Correlation.stat"]][11:23]
#' }
calc_corrstat = function(Sample.df, Peak.list, get.mg, BLANK, IonMode) {

    ## Error check
    Peak.list.pspec <- Peak.list[which(Peak.list$metabolite_group %in% get.mg), ]
    if (sum(ifelse(colnames(Peak.list) == colnames(Peak.list.pspec), 0, 1)) != 0) {
        stop("Something is wrong with your metabolite groups. Check out from Matlab script!")
    }
    nrow(Peak.list)
    nrow(Peak.list.pspec)

    if (BLANK == FALSE) {
        sexes <- unique(paste(Sample.df$Sex, "_", sep = ""))
        sample.cols <- grep(paste("Pooled_QC_", paste(strsplit(sexes, "(?<=.[_])", perl = TRUE), collapse = "|"),
            sep = "|"), colnames(Peak.list))
    } else {
        if (IonMode == "Positive" && BLANK == TRUE) {
            sample.cols <- grep("_Pos", colnames(Peak.list), ignore.case = TRUE)
        } else {
            if (IonMode == "Negative" && BLANK == TRUE) {
                sample.cols <- grep("_Neg", colnames(Peak.list), ignore.case = TRUE)
            }
        }
    }

    corr.group <- as.matrix(unique(Peak.list.pspec[, "metabolite_group"]))
    corr.group <- sort(corr.group)

    ## Error Check
    if (sum(ifelse(corr.group != get.mg, 1, 0)) != 0) {
        stop("Something is wrong with your metabolite groups. Check out from Matlab script!")
    }
    corr.stat = vector(mode = "numeric", length = nrow(Peak.list.pspec))
    corr.df <- data.frame("Correlation.stat" = corr.stat, row.names = as.character(Peak.list.pspec$EIC_ID))
    rownames(corr.df)
    colnames(corr.df)

    total = length(get.mg)
    # i = 17 For debugging purposes
    cat("\n\nGenerating Correlation Matrices.\n\n\n")
    pb = txtProgressBar(min = 0, max = total, style = 3)
    for (i in 1:length(corr.group)) {
        my.df <- Peak.list[which(Peak.list$metabolite_group %in% get.mg[i]), ]
        colnames(my.df)
        if (nrow(my.df) == 1) {
        } else {
              my.mat <- as.matrix(my.df[, sample.cols])
              test.mat <- as.matrix(t(my.mat))
              dimnames(test.mat) <- list(colnames(my.df[, sample.cols]), my.df$EIC_ID)
              colnames(test.mat)
              res <- cor(test.mat)
              d <- dist(res)
              sim.by.hclust <- flashClust(d)
              attributes(d)
              labels(d)
              # plot(sim.by.hclust)
              attributes(sim.by.hclust)
              sim.by.hclust$merge
              sim.by.hclust$labels
              labels(d)[sim.by.hclust$order]
              corr.stat <- res[which(rowSums(res) %in% max(rowSums(res))), ]
              new.df <- as.data.frame(corr.stat, row.names = names(corr.stat))
              j <- rownames(new.df)
              corr.df[j, ] <- new.df[[1]]

              # ordered.hclust <- reorder(sim.by.hclust, wts = my.df$monoisotopic_flg, agglo.FUN = 'mean')
              # ordered.hclust$value ordered.hclust$labels

        }
        setTxtProgressBar(pb, i)
    }
    Peak.list.pspec <- cbind(Peak.list.pspec, corr.df)
    close(pb)
    return(Peak.list.pspec)
}

#' @title Calculate minimum fraction of features across sample classes
#'
#' @export
#' @description Calculates the fraction of samples within each class that
#'   contains a given feature. Only the minimum fraction across all sample
#'   classes is returned.
#' @param Sample.df a data frame with class info as columns.  Must contain a
#'   separate row entry for each unique sex and class combination. Must contain
#'   the columns \code{"Sex","Class","n","Endogenous"}.
#' @param xset4 an xcms object that has had peak picking, retention time
#'   alignment, peak grouping, and imputing missing values performed
#' @param BLANK a logical indicating whether blanks are being evaluated
#' @param Peak.list a table of class 'tbl_df',tbl' or 'data.frame' with
#'   variables as columns.  Should contain all output columns from \code{XCMS}
#'   and \code{CAMERA}, and additional columns from \code{match_Annotation()}.
#' @return data frame containing the original table with one additional column
#'   'Minfrac' at the end, followed by the CAMERA columns
#'   'isotopes','adduct','pcgroup'
#' @importFrom utils read.table write.table str head
#' @importFrom stats variable.names
#' @examples
#'   library(LUMA)
#'   if(require(lcmsfishdata, quietly = TRUE)) {
#'   file <- system.file('extdata','XCMS_objects_Pos.Rdata', package =
#'   "lcmsfishdata") # is case sensitive on Linux
#'   load(file)
#'   # case sensitive on Linux
#'   file2 <- system.file('extdata','Sample_Class.txt', package = "LUMA") # is
#'   Sample.df <- read.table(file2, sep = "\t", header = TRUE)
#'   Peak.list <- lcmsfishdata::Peaklist_Pos$From_CAMERA
#'   test <- calc_minfrac(Sample.df = Sample.df, xset4 = xset4, BLANK = FALSE,
#'   Peak.list = Peak.list)
#'   test[["MinFrac"]][11:23]
#'   }
calc_minfrac = function(Sample.df, xset4, BLANK, Peak.list) {

  if(requireNamespace("xcms", quietly = TRUE)) {

    peakSN <- xcms::peakTable(xset4, filebase=NULL, value="sn") #writes the SN peak table to file
    SN.list <- data.frame(X = rownames(peakSN),peakSN)

    if (BLANK == TRUE) {
      MinFrac.table <- NA
    } else {
      sexes <- unique(paste(Sample.df$Sex, "_", sep = ""))
      samples <- vector(mode = "character", length = length(colnames(SN.list)))
      for (i in 1:length(sexes)) {
        rows_loop <- grep(sexes[i], colnames(SN.list))
        samples[rows_loop] <- sexes[i]
      }
      sum.range.list <- SN.list[, samples %in% sexes]
      t.list<-as.data.frame(t(sum.range.list)) #transposes the SN list

      ## Grabs the unique sample ID number for each sample, if present in the filename
      bin <- variable.names(t.list, full = TRUE)
      colnames(sum.range.list)
      sample.ID <- sub("\\D*(\\d{6}).*", "\\1", colnames(sum.range.list))
      colnames(sum.range.list)

      ## Creates a new column for grouping by class based on user input
      groups <- paste(Sample.df$Sex, Sample.df$Class, sep = ";")
      groups <- strsplit(groups, split = ";")
      names(groups) <- paste(Sample.df$Sex, Sample.df$Class, sep = "_")
      group <- vector(mode = "character", length = length(colnames(sum.range.list)))
      for (i in 1:length(groups)) {
        rows_loop <- intersect(grep(groups[[i]][1],colnames(sum.range.list)),
                               grep(groups[[i]][2],colnames(sum.range.list))
        )
        group[rows_loop] <- names(groups)[i]
      }
      group <- unlist(group)
      class.n <- unique(Sample.df$n)

      if (length(class.n) > 1) {
        t.list <- cbind(group, t.list)  #combines the two dataframes together
        sn.list <- split(t.list, as.factor(group))  #Splits the data frame by a factor
        sn.list.split <- split(sn.list, sapply(sn.list, function(x) nrow(x)))

        # Create matrix to contain all of the minfrac values for each group from each list in sn.list
        min.frac.fulltable <- matrix(nrow = length(bin), ncol = sum(sapply(sn.list, function(x) length(x))))
        # j = 1 This and following line used for debugging purposes
        k = 1
        for (j in 1:length(sn.list)) {
          na_count <- sapply(as.data.frame(sn.list[j]), function(x, y) sum(length(which(is.na(x)))), y = group)  #counts the number of na values per feature across each group
          na_count <- data.frame(na_count)  #puts the list in a data frame
          for (i in 1:length(groups)) {
            rows_loop <- intersect(grep(groups[[i]][1], rownames(na_count)),
                                   grep(groups[[i]][2], rownames(na_count))
            )
            group[rows_loop] <- names(groups)[i]
          }
          group <- unlist(group)
          na_count <- cbind(group, na_count)  #combines the grouping variable with the na count values
          na.count.list <- split(na_count, as.factor(group))  #Splits the data frame into lists of data frames by the grouping variable
          str(na.count.list)

          all_count <- sapply(as.data.frame(sn.list[j]), function(x, y) length(x), y = group)  #counts the number of na values per feature across each group
          all_count <- data.frame(all_count)  #puts the list in a data frame
          str(all_count)
          for (i in 1:length(groups)) {
            rows_loop <- intersect(grep(groups[[i]][1], rownames(all_count)),
                                   grep(groups[[i]][2], rownames(all_count))
            )
            group[rows_loop] <- names(groups)[i]
          }
          group <- unlist(group)
          all_count <- cbind(group, all_count)  #combines the grouping variable with the na count values
          all.count.list <- split(all_count, as.factor(group))  #Splits the data frame into lists of data frames by the grouping variable
          str(all.count.list)

          ## The next section creates a new data matrix with the number of rows and columns equal to the number of
          ## features and groups, respectively.  The new table is called 'minfrac.fulltable_loop'
          minfrac.fulltable_loop <- matrix(nrow = length(bin), ncol = length(all.count.list))
          colnames(minfrac.fulltable_loop) <- c(names(na.count.list))  ###Names the columns for the new table as indicated
          rownames(minfrac.fulltable_loop) <- (bin)  ###Pulls out and names the table rows based on the metabolite label (I do this as a QA to make sure the loop is correctly linking the ANOVA output and metbaolite IDs

          ## this section calculates the minfrac values for each exposure class in a for loop and populates the
          ## minfrac.fulltable_loop with these values
          group.names <- names(na.count.list)
          for (i in names(na.count.list)) {
            na_loop <- na.count.list[i]
            na_loop <- do.call("rbind", na_loop)
            all_loop <- all.count.list[i]
            all_loop <- do.call("rbind", all_loop)
            min_frac_loop <- rapply(as.data.frame(na_loop$na_count), function(x, y) x/y, y = all_loop$all_count)  ##calculates the min frac values for each feature by exposure class
            min_frac_loop <- data.frame(min_frac_loop)  ##changes the format of the min frac results into a data frame
            colnames(min_frac_loop) <- paste(i, "", "min_frac")
            min_frac_loop <- min_frac_loop[-1, ]  ##Removes the value corresponding to the grouping variable
            minfrac.fulltable_loop[, i] = min_frac_loop  ##Pastes the min frac results from an exposure class into the next col of the min frac full table.
          }
          ## Code to calculate min_frac for one of the exposure classes; used to QA the loop output
          i = names(na.count.list[1])
          group.names <- names(na.count.list)
          test.na <- na.count.list[i]
          test.na <- do.call("rbind", test.na)
          test.all <- all.count.list[i]
          test.all <- do.call("rbind", test.all)
          min_frac <- rapply(as.data.frame(test.na$na_count), function(x, y) x/y, y = as.data.frame(test.all$all_count))  #Need to find a way to get rid of the subset operator $
          min_frac <- data.frame(min_frac)
          str(min_frac)
          colnames(min_frac) <- paste(i, "", "min_frac")
          min_frac <- min_frac[-1, ]
          head(min_frac)

          # Bring loop-based minfrac values out of the loop by placing them in the minfrac master table
          min.frac.fulltable[, k:sum(k - 1, ncol(minfrac.fulltable_loop))] <- minfrac.fulltable_loop

          # Update the counter to keep track of where to copy the results in the master table
          k <- k + ncol(minfrac.fulltable_loop)
        }
      } else {
        if (length(class.n) == 1) {
          t.list <- cbind(group, t.list)  #combines the two dataframes together
          sn.list <- split(t.list, as.factor(group))  #Splits the data frame by a factor

          na_count <- sapply(as.data.frame(sn.list), function(x, y) sum(length(which(is.na(x)))), y = group)  #counts the number of na values per feature across each group
          na_count <- data.frame(na_count)  #puts the list in a data frame
          # group<-unlist(strsplit(gsub('(([MF])_([RN])S_T([0482]))|.', '\\1', rownames(na_count)), '\\s+'))
          # #generates a character vector of grouping variables for each na count value
          for (i in 1:length(groups)) {
            rows_loop <- intersect(grep(groups[[i]][1], rownames(na_count)),
                                   grep(groups[[i]][2], rownames(na_count))
            )
            group[rows_loop] <- names(groups)[i]
          }
          group <- unlist(group)
          na_count <- cbind(group, na_count)  #combines the grouping variable with the na count values
          na.count.list <- split(na_count, as.factor(group))  #Splits the data frame into lists of data frames by the grouping variable
          all_count <- sapply(as.data.frame(sn.list), function(x, y) length(x), y = group)  #counts the number of na values per feature across each group
          all_count <- data.frame(all_count)  #puts the list in a data frame
          for (i in 1:length(groups)) {
            rows_loop <- intersect(grep(groups[[i]][1], rownames(all_count)),
                                   grep(groups[[i]][2], rownames(all_count))
            )
            group[rows_loop] <- names(groups)[i]
          }
          group <- unlist(group)
          all_count <- cbind(group, all_count)  #combines the grouping variable with the na count values
          all.count.list <- split(all_count, as.factor(group))  #Splits the data frame into lists of data frames by the grouping variable

          ## The next section creates a new data matrix with the number of rows and columns equal to the number of
          ## features and groups, respectively.  The new table is called 'min.frac.fulltable'
          min.frac.fulltable <- matrix(nrow = length(bin), ncol = length(all.count.list))
          colnames(min.frac.fulltable) <- c(names(na.count.list))  ###Names the columns for the new table as indicated
          rownames(min.frac.fulltable) <- (bin)  ###Pulls out and names the table rows based on the metabolite label (I do this as a QA to make sure the loop is correctly linking the ANOVA output and metbaolite IDs
          head(min.frac.fulltable)  ##A double-check to make sure the new table was created properly

          # this section calculates the minfrac values for each exposure class in a for loop and populates the
          # min.frac.fulltable with these values
          group.names <- names(na.count.list)
          for (i in names(na.count.list)) {
            na_loop <- na.count.list[i]
            na_loop <- do.call("rbind", na_loop)
            all_loop <- all.count.list[i]
            all_loop <- do.call("rbind", all_loop)
            min_frac_loop <- rapply(as.data.frame(na_loop$na_count), function(x, y) x/y, y = all_loop$all_count)  ##calculates the min frac values for each feature by exposure class
            min_frac_loop <- data.frame(min_frac_loop)  ##changes the format of the min frac results into a data frame
            colnames(min_frac_loop) <- paste(i, "", "min_frac")
            min_frac_loop <- min_frac_loop[-1, ]  ##Removes the value corresponding to the grouping variable
            min.frac.fulltable[, i] = min_frac_loop  ##Pastes the min frac results from an exposure class into the next row of the min frac full table.
          }
          # Code to calculate min_frac for one of the exposure classes; used to QA the loop output
          i = names(na.count.list[2])
          group.names <- names(na.count.list)
          test.na <- na.count.list[i]
          test.na <- do.call("rbind", test.na)
          test.all <- all.count.list[i]
          test.all <- do.call("rbind", test.all)
          min_frac <- rapply(as.data.frame(test.na$na_count), function(x, y) x/y, y = as.data.frame(test.all$all_count))  #Need to find a way to get rid of the subset operator $
          min_frac <- data.frame(min_frac)
          colnames(min_frac) <- paste(i, "", "min_frac")
          min_frac <- min_frac[-1, ]
          head(min_frac)

        } else print("Please include the number of samples in each class in the Sample Class file!")
      }
      ## Code to calculate MinFrac as the minimum fraction of samples with a feature across all exposure classes and
      ## sexes
      MinFrac.table <- matrix(nrow = length(bin), ncol = 1)  #This creates a new table to fill with one MinFrac value for each feature
      for (i in 1:length(bin)) {
        minfrac_loop <- 1 - min(min.frac.fulltable[i, ], na.rm = TRUE)
        MinFrac.table[i, ] <- minfrac_loop
      }
    }

    CAMERA.list <- Peak.list[, c("isotopes", "adduct", "pcgroup")]  #Extracts all of the CAMERA columns to a separate dataframe
    drops <- c("isotopes", "adduct", "pcgroup")
    stripped.list <- Peak.list[, !(names(Peak.list) %in% drops)]  #Removes the CAMERA columns
    stripped.list[, "MinFrac"] <- MinFrac.table  #Attaches the MInFrac column to the stripped down peak.list
    new.peak.list <- cbind(stripped.list, CAMERA.list)
    raw <- new.peak.list
    return(raw)

  } else {

    stop("You must install xcms to use calc_minfrac! See installation instructions at:
         \n\nhttps://www.bioconductor.org/packages/release/bioc/html/xcms.html\n\n\n")

  }
}

#' @title Sum features by metabolite group
#'
#' @export
#' @description Sums all features belonging to the same metabolite into a single
#'   intensity value per metabolite group per sample
#' @param Peak.list data frame. Must have column \code{"metabolite_group"}.
#'   Should contain output columns from XCMS and CAMERA. Can contain columns
#'   from functions \code{match_Annotation(), calc_minfrac(), ParseCAMERA(),
#'   plot_metgroup()}.
#' @param Sample.df a data frame with class info as columns.  Must contain a
#'   separate row entry for each unique sex/class combination. Must contain the
#'   columns \code{"Sex","Class","n","Endogenous"}.
#' @param search.par a single-row data frame with 11 variables containing
#'   user-defined search parameters. Must contain the columns
#'   \code{"ppm"},\code{"rt"},\code{"Voidrt"},\code{"Corr.stat.pos"},\code{"Corr.stat.neg"},
#'    \code{"CV"},\code{"Minfrac"}, \code{"Endogenous"},
#'   \code{"Solvent"},\code{"gen.plots"}, \code{"keep.singletons"}.
#' @param QC.id character vector specifying identifier in filename designating a
#'   Pooled QC sample.  Only the first value will be used.  Default is
#'   \code{"Pooled_QC_"}
#' @param BLANK a logical indicating whether blanks are being evaluated
#' @param IonMode a character string defining the ionization mode.  Must be one
#'   of \code{"Positive","Negative"}.
#' @return sum.range.list with the first column containing metabolite group and
#'   the rest containing sample and QC columns
#' @importFrom data.table as.data.table
#' @examples
#' library(LUMA)
#' # is case sensitive on Linux
#' file <- system.file('extdata','Search_Parameters.txt', package = "LUMA")
#' search.par <- read.table(file, sep = "\t", header = TRUE)
#' # is case sensitive on Linux
#' file2 <- system.file('extdata','Sample_Class.txt', package = "LUMA")
#' Sample.df <- read.table(file2, sep = "\t", header = TRUE)
#' Peak.list <- LUMA::Peaklist_Pos$output_parsed
#' if("metabolite_group" %in% colnames(Peak.list)) {
#'   test <- sum_features(Peak.list = Peak.list, Sample.df = Sample.df ,
#'                        search.par = search.par, BLANK = FALSE, IonMode = "Positive")
#' } else (stop("Peak.list must have a column called \"metabolite_group\""))
#' \dontrun{
#' Peak.list <- LUMA::Peaklist_Pos$Annotated
#' if("metabolite_group" %in% colnames(Peak.list)) {
#'   test <- sum_features(Peak.list = Peak.list, Sample.df = Sample.df ,
#'                        search.par = search.par, BLANK = FALSE, IonMode = "Positive")
#' } else (stop("Peak.list must have a column called \"metabolite_group\""))
#' }
sum_features = function(Peak.list, Sample.df, search.par, QC.id, BLANK, IonMode) {

    #Set Default Values
    if(missing(QC.id))
      QC.id <- "Pooled_QC_"

    mylist <- .gen_res(IonMode,search.par,Peak.list,Sample.df,BLANK,QC.id)
    Peaklist_corstat <- mylist[[1]]
    res <- mylist[[2]]
    sum.range.list <- Peaklist_corstat[sapply(res, function(x) length(x) > 0)]  #Extracts all of the sample columns for summing by metabolite group
    DT <- as.data.table(sum.range.list)  #Puts the summing columns in data table format
    sum.range.list <- DT[, lapply(.SD, sum), by = metabolite_group]  #sums features within each sample by metabolite group
    return(sum.range.list)
}

#' @title Calculate coeffient of variation (CV) by pooled QCs
#'
#' @export
#' @description Calculates the CV value across all pooled QC samples
#' @param Peak.list a table of class 'tbl_df',tbl' or 'data.frame' with
#'   variables as columns.  Should contain all output columns from \code{XCMS} and
#'   \code{CAMERA}.
#' @param QC.id character vector specifying identifier in filename designating a
#'   Pooled QC sample.  Only the first value will be used.  Default is
#'   \code{"Pooled_QC_"}
#' @examples
#' library(LUMA)
#' test <- calc_cv(Peak.list = Peaklist_Pos$From_CAMERA)
#' test[["%CV"]][11:23]
calc_cv = function(Peak.list, QC.id) {

  #Set Default Values
  if(missing(QC.id))
    QC.id <- "Pooled_QC_"

  #Sanity Check
  if(length(grep(QC.id,colnames(Peak.list))) <= 2) {
    stop("Peak.list must contain columns for at least 3 pooled QC samples!")
  }

  res <- lapply(colnames(Peak.list), function(ch) grep(QC.id, ch)) #Generates list equal in length to number of columns in Peak.list

  QC.list <- Peak.list[sapply(res, function(x) length(x) > 0)] #Subsets a tibble containing only the pooled QC sample columns

  QCsd <- apply((as.matrix(QC.list)), 1, sd) #Creates a vector of standard deviations for each metabolite/feature across all pooled QC samples

  QCmean <- rowMeans(QC.list) #Creates a vector of mean intensity values for each metabolite/feature across all pooled QC samples

  RSD <- QCsd/QCmean #Calculates the relative standard deviation vector from the mean and sd vectors

  Peak.list[, "X.CV"] <- RSD #Appends RSd's as a new column to the original Peak.list

  return(Peak.list)

}