amp: Smoothing Metabolomics Analyses

Documented in read_from_excel

# Helper function for checking integrity of pheno data
check_pheno_data <- function(x, id_prefix) {

  # Check that Injection order is included
  if (!"Injection_order" %in% colnames(x)) {
    stop('"Injection_order" not found for the samples')
  }
  # Injection order should be unique
  if (length(unique(x$Injection_order)) != nrow(x)) {
    stop("Injection_order is not unique")
  }
  # If Sample_ID is not provided explicitly, it will be created
  if (!"Sample_ID" %in% colnames(x)) {
    x$Sample_ID <- paste0(id_prefix, x$Injection_order)
    log_text("Sample ID autogenerated")
  } else {
    # Add a running index to all "QC" identifiers
    x$Sample_ID <- as.character(x$Sample_ID)
    x$Sample_ID[x$Sample_ID == "QC"] <- paste0("QC_", seq_len(sum(x$Sample_ID == "QC")))
    # After this, the Sample IDs should be unique
    if (length(unique(x$Sample_ID)) != nrow(x)) {
      stop("Sample_ID is not unique")
    }
  }
  # If QC column is not provided explicitly, attempt to create it
  if (!"QC" %in% colnames(x)) {
    qc_found <- which(sapply(x, function(y) {any(y == "QC")}))
    if (length(qc_found)) {
      x$QC <- ifelse(x[, qc_found[1]] == "QC", "QC", "Sample")
      log_text(paste("QC column generated from", colnames(x)[qc_found[1]]))
    } else {
      warning("QC column not found and can not be generated. Please create one before constructing a MetaboSet object.")
    }
  }

  x <- best_classes(x)
  rownames(x) <- x$Sample_ID
  x <- as.data.frame(dplyr::select(x, Sample_ID, dplyr::everything()))
  x
}

# Check that the position of the corner row and column is OK
check_position <- function(x, cc, cr) {
  condition <- (is.na(x[cr - 1, cc - 1])) &
    (is.numeric(type.convert(x[cr + 1, cc + 1]))) &
    (!is.na(x[cr, cc]))
  if (!condition) {
    stop("The corner row and column coordinates seem to be incorrect!")
  }

}

# Check if a vector can be safely converted to numeric
looks_numeric <- function(x) {
  stopifnot(is.atomic(x) || is.list(x)) # check if x is a vector
  num_nas <- sum(is.na(x))
  num_nas_new <- suppressWarnings(sum(is.na(as.numeric(x))))
  return(num_nas_new == num_nas)
}


# Check that all abundances look OK
check_exprs <- function(exprs_) {
  # Check that all rows are full of numbers
  non_numerics <- exprs_ %>%
    apply(1, function(x){!looks_numeric(x)})
  if (sum(non_numerics)) {
    stop(paste("Non-numeric values found in the abundances on rows", paste(which(non_numerics), collapse = ", ")))
  }
  # Convert to numeric
  exprs_ <- exprs_ %>%
    apply(2, as.numeric)
  exprs_
}

#' Read formatted Excel files
#'
#'
#' Reads data from an Excel file of the following format:
#' \itemize{
#'   \item Left side of the sheet contains information about the features, size features x feature info columns
#'   \item Top part contains sample information, size sample info variables x samples
#'   \item The middle contains the actual abundances, size features x samples
#' } See the vignette for more information.
#' This function separates the three parts from the file, and returns them in a list
#'
#' @param file path to the Excel file
#' @param sheet the sheet number or name
#' @param corner_row integer, the bottom row of sample information, usually contains data file names and feature info column names. If set to NULL, will be detected automatically.
#' @param corner_column integer or character, the corresponding column number or the column name (letter) in Excel. If set to NULL, will be detected automatically.
#' @param id_prefix character, prefix for autogenerated sample IDs, see Details
#' @param split_by character vector, in the case where all the modes are in the same Excel file, the column names of feature data used to separate the modes (usually Mode and Column)
#' @param name in the case where the Excel file only contains one mode, the name of the mode, such as "Hilic_neg"
#'
#' @return list of three data frames:
#' \itemize{
#'   \item exprs: the actual abundances, size features x samples
#'   \item pheno_data: sample information, size sample info variables x samples
#'   \item feature_data: information about the features, size features x feature info columns
#' }
#'
#' @details Only specify one of \code{split_by} and \code{name}. The feature data returned will contain a column
#' named "Split", which is used to separate features from different modes. Unless a column named "Feature_ID"
#' is found in the file, a feature ID will be generated based on the value of "Split", mass and retention time.
#' The function will try to find columns for mass and retention time by looking at a few common alternatives,
#' and throw an error if no matching column is found. Sample information needs to contain a row called "Injection_order",
#' and the values need to be unique. In addition, a possible sample identifier row needs to be named "Sample_ID",
#' and the values need to be unique, with an exception of QC samples: if there are any "QC" identifiers, they will
#' be replaced with "QC_1", "QC_2" and so on. If a "Sample_ID" row is not found, it will be created using the \code{id_prefix}
#' and injection order.
#'
#'
#' @importFrom magrittr "%>%"
#'
#' @export
read_from_excel <- function(file, sheet = 1, corner_row = NULL, corner_column = NULL, id_prefix = "ID_", split_by = NULL, name = NULL) {

  if (!requireNamespace("openxlsx", quietly = TRUE)) {
      stop("Package \"openxlsx\" needed for this function to work. Please install it.",
           call. = FALSE)
  }

  if (is.null(split_by) & is.null(name)) {
    stop("Etiher name or split_by needs to be defined, see documentation")
  } else if ((!is.null(split_by)) & (!is.null(name))) {
    stop("Only define split_by OR name, see documentation")
  }

  dada <- openxlsx::read.xlsx(file, sheet, colNames = FALSE)

  # Define excel column order A-Z, AA - ZZ
  combinations <- expand.grid(LETTERS, LETTERS)
  excel_columns <- c(LETTERS, paste0(combinations$Var2, combinations$Var1))

  # If corner coordinates are omitted, try to find them automatically
  corner_row <- corner_row %||% which(!is.na(dada[, 1]))[1]
  corner_column <- corner_column %||% which(!is.na(dada[1, ]))[1]
  # Column can be given as a character
  cc <- ifelse(is.character(corner_column),
               which(excel_columns == corner_column),
               corner_column)
  cr <- corner_row
  # Check that corner is in the right spot
  check_position(dada, cc, cr)

  # Extract sample information
  pheno_data <- as.data.frame(t(dada[1:cr, (cc+1):ncol(dada)]), stringsAsFactors = FALSE)
  colnames(pheno_data) <- gsub(" ", "_", c(dada[1:(cr-1), cc], "Datafile"))

  # If a single mode is given, datafile will indicate the mode
  if (!is.null(name)) {
    colnames(pheno_data)[ncol(pheno_data)] <- paste0(name, "_Datafile")
  }

  pheno_data <- check_pheno_data(x = pheno_data, id_prefix = id_prefix)

  # Exctract feature information
  feature_data <- dada[(cr+1):nrow(dada), 1:cc]
  colnames(feature_data) <- dada[cr, 1:cc]

  # If the file only contains one mode, add the mode name as Split column
  if (!is.null(name)) {
    feature_data$Split <- name
    split_by <- "Split"
  } else { # Multiple modes in the file, create Split column to separate modes
    feature_data <- feature_data %>%
      tidyr::unite("Split", split_by, remove = FALSE)
  }

  # Create feature ID if necessary
  if (!"Feature_ID" %in% colnames(feature_data)){
    feature_data <- name_features(feature_data = feature_data)
  }
  # Reorganise columns and change classes
  feature_data <- feature_data %>%
    dplyr::select(Feature_ID, Split, dplyr::everything()) %>%
    best_classes() %>%
    dplyr::mutate_if(is.factor, as.character)
  rownames(feature_data) <- feature_data$Feature_ID

  # Extract LC-MS measurements as matrix
  exprs_ <- dada[(cr+1):nrow(dada), (cc+1):ncol(dada)] %>%
    check_exprs()

  rownames(exprs_) <- rownames(feature_data)
  colnames(exprs_) <- rownames(pheno_data)

  return(list(exprs = exprs_, pheno_data = pheno_data, feature_data = feature_data))
}

# Helper function to search for mass and retention time column names
find_mz_rt_cols <- function(feature_data) {
  # Find mass and retention time columns
  mz_tags <- c("mass", "average mz", "average.mz", "molecularweight", "molecular weight")
  rt_tags <-  c("retention time", "retentiontime", "average rt[(]min[)]",
                "^rt$")

  mz_col <- NULL
  for (tag in mz_tags) {
    hits <- grepl(tag, tolower(colnames(feature_data)))
    if (any(hits)) {
      mz_col <- colnames(feature_data)[which(hits)[1]]
      break
    }
  }
  rt_col <- NULL
  for (tag in rt_tags) {
    hits <- grepl(tag, tolower(colnames(feature_data)))
    if (any(hits)) {
      rt_col <- colnames(feature_data)[which(hits)[1]]
      break
    }
  }
  # If not found, throw error
  if (is.null(mz_col)){
    stop(paste0("No mass to charge ratio column found - should match one of:\n",
                paste(mz_tags, collapse = ", "), " (not case-sensitive)"))
  }
  if (is.null(rt_col)){
    stop(paste0("No retention time column found - should match one of:\n",
                paste(rt_tags, collapse = ", "), " (not case-sensitive)"))
  }

  return(list(mz_col = mz_col, rt_col = rt_col))
}

# Combines mode name, mass and retention time to create a Feature ID
#' @importFrom magrittr "%>%"
name_features <- function(feature_data) {

  cols <- find_mz_rt_cols(feature_data)
  mz_col = cols$mz_col
  rt_col = cols$rt_col

  # Concatenate rounded mass and retention time
  round_mz <- as.numeric(feature_data[, mz_col]) %>% round(digits = 4) %>%
    as.character() %>% gsub("[.]", "_", .)
  round_rt <- as.numeric(feature_data[, rt_col]) %>% round(digits = 4) %>%
    as.character() %>% gsub("[.]", "_", .)
  feature_data$Feature_ID <- paste0(round_mz, "a", round_rt)

  # Add the split columns (usually column, mode and possibly tissue)

  feature_data <- feature_data %>%
    tidyr::unite("Feature_ID", c("Split", "Feature_ID"), remove = FALSE)

  feature_data
}

#' An S4 class used to represent LC-MS datasets
#'
#' MetaboSet is the main class used to represent data in the amp package.
#' It is built upon the \code{\link[Biobase]{ExpressionSet}} class from the Biobase
#' package. For more information, read the MetaboSet utility vignette.
#' In addition to the slots inherited from \code{\link[Biobase]{ExpressionSet}},
#' \code{MetaboSet} has four slots of its own. The first three slots hold special
#' column names that are stored purely for convenience, as many functions use these as
#' defaults. The fourth slot is a data frame with one row per feature that holds all
#' relevant results from the analyses.
#'
#' @slot group_col character, name of the column holding group information
#' @slot time_col character, name of the column holding time points
#' @slot subject_col character, name of the column holding subject identifiers
#' @slot results data frame, holds results of analyses
#'
#'
#' @import methods
#' @importClassesFrom Biobase ExpressionSet
MetaboSet <- setClass("MetaboSet",
                      slots = c(group_col = "character",
                                time_col = "character",
                                subject_col = "character",
                                results = "data.frame"),
                      contains = "ExpressionSet")

setValidity("MetaboSet",
            function(object) {
              if (!is.na(object@group_col) & !object@group_col %in% colnames(object@phenoData@data)) {
                paste0("Column '", object@group_col, "' not found in pheno data")
              } else if (!is.na(object@time_col) & !object@time_col %in% colnames(object@phenoData@data)) {
                paste("Column", object@time_col, "not found in pheno data")
              } else if (!is.na(object@subject_col) & !object@subject_col %in% colnames(object@phenoData@data)) {
                paste("Column", object@subject_col, "not found in pheno data")
              } else if (!identical(rownames(object@results), featureNames(object))){
                "Rownames of results do not match featureNames"
              } else if (!all(c("Injection_order", "Sample_ID", "QC") %in% colnames(pData(object)))) {
                "Pheno data should contain columns Sample_ID, QC and Injection_order"
              } else {
                x <- check_pheno_data(pData(object), id_prefix = "")
                x <- check_exprs(exprs(object))
                TRUE
              }
            })

#' Construct MetaboSet objects
#'
#' Construct MetaboSet objects from input read by read_from_excel.
#' Returns a list of MetaboSet objects, one per mode. The modes are separated by the "Split" column
#' in feature_data.
#'
#' @param exprs matrix, the feature abundances, size features x samples
#' @param pheno_data data frame, sample information, size sample info variables x samples
#' @param feature_data data frame, information about the features, size features x feature info columns
#' @param group_col character, the name of the column in pheno_data to use as the grouping variable
#' @param time_col character, the name of the column in pheno_data to use as the time variable
#' @param subject_col character, the name of the column in pheno_data to use as the subject ID variable
#'
#' @return list of MetaboSet objects
#'
#' @seealso \code{\link{read_from_excel}}
#'
#' @export
construct_MetaboSet <- function(exprs, pheno_data, feature_data,
                                group_col = NA_character_, time_col = NA_character_,
                                subject_col = NA_character_) {

  pheno_data <- Biobase::AnnotatedDataFrame(data=pheno_data)

  # Split the data by the Split column of feature data
  parts <- unique(feature_data$Split)
  obj_list <- list()
  for (part in parts) {
    fd_tmp <- Biobase::AnnotatedDataFrame(data= feature_data[feature_data$Split == part, ])
    ad_tmp <- exprs[fd_tmp$Feature_ID,]
    obj_list[[part]] <- MetaboSet(exprs = ad_tmp,
                        phenoData = pheno_data,
                        featureData = fd_tmp,
                        group_col = group_col,
                        time_col = time_col,
                        subject_col = subject_col,
                        results = data.frame(Feature_ID = fd_tmp$Feature_ID,
                                             Flag = NA_character_,
                                             row.names = rownames(fd_tmp),
                                             stringsAsFactors = FALSE))
  }

  obj_list
}


#' Write results to Excel file
#'
#' Writes all the data in a MetaboSet object to an Excel spreadsheet.
#' The format is similar to the one used to read data in, except that
#' the results from statistics are added to the right.
#'
#' @param object a MetaboSet object
#' @param file path to the file to write
#' @param ... Additional parameters passed to \code{openxlsx::write.xlsx}
#'
#' @export
write_to_excel <- function(object, file, ...) {

  if (!requireNamespace("openxlsx", quietly = TRUE)) {
      stop("Package \"openxlsx\" needed for this function to work. Please install it.",
           call. = FALSE)
  }

  # Bottom part consists of (from left to right):
  # - feature data
  # - abundance values
  # - results from statistics
  bottom <- cbind(fData(object),
                  exprs(object),
                  dplyr::select(results(object), -Feature_ID),
                  results(object)["Feature_ID"])

  # Feature ID column is duplicated on the right for convenience
  colnames(bottom)[ncol(bottom)] <- "Feature_ID2"

  # All columns must be characters to allow combination with the top block
  bottom <- bottom %>%
    dplyr::mutate_all(as.character) %>%
    rbind(colnames(.), .)
  # Top block holds the sample information
  top <- cbind(matrix(colnames(pData(object)), ncol = 1), t(pData(object)))

  # NA blocks to fill the empty space
  empty1 <- matrix(NA_character_, nrow = nrow(top),
                   ncol = ncol(fData(object)) - 1)
  empty2 <- matrix(NA_character_, nrow = nrow(top),
                   ncol = ncol(results(object)))
  top <- cbind(empty1, top, empty2)
  colnames(top) <- colnames(bottom)


  replace_idx <- (ncol(fData(object))+1):(ncol(fData(object)) + ncol(exprs(object)))
  bottom[1, replace_idx] <- top[nrow(top), replace_idx]

  # All combined
  big <- rbind(top[seq_len(nrow(top) - 1), ], bottom)

  openxlsx::write.xlsx(big, file = file, colNames = FALSE, ...)
}


# ------------ Accessors and Replacers -----------------

#' Retrieve both sample information and features
#'
#' Returns a data frame with sample information plus all features as columns.
#' The data frame thus has one row per sample.
#'
#' @param object a MetaboSet object
#'
#' @examples
#' combined_data(example_set)
#'
#'
#' @importFrom Biobase exprs pData
#' @export
setGeneric("combined_data", signature = "object",
           function(object) standardGeneric("combined_data"))

#' @describeIn MetaboSet sample information and features combined to a single data frame, one row per sample
#' @export
setMethod("combined_data", c(object = "MetaboSet"),
          function(object) {
            cbind(pData(object), t(exprs(object)))
          })


# group

#' @export
setGeneric("group_col", signature = "object",
           function(object) standardGeneric("group_col"))
#' @describeIn MetaboSet access and set group_col
#'
#' @export
setMethod("group_col", "MetaboSet",
          function(object) object@group_col)

#' @export
setGeneric("group_col<-", signature = "object",
           function(object, value) standardGeneric("group_col<-"))

#' @export
setMethod("group_col<-", "MetaboSet",
          function(object, value) {
            object@group_col <- value
            if (validObject(object)) {
              return(object)
            }
          })

# time
#' @export
setGeneric("time_col", signature = "object",
           function(object) standardGeneric("time_col"))

#' @describeIn MetaboSet access and set time_col
#' @export
setMethod("time_col", "MetaboSet",
          function(object) object@time_col)

#' @export
setGeneric("time_col<-", signature = "object",
           function(object, value) standardGeneric("time_col<-"))

#' @export
setMethod("time_col<-", "MetaboSet",
          function(object, value) {
            object@time_col <- value
            if (validObject(object)) {
              return(object)
            }
          })

# subject ID
#' @export
setGeneric("subject_col", signature = "object",
           function(object) standardGeneric("subject_col"))

#' @describeIn MetaboSet access and set subject_col
#' @export
setMethod("subject_col", "MetaboSet",
          function(object) object@subject_col)

#' @export
setGeneric("subject_col<-", signature = "object",
           function(object, value) standardGeneric("subject_col<-"))

#' @export
setMethod("subject_col<-", "MetaboSet",
          function(object, value) {
            object@subject_col <- value
            if (validObject(object)) {
              return(object)
            }
          })


# results from statistical tests
#' @export
setGeneric("results", signature = "object",
           function(object) standardGeneric("results"))

#' @describeIn MetaboSet access and set results
#' @export
setMethod("results", "MetaboSet",
          function(object) object@results)

#' @export
setGeneric("results<-", signature = "object",
           function(object, value) standardGeneric("results<-"))

#' @export
setMethod("results<-", "MetaboSet",
          function(object, value) {
            object@results <- value
            if (validObject(object)) {
              return(object)
            }
          })

#' Extract flags of features
#'
#' @export
setGeneric("flag", signature = "object",
           function(object) standardGeneric("flag"))

#' @describeIn MetaboSet access and set results
#' @export
setMethod("flag", "MetaboSet",
          function(object) object@results$Flag)

#' @export
setGeneric("flag<-", signature = "object",
           function(object, value) standardGeneric("flag<-"))

#' @export
setMethod("flag<-", "MetaboSet",
          function(object, value) {
            object@results$Flag <- value
            if (validObject(object)) {
              return(object)
            }
          })

#' Join results to a MetaboSet object
#'
#' Join a new data frame of results to a MetaboSet object. The data frame needs to have a column "Feature_ID".
#' This function is usually called on the results of one of the statistics functions.
#'
#' @param object a MetaboSet object
#' @param dframe a data frame with the results
#'
#' @examples
#' lm_results <- perform_lm(example_set, formula_char = "Feature ~ Group")
#' with_results <- join_results(example_set, lm_results)
#' colnames(results(with_results))
#'
#' @return a MetaboSet object with the new information added to results(object)
#'
#' @export
setGeneric("join_results", signature = c("object", "dframe"),
           function(object, dframe) standardGeneric("join_results"))

#' @describeIn MetaboSet join new information to results
#' @export
setMethod("join_results", c("MetaboSet", "data.frame"),
          function(object, dframe) {
            cols <- c("Feature_ID", setdiff(colnames(results(object)), colnames(dframe)))
            res <- dplyr::left_join(results(object)[cols],
                                                dframe,
                                                by = "Feature_ID")
            rownames(res) <- res$Feature_ID
            results(object) <- res
            object
          })


#' Join new columns to feature data
#'
#' Join a new data frame of information to feature data of a MetaboSet object. The data frame needs to have a column "Feature_ID".
#' This function is usually used internally by some of the functions in the package, but can be useful.
#'
#' @param object a MetaboSet object
#' @param dframe a data frame with the new information
#'
#' @examples
#' new_info <- data.frame(Feature_ID = featureNames(example_set), Feature_number = seq_len(nrow(example_set)))
#' with_new_info <- join_fData(example_set, new_info)
#' colnames(fData(with_new_info))
#'
#' @return a MetaboSet object with the new information added to fData(object)
#'
#' @export
setGeneric("join_fData", signature = c("object", "dframe"),
           function(object, dframe) standardGeneric("join_fData"))

#' @describeIn MetaboSet join new information to fData
#' @export
setMethod("join_fData", c("MetaboSet", "data.frame"),
          function(object, dframe) {
            fData(object) <- dplyr::left_join(fData(object),
                                              dframe,
                                              by = "Feature_ID")
            rownames(fData(object)) <- fData(object)$Feature_ID
            if (validObject(object)) {
              return(object)
            }
          })


# Subsetting that also subsets results
#' @export
setMethod("[", "MetaboSet", function(x, i, j, ..., drop = FALSE) {

  x <- callNextMethod()
  results(x) <- results(x)[i,]
  x
})

# FeatureNames also changing Feature_ID columns
#' @export
setMethod("featureNames<-", signature=signature(object="MetaboSet", value="ANY"),
                 function(object, value) {
                   fd <- featureData(object)
                   featureNames(fd) <- value
                   ad <- assayData(object)
                   featureNames(ad) <- value
                   object@featureData <- fd
                   object@assayData <- ad
                   fData(object)$Feature_ID <- value
                   results(object)$Feature_ID <- value
                   if (validObject(object)) {
                     return(object)
                   }
                 })