R/transformations.R

Defines functions mark_nas mark_qcs drop_qcs drop_flagged merge_exprs impute_rf impute_simple inverse_normalize flag_report

Documented in drop_flagged drop_qcs flag_report impute_rf impute_simple inverse_normalize mark_nas mark_qcs merge_exprs

#' Mark specified values as missing
#'
#' Replaces all values in the exprs part that equal the specified value with NA.
#' For example, vendor software might use 0 or 1 to signal a missing value,
#' which is not understood by R.
#'
#' @param object a MetaboSet object
#' @param value the value to be converted to NA
#'
#' @return MetaboSet object as the one supplied, with missing values correctly set to NA
#'
#' @examples
#' nas_marked <- mark_nas(merged_sample, value = 0)
#'
#' @export
mark_nas <- function(object, value) {
  ex <- exprs(object)
  ex[ex == value] <- NA
  exprs(object) <- ex
  object
}



#' Replace NA values in pheno data columns with "QC"
#'
#' Loops through specified columns in pheno data and replaces all NA values with "QC". Also transforms
#' the column to factor and sets "QC" as the last level.
#'
#' @param data a data frame such as pheno_data returned by \code{\link{read_from_excel}}
#' @param cols the columns to fix
#'
#' @return a data frame with the column modified
#'
#' @examples
#' # Remove QC labels first
#' pheno_data <- pData(merged_sample)
#' pheno_data$Group[pheno_data$Group == "QC"] <- NA
#' head(pheno_data$Group, 20)
#' pheno_data <- mark_qcs(pheno_data, cols = "Group")
#' head(pheno_data$Group, 20)
#'
#' @export
mark_qcs <- function(data, cols) {

  for (col in cols) {
    data[col] <- as.character(data[, col])
    data[is.na(data[, col]), col] <- "QC"
    data[col] <- factor(data[, col])
    good_levels <- levels(data[, col])
    good_levels <- c(good_levels[good_levels != "QC"], "QC")
    data[col] <- factor(as.character(data[, col]), levels = good_levels)
  }
  data
}


#' Drop QC samples
#'
#' @param object a MetaboSet object
#'
#' @return MetaboSet object as the one supplied, without QC samples
#'
#' @examples
#' dim(merged_sample)
#' noqc <- drop_qcs(merged_sample)
#' dim(noqc)
#'
#' @importFrom Biobase pData "pData<-"
#'
#' @export
drop_qcs <- function(object) {
  object <- object[, object$QC != "QC"]
  pData(object) <- droplevels(pData(object))
  object
}


#' Drop flagged features
#'
#' Removes all features that have been flagged by quality control functions.
#' Only features that do not have a flag (Flag == NA) are retained.
#'
#' @param object a MetaboSet object
#' @param all_features logical, should all features be retained? Mainly used by internal functions
#'
#' @examples
#' dim(merged_sample)
#' flagged <- flag_quality(merged_sample)
#' noflags <- drop_flagged(flagged)
#' dim(noflags)
#'
#' @export
drop_flagged <- function(object, all_features = FALSE) {
  if (!all_features) {
    object <- object[is.na(flag(object)), ]
  }
  object
}

#' Partially replace exprs field with new values
#'
#' Replaces a subset of data in exprs of an object by new values.
#' Used after an operation such as imputation is computed only for a subset of features or samples
#' such as only good-quality features that have not been flagged.
#' This function is mainly used internally, but can be useful.
#'
#' @param object a MetaboSet object
#' @param y matrix containing new values to be merged into exprs
#'
#' @return a MetaboSet object with the new exprs values
#'
#' @examples
#' ex_set <- merged_sample[1:5, 1:5]
#' exprs(ex_set)
#' # Create a matrix of replacment values for rows 1, 3, 5 and columns 1, 3, 4
#' replacement <- matrix(1:9, ncol = 3,
#'                       dimnames = list(featureNames(ex_set)[c(1, 3, 5)],
#'                                       sampleNames(ex_set)[c(1, 3, 4)]))
#' replacement
#' merged <- merge_exprs(ex_set, replacement)
#' exprs(merged)
#'
#' @export
merge_exprs <- function(object, y) {
  # Colnames and rownames should be found in the object
  if (!all(colnames(y) %in% colnames(exprs(object))) | is.null(colnames(y))) {
    stop("Column names of y do not match column names of exprs(object)")
  }
  if(!all(rownames(y) %in% rownames(exprs(object))) | is.null(rownames(y))) {
    stop("Row names of y do not match row names of exprs(object)")
  }

  exprs_tmp <- exprs(object)
  exprs_tmp[rownames(y), colnames(y)] <- y
  exprs(object) <- exprs_tmp
  object
}

#' Impute missing values using random forest
#'
#' Impute the missing values in the exprs part of the object using a
#' random forest. The estimated error in the imputation is logged.
#' It is recommended to set the seed number for reproducibility
#' (it is called random forest for a reason).
#' This a wrapper around \code{missForest::missForest}.
#' Use parallelize = "variables" to run in parallel for faster testing.
#' NOTE: running in parallel prevents user from setting a seed number.
#' \strong{CITATION:} When using this function, cite the \code{missForest} package
#'
#' @param object a MetaboSet object
#' @param all_features logical, should all features be used? If FALSE (the default), flagged features are removed before imputation.
#' @param ... passed to MissForest function
#'
#' @return MetaboSet object as the one supplied, with missing values imputed.
#'
#' @examples
#' missing <- mark_nas(example_set, 0)
#' set.seed(38)
#' imputed <- impute_rf(missing)
#'
#' @seealso \code{\link[missForest]{missForest}} for detail about the algorithm and the parameters
#'
#' @export
impute_rf <- function(object, all_features = FALSE, ...) {
  if (!requireNamespace("missForest", quietly = TRUE)) {
      stop("Package \"missForest\" needed for this function to work. Please install it.",
           call. = FALSE)
  }
  # Start log
  log_text(paste("\nStarting random forest imputation at", Sys.time()))
  # Drop flagged features
  dropped <- drop_flagged(object, all_features)

  if (!requireNamespace("missForest", quietly = TRUE)) {
    stop("missForest package not found")
  }

  # Impute missing values
  mf <- missForest::missForest(xmis = t(exprs(dropped)), ...)
  imputed <- t(mf$ximp)
  # Log imputation error
  log_text(paste0("Out-of-bag error in random forest imputation: ",
                 round(mf$OOBerror, digits = 3), "\n"))
  # Assign imputed data to the droppped
  rownames(imputed) <- rownames(exprs(dropped))
  colnames(imputed) <- colnames(exprs(dropped))
  # Attach imputed abundances to object
  object <- merge_exprs(object, imputed)
  log_text(paste("Random forest imputation finished at", Sys.time()))
  object
}

#' Simple imputation
#'
#' Impute missing values using a simple imputation strategy. All missing values
#' of a feature are imputed with the same value. It is possible
#' to only impute features with a large number of missing values this way. This can be useful
#' for using this function before random forest imputation to speed things up.
#' The imputation strategies available are:
#'
#' \itemize{
#' \item a numeric value: impute all missing values in all features with the same value, e.g. 1
#' \item "min": impute missing values of a feature with the minimum observed value of that feature
#' \item "half_min": impute missing values of a feature with half the minimum observed value of that feature
#' \item "small_random": impute missing values of a feature with random numbers between 0 and the
#' minimum of that feature (uniform distribution, remember to set the seed number!).
#' }
#'
#' @param object a MetaboSet object
#' @param value the value used for imputation, either a numeric or one of "min", "half_min", "small_random",
#' see above
#' @param na_limit only impute features with the proportion of NAs over this limit. For example, if
#' \code{na_limit = 0.5}, only features with at least half of the values missing are imputed.
#'
#' @examples
#' missing <- mark_nas(merged_sample, 0)
#' imputed <- impute_simple(missing, value = "min")
#'
#' @export
impute_simple <- function(object, value, na_limit = 0) {

  imp <- exprs(object)
  nas <- apply(imp, 1, prop_na)
  imp <- imp[nas > na_limit, ]
  if (nrow(imp) == 0) {
    warning("none of the features satisfy the NA limit, returning the original object")
    return(object)
  }

  # Replace all missing values with the given constant
  if (is.numeric(value)) {
    imp[is.na(imp)] <- value
  } else if (value == "min") {
    imp <- t(apply(imp, 1, function(x){
      x[is.na(x)] <- finite_min(x)
      x
    }))
  } else if (value == "half_min") {
    imp <- t(apply(imp, 1, function(x){
      x[is.na(x)] <- finite_min(x) / 2
      x
    }))
  } else if (value == "small_random") {
    imp <- t(apply(imp, 1, function(x){
      x[is.na(x)] <- runif(n = sum(is.na(x)), min = 0, max = finite_min(x))
      x
    }))
  } else {
    stop('value should be a numeric value or one of "min", "half_min", "small_random"')
  }

  obj <- merge_exprs(object, imp)
  obj
}

#' Inverse-rank normalization
#'
#' Applies inverse rank normalization to all features to approximate
#' a normal distribution.
#'
#' @param object a MetaboSet object
#'
#' @return MetaboSet object as the one supplied, with normalized features
#'
#' @examples
#' normalized <- inverse_normalize(merged_sample)
#'
#' @export
inverse_normalize <- function(object) {

  exprs(object) <- exprs(object) %>%
    apply(1, function(x) {
      qnorm((rank(x, na.last="keep")-0.5) / sum(!is.na(x)))}) %>%
    t()
  object
}


#' A report of flagged features
#'
#' Computes the number of features at each stage of flagging for each mode.
#'
#' @param object a MetaboSet object
#'
#' @return data frame with the number of features at each stage of flagging
#'
#' @examples
#' flagged <- merged_sample %>%
#'  mark_nas(0) %>%
#'  flag_detection() %>%
#'  flag_quality()
#' flag_report(flagged)
#'
#' @export
flag_report <- function(object) {

  splits <- sort(unique(fData(object)$Split))
  report <- data.frame()
  flag(object)[is.na(flag(object))] <- "Kept"
  for (split in splits) {
    tmp <- object[fData(object)$Split == split, ]
    report_row <- flag(tmp) %>% table %>% as.matrix() %>% t()
    report_row <- data.frame(Split = split, report_row)
    if (is.null(report_row$Kept)) {
      report_row$Kept <- 0
    }
    report_row$Total <- nrow(tmp)
    report_row$Flagged <- report_row$Total - report_row$Kept
    report <- suppressWarnings(dplyr::bind_rows(report, report_row))
  }
  report
}


# ---------- Logarithms ----------

#' Logarithm
#'
#' Log-transforms the exprs part of a MetaboSet object. Shortcust log2 and log10 also implemented.
#' For more information, see \code{\link{log}}
#'
#' @export
setMethod("log", "MetaboSet", function(x, base = exp(1)) {
  exprs(x) <- log(exprs(x), base = base)
  x
})

#' @rdname log-MetaboSet-method
#' @export
setMethod("log2", "MetaboSet", function(x) {
  exprs(x) <- log2(exprs(x))
  x
})

#' @rdname log-MetaboSet-method
#' @export
setMethod("log10", "MetaboSet", function(x) {
  exprs(x) <- log10(exprs(x))
  x
})

# scale
setGeneric("scale")

#' Scale exprs data
#'
#' Applies the base R function scale to transposed exprs matrix. See ?scale for details
#' @export
setMethod("scale", "MetaboSet", function(x, center = TRUE, scale = TRUE) {
  exprs(x) <- t(scale(t(exprs(x)), center = center, scale = scale))
  x
})

#' Exponential function
#'
#' Apply the exponential function to feature abundances (exprs)
#'
#' @param object a MetaboSet object
#' @param base base of the exponential
#'
#' @return a MetaboSet object with altered feature abundances
#'
#' @export
setGeneric("exponential", signature = "object",
           function(object, base = exp(1)) standardGeneric("exponential"))


#' @export
setMethod("exponential", c(object = "MetaboSet"),
          function(object, base = exp(1)) {
            exprs(object) <- base^exprs(object)
            object
          })
antonvsdata/amp documentation built on Jan. 8, 2020, 3:15 a.m.