R/correction.R

Defines functions met_smooth_gam met_smooth_loess correct_rel_bias

Documented in correct_rel_bias met_smooth_gam met_smooth_loess

# Functions for correcting systematic bias

#========================================================================>
# Smoothing functions used for correction

#------------------------------------------------------------------------
#' GAM smoothing
#'
#' Wraps GAM smoothing function for use in bias correction algorithm.
#'
#' @param time Time or sample number for metabolite time-course.
#' @param concentration Metabolite concentration.
#' @param new.time Optional vector for new independent variables.
#' @param warn The \code{gam} function uses internal warning suppression that
#'             doesn't appear to work when used in conjunction with dplyr.
#'             All warning messages are suppressed by default here. Set to
#'             TRUE to unsuppress.
#' @param ... Arguments to be passed into the s() smoothing function, such as
#'            k -- the dimension of the basis used to represent the smooth term.
#'            If no additional arguments are provided, a reasonable default
#'            for k (5) is assumed.
#
#' @return A vector of smoothed concentrations.
#'
#' @examples
#' # Simulating concave curve with 10 points
#' par <- c(3.5, 4.5, 2.5, 3.5, 0.0, 0.2, 0.8, 0.9)
#' concentration <- as.numeric(simulate_concave(1, 10, par))
#' 
#' # Adding noise and changing scale
#' concentration <- concentration * 18 + 2
#' abs.sd <- mean(concentration) * 0.1
#' concentration <- concentration + rnorm(10, 0, abs.sd)
#' concentration[concentration < 0] <- 0 
#' 
#' # Original sampling time
#' time <- seq(0, 9, length.out=10) * 24
#'
#' # New time for smoothing
#' new.time <- seq(0, 9, length.out=100) * 24
#' 
#' # Smoothing
#' corrected <- met_smooth_gam(time, concentration, new.time = new.time, k = 5)
#' 
#' # Plotting
#' plot(time, concentration, type='p', 
#'      xlab='Time post inoculation (hours)', ylab='Concentration (mM)')
#' lines(new.time, corrected, type='l')
#' @export
met_smooth_gam <- function(time, concentration, new.time = NULL,
                           warn = FALSE, ...) {
  
  d <- data.frame(time = time, concentration = concentration)

  if (warn) {
    f_gam <- function(...) gam(...)
  } else {
    f_gam <- function(...) suppressWarnings(gam(...))
  }

  if (length(list(...)) == 0) {
    model <- f_gam(concentration ~ s(time, bs='cr', k = 5), data = d)
  } else {
    model <- f_gam(concentration ~ s(time, bs='cr', ...), data = d)
  }

  if (! is.null(new.time)) {
    d <- data.frame(time = new.time)
  }

  fit <- as.vector(predict(model, d))

  return(fit)
}

#------------------------------------------------------------------------
#' LOESS smoothing
#'
#' Wraps LOESS smoothing function for use in bias correction algorithm. 
#' WARNING: LOESS is not a particularly good choice for bias correction. 
#' This function is included for completeness only.
#'
#' @param time Time or sample number for metabolite time-course.
#' @param concentration Metabolite concentration.
#' @param new.time Optional vector for new independent variables.
#' @param ... Arguments to be passed into the loess() smoothing function, 
#'            such as span -- "the parameter alpha which controls the degree of
#'            smoothing". If no additional arguments are provided, a reasonable 
#'            default for span (0.75) is assumed.
#
#' @return A vector of smoothed concentrations.
#'
#' @examples
#' # Simulating concave curve with 10 points
#' par <- c(3.5, 4.5, 2.5, 3.5, 0.0, 0.2, 0.8, 0.9)
#' concentration <- as.numeric(simulate_concave(1, 10, par))
#' 
#' # Adding noise and changing scale
#' concentration <- concentration * 18 + 2
#' abs.sd <- mean(concentration) * 0.1
#' concentration <- concentration + rnorm(10, 0, abs.sd)
#' concentration[concentration < 0] <- 0 
#' 
#' # Original sample time
#' time <- seq(0, 9, length.out=10) * 24
#'
#' # New time for smoothing
#' new.time <- seq(0, 9, length.out=100) * 24
#' 
#' # Smoothing
#' corrected <- met_smooth_loess(time, concentration, 
#'                               new.time = new.time, span = 0.75)
#' 
#' # Plotting
#' plot(time, concentration, type='p', 
#'      xlab='Time post inoculation (hours)', ylab='Concentration (mM)')
#' lines(new.time, corrected, type='l')
#' @export
met_smooth_loess <- function(time, concentration, new.time = NULL, ...) {
  
  d <- data.frame(time = time, concentration = concentration)

  if (length(list(...)) == 0) {
    model <- loess(concentration ~ time, data = d, span = 0.75)
  } else {
    model <- loess(concentration ~ time, data = d, ...)
  }

  if (! is.null(new.time)) {
    d <- data.frame(time = new.time)
  }

  fit <- as.vector(predict(model, d))

  return(fit)
}

#========================================================================>
# Correction function

#------------------------------------------------------------------------
#' Correct systematic bias in metabolomic time-course data
#'
#' Identifies systematic deviations in metabolic data using a smoothing fit.
#' Timepoints that have a median relative deviation above a threshold value
#' are assumed to be influenced by a measurement or methodological bias as
#' compared to the overall trends metabolite concentrations.
#'
#' @param time A vector of times or sample numbers for metabolite time-courses.
#'             Note, there should be a time value for every \code{concentration} 
#'             for every \code{metabolite} e.g. time = c(1, 2, 3, 1, 2, 3), 
#'             concentration = c(20, 15, 10, 3, 6, 9), 
#'             metabolite = c('glc', 'glc', 'glc', 'lac', 'lac', 'lac'). 
#' @param concentration A vector of metabolite concentrations.
#' @param metabolite A vector of metabolite names that correspond to \code{time}
#'                   and \code{concentration}.
#' @param f_smooth Smoothing function (set to met_smooth_gam by default).
#' @param max.iter The algorithm correct the single most deviating point at one
#'                 time (as it may influence the identification of other
#'                 deviations). \code{max.iter} controls the maximum number
#'                 of correction passes.
#' @param min.deviation Smallest median relative deviation to identify as a 
#'                      bias. 0.02 has been found to be a good default for
#'                      a diverse set of metabolic time-courses from
#'                      higher eukaryote cell culture.
#' @param ... Arguments to be passed into \code{f_smooth}, (such as k or
#'            span parameters for met_smooth_gam and met_smooth_loess
#'            respectively). 
#
#' @return A vector of corrected concentrations.
#'
#' @examples
#' 
#' # Using previously simulated data 40 metabolic trends with 10 time points
#' data(timecourse)
#'
#' # Adding an error of 5% at sample 4
#' logic <- timecourse$sample == 4
#' timecourse$concentration[logic] <- timecourse$concentration[logic] * 1.05 
#'
#' # Correcting
#' timecourse$corrected <- correct_rel_bias(timecourse$time, 
#'                                          timecourse$concentration,
#'                                          timecourse$metabolite)
#'
#'
#' # Plotting -- the original value of the corrected point is marked in red
#' par(mfrow = c(8, 5), oma = c(5, 4, 1, 1) + 0.1, mar = c(1, 1, 1, 1) + 0.1)
#' new.time <- seq(min(timecourse$time), max(timecourse$time), length.out=100)
#' 
#' for (metabolite in unique(timecourse$metabolite)) {
#'
#'   logic <- timecourse$metabolite == metabolite
#'   d <- timecourse[logic, ]
#'   
#'   logic2 <- d$concentration != d$corrected
#'
#'   plot(d$time, d$corrected, pch = 16, xlab = '', ylab = '',  
#'        ylim = c(min(d$concentration), max(d$concentration)))
#'
#'   smoothed <- met_smooth_gam(d$time, d$corrected,
#'                              new.time = new.time, k = 5)
#'   lines(new.time, smoothed)
#'
#'   points(d$time[logic2], d$concentration[logic2], pch = 16, col = 'red')
#'
#' }
#'
#' title(xlab = 'Time post inoculation (hours)', 
#'       ylab = 'Concentration (mM)', outer = TRUE, line = 3)
#' @export
correct_rel_bias <- function(time, concentration, metabolite,
                             f_smooth = met_smooth_gam, 
                             max.iter = 20, min.deviation = 0.02, ...) {

  # Generating data frame, with a new "corrected" column
  d <- data.frame(time, concentration, corrected = concentration, 
                  metabolite, original = 1:length(time))
  
  extra_args <- list(...)

  # Passing in extra parameters into smoothing
  f_smooth_dply <- function(time, corrected) {
    args <- list(time = time, corrected = corrected)
    do.call(f_smooth, c(args, extra_args))
  }
  
  # Iteratively correcting systematic deviation
  for (i in 1:max.iter) {

    # Generating fit and calculating deviations
    d <- d %>%
           group_by(metabolite) %>%
           mutate(fit = f_smooth(time, corrected),
                  deviation = (corrected - fit) / corrected)

    # Identifying the timepoint with largest deviation 
    deviations <- d %>%
                    group_by(time) %>%
                    summarize(med = abs(median(deviation, na.rm = TRUE))) %>%
                    arrange(desc(med))

    if (deviations$med[1] < min.deviation) break

    max_dev_time <- deviations$time[1]
   
    logic <- d$time == max_dev_time
    d[logic, ] <- within(d[logic, ], {
      corrected <- corrected - median(deviation) * corrected
    })
  }

  # Correcting any changes in order
  out <- d$corrected[order(d$original)]
  
  return(out)
}
ssokolen/metcourse documentation built on May 30, 2019, 8:43 a.m.