assayr2: Tools for analyzing assay data

Documented in ll4_robust_parameter_overlap

# qc.R

#' Overlapping confidence intervals for LL4 parameters (normal vs. robust)
#'
#' @param fit A `drc` object (4-parameter fit) (or list of `drc` objects)
#' @param robust_fit A `drc` object that has been transformed by robustifyDrc (4-parameter fit) (or list of `drc` objects)
#' @return TRUE if at least one pair of parameter estimates (robust vs. non robust) are non-overlapping.
#'      If return_frame is TRUE, then a dataframe with columns indicating which parameters are non-overlappi
#' @examples
#' library(drc)
#' library(tibble)
#' library(magrittr)
#' library(ggplot2)
#' data("curves")
#' model <- curves$standard_model[[1]]
#' robust_model <- curves$standard_model[[1]] %>%
#'    assayr2::robustify_drc(form = as.formula(log10_data ~ standard_conc))

#' ll4_robust_parameter_overlap(model, robust_model)
#' ll4_robust_parameter_overlap(list(model, model),
#'                      list(robust_model, robust_model))
#' @export
ll4_robust_parameter_overlap <- function(fit, robust_fit) {
  stopifnot(class(fit) == "drc" | class(fit) == "list")
  if (class(fit) == "list" & !(missing(robust_fit))) {
    stopifnot(length(fit) == length(robust_fit))
  }
  # handle missing robust fit
  if (missing(robust_fit)) {
    if (class(fit) == "drc") {
      robust_fit <- robustify_drc(fit, formula(fit))
    } else {
      robust_fit <- purrr::map(fit, ~robustify_drc(., formula(.)))
    }
  }

  .ll4_parameter_overlap <- function(fit, robust_fit) {
    ci_fit <- confint(fit)
    ci_rfit <- confint(robust_fit)
    res <- data.frame(
      parameter = tolower(LETTERS[2:5]),
      gt_robust = ci_fit[, 1] > ci_rfit[, 2],
      lt_robust = ci_fit[, 2] < ci_rfit[, 1]
    ) %>%
      dplyr::mutate(overlapping = !(gt_robust | lt_robust))
    res %<>% dplyr::filter(!(is.nan(overlapping)) & !(is.na(overlapping)))
    !(all(res$overlapping))
  }
  # one pair of models
  if (class(fit) == "drc" & class(robust_fit) == "drc") {
    .ll4_parameter_overlap(fit, robust_fit)
  } else { # > 1 pair of models
    purrr::map2_lgl(fit, robust_fit, ~.ll4_parameter_overlap(.x, .y))
  }
}

#' Check for outliers using weights from Tukey's Biweight mean
#'
#' @param robust_fit A model object of class "drc" that has been processed for
#' robust estimation.
#' @param threshold A numeric vector of length 1 indicating the maximum weight
#'   that a sample is labeled as an outlier
#' @return boolean TRUE if outliers are present
#' @export
detect_outliers <- function(robust_fit, threshold = 0.1) {
  any(robust_fit$weights < threshold)
}

#' Check if an asymptote estimate for a 4-parameter log-logistic function
#'     is considered to be noisy
#'
#' @param object drc object 4-parameter log-logistic
#' @param bound `lower` or `upper` for the lower or upper bound of the CI
#' @return boolean TRUE if the ic50 estimate is poor with respect to the bound specified
#' @examples
#' data("isobutyryl_curves")
#' poor_ic50_estimate(isobutyryl_curves[[5]], bound = "upper")
#' @export
poor_ic50_estimate <- function(object, bound) {
  stopifnot(bound %in% c("lower", "upper"))
  ci <- confint(object)
  cutoffs <- list(
    upper = coef(object)[4] * 3,
    lower = coef(object)[4] / 3
  )
  flag <- switch(bound,
    upper = ci[4, 2] > cutoffs[[bound]], # ic50 upper,
    lower = ci[4, 1] < cutoffs[[bound]] # ic50 lower
  )
  if (is.nan(flag) | is.na(flag)) { # must be poor, because CI is unbounded
    flag <- TRUE
  }
  flag
}


#' Check if a parameter estimate is unbounded
#'
#' @param object A model object of class "drc", fit with 4-param log-logistic function.
#' @param parameter A character string specifying the paramter to check.
#' Available option are upper asymtote ("upper"), lower asymtote ("lower),
#' IC50 ("ic50"), or slope coefficient ("slope").
#' @return boolean TRUE if the estimate for the parameter is unbounded
#' @export
unbounded_parameter_ci <- function(object, parameter) {
  stopifnot(parameter %in% c("upper", "lower", "ic50", "slope"))
  coefs <- coef(object)
  if (coefs[3] > coefs[2]) {
    row_number <- switch(parameter,
      upper = 3, lower = 2, slope = 1, ic50 = 4
    )
  } else {
    row_number <- switch(parameter,
      upper = 2, lower = 3, slope = 1, ic50 = 4
    )
  }
  ci <- suppressWarnings(confint(object))
  any(is.nan(ci[row_number, ]))
}

#' Curve span
#'
#' Calculate the range of the curve relative to the asymptotes
#'
#' @param object drc object, 4-parameter log-logistic function
#' @importFrom stats coef fitted
#' @export
curve_span <- function(object) {
  curve_range <- range(
    fitted(object)[which.min(object$dataList$dose)],
    fitted(object)[which.max(object$dataList$dose)]
  ) %>%
    diff() %>%
    abs()

  model_width <- abs(coef(object)[3] - coef(object)[2])
  unname(curve_range / model_width)
}


#' Check if curve is saturated
#'
#' @param object drc object, 4-parameter log-logistic function
#' @param threshold Ask Justin.
#' @return TRUE if there is evidence to believe the curve
#'   is not saturated by the data
#' @export
unsaturated_curve <- function(object, threshold = 0.9) {
  curve_span(object) < threshold
}

#' Calculate Normalized-Root-Mean-Square-Deviation (NRMSD) for drc model.
#'
#' @param object A model object of class "drc"
#' @param cutoff_hep A cutoff value for hepatocytes in PAH experiments.
#' @param cutoff_huh A cutoff value for HUH7 in PAU experiments.
#' @examples
#' data("nrmsd_qc_positive_cases")
#' nrmsd_qc(nrmsd_qc_positive_cases[[1]])
#' @export
nrmsd_qc <- function(object, cutoff_hep = .1433786, cutoff_huh = .1647703) {
  # why is this an anoymous function?
  .nrmsd <- function(object) {
    sum_sq_resids <- object$predres[, 2] %>% sum(.^2)
    rmsd <- sqrt(sum_sq_resids / nrow(object$predres))
    y_range <- range(object$predres[, 1])[2] - range(object$predres[, 1])[1]
    return(rmsd / y_range)
  }
  # determine which cutoff value to use
  if (grepl("PAH", unique(object$origData$run))) {
    cutoff <- cutoff_hep
  }
  else {
    cutoff <- cutoff_huh
  }
  # return logical
  return(.nrmsd(object) > cutoff)
}
#' Encode results of QC check in an integer warning code and human-readable message
#'
#' @param object drc object fit as 4 -parameter log-logistic function
#' @return character vector of error codes separated by a comma
#' @examples
#' data("isobutyryl_curves")
#' ll4_qc_warnings(isobutyryl_curves[[5]])
#' @export
ll4_qc_warnings <- function(object) {
  robust_fit <- robustify_drc(object, formula(object))
  in_binary <- c(
    unbounded_parameter_ci(object, parameter = "slope"),
    unbounded_parameter_ci(object, parameter = "lower"),
    unbounded_parameter_ci(object, parameter = "upper"),
    unbounded_parameter_ci(object, parameter = "ic50"),
    ll4_robust_parameter_overlap(object, robust_fit),
    detect_outliers(robust_fit, threshold = 0.1),
    unsaturated_curve(object),
    poor_ic50_estimate(object, bound = "upper"),
    poor_ic50_estimate(object, bound = "lower"),
    nrmsd_qc(object)
  )

  which(in_binary == TRUE) %>% paste(collapse = ",")
}


#' Calculate the weighted Shannon entropy score (WES)
#'
#' Corresponds to the WES calculation outlined in Shockley K.R., 2014.
#'
#' @param resp A numeric vector of values representing the dose response measurements.
#' @param pos_ctl_fit An object of class drc fit with \code{drm()} and the argument \code{fct = LL.4()}.
#' @param curve_type String. Either "negative", "positive", or "auto" indicating the (expected) sign of the slope of the curve.
#' @param LLoD Numeric. The lower limit of detection.
#' @importFrom stats fitted
#' @export
wes <- function(resp, pos_ctl_fit, curve_type = "auto", LLoD = NULL) {
  curve_types <- c("positive", "negative", "auto")
  ind <- pmatch(curve_type, curve_types)
  if (is.na(ind)) {
    stop(
      "invalid curve type:", paste0(' "', curve_type, '"'),
      ". Must be one of ", paste(paste0('"', curve_types, '"'), collapse = " ")
    )
  }
  curve_type <- curve_types[ind]

  baseline <- fitted(pos_ctl_fit)[which.min(pos_ctl_fit$dataList$dose)]
  termline <- fitted(pos_ctl_fit)[which.max(pos_ctl_fit$dataList$dose)]

  if (curve_type == "auto") {
    if (termline > baseline) {
      curve_type <- "positive"
    } else {
      curve_type <- "negative"
    }
  }

  if (!is.null(LLoD)) {
    if (curve_type == "positive") {
      LLoD <- baseline * 1.25
    } else {
      LLoD <- termline * 1.25
    }
  }

  r <- abs(resp - baseline) / abs(termline - baseline)
  p <- r / sum(r)
  w <- rep(1, length(r))
  w2 <- rep(r / LLoD)
  ind <- r < LLoD
  w[ind] <- w2[ind]

  wes <- w * p * log2(p)
  wes[is.nan(wes)] <- 0
  wes <- -1 * sum(wes)

  return(wes)
}


.calc_mb <- function(object, s, i) {
  baseline <- fitted(object)[which.min(object$dataList$dose)]
  hs <- fitted(object)[which.max(object$dataList$dose)] - baseline
  b <- object$coefficients["b:(Intercept)"]
  c <- object$coefficients["c:(Intercept)"]
  d <- object$coefficients["d:(Intercept)"]
  e <- object$coefficients["e:(Intercept)"]
  e <- log(e)
  if (hs > 0) {
    p <- ((d - c) * (1 - s)) / (baseline + i - c + c * s) - 1
  } else {
    p <- ((d - c) * (1 + s)) / (baseline - i - c - c * s) - 1
  }
  if (p <= 0) {
    return(NaN)
  }
  mb <- (1 / b) * log(p) + e
  return(unname(mb))
}


.calc_auc <- function(object, lb, ub) {
  b <- object$coefficients["b:(Intercept)"]
  c <- object$coefficients["c:(Intercept)"]
  d <- object$coefficients["d:(Intercept)"]
  e <- object$coefficients["e:(Intercept)"]
  e <- log(e)

  upper <- (c * ub) - ((d - c) / b) * log(1 + exp(b * (e - ub)))
  lower <- (c * lb) - ((d - c) / b) * log(1 + exp(b * (e - lb)))
  area <- unname(upper - lower)
  return(area)
}


.auc_low_error <- function(object, lb, ub, s, i) {
  b <- object$coefficients["b:(Intercept)"]
  c <- object$coefficients["c:(Intercept)"]
  d <- object$coefficients["d:(Intercept)"]
  e <- object$coefficients["e:(Intercept)"]
  e <- log(e)

  upper <- (1 - s) * ((c * ub) - ((d - c) / b) * log(1 + exp(b * (e - ub)))) - (i * ub)
  lower <- (1 - s) * ((c * lb) - ((d - c) / b) * log(1 + exp(b * (e - lb)))) - (i * lb)
  area <- unname(upper - lower)
  return(area)
}


.auc_up_error <- function(object, lb, ub, s, i) {
  b <- object$coefficients["b:(Intercept)"]
  c <- object$coefficients["c:(Intercept)"]
  d <- object$coefficients["d:(Intercept)"]
  e <- object$coefficients["e:(Intercept)"]
  e <- log(e)

  upper <- (1 + s) * ((c * ub) - ((d - c) / b) * log(1 + exp(b * (e - ub)))) + (i * ub)
  lower <- (1 + s) * ((c * lb) - ((d - c) / b) * log(1 + exp(b * (e - lb)))) + (i * lb)
  area <- unname(upper - lower)
  return(area)
}

#' Calculate the error-ajusted activity area of a dose response curve
#'
#' Adjusted activity area corresponds to the area between the curve and the
#' line representing the baseline concentration, minus the overlapping error region.
#'
#' @param object An object of class drc fit with \code{drm()} and the argument \code{fct = LL.4()}.
#' @examples
#' fit <- drc::drm(disp~wt, data=mtcars, fct=drc::LL.4())
#' adj_activity_area(fit)
#' @importFrom stats anova fitted lm coef
#' @importFrom dplyr mutate
#' @export
adj_activity_area <- function(object) {
  resp <- data.frame(object$predres)
  resp <- dplyr::mutate(resp, abs_dev = abs(Residuals))

  m0 <- lm(abs_dev ~ 1, data = resp)
  m1 <- lm(abs_dev ~ Predicted.values, data = resp)
  p <- anova(m0, m1)$`Pr(>F)`[2]

  min_pred <- min(resp$Predicted.values) * coef(m1)[2] + coef(m1)[1]

  if (p <= 0.05 & coef(m1)[2] > 0 & min_pred > 0) {
    slope <- coef(m1)[2]
    intercept <- coef(m1)[1]
  } else {
    slope <- 0
    intercept <- coef(m0)[1]
    if (intercept == 0) {
      return(activity_area(object))
    }
  }

  dose_range <- range(object$dataList$dose)
  ub <- log(dose_range[2])
  lb <- log(dose_range[1])

  mb <- .calc_mb(object, slope, intercept)
  if (is.nan(mb)) {
    return(0)
  }
  if ((mb > ub) | (mb < lb)) {
    return(0)
  }

  baseline <- fitted(object)[which.min(object$dataList$dose)]
  hs <- fitted(object)[which.max(object$dataList$dose)] - baseline
  if (hs > 0) {
    res <- .auc_low_error(object, mb, ub, slope, intercept) - (baseline * (ub - mb))
  } else {
    res <- (baseline * (ub - mb)) - .auc_up_error(object, mb, ub, slope, intercept)
  }

  return(res)
}

#' Calculate the activity area of a dose response curve
#'
#' Activity area corresponds to the area between the curve and the
#' line representing the baseline concentration.
#'
#' @param object An object of class drc fit with \code{drm()} and the argument \code{fct = LL.4()}.
#' @examples
#' \dontrun{
#' fit <- drc::drm(disp~wt, data=mtcars, fct=drc::LL.4())
#' activity_area(fit)
#' }
#' @importFrom stats fitted
#' @export
activity_area <- function(object) {
  dose_range <- range(object$dataList$dose)
  ub <- log(dose_range[2])
  lb <- log(dose_range[1])

  area <- .calc_auc(object, lb, ub)

  baseline <- fitted(object)[which.min(object$dataList$dose)]
  resp_area <- abs((baseline * (ub - lb)) - area)

  return(resp_area)
}


#' Calculate the maximum response of a DRC
#'
#' Returns the absolute value of the range of the curve
#'
#' @param object An object of class drc fit with \code{drm()} and the argument \code{fct = LL.4()}.
#' fit <- drc::drm(disp~wt, data=mtcars, fct=drc::LL.4())
#' RMAX(fit)
#' @importFrom stats fitted
#' @export
RMAX <- function(object) {
  res <- fitted(object) %>%
    range() %>%
    diff() %>%
    abs()
  return(res)
}


# Utilities for running a full QC check for a set of dose-response curves ----

#' Get a table of dose-response curve data that should
#'   be checked for quality
#' @param db_path (character) File path to a SQLite database with QC messages that correspond to integer identifiers
#' @param rds_path (character) File path to RDS file with all dose-response that exists for HST-explorer
#' @param analytes (character) Identifiers for analytes that should be checked see assayr2::see_whitelist()$targ
#'   for valid identifiers
#' @param debug (bool) set to TRUE if qc_columns should be added for debugging
#' @return list where first element is a table of curves that need QC, second
#'   element is a table of curves that do not need QC
#' @export
get_curves_needing_qc <- function(db_path = "/data/shiny/HST_explorer/qc-warnings.sqlite",
                                  rds_path = "/data/shiny/HST_explorer/nate_db.RDS",
                                  analytes = c("13C-Isobutyryl"),
                                  debug = FALSE) {
  curves <- readRDS(rds_path)
  all_curve_ids <- curves %>% dplyr::count(
    tx_run,
    donor,
    tx_challenge_abstract,
    tx_cmpd,
    isotope, targ,
    media
  )
  # assume nate_db.RDS has QC column
  if (debug) {
    curves$qc <- FALSE
    curves$qc_warnings <- ""
  }
  if (!missing(analytes) | is.null(analytes)) {
    # Force qc flag to TRUE to avoid a QC check for analytes which we
    #  have not investigated
    curves$qc <- ifelse(!(curves$targ %in% analytes),
      TRUE, curves$qc
    )
    curves_needing_qc <- dplyr::filter(
      curves,
      targ %in% analytes,
      !qc
    )
    curves %<>% dplyr::filter(
      !(targ %in% analytes),
      qc
    )
  } else {
    curves_needing_qc <- dplyr::filter(curves, !qc)
    curves %<>% dplyr::filter(qc)
  }
  list(curves_needing_qc, curves)
}


#' Run QC check for a set of dose-response curves
#'
#' @param curves_needing_qc (data.frame) Table of data
#'  for curves which need QC (first element of get_curves_needing_qc output)
#' @param curves (data.frame) Table of data for curves
#'  which do not need QC (second element of get_curves_needing_qc output)
#' @param analytes (character) Identifiers for analytes that should be checked see assayr2::see_whitelist()$targ
#'   for valid identifiers
#' @param db_path A valid path to SQLite DB.
#' @return Data.frame with original data combined with data that have
#'   been checked for dose-response curve quality
#' @export
drc_qc <- function(curves_needing_qc, curves, analytes, db_path = "/data/shiny/HST_explorer/qc-warnings.sqlite") {
  # update QC table if new files have been added
  if (nrow(curves_needing_qc) > 0) {

    # Get table of unique curves with relevant identifiers
    labeled <- tibble::as.tibble(curves_needing_qc) %>%
      tidyr::nest(-tx_run, -targ, -donor, -media)
    # Convert concentration to numeric ======================
    labeled$data %<>% purrr::map(~dplyr::mutate(., tx_conc_numeric = as.numeric(tx_conc %>% as.character())))
    labeled$data %<>% purrr::map(~dplyr::mutate(.,
      tx_conc_numeric = ifelse(tx_conc_numeric == 0,
        assayr2::new_zeros(tx_conc_numeric), tx_conc_numeric
      )
    ))
    # keep only 4-point curves
    # labeled <- labeled[which(purrr::map_lgl(labeled$data, ~ nrow(.) >= 30)),]
    # Fit dose-response curves for each data set =============
    prop_drc <- purrr::map(labeled$data, ~
    tryCatch({
      drc::drm(conc_corrected ~ tx_conc,
        data = .,
        fct = drc::LL.4(),
        control = drc::drmc(errorm = F, useD = FALSE)
      )
    },
    error = function(e) {
      NA
    }
    ))
    print("Finished fitting models...")
    # Run suite of QC-tests on each model ====================
    warning_codes <- purrr::map(prop_drc, ~tryCatch({
      assayr2::ll4_qc_warnings(.)
    }, error = function(e) {
      NA
    }))
    labeled$qc_code <- unlist(warning_codes)
    rm(warning_codes)
    # Map warning codes to human-readable messages ===========
    db_conn <- RSQLite::dbConnect(RSQLite::SQLite(), db_path)
    warning_messages <- RSQLite::dbGetQuery(db_conn, "SELECT * FROM warning_messages")
    RSQLite::dbDisconnect(db_conn)

    warning_code2message <- function(x) {
      codes <- strsplit(x, split = ",")[[1]] %>% unlist()
      messages <- dplyr::filter(warning_messages, warning_code %in% codes)
      paste(messages$warning_message, sep = "\n", collapse = "\n")
    }
    labeled$qc_warning <- purrr::map_chr(
      labeled$qc_code,
      ~warning_code2message(.)
    )
    # Restore identifier variables
    cols <- Filter(
      function(x) !(x %in% c("data", "tx_conc_numeric")),
      colnames(labeled)
    )
    for (i in 1:nrow(labeled)) {
      for (col in cols) {
        labeled$data[[i]][col] <- labeled[[col]][i]
      }
    }
    # Flatten data that has just been checked
    qcd_curves <- do.call("rbind", labeled$data)
    qcd_curves$tx_conc_numeric <- NULL

    # Order of columns may have changed, so
    # make them match original data
    qcd_curves <- qcd_curves[colnames(curves_needing_qc)]
    # Check if we still have the same number of unique curves as
    # we started with
    validate_n_curves <- function(qcd_curves, curves_needing_qc) {
      a <- n_new_qcd_curves <- qcd_curves %>% dplyr::count(
        tx_run,
        donor,
        tx_challenge_abstract,
        tx_cmpd,
        isotope, targ,
        media
      ) %>% nrow()
      
      b <- curves_needing_qc %>% dplyr::count(
        tx_run,
        donor,
        tx_challenge_abstract,
        tx_cmpd,
        isotope, targ,
        media
      ) %>% nrow()
      
      assertthat::are_equal(a, b, msg = "Could not QC one or more dose-response curves.")
    }
    valid_curves <- validate_n_curves(qcd_curves, curves_needing_qc)

    # Combine with original data (13C-Isobutyryl curves that have already been checked
    #  and curves for all analytes except for 13C-Isobutyryl)
    do.call("rbind", list(curves, qcd_curves))
  } else {
    stop("Empty data.frame passed as argument for curves_needing_qc")
  }
}

#' Wrapper to run entire QC check for a given machine
#' @param db_path (character) File path to a SQLite database with QC messages that correspond to integer identifiers
#' @param rds_path (character) File path to RDS file with all dose-response that exists for HST-explorer
#' @param analytes (character) Identifiers for analytes that should be checked see assayr2::see_whitelist()$targ
#'   for valid identifiers
#' @param backup_rds_dir Ask Justin.
#' @param debug (bool) set to TRUE if qc_columns should be added for debugging
#' @export run_qc_check
run_qc_check <- function(db_path, rds_path, analytes, backup_rds_dir = ".", debug = F) {
  # Backup original =================================
  d <- readRDS(rds_path)
  human_time <- Sys.time() %>% gsub(" ", "_", .) %>% gsub("\\:", "-", .)
  outfile <- file.path(backup_rds_dir, paste0("nate_db", "_", human_time, ".RDS"))
  saveRDS(d, outfile)
  # Run QC check ====================================
  res <- get_curves_needing_qc(db_path, rds_path, analytes, debug = debug)
  res <- drc_qc(res[[1]], res[[2]], analytes)
  # print(head(res))
  # Save data with new QC ==========================
  # saveRDS(res, file = rds_path)
}