R/calculate_pipeline.R

Defines functions ParCalculateTemplate multiResultClass CalculateTemplate CalculateMat

Documented in CalculateMat CalculateTemplate ParCalculateTemplate

################################################################################
# Copyright Shuyu Zheng and Jing Tang - All Rights Reserved
# Unauthorized copying of this file, via any medium is strictly prohibited
# Proprietary and confidential
# Written by Shuyu Zheng<shuyu.zheng@helsinki.fi>, November 2020
################################################################################

# TidyComb
# Functions for pipe all calculation togather
#
# Functions in this page:
#
# CalculateMat: Calculate scores from one dose-response matrix.
# CalculateTemplate: Calculate scores from template file which might contain
#                    several drug combination blocks.
# CalculateTemplateDebug: Debug version of CalculateTemplate.
# ParCalculateTemplate: Parallel calculate scores from template file. It using
#                    multi-core method to do parallel. (It might not work under
#                    Windows system)

#' Chain all calculation about one dose-response matrix
#'
#' \code{CalculateMat} chains all calculations about one dose-response matrix (
#' one drug-drug interaction block) together. The calculations includes:
#' dose-response curve fitting, synergy scores (ZIP, Bliss, Loewe,
#' HSA, S), drug sensitivity (RI, CSS), generate drug-drug response surface and
#' generating summary scores for each block.
#'
#' The steps for calculation:
#' \enumerate{
#'   \item Pre-process Matrix
#'     \enumerate{
#'       \item Impute for missing values (with the average of values from the
#'           nearest four cells) in original matrix by using function
#'           \code{\link[synergyfinder]{ImputeNA}}.
#'       \item Add noise(A small random number ranging from 0 to 0.001) to
#'           original matrix by using function \code{\link[synergyfinder]{AddNoise}}.
#'       )
#'       \item Correct baseline using function
#'       \code{\link[synergyfinder]{CorrectBaseLine}} with the method
#'       selected by parameter \code{correction}.
#'     }
#'   \item Single drug process
#'     \enumerate{
#'       \item Extract and fitting single drugs.
#'       \item Extract coeficients from fitted model. (b, c, d, e, IC50)
#'       \item Calculate RI(Relative inhibition for single drug) with function
#'       \code{CalculateSens}
#'     }
#'   \item Whole response matrix process
#'     \enumerate{
#'       \item Calculate Synergy Scores with function
#'       \code{\link[synergyfinder]{ZIP}},
#'       \code{\link[synergyfinder]{Bliss}}, \code{\link[synergyfinder]{HSA}},
#'       \code{\link[synergyfinder]{Loewe}} in \code{synergyfinder} package.
#'       \item Calculate Surface(The landscape of response, synergy scores)
#'       \item Calculate CSS(drug combination sensitivity score), S(synergy
#'       score calculated from CSS and IR)
#'     }
#'   \item Summarize and generate surface
#' }
#' @param response.mat A matrix which contains the drug combination reaponse
#' value. Its column names are doses of drug added along columns. Its row name
#' are doses of drug added along rows. \cr
#' \strong{Note}: the matrix should be sorted by: 1. The concentrations along
#' the column increase \emph{from left to right}; 2. The concentrations along
#' the row increase \emph{from top to bottom}.
#'
#' @param noise a logical value. It indicates whether or not adding noise to
#' to the "inhibition" values in the matrix. Default is \code{TRUE}.
#'
#' @param correction a string. It indicates which method used by function
#' \code{\link[synergyfinder]{CorrectBaseLine}} for base line correction.
#'   \itemize{
#'     \item \code{non}: no baseline correction;
#'     \item \code{par}: only correct base line on negative values in the matrix;
#'     \item \code{all}: correct base line on all the values in the matrix.
#'   }
#'
#' @param summary.only a logical value. If it is \code{TRUE} then only summary
#' table is calculated and returned, otherwise, for tables will be return.
#' Default setting is \code{FALSE}.
#'
#' @param seed a integer or NULL. It is used to set the random seed to
#' \code{AddNoise} function to make sure the results are reproducible.
#' By default, it is set as NULL which means no seed was set.
#'
#' @return A list contains 4 tables:
#'   \itemize{
#'     \item \strong{response} It contains the modified inhibition value and 4
#'     type of synergy scores of each drug dose response pair.
#'     \item \strong{summary} It contains summarized information of each
#'     blocks: synergy scores, css, ri, S
#'     \item \strong{curve} It contains the coefficients from single drug dose
#'     response curve.
#'     \item \strong{surface} It contains the smoothed inhibition values and
#'     synergy scores for plotting the scores' landscape.
#'  }
#'  If \code{summary.only} is \code{TRUE}, it will return only the "summary"
#'  data frame.
#'
#' @author
#' Jing Tang \email{jing.tang@helsinki.fi}
#' Shuyu Zheng \email{shuyu.zheng@helsinki.fi}
#'
#' @importFrom magrittr %>%
#'
#' @export
#'
#' @examples
#' data <- read.csv(system.file("template.csv", package = "TidyComb"),
#'                  stringsAsFactors = FALSE)
#' response.mat <- reshape2::acast(conc_r~conc_c, value.var = "inhibition",
#'                                 data = data[data$block_id == 1, ])
#' res <- CalculateMat(response.mat)
CalculateMat <- function(response.mat, noise = TRUE, correction = "non",
                         summary.only = FALSE, seed = NULL) {

  options(scipen = 999)

  # 0. Check input data
  if (!is.matrix(response.mat)) {
    warning("Input data is not 'matrix'. Trying to transform it.")
    response.mat <- as.matrix(response.mat)
  }

  # 1. Pre-processing
  # 1.1 Impute data until there is no missing value in the response matrix
  while (sum(is.na(response.mat))) {
    response.mat <- synergyfinder::ImputeNA(response.mat)
  }

  # 1.2. Add random noise to original matrix
  if (!is.null(seed)){
    set.seed(seed)
  }
  if (noise) {
    response.mat <- synergyfinder::AddNoise(response.mat)
  }

  # 1.3. Correct baseline with corresponding "method". Available methods are
  #      "non", "part", "all".
    response.mat <- synergyfinder::CorrectBaseLine(response.mat,
                                                   method = correction)
  # 2. Single drug process
  # 2.1. Fit single drug dose-response curve
  # drug_col
  drug.col <- synergyfinder::ExtractSingleDrug(response.mat, dim = "col")
  col.model <- synergyfinder::FitDoseResponse(drug.col)
  col.type <- as.character(synergyfinder::FindModelType(col.model))

  # drug_row
  drug.row <- synergyfinder::ExtractSingleDrug(response.mat, dim = "row")
  row.model <- synergyfinder::FitDoseResponse(drug.row)
  row.type <- as.character(synergyfinder::FindModelType(row.model))

  # 2.2 Extract coeficients
  # drug_col
  col.coe <- stats::coef(col.model)
  # drug_rowr
  row.coe <- stats::coef(row.model)

  if (!summary.only) {
    curves <- as.data.frame(rbind(col.coe, row.coe))
    colnames(curves) <- sub(":(Intercept)", "", colnames(curves), fixed = TRUE)
    curves$model <- c(col.type, row.type)
    curves$dim <- c("col", "row")
  }


  # 2.3 Calculate RI (using single drug response but without that at 0
  # concentration)
  col.ri <- CalculateSens(drug.col)
  row.ri <- CalculateSens(drug.row)

  # 3. whole matrix process
  # 3.1 Calculate synergyscores
  zip <- synergyfinder::ZIP(response.mat, drug.row.model = row.model,
                                     drug.col.model = col.model)
  loewe <- synergyfinder::Loewe(response.mat, drug.row.model = row.model,
                                drug.col.model = col.model)
  hsa <- synergyfinder::HSA(response.mat)
  bliss <- synergyfinder::Bliss(response.mat)

  response <- lapply(list(response.mat, zip, loewe, hsa, bliss), reshape2::melt)
  response <- Reduce(function(x, y) {
    merge(x = x, y = y, by = c("Var1", "Var2"))}, response)

  colnames(response) <- c("conc_r", "conc_c", "inhibition", "synergy_zip",
                         "synergy_loewe", "synergy_hsa", "synergy_bliss")

  if (!summary.only) {
    # 3.2 calculate surface
    smooth.res <- smoothing(response.mat)
    smooth.zip <- smoothing(zip)
    smooth.hsa <- smoothing(hsa)
    smooth.bliss <- smoothing(bliss)
    smooth.loewe <- smoothing(loewe)

    surface <- lapply(list(smooth.res, smooth.zip, smooth.loewe, smooth.hsa,
                           smooth.bliss), reshape2::melt)
    surface <- Reduce(function(x, y) {
      merge(x = x, y = y, by = c("Var1", "Var2"))}, surface)

    colnames(surface) <- c("conc_r", "conc_c", "inhibition", "synergy_zip",
                           "synergy_loewe", "synergy_hsa", "synergy_bliss")
  }


  # 3.3 Calculate CSS
  col.ic50 <- CalculateIC50(col.coe, col.type, max(drug.col$dose))
  row.ic50 <- CalculateIC50(row.coe, row.type, max(drug.row$dose))

  imputed.ic50 <- ImputeIC50(response.mat, col.ic50 = col.ic50,
                             row.ic50 = row.ic50)
  # a particular row selected according to ic50_row
  col.css <- CalculateSens(imputed.ic50$tempcf_c)
  # a particular column selected according to ic50_col
  row.css <- CalculateSens(imputed.ic50$tempcf_r)

  css <- mean(c(col.css, row.css), na.rm = TRUE)

  # Synergy scores from CSS and RI
  S_sum <- css - sum(col.ri, row.ri)
  S_max <- css - max(c(col.ri, row.ri))
  S_mean <- css - mean(c(col.ri, row.ri))

  sum <- response %>%
    dplyr::filter(conc_r != 0 & conc_c != 0) %>%
    dplyr::select(-conc_r, -conc_c, -inhibition) %>%
    apply(2, mean, na.rm = TRUE)

  sum <- data.frame(t(sum), ic50_row = row.ic50 , ic50_col = col.ic50,
                    ri_row = row.ri, ri_col = col.ri, css_row = row.css,
                    css_col = col.css, css = css, S_sum = S_sum, S_max = S_max,
                    S_mean = S_mean)
  if (summary.only) {
    return(sum)
  } else{
    res <- list(response = response, surface = surface,
                summary = sum, curve = curves)
    return(res)
  }

  # clean up
  gc()
  rm(.Random.seed)
}

#' Calculate Drug Combination data in template format
#'
#' @param template a dataframe which must contains following columns:
#'  \itemize{
#'   \item \emph{block_id}: (integer) the identifier for a drug combination.
#'    If mul-tiple drug combinations are present, e.g. in the standard 384-well
#'    platewhere 6 drug combinations are fitted, then the identifiers for each
#'    of themmust be unique.
#'    \item \emph{drug_col}: (character) the name of the drug on the columns in
#'    adose-response matrix.
#'    \item \emph{drug_row}: (character) the name of the drug on the rows in
#'    adose-response matrix.
#'    \item \emph{conc_c}: (numeric) the concentrations of the column drugs
#'    in a combination.
#'    \item \emph{conc_r}: (numeric) the concentrations of the row drugs
#'    in a combination.
#'    \item \emph{conc_c_unit}: (character) the unit of concentrations of the
#'    column drugs. It is typically nM or \eqn{\mu}M.
#'    \item \emph{conc_r_unit}: (character) the unit of concentrations of the
#'    row drugs. It is typically nM or \eqn{\mu}M.
#'    \item \emph{response}: (numeric) the effect of drug combinations at the
#'    concentra-tions specified by conc_r and conc_c. The effect must be
#'    normalized to \%inhibition based on the positive and
#'    negative controls. For a well-controlled experiment, the range of the
#'    response values is expected from 0 to 100. However, missing values or
#'    extreme values are allowed.
#'    \item \emph{cell_line_name}: (character) the name of cell lines on which
#'    the drug combination was tested.
#'  }
#'
#' @param summary.only a logical value. If it is \code{TRUE} then only summary
#' table is calculated and returned, otherwise, for tables will be return.
#' Default setting is \code{FALSE}.
#'
#' @param seed a integer or NULL. It is used to set the random seed to
#' \code{AddNoise} function to make sure the results are reproducible.
#' By default, it is set as NULL which means no seed was set.
#'
#' @return A list. It contains 4 tables:
#'   \itemize{
#'     \item \strong{response} It contains the modified response value and 4
#'     type of synergy scores of each drug dose response pair.
#'     \item \strong{summary} It contains summarized information of each
#'     blocks: synergy scores, css, ri, S
#'     \item \strong{curve} It contains the coefficients from single drug dose
#'     response curve.
#'     \item \strong{surface} It contains the smoothed response value and
#'     synergy scores of each drug dose response pair.
#'  }
#'
#' @author
#' Jing Tang \email{jing.tang@helsinki.fi}
#' Shuyu Zheng \email{shuyu.zheng@helsinki.fi}
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'
#' @export
#' @examples
#' data <- read.csv(system.file("template.csv", package = "TidyComb"),
#'                  stringsAsFactors = FALSE)
#' res <- CalculateTemplate(data)
CalculateTemplate <- function(template, summary.only=FALSE, seed = NULL) {
  template <- CheckTemplate(template)

  blocks <- unique(template$block_id)

  # generate container
  summary <- data.frame(block_id = integer(), synergy_zip = numeric(),
                        synergy_bliss = numeric(), synergy_hsa = numeric(),
                        synergy_loewe = numeric(), ic50_row = numeric() ,
                        ic50_col = numeric(), ri_row = numeric(),
                        ri_col = numeric(), css_row = numeric(),
                        css_col = numeric(), css = numeric(), S_sum = numeric(),
                        S_max = numeric(), S_mean = numeric(),
                        stringsAsFactors = FALSE)

  if (!summary.only) {
    response <- data.frame(block_id = integer(), conc_r = numeric(),
                          conc_c = numeric(), inhibition = numeric(),
                          synergy_zip = numeric(), synergy_bliss = numeric(),
                          synergy_loewe = numeric(), synergy_hsa = numeric(),
                          stringsAsFactors = FALSE)
    surface <- response
    curve <- data.frame(block_id = integer(), b = numeric(),
                        c = numeric(), d = numeric(), e = numeric(),
                        model = numeric(), drug.row = numeric(),
                        drug.col = numeric(), stringsAsFactors = FALSE)
  }

  for (block in blocks) {
    # 1. Generate response matrix for each block
    response.df <- template %>%
      dplyr::filter(block_id == block)

    response.mat <- response.df %>%
      dplyr::select(conc_r, conc_c, inhibition) %>%
      reshape2::acast(conc_r ~ conc_c, value.var = "inhibition")

    # 2. Do calculation on matrix (with error control)
    tmp <- tryCatch({
      CalculateMat(response.mat = response.mat, summary.only = summary.only,
                   seed = seed)
    }, error = function(e) {
      print(block)
      traceback()
    })

    if (summary.only) {
      tmp$block_id <- rep(block, nrow(tmp))
    } else {
      tmp <- lapply(tmp, function(x){
        x$block_id = rep(block, nrow(x))
        return(x)
      })
    }


    # 3. Add information to summary table
    info <- response.df %>%
      dplyr::select(drug_row, drug_col, cell_line_name,
                    conc_r_unit, conc_c_unit) %>%
      unique()

    if (nrow(info) > 1) {
      warning("The summary data of block ", block, " contains ", nrow(info),
              " different versions.")
    } else if (nrow(info) < 1) {
      warning("The summary data of block ", block, " is missing.")
    }

    if (!summary.only) {
      tmp$summary <- cbind.data.frame(info, tmp$summary)

      # 4. fill drug names to curve table

      tmp$curve$drug_col <- rep(NA, 2)
      tmp$curve$drug_col[which(tmp$curve$dim ==
                                 "col")] <- as.character(info$drug_col)

      tmp$curve$drug_row <- rep(NA, 2)
      tmp$curve$drug_row[which(tmp$curve$dim ==
                                 "row")] <- as.character(info$drug_row)

      # 4. Append new tables to containers
      response <- rbind.data.frame(response, tmp$response)
      surface <- rbind.data.frame(surface, tmp$surface)
      curve <- rbind.data.frame(curve, tmp$curve)
      summary <- rbind.data.frame(summary, tmp$summary)
    } else {
      tmp <- cbind.data.frame(info, tmp)
      summary <- rbind.data.frame(summary, tmp)
    }



    # Clean temporary file
    tmp <- NULL
    info <- data.frame()
    response.df <- data.frame()
    response.mat <- matrix()
  }

  if (summary.only) {
    return(summary)
  } else {
    curve <- dplyr::select(curve, block_id, drug_row, drug_col, b, c, d, e, model)

    return(list(response = response, surface = surface,
                curve = curve, summary = summary))
  }
}


multiResultClass <- function(response=NULL, summary=NULL, surface = NULL,
                             curve = NULL) {
  me <- list(
    response = response,
    summary = summary,
    surface = surface,
    curve = curve
  )

  ## Set the name for the class
  class(me) <- append(class(me), "multiResultClass")
  return(me)
}

#' Parallel Calculate Drug Combination data in template format
#'
#' @param template a dataframe in the format as template. Columns "block_id",
#' "drug_row", "drug_col", "inhibition", "conc_r", "conc_c", "conc_r_unit",
#' "conc_c_unit","cell_line_name", "drug_row", "drug_col" are reqired.
#'
#' @param cores A integer. It indicates number of cores would be allocated to
#' the parallel processed
#'
#' @param summary.only a logical value. If it is \code{TRUE} then only summary
#' table is calculated and returned, otherwise, for tables will be return.
#' Default setting is \code{FALSE}.
#'
#' @param seed a integer or NULL. It is used to set the random seed to
#' \code{AddNoise} function to make sure the results are reproducible.
#' By default, it is set as NULL which means no seed was set.
#'
#' @param ... Other arguments required by nested functions. Some important
#' arguments are:
#'  \itemize{
#'    \item \code{impute} and \code{noise} inherited from function
#'          \code{CalculateMat};
#'    \item \code{method} inherited from function \code{CorrectBaseLine};
#'    \item \code{Emin} and \code{Emax} inherited from function
#'          \code{FitDoseResponse}.
#' }
#'
#' @return A list. It contains 4 tables:
#'   \itemize{
#'     \item \strong{response} It contains the modified inhibition value and 4
#'     type of synergy scores of each drug dose response pair.
#'     \item \strong{summary} It contains summarized information of each
#'     blocks: synergy scores, css, ri, S
#'     \item \strong{curve} It contains the coefficients from single drug dose
#'     response curve.
#'     \item \strong{surface} It contains the smoothed inhibition value and
#'     synergy scores of each drug dose response pair.
#'  }
#'
#' @author
#' Jing Tang \email{jing.tang@helsinki.fi}
#' Shuyu Zheng \email{shuyu.zheng@helsinki.fi}
#'
#' @importFrom magrittr %>%
#' @importFrom foreach %dopar%
#'
#' @export
#' @examples
#' data <- read.csv(system.file("template.csv", package = "TidyComb"),
#'                  stringsAsFactors = FALSE)
#' res <- ParCalculateTemplate(data)
ParCalculateTemplate <- function(template, cores = 1, summary.only = FALSE,
                                 seed = NULL, ...) {
  template <- CheckTemplate(template)
  blocks <- unique(template$block_id)

  cl <- parallel::makeForkCluster(cores)
  doParallel::registerDoParallel(cl)

  res <- foreach::foreach (i = 1:length(blocks)) %dopar% {

    # 1. Generate response matrix for each block
    result <- multiResultClass()
    response <- template %>%
      dplyr::filter(block_id == blocks[i])

    response.mat <- response %>%
      dplyr::select(conc_r, conc_c, inhibition) %>%
      reshape2::acast(conc_r ~ conc_c, value.var = "inhibition")

    # 2. Do calculation on matrix
    tmp <- CalculateMat(response.mat = response.mat,
                        summary.only = summary.only, seed = seed)
    if (summary.only) {
      tmp$block_id = rep(blocks[i], nrow(tmp))
    } else {
      tmp <- lapply(tmp, function(x){
        x$block_id = rep(blocks[i], nrow(x))
        return(x)
      })
    }

    # 3. Add information to summary table
    info <- response %>%
      dplyr::select(drug_row, drug_col, cell_line_name,
                    conc_r_unit, conc_c_unit) %>%
      unique()

    if (nrow(info) > 1) {
      warning("The summary data of block ", blocks[i], " contains ", nrow(info),
              " different versions.")
    } else if (nrow(info) < 1) {
      warning("The summary data of block ", blocks[i], " is missing.")
    }

    if (summary.only) {
      result <- cbind.data.frame(info, tmp)
    } else {
      tmp$summary <- cbind.data.frame(info, tmp$summary)

      # 4. fill drug names to curve table

      tmp$curve$drug_col <- rep(NA, 2)
      tmp$curve$drug_col[which(tmp$curve$dim ==
                                 "col")] <- as.character(info$drug_col)

      tmp$curve$drug_row <- rep(NA, 2)

      tmp$curve$drug_row[which(tmp$curve$dim ==
                                 "row")] <- as.character(info$drug_row)

      tmp$curve <- dplyr::select(tmp$curve, block_id, drug_row, drug_col,
                                 b, c, d, e, model)

      # 5. collect tables
      result$response <- tmp$response
      result$surface <- tmp$surface
      result$summary <- tmp$summary
      result$curve <- tmp$curve
    }

    # Clean temporary file
    tmp <- NULL
    info <- data.frame()
    response <- data.frame()
    response.mat <- matrix()
    rm(.Random.seed)
    return(result)
  }

  # Stop the clusters
  parallel::stopCluster(cl)

  # Collecting the results
  if (summary.only) {
    res2 <- Reduce(function(x, y) {rbind.data.frame(x, y)}, res)
  } else {
    res2 <- list()
    res2$response <- Reduce(function(x, y) {rbind.data.frame(x, y)},
                           lapply(res, "[[" , "response"))
    res2$surface <- Reduce(function(x, y) {rbind.data.frame(x, y)},
                           lapply(res, "[[" , "surface"))
    res2$summary <- Reduce(function(x, y) {rbind.data.frame(x, y)},
                           lapply(res, "[[" , "summary"))
    res2$curve <- Reduce(function(x, y) {rbind.data.frame(x, y)},
                         lapply(res, "[[" , "curve"))
  }

  return(res2)
}
DrugComb/TidyComb documentation built on June 22, 2022, 2:49 a.m.