R/tcplfit_onerun.R

Defines functions tibble_fit_para make_act_na_0_in make_act_na_0 summarize_fit_output get_hillfit_direction create_hillsimu_resp create_hillsimu_dataset merge_fit_output clean_fit_output_in clean_fit_output cal_fit_dataset nest_fit_dataset run_fit

Documented in run_fit summarize_fit_output

#' Run parametric fits using types of models on concentration-response datasets
#'
#' Confidence intervals of activity metrics can be obtained through bootstrap approach.
#' The bootstrap samples are generated by adding the residuals (the difference between the original responses and the Hill fit) to
#' the fitted response (strictly to Hill equation).
#'
#' @param d Datasets with concentration-response data.
#'   An example is [zfishbeh].
#'   mask column is optional.
#' @inheritParams fit_modls
#' @param keep_sets Output datasets. Multiple values are allowed.
#'   Default values are fit_set and resp_set.
#'   fit_set is a must.
#' \itemize{
#'   \item fit_set: a tibble with output from model fits
#'   \item resp_set: a tibble with fitted response data from the winning model
#' }
#' @param n_samples NULL (default) for no bootstrap samples are generated or number of samples to be generated from bootstrapping.
#'   When n_samples is not NULL, fit_modls = "hill" will be set automatically.
#' @param ... The named input configurations for replacing the default configurations.
#'   The input configuration needs to add model type as the prefix.
#'   For example, hill_pdir = -1 will set the Hill fit only to the decreasing direction.
#'
#' @return A list of named components: result and result_nested.
#'   The result component is also a list of output sets depending on the parameter, *keep_sets*.
#'   The result_nested component is a tibble with input data nested in a column, input,
#'   and output data nested in a column, output.
#'
#'   The prefix of the column names in the *fit_set* are the used models.
#'   The *win_modl* is the winning model.
#'
#' @export
#'
#' @seealso [fit_modls()] for model fit information and the following analyses using [summarize_fit_output()].
#'  for dichotomous response (see [zfishdev]), use `create_dataset()` first.
#'
#' @examples
#'
#' # default
#' fitd <- run_fit(zfishbeh)
#'
#' # use only hill model and fit only to the decreasing direction, keep only the fit_set output
#' fitd <- run_fit(zfishbeh, modls = "hill", keep_sets = "fit_set", hill_pdir = -1)
#'
#' # fit to the bootstrap samples
#' fitd <- run_fit(zfishbeh, n_samples = 2)
#'
#'
run_fit <- function(d, modls = c("hill", "cnst"), keep_sets = c('fit_set', 'resp_set'), n_samples = NULL, ...) {

  # argument checks
  args <- list(...)
  d <- .check_dat_base(d, check_one_concresp = FALSE)
  d <- create_resp_mask(d, mask = 0) #only user put the mask column, disable the
  keep_sets <- .check_keep_sets(keep_sets, c('fit_set', 'resp_set'), must_set = "fit_set")
  modls <- match.arg(modls, c("hill", "cnst"), several.ok = TRUE)
  n_samples <- .check_n_samples(n_samples)

  # currently only hill can be used to generate simulated samples
  if (!is.null(n_samples)) modls <- 'hill'

  # nest the data and calculate fits
  nestd <- nest_fit_dataset(d, nest_cols = c("endpoint", "chemical"))
  fitd <- cal_fit_dataset(nestd = nestd, modls = modls, args = args, fit_type = "original")

  # run simulated datasets
  if(!is.null(n_samples)) {
    simud <- create_hillsimu_dataset(fitd = fitd, n_samples = n_samples, pdir = args$hill_pdir)
    nestd <- nest_fit_dataset(simud, nest_cols = c("endpoint", "chemical", "sample_id"))
    fitd <- cal_fit_dataset(nestd = nestd, modls = modls, args = args, fit_type = "hill_simu")
  }

  # clean the output and unnest the results (sets)
  fitd_clean <- clean_fit_output(fitd = fitd, keep_sets = keep_sets)
  sets <- merge_fit_output(fitd_clean, keep_sets = keep_sets)

  # return the list
  result <- list(result = sets, result_nested = fitd)
  class(result) <- c("rcurvep", class(result))

  return(result)
}



#' Nest the dataset for preparing the ensuing calculations.
#'
#' @inheritParams run_fit
#' @param nest_cols Column names of the dataset.
#'
#' @return A nested tibble with a new column, input.
#' @keywords internal
#' @noRd
#'
nest_fit_dataset <- function(d, nest_cols) {

  nest_colsq <- rlang::syms(c(nest_cols, 'conc'))

  result <- d %>%
    dplyr::arrange(!!!nest_colsq) %>%
    tidyr::nest(input = -tidyselect::one_of(nest_cols))

  return(result)
}


#' Perform parametric fitting using types of models on the nested dataset.
#'
#' The calculation can be performed either on the original dataset
#' or on the simulated dataset (Hill only).
#'
#' @param nestd The nested data from \code{nest_fit_dataset}.
#' @inheritParams fit_modls
#' @param args The additional arguments for running the \code{run_fit}.
#' @param fit_type The fitting type. Either original or hill_simu. Only value is allowed.
#'
#' @return A tibble.
#' @keywords internal
#' @noRd
#'
#'
cal_fit_dataset <- function(nestd, modls, args, fit_type = c("original", "hill_simu")) {
  if (fit_type == "original") {
    result <- nestd %>%
      dplyr::mutate(
        output = purrr::map(
          .data$input,
          ~ do.call(
            fit_modls,
            c(list(Conc = .x$conc, Resp = .x$resp, Mask = .x$mask, modls = modls), args)
            )
        )
      )
  } else if (fit_type == "hill_simu") {

    args[['hill_pdir']] <- NULL

    result <- nestd %>%
      dplyr::mutate(
        output = purrr::map(
          .data$input,
          ~ do.call(
            fit_modls,
            c(list(Conc = .x$conc, Resp = .x$resp, hill_pdir = unique(.x$direction), Mask = NULL, modls = modls), args))
        )
      )
  }

  return(result)
}


#' Clean the dataset results after fitting based on multiple values of keep_sets.
#'
#' @param fitd The output from \code{cal_fit_dataset()}.
#' @inheritParams run_fit
#' @param thr_resp The response cutoff to calculate the potency. Default = NULL.
#' @param perc_resp The percentage cutoff to calculate the potency. Default = NULL.
#'
#' @return A tibble with results nested in new columns.
#'   fit_set to out_paras.
#'   resp_set to out_resps.
#'   act_set to in_summary and activity.
#'
#' @keywords internal
#' @noRd
#'

clean_fit_output <- function(fitd, keep_sets, thr_resp = NULL, perc_resp = NULL) {
  result <- suppressMessages(
    purrr::reduce(
      purrr::map(
        keep_sets, clean_fit_output_in,
        fitd = fitd, thr_resp = thr_resp, perc_resp = perc_resp),
      dplyr::left_join)
  )
  return(result)
}

#' Clean the dataset results after fitting based on one keep_set.
#'
#' @inheritParams summarize_fit_output
#' @param keep_set Only value is allowed, fit_set, resp_set, act_set.
#'
#' @return A tibble with nested results in new columns.
#'   fit_set to out_paras.
#'   resp_set to out_resps.
#'   act_set to in_summary and activity.
#'
#' @keywords internal
#' @noRd
#'

clean_fit_output_in <- function(fitd, keep_set, thr_resp = NULL, perc_resp = NULL) {

  if (keep_set == "fit_set") {
    result <- fitd %>%
      dplyr::mutate(
        out_paras = purrr::map(.data$output, extract_fit_para)
      )
  }
  if (keep_set == 'resp_set') {
    result <- fitd %>%
      dplyr::mutate(
        out_resps = purrr::map2(.data$input, .data$output, extract_fit_resp)
      )
  }
  if (keep_set == "act_set") {
    result <- fitd %>%
      dplyr::mutate(
        in_summary = purrr::map(.data$input, extract_input_summary),
        activity = purrr::map2(.data$input, .data$output, extract_fit_activity, thr_resp = thr_resp, perc_resp = perc_resp)
      )
  }
  result <- result %>% dplyr::select(-.data$input, -.data$output)

  return(result)
}


#' Merge the nested results from fitting into tibble.
#'
#' @param fitd The output from \code{clean_fit_output()}.
#' @inheritParams run_fit
#'
#' @return A list of tibbles (unnested) named by the keep_sets.
#' @keywords internal
#' @noRd

merge_fit_output <- function(fitd, keep_sets) {

  tbl_names <- list(
    fit_set = c("out_paras"),
    resp_set = c("out_resps"),
    act_set = c("in_summary", "activity")
  )

  result <- purrr::map(
    keep_sets, function(x, tbl_names, obj)
      suppressWarnings(tidyr::unnest(
        obj %>% dplyr::select(tidyselect::one_of(c("endpoint", "chemical", "sample_id", tbl_names[[x]])))
      )),
    tbl_names = tbl_names,
    obj = fitd
  ) %>% rlang::set_names(keep_sets)


  return(result)

}



#' Create a simulated dataset based on Hill fit results.
#'
#' @param fitd The output from the \code{cal_fit_dataset()}.
#' @inheritParams run_fit
#' @param pdir The preferred direction of the fit.
#'
#' @return A tibble with new columns, including sample_id and direction.
#' @keywords internal
#' @noRd
#'

create_hillsimu_dataset <- function(fitd, n_samples, pdir) {

  # a new column simud is added, which has a nested tibble
  # The nested tibble has a new sample_id column
  result <- fitd %>%
    dplyr::mutate(
      simud = purrr::map2(
        .data$input, .data$output,
        ~ dplyr::bind_rows(
          replicate(n_samples, create_hillsimu_resp(.x, .y), simplify = FALSE),
          .id = "sample_id"))
    )

  # create direction column and remove the original fit input and output
  result <- result %>%
    dplyr::mutate(
      direction = purrr::map_dbl(
        .data$output, get_hillfit_direction, pdir = pdir)
    ) %>%
    dplyr::select(-.data$input, -.data$output) %>%
    tidyr::unnest(cols = "simud") %>%
    dplyr::select(-.data$mask) # do not need the mask anymore

  return(result)
}

#' Create simulated responses by the hill fit results.
#'
#' The new responses is the fitted responses + sampled residuals.
#'
#' @param inp A list (tibble) of with conc and resp columns.
#' @param out A list of named models.
#'
#' @return The vector of responses.
#' @keywords internal
#' @noRd

create_hillsimu_resp <- function(inp, out) {
  result <- inp
  yhat <- tcplHillVal(inp$conc, out[['hill']]$tp, out[['hill']]$ga, out[['hill']]$gw)
  resid <- inp$resp - yhat
  sampleresids <- sample(resid,length(resid),replace = TRUE)
  this_y <- yhat + sampleresids
  result$resp <- this_y
  return(result)
}


#' Get the direction to the next round of Hill fitting.
#'
#' If pdir is null, then use the fit results to get the direction.
#' Otherwise, use the user input.
#'
#' @param out The list of models. The component of hill is necessary.
#' @param pdir The preferred direction.
#'
#' @return An integer 1 or -1.
#' @keywords internal
#' @noRd
#'
get_hillfit_direction <- function(out, pdir) {

  tp <- out[['hill']]$tp

  # if not null and
  if (!is.null(pdir)) {
    if (length(pdir) == 1) {
      return(pdir)
    }
  }

  result <- 1
  if(!is.na(tp)) {
    if (tp < 0) result <- -1
  }
  return(result)

}


#' Summarize the results from the parametric fitting using types of models
#'
#' The function first extracts the activity data based on the fit the supplied input parameters.
#' In addition, summary of activity data (e.g., confidence interval, hit confidence) can be produced.
#'
#' A tibble, act_set is generated. When (extract_only = FALSE), a tibble, act_summary is generated with confidence intervals of the activity metrics.
#' The quantile approach is used to calculate the confidence interval.
#' For potency activity metrics, if value is NA, highest tested concentration is used in the summary.
#' For other activity metrics, if value is NA, 0 is used in the summary.
#'
#' @param d The output from the [run_fit()].
#' @param thr_resp The response cutoff to calculate the potency. Default = NULL.
#' @param perc_resp The percentage cutoff to calculate the potency. Default = NULL.
#' @param ci_level The confidence level for the activity metrics. Default is = 0.95.
#' @param extract_only Whether act_summary data should be produced. Default = FALSE.
#'
#' @return A list of named components: result and result_nested (and act_summary).
#'   The result and result_nested are the copy from the output of [run_fit()].
#'   An act_set is added under the result component.
#'   If (extract_only = FALSE), an act_summary is added.
#' @export
#' @seealso [run_fit()]
#' @examples
#'
#' # generate some fit outputs
#'
#' ## fit only
#' fitd1 <- run_fit(zfishbeh)
#'
#' ## fit + bootstrap samples
#' fitd2 <- run_fit(zfishbeh, n_samples = 3)
#'
#'\donttest{
#' # only to extract the activity data
#' sumd1 <- summarize_fit_output(fitd1, extract_only = TRUE)
#'
#' # calculate EC20 instead of default EC10
#' sumd1 <- summarize_fit_output(fitd1, extract_only = TRUE, perc_resp  = 20)
#'
#' # calculate POD using a higher noise level (e.g., 40)
#' ## this number depends on the response unit
#' sumd1 <- summarize_fit_output(fitd1, extract_only = TRUE, thr_resp  = 40)
#'
#' # calculate confidence intervals based on the bootstrap samples
#' sumd2 <- summarize_fit_output(fitd2)
#'
#'}
#'
summarize_fit_output <- function(d, thr_resp = 20, perc_resp = 10, ci_level = 0.95, extract_only = FALSE) {

  fitd <- d$result_nested
  fitd_clean <- clean_fit_output(fitd, keep_sets = "act_set", thr_resp = thr_resp, perc_resp = perc_resp)
  act_set <- merge_fit_output(fitd_clean, keep_sets = c("act_set"))
  d[['result']] <- c(d$result, act_set)

  if (!extract_only) {
    lsets <- d$result
    base_cols <- get_base_cols(lsets)
    lsets_n <- get_nested_joined_sets(lsets, base_cols)
    lsets_n <- make_act_na_highconc(lsets_n)
    lsets_n <- make_act_na_0(lsets_n)
    lsets_n <- summarize_actsets(lsets_n, ci_level = ci_level)
    d[['act_summary']] <- unnest_joined_sets(lsets_n, base_cols, "act_summary")
  }

  class(d) <- c("rcurvep", class(d))

  return(d)
}

#' Make activity NA as 0 in a dataset.
#'
#' @param nestd The output from \code{get_nested_joined_sets()}
#'
#' @return The original tibble with a added column, act_set.
#' @keywords internal
#' @noRd

make_act_na_0 <- function(nestd) {
  result <- nestd %>%
    dplyr::mutate(
      act_set = purrr::map(.data$act_set, make_act_na_0_in)
    )
  return(result)
}

#' Make activity NA as 0 for some selected columns.
#'
#' @param act_set A tibble.
#'
#' @return A tibble with values in the selected columns modified.
#' @keywords internal
#' @noRd
#'
#'
make_act_na_0_in <- function(act_set) {

  potency_cols <- c("slope", "Emax", "max_resp", "min_resp")
  suppressWarnings(result <- act_set %>%
                     dplyr::mutate_at(
                       dplyr::vars(tidyselect::one_of(potency_cols)),
                       ~ replace(.x, is.na(.x), 0)
                     ))

  return(result)
}


#' Convert the output data of a model to the structure of tibble.
#'
#' The model type information is added as the prefix of the column names.
#'
#' @param modl A list from one model.
#'
#' @return A tibble with parameters.
#' @keywords internal
#' @noRd
#'
tibble_fit_para <- function(modl) {
  pars <- tibble::as_tibble(modl)
  modl_type <- pars$modl
  result <- pars %>%
    dplyr::select(-.data$modl) %>%
    dplyr::rename_all(.funs = list(~ stringr::str_c(modl_type, "_", .)))
  return(result)
}

#' Convert the output data from a list of models to the structure of tibble.
#'
#' A column, win_modl, indicating the winning model, is also added.
#'
#' @param lmodl A list of models.
#'
#' @return A tibble with output parameters from types of model fitting + a column, win_modl.
#' @keywords internal
#' @noRd

extract_fit_para <- function(lmodl) {
  result <- purrr::map_dfc(lmodl, tibble_fit_para)
  result[['win_modl']] <- lmodl[[1]]$modl
  return(result)
}

#' Extract fit responses based on the winning model
#'
#' If the winning model is cnst, the median of responses is reported per concentration.
#'
#' @inheritParams create_hillsimu_resp
#'
#' @return The tibble of the input with an added column, fitted_resp.
#' @keywords internal
#' @noRd

extract_fit_resp <- function(inp, out) {
  result <- inp
  win_modl <- out[[1]]

  fit_resps <- rep(as.numeric(NA), length(inp$conc))

  if (win_modl$modl == "cnst") {

    fit_resps <- rep(median(inp$resp), length(inp$conc))

  } else if (win_modl$modl == "hill") {

    fit_resps <- tcplHillVal(inp$conc, win_modl$tp, win_modl$ga, win_modl$gw)
  }

  # add the results to a new column
  result[['fitted_resp']] <- fit_resps
  return(result)
}



#' Extract activity from the model fits.
#'
#'
#' If the cnst is the winning model and the median of responses larger than the thr_resp,
#' it is considered as an hit.
#' The median of responses is reported as Emax and
#' the lowest concentration is the other potenty metrics.
#' The hit is considered having POD < max tested concentration.
#'
#' @inheritParams create_hillsimu_resp
#' @inheritParams summarize_fit_output
#'
#' @return A list of activity metrics + hit.
#' @keywords internal
#' @noRd
#'
extract_fit_activity <- function(inp, out, thr_resp, perc_resp) {

  win_modl <- out[[1]]
  med_resp <- median(inp$resp)
  EC50 <- NA_real_
  Emax <- NA_real_
  POD <- NA_real_
  ECxx <- NA_real_
  slope <- NA_real_
  hit <- 0

  if (win_modl$modl == "cnst") {
    if (abs(med_resp) > thr_resp) {
      hit <- 1
      min_conc <- min(inp$conc)
      Emax <- med_resp
      EC50 <- min_conc
      POD <- min_conc
      ECxx <- min_conc

    }

  } else if (win_modl$modl == "hill") {
    Emax <- win_modl$tp
    EC50 <- win_modl$ga
    slope <- win_modl$gw

    if (!is.na(Emax)) {
      # perc_resp
      ECxx <- tcplHillACXX(perc_resp, win_modl$tp, win_modl$ga, win_modl$gw)

      # thr_resp
      if (Emax < 0) thr_resp <- thr_resp*-1
      if (abs(Emax) > abs(thr_resp)) POD <- tcplHillConc(thr_resp, win_modl$tp, win_modl$ga, win_modl$gw)

      # hit?
      if (!is.na(POD) && POD < max(inp$conc)) hit <- 1
    }
  }

  result <- tibble::tibble(
    EC50 = EC50,
    slope = slope,
    Emax = Emax,
    ECxx = ECxx,
    POD = POD,
    hit = hit
    )
  return(result)

}


extract_hill_activity <- function(out, bmr) {
  if (out$tp < 0) bmr <- bmr*-1

  result <- list(
    EC50 = out[['ga']],
    Emax = out[['tp']],
    POD = suppressWarnings(tcplHillConc(bmr, out$tp, out$ga, out$gw))
  ) %>% tibble::as_tibble()

  return(result)
}

Try the Rcurvep package in your browser

Any scripts or data that you put into this service are public.

Rcurvep documentation built on Aug. 25, 2022, 5:09 p.m.