R/tabular_input.R

Defines functions load_surv_models check_survival_specs modify_param_defs_for_multinomials transition_type save_graph save_outputs is_xls is_xlsx is_csv filter_blanks read_file create_demographic_table parse_multi_spec create_df_from_tabular create_model_from_tabular create_options_from_tabular create_parameters_from_tabular create_matrix_from_tabular create_states_from_tabular create_model_list_from_tabular eval_models_from_tabular gather_model_info run_model_tabular

Documented in create_demographic_table create_df_from_tabular create_matrix_from_tabular create_model_from_tabular create_model_list_from_tabular create_options_from_tabular create_parameters_from_tabular create_states_from_tabular eval_models_from_tabular filter_blanks gather_model_info is_csv is_xls is_xlsx load_surv_models parse_multi_spec read_file run_model_tabular save_outputs

#' Run Analyses From Files
#' 
#' This function runs a model from tabular input.
#' 
#' The reference file should have two columns. `data` 
#' can be added, having value `TRUE` where an absolute 
#' file path is provided. `data` values must include 
#' `state`, `tm`, and `parameters`, and can 
#' also include `options`, `demographics` and 
#' `data`.  The corresponding values in the `file`
#' column give the names of the files (located in 
#' `base_dir`) that contain the corresponding 
#' information - or, in the case of `data`, the 
#' directory containing the tables to be loaded.
#' 
#' @param location Directory where the files are located.
#' @param reference Name of the reference file.
#' @param run_dsa Run DSA?
#' @param run_psa Run PSA?.
#' @param run_demo Run demographic analysis?
#' @param save Should the outputs be saved?
#' @param overwrite Should the outputs be overwritten?
#'   
#' @return A list of evaluated models (always), and, if 
#'   appropriate input is provided, dsa (deterministic 
#'   sensitivity analysis), psa (probabilistic sensitivity 
#'   analysis) and demographics (results across different 
#'   demographic groups).
#'   
#' @export
run_model_tabular <- function(location, reference = "REFERENCE.csv",
                              run_dsa = TRUE, 
                              run_psa = TRUE, run_demo = TRUE,
                              save = FALSE, overwrite = FALSE) {
  
  inputs <- gather_model_info(location, reference)
  outputs <- eval_models_from_tabular(inputs,
                                      run_dsa = run_dsa,
                                      run_psa = run_psa,
                                      run_demo = run_demo)
  
  output_dir <- inputs$output_dir
  
  if (save) {
    save_outputs(outputs, output_dir, overwrite)
  }
  
  outputs
}

#' Gather Information for Running a Model From Tabular Data
#' 
#' @param base_dir Directory where the files are located.
#' @param ref_file Name of the reference file.
#'   
#' @return A list with elements: \itemize{ \item models (of 
#'   type `uneval_model`, created by 
#'   [create_model_list_from_tabular()]) \item 
#'   param_info  \item output_dir where to store output 
#'   files, if specified \item demographic_file a table for 
#'   demographic analysis \item model_options a list of 
#'   model options.}
#'   
#' @keywords internal
gather_model_info <- function(base_dir, ref_file) {
  
  if (options()$heemod.verbose) message("* Reading files...")
  if (options()$heemod.verbose) message("** Reading reference file...")
  ref <- read_file(file.path(base_dir, ref_file))
  
  if (any(pb <- duplicated(ref$data))) {
    stop(sprintf(
      "Duplicated values in reference file 'data' column: %s.",
      paste(ref$data[pb], collapse = ", ")
    ))
  }
  
  if (is.null(ref$absolute_path)) {
    ref$full_file <- file.path(base_dir, ref$file)
  } else {
    if (options()$heemod.verbose) message(sprintf(
      "** Using absolute path for %s.",
      paste(ref$data[ref$absolute_path & ! is.na(ref$absolute_path)],
            collapse = ", ")
    ))
    
    ref$full_file <- ifelse(
      ref$absolute_path & ! is.na(ref$absolute_path),
      ref$file,
      file.path(base_dir, ref$file)
    )
  }
  
  df_env <- new.env()
  
  if (options()$heemod.verbose) message("** Reading model list...")
  models <- create_model_list_from_tabular(
    ref = ref,
    df_env = df_env
  )
  
  model_options <- NULL
  if ("options" %in% ref$data) {
    if (options()$heemod.verbose) message("** Reading options...")
    model_options <- create_options_from_tabular(
      read_file(ref$full_file[ref$data == "options"])
    )
  }
  
  if ("data" %in% ref$data) {
    if (options()$heemod.verbose) message("** Reading external data...")
    create_df_from_tabular(
      ref$full_file[ref$data == "data"],
      df_env
    )
  }
  
  if ("source" %in% ref$data) {
    if(options()$heemod.verbose) message("** Reading R source files...")
    source_dir <- ref$full_file[ref$data == "source"]
    short_source_dir <- ref$file[ref$data == "source"]
    if (! dir.exists(source_dir)) {
      stop("'source' directory missing: ", short_source_dir)
    }
    source_list <- list.files(source_dir, full.names = TRUE)
    if (length(source_list) == 0) {
      stop("No source files in 'source' directory: ", 
           short_source_dir)                      
    }                                    
    for (this_file in source_list) { 
      source(this_file, echo = FALSE, local = df_env)
    }
  }
  
  ## note - the environment df_env gets included directly
  ##   into param_info, so anything that will load anything
  ##   into that environment needs to come before this statement
  if (options()$heemod.verbose) message("** Reading parameters..")
  param_info <- create_parameters_from_tabular(
    read_file(ref$full_file[ref$data == "parameters"]),
    df_env
  )
  
  output_dir <- NULL
  if("output" %in% ref$data) {
    if (options()$heemod.verbose) message("** Reading path to output directory...")
    output_dir <- ref$full_file[ref$data == "output"]
  }
  
  demographic_file <- NULL
  if("demographics" %in% ref$data) {
    if (options()$heemod.verbose) message("** Reading demographic data...")
    demographic_file <- create_demographic_table(
      read_file(ref$full_file[ref$data == "demographics"]),
      params = param_info$params
    )
  }
  
  
  list(
    models = models,
    param_info = param_info,
    output_dir = output_dir,
    demographic_file = demographic_file,
    model_options = model_options
  )
}

#' Evaluate Models From a Tabular Source
#' 
#' Execute a full set of analyses, possibly including 
#' discrete sensitivity analysis, probabilistic sensitivity 
#' analysis, and analyses across demographics.
#' 
#' @param inputs Result from 
#'   [gather_model_info()].
#' @param run_dsa Run DSA?
#' @param run_psa Run PSA?
#' @param run_demo Run demographic analysis?
#'   
#' @return a list \itemize{ \item `models` (always) 
#'   unevaluated model. \item `model_runs` (always) 
#'   evaluated models \item `dsa` (deterministic 
#'   sensitivity analysis) - if appropriate parameters 
#'   provided \item `psa` (probabilistic sensitivity 
#'   analysis) - if appropriate parameters provided \item 
#'   `demographics` results across different 
#'   demographic groups - if appropriate parameters 
#'   provided}
#'   
#' @keywords internal
eval_models_from_tabular <- function(inputs,
                                     run_dsa = TRUE,
                                     run_psa = TRUE,
                                     run_demo = TRUE) {
  
  if (options()$heemod.verbose) message("* Running files...")
  
  list_args <- c(
    inputs$models,
    list(
      parameters = inputs$param_info$params,
      init = inputs$model_options$init,
      cost = inputs$model_options$cost,
      effect = inputs$model_options$effect,
      base_model = inputs$model_options$base_model,
      method = inputs$model_options$method,
      cycles = inputs$model_options$cycles
    )
  )
  
  list_args <- Filter(
    function(x) ! is.null(x),
    list_args
  )
  
  
  if (! is.null(inputs$model_options$num_cores)) {
    use_cluster(inputs$model_options$num_cores)
    on.exit(
      close_cluster()
    )
  }
  
  if (options()$heemod.verbose) message("** Running models...")
  model_runs <- do.call(
    run_model,
    list_args
  )
  
  model_dsa <- NULL
  if (run_dsa & ! is.null(inputs$param_info$dsa)) {
    if (options()$heemod.verbose) message("** Running DSA...")
    model_dsa <- run_dsa(
      model_runs,
      inputs$param_info$dsa_params
    )
  }
  
  model_psa <- NULL
  if (! is.null(inputs$param_info$psa_params) & run_psa) {
    if (options()$heemod.verbose) message("** Running PSA...")
    model_psa <- run_psa(
      model_runs,
      psa = inputs$param_info$psa_params,
      N = inputs$model_options$n
    )
  }
  
  demo_res <- NULL
  if (! is.null(inputs$demographic_file) & run_demo) {
    if (options()$heemod.verbose) message("** Running demographic analysis...")
    demo_res <- stats::update(model_runs, inputs$demographic_file)
  }
  
  ##  if(! is.null(inputs$model_options$num_cores)) {
  ##    close_cluster()
  ##  }
  
  list(
    models = inputs$models,
    model_runs = model_runs,
    dsa = model_dsa,
    psa = model_psa,
    demographics = demo_res
  )
}

#' Read Models Specified by Files
#' 
#' @param ref Imported reference file.
#' @param df_env An environment containing external data.
#'   
#' @return A list of unevaluated models.
#'   
#' @keywords internal
create_model_list_from_tabular <- function(ref, df_env = globalenv()) {
  if(! inherits(ref, "data.frame")) stop("'ref' must be a data frame.")
  
  if (options()$heemod.verbose) message("*** Reading states...")
  state_file_info <- 
    read_file(ref$full_file[ref$data == "state"])
  
  state_info <- parse_multi_spec(
    state_file_info,
    group_vars = ".state"
  )
  state_names <- state_info[[1]]$.state
  ## to accomodate partitioned survival models, we will allow for
  ##   the possibility that there is no transition matrix ...
  if (options()$heemod.verbose) message("*** Reading TM...")
  
  tm_info <- read_file(ref$full_file[ref$data == "tm"])
  trans_type <- transition_type(tm_info)
  
  if (trans_type == "matrix") {
    tm_info <- parse_multi_spec(
      tm_info,
      group_vars = c("from", "to"))
    tab_undefined <- 
      dplyr::bind_rows(tm_info) %>%
      dplyr::filter(is.na(.data$prob))
    
    if (nrow(tab_undefined) > 0) {
      rownames(tab_undefined) <- NULL
      print(tab_undefined)
      stop("Undefined probabilities in the transition matrix (see above).")
    }
    one_way <- setdiff(names(state_info), names(tm_info))
    other_way <- setdiff(names(tm_info), names(state_info))
  }
  
  
  if(trans_type == "part_surv"){
    use_fits_file <- ref[ref$data == "use_fits", "full_file"]
    use_fits <- read_file(use_fits_file)
    tm_info <- construct_part_surv_tib(use_fits, ref, env = df_env,
                                    state_names = state_names)
    one_way <- setdiff(names(state_info), unique(tm_info$.strategy))
    other_way <- setdiff(unique(tm_info$.strategy), names(state_info))
  }


  one_way <- setdiff(names(state_info), names(tm_info))
  other_way <- setdiff(names(tm_info), names(state_info))
  if (length(c(one_way, other_way))){
    err_string <- "Mismatching model names between transition (TM) file and state file.\n"
    if(length(one_way))
      err_string <-
        paste(err_string,
              "In state file but not TM file:", 
              paste(one_way, collapse = ", "),
              "\n")
    if(length(other_way))
      err_string <-
        paste(err_string,
              "In TM but not state file:", 
              paste(other_way, collapse = ", "),
              "\n")
    stop(err_string)
  }
  
  
  if(trans_type == "part_surv")
    tm_info <- 
      dplyr::filter(tm_info, .data$.strategy %in% names(state_info))
  else
    tm_info <- tm_info[names(state_info)]


  if (options()$heemod.verbose) message("*** Defining models...")
  models <- lapply(
    seq_along(state_info),
    function(i) {
      if(inherits(tm_info, "tbl_df"))
        this_tm <- dplyr::filter(
          tm_info,
          .data$.strategy == names(state_info)[i])$part_surv[[1]]
      else
        this_tm <- tm_info[[i]]
      create_model_from_tabular(state_info[[i]], 
                                this_tm,
                                df_env = df_env)
    })  
  
  names(models) <- names(state_info)
  
  models
}

#' Create State Definitions From Tabular Input
#' 
#' Transforms tabular input defining states into an 
#' `heemod` object.
#' 
#' Columns of state_info besides .model and state include 
#' costs and utilities we want to keep track of, with 
#' appropriate values (these may include parameters). For 
#' any cost or utility that should be discounted, an 
#' additional column with the name ".discount.\<cost\>" or 
#' ".discount.\<effect\>", for the appropriate cost or effect,
#' can be included. If no discounting is desired for a 
#' particular cost or effect, the corresponding column can 
#' be omitted.
#' 
#' A discount column can contain only a single value - a 
#' cost or benefit must be discounted by the same amount in 
#' each state. Discounts can be numbers or parameters (which
#' will then need to be defined like any other).
#' 
#' The input data frame is expected to contain state 
#' information for all the models you will use in an 
#' analysis. For more information see the vignette: 
#' `vignette("file-input", package = "heemod")`.
#' 
#' @param state_info Result for one model of 
#'   [parse_multi_spec()].
#' @param df_env An environment containing external data.
#'   
#' @return A state list.
#'   
#' @keywords internal
create_states_from_tabular <- function(state_info,
                                       df_env = globalenv()) {
  
  if(! inherits(state_info, "data.frame")) {
    stop("'state_info' must be a data frame.")
  }
  if(!(".state" %in% names(state_info))) {
    stop("'.state' should be a column name of the state file.")
  }
  if (any(duplicated(state_info$.state))) {
    stop(sprintf(
      "Duplicated state names: %s.",
      paste(unique(state_info$.state[duplicated(state_info$.state)]),
            sep = ", ")
    ))
  }
  
  
  
  state_names <- state_info$.state
  values <- setdiff(names(state_info), c(".model", ".state"))
  discounts <- values[grep("^\\.discount", values)]
  values <- setdiff(values, discounts)
  discounts_clean <- gsub("^\\.discount\\.(.+)", "\\1", discounts)

  num_missing_per_column <- colSums(sapply(state_info, is.na))
  missing_col_names <- names(num_missing_per_column)[num_missing_per_column > 0]
  ## missing names are allowed for discount columns
  missing_col_names <- setdiff(missing_col_names, discounts)
  if(length(missing_col_names)){
    stop("value",
         plur(length(missing_col_names)),
         " ",
         paste(missing_col_names, collapse = ", "),
         " for strategy '",
         unique(state_info$.model),
         "'",
         ifelse(length(missing_col_names) == 1, " has ", " have "),
         "missing values in the state file.\n",
         "Please make sure all values are defined for all states ",
         "(even when the value is 0)."
    )
  }
  
  if (! all(discounts_clean %in% values)) {
    stop(sprintf(
      "Discounting rates defined for non-existing values: %s.",
      paste(discounts[! discounts %in% values], collapse = ", ")
    ))
  }
  
  for (n in discounts) {
    if (all(is.na(state_info[[n]]))) {
      stop(sprintf(
        "No discount values found for '%s'.", n
      ))
      
    } else if (length(unique(stats::na.omit(state_info[[n]]))) > 1) {
      stop(sprintf(
        "Multiple discount values for '%s'.", n
      ))
      
    } else {
      state_info[[n]] <- stats::na.omit(state_info[[n]])[1]
    }
  }
  
  for (n in discounts_clean) {
    state_info[[n]] <- sprintf(
      "discount(%s, %s)",
      state_info[[n]],
      state_info[[paste0(".discount.", n)]]
    )
  }
  
  res <- define_state_list_(
    stats::setNames(lapply(
      state_info$.state,
      function(state) {
        define_state_(
          list(
            .dots = lazyeval::as.lazy_dots(
              stats::setNames(as.character(lapply(
                values,
                function(value) {
                  state_info[[value]][state_info$.state == state]
                }
              )), values),
              env = df_env
          ), 
          starting_values = define_starting_values()
          )
        )
      }
    ), state_info$.state)
  )
  if (options()$heemod.verbose) print(res)
  res
}

#' Create a Transition Matrix From Tabular Input
#' 
#' Transforms tabular input defining a transition matrix 
#' into an `heemod` object.
#' 
#' The data frame `trans_probs` should have columns 
#' `from`, `to`, and `prob`, where 
#' `prob` is the probability of a transition from the 
#' `from` state to the `to` state. Prob can be 
#' defined in terms of parameters, just as when using 
#' `define_transition` at the keyboard. Probabilities of 0 
#' need not be specified - they will be automatically 
#' inserted.
#' 
#' All state names must be used in the `from` column of
#' the transition matrix (otherwise you can just get rid of 
#' the state). Absorbing states should have a transition 
#' from and to themselves with probability 1.
#' 
#' @param trans_probs  Result for one model of 
#'   [parse_multi_spec()].
#' @param state_names The names of the states used in the 
#'   transition matrix.
#' @param df_env An environment containing external data.
#'   
#' @return A transition matrix.
#'   
#' @keywords internal
create_matrix_from_tabular <- function(trans_probs, state_names,
                                       df_env = globalenv()) {
  if(! inherits(trans_probs, "data.frame")) {
    stop("'trans_probs' must be a data frame.")
  }
  stopifnot(
    all(c("from", "to", "prob") %in% names(trans_probs)),
    length(state_names) > 0
  )
  
  unique_states <- unique(c(trans_probs$from, trans_probs$to))
  
  ## we can have an initial state where people start but 
  ## can't get to, so we don't check trans_probs$to states 
  ## the way we check trans_probs$from states
  
  if (! all(trans_probs$to %in% trans_probs$from)) {
    stop(sprintf(
      "Some states do not have an exit probability: %s.",
      paste(
        unique(trans_probs$to)[! unique(trans_probs$to) %in% trans_probs$from],
        sep = ", "
      )
    ))
  }
  
  if(! all(unique_states %in% state_names)) {
    stop(sprintf(
      "Some states specified in the transition matrix differ from 'state_names': %s.",
      unique_states[! unique_states %in% state_names]
    ))
  }
  
  ## set up matrix of 0's, and then add the listed
  ## probabilities
  num_states <- length(unique_states)
  prob_mat <- matrix(0, nrow = num_states, ncol = num_states,
                     dimnames = list(state_names, state_names))
  prob_mat[as.matrix(trans_probs[, c("to", "from")])] <- trans_probs$prob
  
  res <- define_transition_(
    lazyeval::as.lazy_dots(as.character(prob_mat), env = df_env),
    state_names = state_names
  )
  if (options()$heemod.verbose) print(res)
  res
}

#' Create a Parameter Definition From Tabular Input
#' 
#' If specified in the tabular file, DSA and PSA can also be
#' created.
#' 
#' The tabular parameter definition file can have the 
#' following columns: `parameter` (the parameter name, 
#' required), `value` (required), `low` and 
#' `high` (if both are present, a deterministic 
#' sensitivity analysis will be performed), and `psa` 
#' (a definition of a distribution to use in a probabilistic
#' sensitivity analysis. Other columns will be ignored.
#' 
#' @param param_defs A parameter definition file.
#' @param df_env An environment containing external data.
#' 
#' @return The parameter definition.
#'   
#' @keywords internal
create_parameters_from_tabular <- function(param_defs,
                                           df_env = globalenv()) {
  if(! inherits(param_defs, "data.frame"))
    stop("'param_defs' must be a data frame.")
  if(! "parameter" %in% names(param_defs))
    stop("parameter file must include the column 'parameter'")
  
  if (xor("low" %in% names(param_defs),
          "high" %in% names(param_defs))) {
    stop("Both 'low' and 'high' columns must be present in parameter file to define DSA.")
  }
  
  parameters <- define_parameters_(
    lazyeval::as.lazy_dots(
      stats::setNames(
        as.character(lapply(param_defs$value, function(x) x)),
        param_defs$parameter
      ),
      env = df_env
    )
  )
  
  dsa <- psa <- NULL
  
  if ("low" %in% names(param_defs) &&
      "high" %in% names(param_defs)) {
    
    if (! all(is.na(param_defs$low) ==
              is.na(param_defs$high))) {
      stop("'low' and 'high' must be both non missing in DSA tabular definition.")
    }
    
    if (all(is.na(param_defs$low))) {
      stop("No non-missing values in columns 'low' and 'high'.")
    }
    
    param_sens <- param_defs$parameter[! is.na(param_defs$low)]
    low <- stats::na.omit(param_defs$low)
    high <- stats::na.omit(param_defs$high)
    
    dsa <- define_dsa_(
      par_names = param_sens,
      low_dots = lazyeval::as.lazy_dots(
        stats::setNames(
          as.character(lapply(low, function(x) x)),
          param_sens
        ),
        env = df_env
      ),
      high_dots = lazyeval::as.lazy_dots(
        stats::setNames(
          as.character(lapply(high, function(x) x)),
          param_sens
        ),
        env = df_env
      )
    )
  }
  
  if ("psa" %in% names(param_defs)) {
    
    if (all(is.na(param_defs$psa))) {
      stop("No non-missing values in column 'psa'.")
    }
    
    param_psa <- param_defs$parameter[! is.na(param_defs$psa)]
    distrib_psa <- stats::na.omit(param_defs$psa)
    
    psa <- 
      do.call(define_psa,    
              lapply(
                seq_along(param_psa),
                function(i) {
                  substitute(
                    lhs ~ rhs,
                    list(
                      lhs = parse(text = param_psa[i])[[1]],
                      rhs = parse(text = distrib_psa[i])[[1]]))
                }
              )
      )
    
  }
  
  ## if there are multinomials specified, 
  ##  fix them up in the parameters and redefine
  param_defs_new <- 
    modify_param_defs_for_multinomials(param_defs, psa)
  
  parameters <- define_parameters_(
    lazyeval::as.lazy_dots(
      stats::setNames(
        as.character(lapply(param_defs_new$value, function(x) x)),
        param_defs_new$parameter
      ),
      env = df_env
    )
  )
  
  
  list(
    params = parameters,
    dsa_params = dsa,
    psa_params = psa
  )
}


#' Create Model Options From a Tabular Input
#'
#' @param opt An option data frame.
#'
#' @return A list of model options.
#'   
#' @keywords internal
create_options_from_tabular <- function(opt) {
  
  allowed_opt <- c("cost", "effect", "init",
                   "method", "base", "cycles", "n",
                   "num_cores")
  if(! inherits(opt, "data.frame"))
    stop("'opt' must be a data frame.")
  
  if (any(ukn_opt <- ! opt$option %in% allowed_opt)) {
    stop(sprintf(
      "Unknown options: %s.",
      paste(opt$option[ukn_opt], collapse = ", ")
    ))
  }
  
  if (any(duplicated(opt$option))) {
    stop("Some option names are duplicated: ",
         paste(unique(opt$option[duplicated(opt$option)]),
               collapse = ", "))
  }
  
  res <- list()
  for (n in opt$option) {
    res <- c(res, list(opt$value[opt$option == n]))
  }
  names(res) <- opt$option
  
  if (! is.null(res$init)) {
    if(substr(res$init, 1, 2) == "c(" & 
       substr(res$init, nchar(res$init), nchar(res$init)) == ")"){
      res$init <- substr(res$init, 3, nchar(res$init))
      res$init <- substr(res$init, 1, nchar(res$init) - 1)
      warning("initial values enclosed in c(); removing")
    }
    res$init <- as_numeric_safe(
      strsplit(res$init, ",")[[1]]
    )
  }
  
  if (! is.null(res$cycles)) {
    res$cycles <- as_integer_safe(res$cycles)
  }
  
  if (! is.null(res$n)) {
    res$n <- as_integer_safe(res$n)
  }
  
  if (! is.null(res$cost)) {
    res$cost <- parse(text = res$cost)[[1]]
  }
  
  if (! is.null(res$effect)) {
    res$effect <- parse(text = res$effect)[[1]]
  }
  if (! is.null(res$num_cores)){
    res$num_cores <- parse(text = res$num_cores)[[1]]
  }
  if (options()$heemod.verbose) message(paste(
    names(res), unlist(res), sep = " = ", collapse = "\n"
  ))
  res
}

#' Create a `heemod` Model From Tabular Files Info
#' 
#' Calls [create_states_from_tabular()] and
#' [create_matrix_from_tabular()].
#' 
#' @param state_info A state tabular file (file path or 
#'   parsed file).
#' @param tm_info A transition matrix tabular file (file 
#'   path or parsed file).
#' @param df_env An environment containing external data.
#' 
#' @return A `heemod` model as returned by 
#'   [define_strategy()].
#'   
#' @keywords internal
create_model_from_tabular <- function(state_info,
                                      tm_info,
                                      df_env = globalenv()) {
  if (length(tm_info) == 0) {
    stop("A transition object must be defined.")
  }
  
  if (! inherits(state_info, "data.frame")) {
    stop("'state_info' must be a data frame.")
  }
  
  if (!is.null(tm_info) &&
      ! inherits(tm_info, c("data.frame", "part_surv"))) {
    stop("'tm_info' must be either a data frame ",
         "defining a transition matrix or a part_surv object ",
         "defining a partitioned survival model.")
  }
  
  if (options()$heemod.verbose) message("**** Defining state list...")
  states <- create_states_from_tabular(state_info,
                                       df_env = df_env)
  if (options()$heemod.verbose) message("**** Defining TM...")
  
  if (inherits(tm_info, "data.frame")) {
    TM <- create_matrix_from_tabular(
      tm_info, get_state_names(states),
      df_env = df_env)
  }
  
  if (inherits(tm_info, "part_surv")) {
    TM <- tm_info
  }
  
  define_strategy_(transition = TM, states = states,
                   starting_values = check_starting_values(
                     define_starting_values(),
                     get_state_value_names(states)))
}

#' Load Data From a Folder Into an Environment
#' 
#' Reads files containing data frames (in tabular format) 
#' from a directory, and loads them in an environment to be 
#' available during an analysis.
#' 
#' The files must be in .csv, .xls, or .xlsx format. A file 
#' my_df.csv (or my_df.xls, or my_df.xlsx) will be loaded as
#' a data frame my_df.
#' 
#' @param df_dir A directory containing the files.
#' @param df_envir An environment.
#' 
#' @return The environment with the data frames.
#'   
#' @keywords internal
create_df_from_tabular <- function(df_dir, df_envir) {
  if(! file.exists(df_dir)) {
    stop(paste(df_dir, "does not exist."))
  }
  
  all_files <- list.files(df_dir, full.names = TRUE)
  
  ## work around an annoyance around open files in windows
  if(.Platform$OS.type == "windows") {
    part_files <- list.files(df_dir, full.names = FALSE)
    accidental_files <- grep("^~.*\\.xlsx?", part_files)
    if(length(accidental_files) > 0) {
      all_files <- all_files[- accidental_files]
    }
  }
  
  obj_names <- all_files %>% 
    basename %>% 
    tools::file_path_sans_ext()
  
  if(any(duplicate_names <- duplicated(obj_names)))
    stop(paste("Duplicate data names:",
               paste(obj_names[duplicate_names], collapse = ", ")))
  
  ## do the assignments
  for(i in seq(along = all_files)){
    this_val <- read_file(all_files[i])
    
    ## check for accidental commas in numbers
    comma_cols <- 
      which(sapply(sapply(this_val, function(x){grep(",", x)}),
                   any)
      )

    for(this_comma_col in comma_cols){
      try_numeric <- try(as.numeric(gsub(",", "", this_val[, this_comma_col])), 
                         silent = TRUE)
      if(!inherits(try_numeric, "try-error")){
        this_val[, this_comma_col] <- try_numeric
        message(paste("converting column",
                      names(this_val)[this_comma_col],
                      "from file",
                      basename(all_files[i]),
                      "to numeric despite it having commas"
                      )
                )
    }
    }
    assign(obj_names[i], this_val, envir = df_envir)
  }
  df_envir
}

#'Specify Inputs for Multiple Models From a Single File
#'
#'Parse a `data.frame` containing specifications for 
#'multiple models into a list of inputs required for each 
#'model.
#'
#'Each combination of values of the columns specified by 
#'`group_vars` should either be unique in the file (in 
#'which case it will be replicated for all values of 
#'`split_on`), or must be repeated as many times as 
#'unique values of `split_on`.
#'
#'`split_on` is usually the model name.
#'
#'`group_var` can be the state names, or from and to 
#'lines for a matrix definition...
#'
#'@param multi_spec `data frame`.
#'@param split_on `character` of length 1, with the 
#'  name of the variable in `multi_spec` to be split 
#'  on.
#'@param group_vars `character`, one or more variable 
#'  names from `multi_spec` that identify a line of 
#'  information.
#'  
#'@return A list of data frames, one for each value of 
#'  `split_on.`
#'   
#' @keywords internal
parse_multi_spec <- function(multi_spec,
                             split_on = ".model",
                             group_vars) {
  
  if(! inherits(multi_spec, "data.frame"))
    stop("'multi_spec' must be a data frame.")
  
  if(length(split_on) != 1) stop("'split_on' must have a length of exactly 1.")
  if(any(names(multi_spec) == "")) stop("'multi_spec' can't have empty names.")
  
  if(! all(c(split_on, group_vars) %in% names(multi_spec)))
    stop("'split_on' and 'group_vars' must be column names of the input 'multi_spec'.")
  
  remaining <- setdiff(names(multi_spec), c(split_on, group_vars))
  
  ## any line that exists for one split but not others
  ##   needs to be duplicated for all splits
  ## throw an error if a parameter exists for more than one,
  ##   but not all
  unique_splits <- unique(multi_spec[, split_on])
  num_splits <- length(unique_splits)
  
  occurences <- multi_spec %>% 
    dplyr::group_by(!!!syms(as.list(group_vars))) %>% 
    dplyr::summarize(count = n())
  
  orig_order <- unique(multi_spec[, group_vars, drop = FALSE])
  
  wrong_number_times <- 
    occurences$count > 1 & occurences$count < num_splits
  
  if(any(wrong_number_times)) {
    stop("'group_var' combinations must be specified either once for all splits or once for each split.")
  }
  
  just_once <- multi_spec %>% 
    dplyr::group_by(!!!syms(as.list(group_vars))) %>% 
    dplyr::filter( n() == 1) %>%
    dplyr::select(-dplyr::one_of(split_on))
  
  just_once <- data.frame(
    temp = rep(unique_splits, nrow(just_once)),
    just_once[rep(seq_len(nrow(just_once)), each = num_splits), ],
    stringsAsFactors = FALSE
  )
  
  names(just_once)[1] <- split_on
  
  more_than_once <- multi_spec %>% 
    dplyr::group_by(!!! syms(as.list(group_vars))) %>%
    dplyr::filter(n() > 1)
  
  multi_spec <- 
    dplyr::bind_rows(just_once, as.data.frame(more_than_once))
  rownames(multi_spec) <- NULL
  list_spec <- split(multi_spec, multi_spec[, split_on])
  ## sort by order of appearance of split variables in multi_spec
  ##   (making first model mentioned the base model, similar to earlier system)
  
  orig_split_order <- unique(multi_spec[, split_on])
  list_spec <- list_spec[orig_split_order]
  ## want to make sure everything comes out sorted
  ##   by order of appearance of values of group_vars
  lapply(list_spec, function(x) {
    match_to_orig <-
      sapply(group_vars, function(this_var){
        match(x[, this_var], orig_order[, this_var])})
    x[do.call("order", as.data.frame(match_to_orig)),]
  })
}

#' Read a Demographic Table
#' 
#' This function mostly checks whether the parameters are
#' correct.
#' 
#' An optional `.weights` column can exist in the file.
#' 
#' @param newdata A data frame.
#' @param params Parameters of a model, to check that all
#'   the columns in the demographic table (other than the
#'   weight column) are in the model.
#' @param ... catches other, unwanted arguments.
#'   
#' @return  A data frame.
#'   
#' @keywords internal
create_demographic_table <- function(newdata,
                                     params) {
  weight_col <- which(names(newdata) == ".weights")
  if (length(weight_col)) {
    var_names <- names(newdata)[- weight_col]
  } else {
    var_names <- names(newdata)
  }
  valid_names <- var_names %in% names(params)
  
  if(! all(valid_names)) {
    invalid_names <- paste(var_names[!valid_names], collapse = ", ")
    stop(sprintf(
      "The following columns in the demographic table are not parameters of the model: %s.",
      invalid_names
    ))
  }
  
  newdata
}

#' Read the accepted file formats for tabular input
#'
#' Columns starting with '.comment' are ignored.
#' 
#' @param file_name File name.
#' 
#' @return A `data.frame`.
#'   
#' @keywords internal
read_file <- function(file_name) {
  
  have_xls <- is_xls(file_name)
  have_xlsx <- is_xlsx(file_name)
  have_csv <- is_csv(file_name)
  
  if(! have_csv & ! have_xls & ! have_xlsx) {
    stop("file names must be for csv, xls, or xlsx")
  }
  
  if(have_csv) {
    tab <- utils::read.csv(
      file_name, header = TRUE,
      stringsAsFactors = FALSE,
      strip.white = TRUE,
      na.strings = c(".", "NA", "")
    ) 
  } else if(have_xls | have_xlsx) {
    if (! requireNamespace("readxl", quietly = TRUE)) {
      stop("readxl packaged needed to read Excel files")
      
    } else {
      tab <- as.data.frame(
        readxl::read_excel(file_name)
      )
    }
  }
  
  ## get rid of "comment" columns, if any
  tab <- tab[! grepl("^\\.comment", names(tab))]
  
  ## get rid of NA rows that may have come from blank rows
  ## in file
  tab <- filter_blanks(tab)
  tab
}

#' Remove Blank Rows From Table
#' 
#' Remove rows were all values are `NA`.
#' 
#' Some rows can be left blanks in the input table for 
#' readability, this function ensures those rows are 
#' removed.
#' 
#' @param x A `data.frame`.
#'   
#' @return A `data.frame` without blank rows.
#'   
#' @keywords internal
filter_blanks <- function(x) {
  x[! apply(is.na(x), 1, all), , drop = FALSE]
}

#' Check File Type
#' 
#' @param x A file name.
#' @return Whether the file is (respectively)
#'  csv, xlsx, or xls.
#' @rdname file-checkers
#'   
#' @keywords internal
is_csv <- function(x) {
  tolower(tools::file_ext(x)) == "csv"
}

#' @rdname file-checkers
is_xlsx <- function(x) {
  tolower(tools::file_ext(x)) == "xlsx"
}

#' @rdname file-checkers
is_xls <- function(x) {
  tolower(tools::file_ext(x)) == "xls"
}

#' Save Model Outputs
#' 
#' @param outputs Result from
#'   [run_model_tabular()].
#' @param output_dir Subdirectory in which to write output.
#' @param overwrite Should the outputs be overwritten?
#'   
#' @return `NULL`. Used for its side effect of creating
#'   files in the output directory.
#'   
#' @keywords internal
save_outputs <- function(outputs, output_dir, overwrite) {
  if (options()$heemod.verbose) message("* Saving outputs...")
  if(is.null(output_dir)) {
    warning("Output directory not defined in the specification file - the outputs will not be saved.")
    return(NULL)
    
  } else if(dir.exists(output_dir) & ! overwrite) {
    warning("Output directory exists and overwrite is FALSE - the outputs will not be saved.")
    return(NULL)
    
  } else {
    delete_success <- 0
    if(dir.exists(output_dir)) {
      delete_success <- unlink(output_dir, recursive = TRUE)
    }
    
    if (delete_success == 1) {
      warning("Failed to delete output directory.")
      return(NULL)
    }
    
    dir.create(output_dir)
  }
  
  ## some csv files
  if (options()$heemod.verbose) message("** Writing tabular outputs to files ...")
  
  if (! is.null(outputs$demographics)) {
    utils::write.csv(
      summary(outputs$demographics)$scaled_results,
      file = file.path(output_dir, "icer_by_group.csv"),
      row.names = FALSE
    )
  }
  
  utils::write.csv(
    get_counts(outputs$model_runs),
    file = file.path(output_dir, "state_counts.csv"),
    row.names = FALSE
  )
  
  utils::write.csv(
    get_values(outputs$model_runs),
    file = file.path(output_dir, "cycle_values.csv"),
    row.names = FALSE
  )
  
  if(!is.null(outputs$dsa)){
    utils::write.csv(
      summary(outputs$dsa)$res_comp,
      file = file.path(output_dir, "dsa.csv"),
      row.names = FALSE
    )
  }
  
  utils::write.csv(
    outputs$psa$psa,
    file = file.path(output_dir, "psa_values.csv"),
    row.names = FALSE
  )
  
  
  
  ## plots about individual models
  
  if (options()$heemod.verbose) message("** Generating plots for individual models...")
  this_plot <- plot(outputs$model_runs)
  this_file <- "state_count_plot"
  save_graph(this_plot, output_dir, this_file)
  
  if(!is.null(outputs$dsa)){
    this_plot <- plot(outputs$dsa)
    this_file <- "dsa"
    save_graph(this_plot, output_dir, this_file)
  }
  
  ## plots about differences between models
  if (options()$heemod.verbose) message("** Generating plots with model differences...")
  
  if(!is.null(outputs$dsa)){
    this_plot <- plot(outputs$dsa, type = "difference")
    this_file <- "dsa_diff"
    save_graph(this_plot, output_dir, this_file)
  }
  if(!is.null(outputs$psa)){
    this_plot <- plot(outputs$psa)
    this_file <- paste("psa")
    save_graph(this_plot, output_dir, this_file)
    ## acceptability curve
    if (options()$heemod.verbose) message("** Generating acceptability curve...")
    this_plot <- plot(outputs$psa, type = "ac")
    save_graph(this_plot,
               output_dir, "acceptability")
  }
  invisible(NULL)
}

save_graph <- function(plot, path, file_name) {
  full_file <- file.path(path, file_name)
  grDevices::png(filename = paste(full_file, "png", sep = "."))
  print(plot)
  grDevices::dev.off()
  filename <- paste(full_file, "pdf", sep = ".")
  if(capabilities("cairo")){
    grDevices::cairo_pdf(filename = filename)
  } else {
   grDevices::pdf(file = filename)
  }
  print(plot)
  grDevices::dev.off()
}


transition_type <- function(tm_info){
  which_defines <- NULL
  if(all(names(tm_info)[1:4] == c(".model", "from", "to", "prob"))){
    which_defines <- "matrix"
  }
  else{
    if(all(sort(names(tm_info)[1:10]) == 
           sort(c("type", "treatment",	"data_directory",
                  "data_file",	"fit_directory",	"fit_name",
                  "fit_file",	"time_col",	"treatment_col",
                  "censor_col"))))
      which_defines <- "part_surv"
  }
  if(is.null(which_defines))
    stop("the data frame tm_info must define ",
         "either a transition matrix or a partitioned survival object")
  which_defines
}

modify_param_defs_for_multinomials <- function(param_defs, psa) {
  param_names <- param_defs[, 1]
  multinom_pos <- grep("\\+", param_names)
  
  multinom_vars <- lapply(
    multinom_pos,
    function(i) {
      strsplit(param_names[[i]], "+", fixed = TRUE)[[1]]
    })
  
  multinom_vars <- lapply(
    multinom_vars,
    function(x) {
      gsub("[[:space:]]", "", x)
    })
  
  duplicates <- sapply(
    param_defs$parameter,
    function(x){
      x %in% unlist(multinom_vars)
    })
  
  if (any(duplicates)) {
    stop("Some variables appear as individual parameters ",
         "and in a multinomial: ",
         paste(param_defs$parameter[duplicates], collapse = ", ")
    )
  }
  
  multinom_counts <- lapply(
    multinom_vars,
    function(x) {
      sapply(
        psa$list_qdist[x],
        function(y) {
          eval(expression(n),
               environment(y))
        })
    })
  
  multinom_probs <- lapply(
    multinom_counts,
    function(x) {
      x / sum(x)
    })
  
  replacements <- lapply(
    multinom_probs,
    function(x) {
      zz <- data.frame(
        parameter = names(x),
        value = x,
        stringsAsFactors = FALSE)
      rownames(zz) <- NULL
      zz
    })
  
  ## now we need to put these where they initially appeared
  ##   in the parameter file
  
  param_defs <- param_defs[, c("parameter", "value")]
  param_defs_old <- param_defs
  for (i in rev(seq(along = multinom_pos))) {
    this_pos <- multinom_pos[i]
    start_index <- 1:(this_pos - 1)
    end_index <- 
      if (this_pos == nrow(param_defs)) {
        numeric(0)
      } else {
        (this_pos + 1):nrow(param_defs)
      }
    
    param_defs <- dplyr::bind_rows(
      param_defs[start_index,],
      replacements[[i]],
      param_defs[end_index,])
  }
  
  param_defs
}


## make sure that survival fit specifications are 
##   properly formatted and have reasonable data
check_survival_specs <- 
  function(surv_specs){
    ## if there are troubles with absolute file paths, 
    ##   might want to add an "absolute" column to surv_specs
    ##   to be explicit (if, possibly, a little redundant)
    if(!is.data.frame(surv_specs) || nrow(surv_specs) == 0)
      stop("surv_specs must be a data frame with at least one row")
    if(!("event_code" %in% names(surv_specs))){
      warning("event_code not defined in surv_specs; setting to 1 for all rows")
      surv_specs$event_code <- 1
    }
    if(!("censor_code" %in% names(surv_specs))){
      warning("censor_code not defined in surv_specs; setting to 0 for all rows")
      surv_specs$censor_code <- 0
    }
      
    surv_spec_col_names <- c("type", "treatment", "data_directory", "data_file",
                             "fit_directory", "fit_name", "fit_file",
                             "time_col", "treatment_col", "censor_col",
                             "event_code", "censor_code"
                             )
    if(! identical(sort(names(surv_specs)), sort(surv_spec_col_names))){
      extra_names <- setdiff(names(surv_specs), surv_spec_col_names)
      missing_names <- setdiff(surv_spec_col_names, names(surv_specs))
      names_message <- paste("surv_ref must have column names:\n",
                             paste(surv_spec_col_names, collapse = ", "), 
                             sep = "")
      if(length(extra_names) > 0)
        names_message <- paste(names_message, "\n", "extra names: ",
                               paste(extra_names, collapse = ", "),
                               sep = "")
      if(length(missing_names) > 0)
        names_message <- paste(names_message, "\n", "missing names: ",
                               paste(missing_names, collapse = ", "),
                               sep = "")
      stop(names_message)
    }
    
    if(any(is.na(surv_specs) | surv_specs == ""))
      stop("all elements of surv_specs must be filled in")
    
    ## sort survival specs by treatment, then PFS and OS
    ## our checks will make sure that we have the right entries,
    ##   and that they are in the right order
    surv_specs <- 
      surv_specs %>% dplyr::arrange(.data$treatment, desc(.data$type))
    
    os_ind <- grep("os", surv_specs$type, ignore.case = TRUE)
    pfs_ind <- grep("pfs", surv_specs$type, ignore.case = TRUE)
    all_even_os <- identical(as.numeric(os_ind), 
                             seq(from = 2, 
                                 to = nrow(surv_specs),
                                 by = 2))
    all_odd_pfs <- identical(as.numeric(pfs_ind), 
                             seq(from = 1, 
                                 to = nrow(surv_specs),
                                 by = 2))
    same_treatments <- 
      identical(surv_specs$treatment[os_ind],
                surv_specs$treatment[pfs_ind])
    if(!all_even_os | !all_odd_pfs | !same_treatments)
      stop("each treatment must have exactly one PFS and one OS entry")
    
    if(any(dups <- duplicated(surv_specs[, c("type", "treatment")]))){
      print(surv_specs[dups,])
      stop("survival fit specification can only have one row for fitting ",
           "OS or PFS for a given treatment\n")
    }
    if(any(dups <- duplicated(surv_specs[, c("fit_directory", "fit_file", 
                                             "time_col", "censor_col")]))){
      print(surv_specs[dups,])
      stop("can only specify a given data file in a given directory with ",
           "the same time column and censoring column for one fit")
    }
    surv_specs
  }

#' Load a set of survival fits
#'
#' @param location base directory
#' @param survival_specs information about fits
#' @param use_envir an environment
#'
#' @return A list with two elements:  \itemize{
#'    \item{`best_models`, 
#'    a list with the fits for each data file passed in; and} 
#'    \item{`envir`, 
#'    an environment containing the models so they can be referenced to 
#'    get probabilities.}
#'    }
#' @export
#'
load_surv_models <- function(location, survival_specs, use_envir){
  fit_files <- file.path(location,
                         survival_specs$fit_directory,
                         survival_specs$fit_file)
  for(index in seq(along = fit_files)){
    this_fit_file <- fit_files[index]
    this_fit_name <- survival_specs$fit_name[index]
    load(paste(this_fit_file, ".RData", sep = ""))
  }
  surv_models <- mget(survival_specs$fit_name)
  names(surv_models) <- survival_specs$fit_name
  list(do.call("rbind", mget(survival_specs$fit_name)),
       env = use_envir)
}
pierucci/heemod documentation built on July 17, 2022, 9:27 p.m.