R/pls-modeling.R

Defines functions fit_rf fit_pls evaluate_model_q transform_cvpredictions train_rf_q train_pls train_pls_q control_train control_train_none_q control_train_rcv_q control_train_loocv_q control_train_q split_data_q

Documented in fit_pls fit_rf

## Perform calibration sampling based on spectral PCA
## or random split -------------------------------------------------------------
split_data_q <- function(
    spec_chem,
    split_method,
    evaluation_method = "test_set",
    ratio_val,
    ken_sto_pc = 2,
    print = TRUE,
    invert = FALSE, env = parent.frame()) {
  MIR <- model <- type <- PC1 <- PC2 <- NULL

  # Evaluate the invert argument in the parent function (fit_pls)
  invert <- eval(invert, envir = parent.frame())
  # Evaluate the validation argument in the parent function (fit_pls)
  evaluation_method <- eval(evaluation_method, envir = parent.frame())

  # Slice based on sample_id if spectral data is in tibble class
  if (tibble::is_tibble(spec_chem)) {
    spec_chem <- spec_chem %>%
      dplyr::group_by(!!rlang::sym("sample_id")) %>%
      dplyr::slice(1L)
  }

  if (evaluation_method == "test_set") {
    # pc = 0.99 before !!!
    ken_sto_pc <- eval(ken_sto_pc, envir = parent.frame())

    if (invert == FALSE) {
      ## Select calibration set by Kennard-Stones algorithm
      # Check if tibble; if yes slice tibble and bind list of data.tables in
      # one data table for spectral data
      if (tibble::is_tibble(spec_chem)) {
        spc_pre <- as.matrix(data.table::rbindlist(spec_chem$spc_pre))
        # k = number of samples to select
        # ken_sto_pc = if provided, the number of principal components
        # (see ?kenStone)
        sel <- prospectr::kenStone(
          X = spc_pre,
          k = round((1 - ratio_val) * nrow(spec_chem)),
          pc = substitute(ken_sto_pc)
        )
      } else {
        sel <- prospectr::kenStone(
          X = spec_chem$MIR,
          k = round((1 - ratio_val) * nrow(spec_chem)),
          pc = substitute(ken_sto_pc)
        )
      }
      # Split MIR data into calibration and validation set using
      # the results of Kennard-Stone Calibration Sampling
      # Selct by row index of calibration samples
      val_set <- spec_chem[-sel$model, ]
      cal_set <- spec_chem[sel$model, ]

      # Optionally split up calibation (train) and validation (test) sets
      # randomly; use function from modelr package
      # !!! Important note: The option to split up the calibration and
      # sets randomly is still experimental and a modification for the
      # PC space projection is not yet implemented for graphical output.
      # p_pc ggplot2 output needs to be updated for split_method = "random"
      if (split_method == "random") {
        # Split data sets into test and traing using modelr package
        df_split <- modelr::crossv_mc(spec_chem, n = 1, test = ratio_val)
        # Select train of df_split and convert back into tibble,
        # assign to calibration set
        cal_set <- tibble::as_tibble(df_split[1, ][["train"]][[1]])
        # Select test of df_split and convert back into tibble,
        # assign to validation set
        val_set <- tibble::as_tibble(df_split[1, ][["test"]][[1]])
      }
      sel_df_cal <- data.frame(sel$pc[sel$model, 1:2])
      sel_df_val <- data.frame(sel$pc[-sel$model, 1:2])
    } else {
      if (tibble::is_tibble(spec_chem)) {
        spc_pre <- as.matrix(data.table::rbindlist(spec_chem$spc_pre))
        sel <- prospectr::kenStone(
          X = spc_pre,
          k = round(ratio_val * nrow(spec_chem)),
          pc = substitute(ken_sto_pc)
        )
      } else {
        ## Select validation set by Kennard-Stones algorithm
        sel <- prospectr::kenStone(
          X = spec_chem$MIR,
          k = round(ratio_val * nrow(spec_chem)),
          pc = substitute(ken_sto_pc)
        )
      }
      sel_df_cal <- data.frame(sel$pc[-sel$model, 1:2])
      sel_df_val <- data.frame(sel$pc[sel$model, 1:2])
      # Split MIR data into calibration and validation set using
      # the results of Kennard-Stone Calibration Sampling
      # Selct by row index of calibration samples
      val_set <- spec_chem[sel$model, ]
      cal_set <- spec_chem[-sel$model, ]
    }

    # Add additional columns to calibration set and validation sets for plotting
    sel_df_cal$type <- as.factor(
      rep("calibration", nrow(sel_df_cal))
    )
    sel_df_val$type <- as.factor(
      rep("validation", nrow(sel_df_val))
    )
    sel_df <- rbind(sel_df_cal, sel_df_val)
    # Compute ratio needed to make the figure square
    ratio <- with(sel_df, diff(range(PC1)) / diff(range(PC2)))
    # Save graph showing the selected calibration and validation samples
    # for the first two principal components (pc)
    p_pc <- ggplot2::ggplot(data = sel_df) +
      ggplot2::geom_point(
        ggplot2::aes(x = PC1, y = PC2, shape = type),
        size = 4
      ) +
      ggplot2::coord_fixed(ratio = 1) +
      ggplot2::scale_shape_manual(values = c(1, 19)) +
      ggplot2::scale_colour_manual(values = c("black", "red")) +
      ggplot2::theme_bw() +
      ggplot2::theme(legend.title = ggplot2::element_blank())

    # Print outputs to list
    list_out <- list(
      calibration = cal_set,
      validation = val_set,
      p_pc = p_pc
    )
    list_out
    # Check number of observations (rows) for calibration set
    # nrow(cal_set)
  } else {
    cal_set <- spec_chem
    list(
      calibration = cal_set,
      validation = NULL,
      p_pc = NULL
    )
  }
}

# trainControl generating helper function
control_train_q <- function(x, response, resampling_seed,
                            env = parent.frame()) {
  calibration <- NULL
  # List of calibration and validation samples
  # set up a cross-validation scheme
  # create 10 folds that we will keep for the different
  # modeling approaches to allow comparison
  # randomly break the data into 10 partitions
  # note that k is the total number of samples for leave-one-out
  # use substitute function to make non-standard evaluation
  # of variable argument (looks at a function as argument,
  # sees code used to compute value;
  # see chapter 13.1 Capturing expressions
  # in Advanced R (Hadley Wickham)
  # !! p. 270
  response <- eval(response, x$calibration, env)
  # Set seed for creating resampling indices
  set.seed(eval(resampling_seed, env))
  idx <- caret::createFolds(y = response, k = 10, returnTrain = TRUE)
  # inject the index in the trainControl object
  caret::trainControl(
    method = "cv", index = idx,
    savePredictions = TRUE, selectionFunction = "oneSE"
  )
}

## Adapt model tuning to leave-one-out cross-validation ========================

## trainControl generating helper function
control_train_loocv_q <- function(x, response, env = parent.frame()) {
  calibration <- NULL
  # r: response
  response <- eval(response, x$calibration, env)
  # Set up leave-one-out cross-validation
  caret::trainControl(
    method = "LOOCV", savePredictions = TRUE,
    selectionFunction = "oneSE"
  )
}

## Adapt model tuning to repeated k-fold cross-validation ======================

## trainControl generating helper function
control_train_rcv_q <- function(x, response, resampling_seed,
                                env = parent.frame()) {
  calibration <- NULL
  # r: response
  response <- eval(response, x$calibration, env)
  # Set seed for creating resampling indices
  set.seed(eval(resampling_seed, env))
  # Set up 5 times repeated 10-fold cross-validation
  idx <- caret::createMultiFolds(y = response, k = 10, times = 5) # update ***
  # Inject the index in the trainControl object
  caret::trainControl(
    method = "repeatedcv", index = idx,
    savePredictions = TRUE, selectionFunction = "oneSE"
  )
}

## Fitting models without parameter tuning =====================================
# 5.9; https://topepo.github.io/caret/model-training-and-tuning.html

## trainControl generating helper function
control_train_none_q <- function(x, response, resampling_seed,
                                 env = parent.frame()) {
  calibration <- NULL
  # r: response
  response <- eval(response, x$calibration, env)
  # Set seed for creating resampling indices
  set.seed(eval(resampling_seed, env))
  # Set trainControl argument to "none" so that caret::train will only fit
  # one model to the entire training set;
  # use a fixed number of PLS components instead
  idx <- caret::createFolds(y = response, k = 10, returnTrain = TRUE) # update ***
  # inject the index in the trainControl object
  caret::trainControl(
    method = "none", index = idx, savePredictions = TRUE,
    selectionFunction = "oneSE"
  )
}

## Standard evlauation version of trainContol helper function
control_train <- function(x, response, env = parent.frame()) {
  control_train_q(x, substitute(response), env)
}

# Fit a PLS regression model using the caret package ---------------------------

## Train a PLS regression model
train_pls_q <- function(
    x,
    evaluation_method = "test_resampling",
    response, tr_control, env = parent.frame(),
    pls_ncomp_max = 20, ncomp_fixed = 5,
    center, scale, tuning_method = "resampling") {
  # Fit a partial least square regression (pls) model
  # center and scale MIR (you can try without)
  calibration <- MIR <- NULL
  r <- eval(response, x$calibration, env)
  # ? Is it really necessary to evaluate this in the parent frame?
  pls_ncomp_max <- eval(pls_ncomp_max, envir = parent.frame())
  # Evaluate fixed number of PLS regression components
  # from ncomp_fixed object in parent frame (fit_pls function)
  ncomp_fixed <- eval(ncomp_fixed, envir = parent.frame())

  # Test whether the spectral object has the class "tibble"
  if (!tibble::is_tibble(x$calibration)) {
    stop("spec_chem needs to be of class tibble")
  }

  spc_pre <- data.table::rbindlist(x$calibration$spc_pre)
  if (scale == TRUE && center == TRUE) {
    if (tuning_method == "resampling") {
      # Fit model with parameter tuning
      pls_model <- caret::train(
        x = spc_pre, y = r,
        method = "pls",
        tuneLength = pls_ncomp_max,
        trControl = tr_control,
        preProcess = c("center", "scale")
      )
    } else if (tuning_method == "none") {
      # Fit model without parameter tuning
      pls_model <- caret::train(
        x = spc_pre, y = r,
        method = "pls",
        trControl = tr_control,
        preProcess = c("center", "scale"),
        tuneGrid = data.frame(ncomp = ncomp_fixed)
      )
    }
  } else {
    # No centering and scaling!
    pls_model <- caret::train(
      x = spc_pre, y = r,
      method = "pls",
      tuneLength = pls_ncomp_max,
      trControl = tr_control
    )
  }
}


## Standard evaluation version for training a PLS regression model
train_pls <- function(
    x, response, evaluation_method = "resampling",
    env = parent.frame()) {
  train_pls_q(
    x = x, evaluation_method = substitute(evaluation_method),
    response = substitute(response), env
  )
}

# Fit a random forest model using the caret package ----------------------------

## Train a random forest model
train_rf_q <- function(
    x,
    validation = TRUE, evaluation_method = "resampling",
    response, tr_control, ntree_max = 500, env = parent.frame()) {
  # Fit a partial least square regression (pls) model
  # center and scale MIR (you can try without)
  calibration <- MIR <- NULL
  response <- eval(response, x$calibration, env)
  ntree_max <- eval(ntree_max, envir = parent.frame())
  if (tibble::is_tibble(x$calibration)) {
    spc_pre <- data.table::rbindlist(x$calibration$spc_pre)
    rf_model <- caret::train(
      x = spc_pre, y = response,
      method = "rf",
      ntree = ntree_max,
      trControl = tr_control,
      preProcess = c("center", "scale")
    )
  } else {
    rf_model <- caret::train(
      x = x$calibration$MIR, y = response,
      method = "rf",
      ntree = ntree_max,
      trControl = tr_control,
      preProcess = c("center", "scale")
    )
  }
  rf_model
}


# Evaluate model performance (validation and cross-validation) -----------------

## Helper function to transform repeated k-fold cross-validation hold-out
## predictions
transform_cvpredictions <- function(cal_index, predobs_cv) {
  predobs_cv <- dplyr::full_join(cal_index, predobs_cv, by = "rowIndex") %>%
    dplyr::group_by(!!rlang::sym("sample_id")) %>%
    # Average observed and predicted values
    dplyr::mutate(
      "obs" = mean(!!rlang::sym("obs")),
      "pred_sd" = sd(!!rlang::sym("pred"))
    ) %>%
    # Add 95% confidence interval for mean hold-out predictions from
    # repeated k-fold cross-validation
    dplyr::mutate_at(
      .vars = dplyr::vars(!!rlang::sym("pred")),
      .funs = dplyr::funs("pred_sem_ci" = sem_ci)
    ) %>%
    # Add mean hold-out predictions from repeated k-fold cross-validation
    dplyr::mutate("pred" = mean(!!rlang::sym("pred"))) %>%
    # Slice data set to only have one row per sample_id
    dplyr::slice(1L)
}

## Evaluate PLS performance
evaluate_model_q <- function(
    x, model, response,
    evaluation_method, tuning_method, resampling_method,
    print = TRUE, env = parent.frame()) {
  # Set global variables to NULL to avoid R CMD check notes
  MIR <- object <- dataType <- obs <- pred_sem_ci <- pred <- NULL
  ncomp <- finalModel <- rmse <- r2 <- r2 <- rpd <- n <- NULL
  rmse <- calibration <- NULL
  # Collect fitted object into a list
  list_models <- list("final_model" = model)
  # Evaluate validation argument in parent.frame !!!
  evaluation_method <- eval(evaluation_method, envir = parent.frame())
  # Evaluate tuning_method argument in parent.frame
  tuning_method <- eval(tuning_method, envir = parent.frame())
  # Evaluate resampling_method argument in parent.frame
  resampling_method <- eval(resampling_method, envir = parent.frame())
  # Extract best tuning parameters and associated cv predictions
  if (evaluation_method == "test_set") {
    # Calculate training (calibration) and test (validation) data
    # predictions based on pls model with calibration data
    r <- eval(response, x$validation, env)

    if (!tibble::is_tibble(x$validation)) {
      stop("Spectra and reference data need to be provided as tibble
        (class `tbl_df`, `tbl`, `data.frame`")
    }
    spc_pre <- data.table::rbindlist(x$validation$spc_pre)
    predobs <- caret::extractPrediction(list_models,
      testX = spc_pre, testY = r
    ) # update ***
    # Append sample_id column to predobs data.frame
    # extract sample_id from validation set
    predobs$sample_id <- c(
      x$calibration$sample_id, x$validation$sample_id
    )

    # Create new data frame column <object>
    predobs$object <- predobs$model

    # Replace levels "Training" and "Test" in dataType column
    # by "Calibration" and "Validation" (rename levels of factor)
    predobs$dataType <- plyr::revalue(
      predobs$dataType,
      c("Test" = "Validation", "Training" = "Calibration")
    )
    # Change the order of rows in the data frame
    # Calibration as first level (show Calibration in ggplot graph
    # on left panel)
    predobs$dataType <- factor(predobs$dataType,
      levels = c("Calibration", "Validation")
    )
    # Calculate model performance indexes by model and dataType
    # uses package plyr and function summary.df of SPECmisc.R
    stats <- plyr::ddply(
      predobs, c("model", "dataType"),
      function(x) summary_df(x, "obs", "pred")
    )
    # Check whether method = "none" argument is selected in train();
    # this is the case when ncomp_fixed argument in fit_pls() is
    # evaluated
    # Checking for the existence of a <pred> element in the train function output
    # list can be dangerous and doesn't work in all cases when using
    # e.g. is.element('pred', x) or is.null(x$pred);
    # Problems can occur e.g. if a list element contains NULL element;
    # see
    # http://stackoverflow.com/questions/7719741/how-to-test-if-list-element-exists
  } else if (evaluation_method == "resampling" && tuning_method == "resampling") {
    # Good discussion on which cross-validation results are returned from caret
    # Extract best tuning parameters and associated cv predictions
    # http://stats.stackexchange.com/questions/219154/how-does-cross-validation-in-train-caret-precisely-work
    # Alternative solution for one model: conformal::GetCVPreds(model) function
    # see https://github.com/cran/conformal/blob/master/R/misc.R
    predobs_cv <- plyr::ldply(list_models,
      function(x) dplyr::inner_join(x$pred, x$bestTune, by = "ncomp"),
      .id = "model"
    )
    # Extract auto-prediction
    predobs <- caret::extractPrediction(list_models)
    # !!! new ---
    # Replace levels "Training" dataType column
    # by "Calibration" (rename levels of factor)
    predobs$dataType <- plyr::revalue(
      predobs$dataType,
      c("Training" = "Calibration")
    )
    # Append sample_id column to predobs data.frame
    # extract sample_id from calibration set
    predobs$sample_id <- x$calibration$sample_id
    # Create rowIndex for calibration tibble
    x$calibration$rowIndex <- 1:nrow(x$calibration)
    # Generate sample_id column for rowIndex of pred list element of
    # train object; select only rowIndex and sample_id of calibration tibble
    vars_indexing <- c("rowIndex", "sample_id")
    cal_index <- dplyr::select(
      x$calibration,
      !!!rlang::syms(vars_indexing)
    )

    # Transform cross-validation hold-out predictions --------------------------
    predobs_cv <- transform_cvpredictions(
      cal_index = cal_index,
      predobs_cv = predobs_cv
    )

    predobs_cv$object <- predobs_cv$model
    predobs_cv$model <- factor(predobs_cv$model)
    predobs_cv$dataType <- factor("Cross-validation")
    vars_keep <- c(
      "obs", "pred", "pred_sd", "pred_sem_ci",
      "model", "dataType", "object"
    )
    predobs_cv <- dplyr::select(
      predobs_cv,
      # !!! sample_id newly added
      !!!rlang::syms(vars_keep)
    )
    # Add column pred_sd to predobs data frame (assign values to 0) so that
    # column pred_sd is retained in predobs_cv after dplyr::bind_rows
    predobs$pred_sd <- NA
    # Desn't work because some columns are turned into numeric;
    # resulting data frame has only two rows
    # pb_2018-11-09: Model evaluation graph should only show cross-validation
    #   results when arguments `evaluation_method` == "resampling" &&
    #   `tuning_method` == "resampling"
    predobs <- predobs_cv
    # predobs <- dplyr::bind_rows(predobs, predobs_cv)
    # Calculate model performance indexes by model and dataType
    # uses package plyr and function summary.df of SPECmisc.R
    stats <- suppressWarnings(plyr::ddply(
      predobs, c("model", "dataType"),
      function(x) summary_df(x, "obs", "pred")
    ))
  }
  # Add number of components to stats; from finalModel list item
  # from train() function output (function from caret package)
  stats$ncomp <- rep(model$finalModel$ncomp, nrow(stats))
  # !!! Experimental: return stats
  # return(stats)
  # Add range of observed values for validation and calibraton
  # get range from predicted vs. observed data frame
  # stored in object predobs
  obs_cal <- subset(predobs, dataType == "Calibration")$obs
  # Get name of predicted variable; see p. 261 of book
  # "Advanced R" (Hadley Wickham)
  response_name <- deparse(response)

  if (evaluation_method == "test_set") {
    # Assign validation set to separate data frame
    obs_val <- subset(predobs, dataType == "Validation")$obs
    # before: deparse(substitute(variable))
    df_range <- data.frame(
      response = rep(response_name, 2),
      dataType = c("Calibration", "Validation"),
      min_obs = c(range(obs_cal)[1], range(obs_val)[1]),
      median_obs = c(median(obs_cal), median(obs_val)),
      max_obs = c(range(obs_cal)[2], range(obs_val)[2]),
      mean_obs = c(mean(obs_cal), mean(obs_val)),
      CV = c(
        sd(obs_cal) / mean(obs_cal) * 100,
        sd(obs_val) / mean(obs_val) * 100
      )
    )
  } else if (evaluation_method == "resampling" && tuning_method == "resampling") {
    # Assign cross-validation set to separate data frame
    obs_val <- subset(predobs, dataType == "Cross-validation")$obs
    df_range <- data.frame(
      response = rep(response_name, 2),
      dataType = factor(c("Calibration", "Cross-validation")),
      min_obs = c(range(obs_cal)[1], range(obs_val)[1]),
      median_obs = c(median(obs_cal), median(obs_val)),
      max_obs = c(range(obs_cal)[2], range(obs_val)[2]),
      mean_obs = c(mean(obs_cal), mean(obs_val)),
      CV = c(
        sd(obs_cal) / mean(obs_cal) * 100,
        sd(obs_val) / mean(obs_val) * 100
      )
    )
  }

  # Join stats with range data frame (df_range)
  stats <- suppressWarnings(dplyr::inner_join(stats, df_range, by = "dataType"))
  annotation <- plyr::mutate(stats,
    rmse = as.character(as.expression(paste0(
      "RMSE == ",
      round(rmse, 2)
    ))),
    r2 = as.character(as.expression(paste0(
      "italic(R)^2 == ",
      round(r2, 2)
    ))),
    rpd = as.character(as.expression(paste(
      "RPD == ",
      round(rpd, 2)
    ))),
    n = as.character(as.expression(paste0("italic(n) == ", n))),
    ncomp = as.character(as.expression(paste0(
      "ncomp == ",
      ncomp
    )))
  )

  # Plot predicted vs. observed values and model indexes
  # update label, xlim, and ylim ***
  # Add label number of samples to facet_grid using a
  # labeling function
  # ! Update labeller API:
  # https://github.com/hadley/ggplot2/commit/ef33dc7
  # http://sahirbhatnagar.com/facet_wrap_labels

  # Prepare lookup character vector
  make_label <- function(x, evaluation_method = "test_set",
                         resampling_method = "kfold_cv") {
    dataType <- n <- NULL

    if (evaluation_method == "test_set") {
      c(
        `Calibration` = paste0(
          "Calibration", "~(",
          x[x$dataType == "Calibration", ]$n, ")"
        ),
        `Validation` = paste0(
          "Validation", "~(",
          x[x$dataType == "Validation", ]$n, ")"
        )
      )
    } else if (evaluation_method == "resampling" &&
      resampling_method == "rep_kfold_cv") {
      c(
        `Calibration` = paste0(
          "Calibration", "~(",
          x[x$dataType == "Calibration", ]$n, ")"
        ),
        `Cross-validation` = paste0(
          "5%*%repeated~10*-fold~CV", "~(",
          x[x$dataType == "Cross-validation", ]$n, ")"
        )
      )
    } else {
      c(
        `Calibration` = paste0(
          "Calibration", "~(",
          x[x$dataType == "Calibration", ]$n, ")"
        ),
        `Cross-validation` = paste0(
          "10*-fold~CV", "~(",
          x[x$dataType == "Cross-validation", ]$n, ")"
        )
      )
    }
  }
  if (evaluation_method == "test_set") {
    label_validation <- make_label(
      x = annotation,
      evaluation_method = "test_set"
    )
  } else if (evaluation_method == "resampling" &&
    resampling_method == "rep_kfold_cv") {
    label_validation <- make_label(
      x = annotation,
      evaluation_method = "resampling", resampling_method = "rep_kfold_cv"
    )
  } else {
    label_validation <- make_label(
      x = annotation,
      evaluation_method = "resampling"
    )
  }

  # Rename labels on the fly with a lookup character vector
  to_string <- ggplot2::as_labeller(
    x = label_validation, ggplot2::label_parsed
  )

  # Create model evaluation plot -----------------------------------------------

  ## ggplot graph for model comparison
  ## (arranged later in panels)
  x_label <- paste0(
    "Observed ",
    as.character(response_name)
  )
  y_label <- paste0(
    "Predicted ",
    as.character(response_name)
  )

  ## Create x and y minimum and maximum for plotting range; use either
  ## observed or predicted data, depending on what minimum and maximum values
  ## are
  xy_min <- if (min(predobs$obs) < min(predobs$pred)) {
    predobs$obs
  } else {
    predobs$pred
  }
  xy_max <- if (max(predobs$obs) > max(predobs$pred)) {
    predobs$obs
  } else {
    predobs$pred
  }
  xy_range <- ifelse(diff(range(xy_min) > diff(range(xy_max))),
    diff(range(xy_min)), diff(range(xy_max))
  )

  if (model$method == "pls") {
    p_model <- ggplot2::ggplot(data = predobs) +
      ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
        shape = 1, size = 2, alpha = 1 / 2, data = predobs
      ) +
      ggplot2::geom_text(
        data = annotation,
        ggplot2::aes(x = Inf, y = -Inf, label = ncomp), size = 5,
        hjust = 1.15, vjust = -4.5, parse = TRUE
      ) + # !!! additional label
      ggplot2::geom_text(
        data = annotation,
        ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 5,
        hjust = 1.15, vjust = -3, parse = TRUE
      ) +
      ggplot2::geom_text(
        data = annotation,
        ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 5,
        hjust = 1.12, vjust = -2.5, parse = TRUE
      ) +
      ggplot2::geom_text(
        data = annotation,
        ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 5,
        hjust = 1.15, vjust = -1.25, parse = TRUE
      ) +
      ggplot2::facet_grid(~dataType,
        labeller = ggplot2::as_labeller(to_string)
      ) +
      ggplot2::geom_abline(col = "red") +
      ggplot2::labs(x = x_label, y = y_label) +
      ggplot2::xlim(c(
        min(xy_min) - 0.05 * xy_range,
        max(xy_max) + 0.05 * xy_range
      )) +
      ggplot2::ylim(c(
        min(xy_min) - 0.05 * xy_range,
        max(xy_max) + 0.05 * xy_range
      )) +
      ggplot2::coord_fixed() +
      ggplot2::theme_bw() +
      ggplot2::theme(strip.background = ggplot2::element_rect(fill = "white"))

    if (evaluation_method == "resampling") {
      p_model <- p_model +
        ggplot2::geom_errorbar(
          ggplot2::aes(
            x = obs, ymin = pred - pred_sem_ci,
            ymax = pred + pred_sem_ci
          ),
          width = 0, data = predobs, inherit.aes = FALSE
        )
    }
  } else {
    p_model <- ggplot2::ggplot(data = predobs) +
      ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
        shape = 1, size = 2, alpha = 1 / 2
      ) + # without ncomp label
      ggplot2::geom_text(
        data = annotation,
        ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 5,
        hjust = 1.15, vjust = -3, parse = TRUE
      ) +
      ggplot2::geom_text(
        data = annotation,
        ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 5,
        hjust = 1.12, vjust = -2.5, parse = TRUE
      ) +
      ggplot2::geom_text(
        data = annotation,
        ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 5,
        hjust = 1.15, vjust = -1.25, parse = TRUE
      ) +
      ggplot2::facet_grid(~dataType,
        labeller = ggplot2::as_labeller(to_string)
      ) +
      ggplot2::geom_abline(col = "red") +
      ggplot2::labs(x = x_label, y = y_label) +
      ggplot2::xlim(c(
        min(xy_min) - 0.05 * xy_range,
        max(xy_max) + 0.05 * xy_range
      )) +
      ggplot2::ylim(c(min(xy_min) -
        0.05 * xy_range, max(xy_max) + 0.05 * xy_range)) +
      ggplot2::coord_fixed() +
      ggplot2::theme_bw()
  }

  if (print == TRUE) {
    print(p_model)
  }

  list(stats = stats, p_model = p_model, predobs = predobs)
}


## PLS regression modeling in one function ======================

#' @name fit_pls
#' @title Calibration sampling, model tuning, and PLS regression
#' @description Perform calibration sampling and use selected
#' calibration set for model tuning
#' @param spec_chem Tibble that contains spectra, metadata and chemical
#' reference as list-columns. The tibble to be supplied to \code{spec_chem} can
#' be generated by the `join_chem_spc() function`
#' @param response Response variable as symbol or name
#' (without quotes, no character string). The provided response symbol needs to be
#' a column name in the \code{spec_chem} tibble.
#' @param variable Depreciated and replaced by `response`
#' @param center Logical whether to perform mean centering of each spectrum column
#' (e.g. wavenumber or wavelength) after common spectrum preprocessing. Default is
#' \code{center = TRUE}
#' @param scale Logical whether to perform standard deviation scaling
#' of each spectrum column (e.g. wavenumber or wavelength) after common
#' spectrum preprocessing. Default is \code{scale = TRUE}
#' @param evaluation_method Character string stating evaluation method.
#' Either \code{"test_set"} (default) or \code{"resampling"}. \code{"test_set"}
#' will split the data into a calibration (training) and validation (test) set,
#' and evaluate the final model by predicting on the validation set.
#' If \code{"resampling"}, the finally selected model will be evaluated based
#' on the cross-validation hold-out predictions.
#' @param validation Depreciated and replaced by \code{evaluation_method}.
#' Default is \code{TRUE}.
#' @param split_method Method how to to split the data into a independent test
#' set. Default is \code{"ken_sto"}, which will select samples for calibration
#' based on Kennard-Stone sampling algorithm of preprocessed spectra. The
#' proportion of validation to the total number of samples can be specified
#' in the argument \code{ratio_val}.
#' \code{split_method = "random"} will create a single random split.
#' @param ratio_val Ratio of validation (test) samples to
#' total number of samples (calibration (training) and validation (test)).
#' @param ken_sto_pc Number of component used
#' for calculating mahalanobsis distance on PCA scores for computing
#' Kennard-Stone algorithm.
#' Default is \code{ken_sto_pc = 2}, which will use the first two PCA
#' components.
#' @param pc Depreciated; renamed argument is `ken_sto_pc`.
#' @param invert Logical
#' @param tuning_method Character specifying tuning method. Tuning method
#' affects how caret selects a final tuning value set from a list of candidate
#' values. Possible values are \code{"resampling"}, which will use a
#' specified resampling method such as repeated k-fold cross-validation (see
#' argument \code{resampling_method}) and the generated performance profile
#' based on the hold-out predictions to decide on the final tuning values
#' that lead to optimal model performance. The value \code{"none"} will force
#' caret to compute a final model for a predefined canditate PLS tuning
#' parameter number of PLS components. In this case, the value
#' supplied by \code{ncomp_fixed}` is used to set model complexity at
#' a fixed number of components.
#' @param resampling_method Character specifying resampling method. Currently,
#' \code{"kfold_cv"} (default, performs 10-fold cross-validation),
#' \code{"rep_kfold_cv"} (performs 5-times repeated 10-fold cross-validation),
#' \code{"loocv"} (performs leave-one-out cross-validation), and \code{"none"}
#' (if \code{resampling_method = "none"}) are supported.
#' @param resampling_seed Random seed (integer) that will be used for generating
#' resampling indices, which will be supplied to \code{caret::trainControl}.
#' This makes sure that modeling results are constant when re-fitting.
#' Default is \code{resampling_seed = 123}.
#' @param cv Depreciated. Use \code{resampling_method} instead.
#' @param pls_ncomp_max Maximum number of PLS components that are evaluated
#' by caret::train. Caret will aggregate a performance profile using resampling
#' for an integer sequence from 1 to \code{pls_ncomp_max}
#' @param ncomp_fixed Integer of fixed number of PLS components. Will only be
#' used when \code{tuning_method = "none"} and  \code{resampling_method = "none"}
#' are used.
#' @param print Logical expression whether model evaluation graphs shall be
#' printed
#' @param env Environment where function is evaluated. Default is
#' \code{parent.frame}.
#' @export
# Note: check non standard evaluation, argument passing...
fit_pls <- function(
    spec_chem,
    response, variable = NULL, # variable depreciated, will not work
    center = TRUE, scale = TRUE, # center and scale all predictors (wavenumbers)
    evaluation_method = "test_set", validation = TRUE, # validation depreciated
    split_method = "ken_stone",
    ratio_val = 1 / 3, # is only used if evaluation_method = "test_set"
    ken_sto_pc = 2, pc, # only if split_method = "ken_stone"; number of component
    # used for calculating mahalanobsis distance on PCA scores. pc is depreciated.
    invert = TRUE, # only if split_method = "ken_stone"
    tuning_method = "resampling",
    resampling_method = "kfold_cv", cv = NULL, # cv depreciated
    resampling_seed = 123, # Seed for creating resampling indices
    pls_ncomp_max = 20, # Maximal number of PLS components used by model tuning
    ncomp_fixed = 5, # only fit and evaluate one model, if tuning_method = "none"
    print = TRUE, # print model summary and evaluation graphs
    env = parent.frame()) {
  calibration <- 0

  # Warning messages and reassignment for depreciated arguments ----------------
  # Depreciate argument variable, use more specific term for the response
  # to be predicted by spectral modeling
  if (!is.null(variable)) {
    stop("argument variable has been replaced by response for simplerspec_0.1.0")
  }
  # 20170602: revise argument name and values of validation;
  # Replace validation = TRUE or FALSE with
  # new argument evaluation_method = "test_set" or "resampling"
  if (!missing(validation)) {
    warning("argument validation is depreciated; please use evaluation_method instead.",
      call. = FALSE
    )
    evaluation_method <- validation
  }
  # Depreciate argument pc, use more consistent and verbose argument ken_sto_pc
  if (!missing(pc)) {
    warning("argument pc is depreciated; please use ken_sto_pc instead.",
      call. = FALSE
    )
    ken_sto_pc <- pc
  }
  # Depreciate argument cv, use more consistent and verbose argument
  # resampling_method
  if (!missing(cv)) {
    warning("argument cv is depreciated; please use resampling_method instead.",
      call. = FALSE
    )
    resampling_method <- cv
  }
  # Change values for resampling_method argument
  if (resampling_method == "LOOCV") {
    warning("value 'LOOCV' (leave one out cross-validation) for argument resampling_method is depreciated; please use value 'loocv' instead.")
    resampling_method <- "loocv"
  }
  if (resampling_method == "repeatedcv") {
    warning("value 'repeatedcv' (repeated k-fold cross-validation) for argument resampling_method is depreciated; please use value 'rep_kfold_cv' instead.")
    resampling_method <- "rep_kfold_cv"
  }

  # Perform calibration sampling -----------------------------------------------
  list_sampled <- split_data_q(
    spec_chem, split_method,
    ratio_val = ratio_val,
    ken_sto_pc = substitute(ken_sto_pc),
    evaluation_method = substitute(evaluation_method),
    invert = substitute(invert)
  )

  # Check on method for cross-validation to be used in caret model tuning ------
  if (resampling_method == "loocv") {
    # leave-one-out cross-validation
    tr_control <- control_train_loocv_q(
      x = list_sampled,
      response = substitute(response), env = env
    )
  } else if (resampling_method == "rep_kfold_cv") {
    # repeated k-fold cross-validation
    tr_control <- control_train_rcv_q(
      x = list_sampled,
      response = substitute(response),
      resampling_seed = substitute(resampling_seed), env = env
    )
  } else if (resampling_method == "none") {
    # no resampling; calls caret::train(..., method = "none");
    # fixed number of PLS components; tuning_method argument has also
    # to be set to "none"
    tr_control <- control_train_none_q(
      x = list_sampled,
      response = substitute(response),
      resampling_seed = substitute(resampling_seed), env = env
    )
  } else if (resampling_method == "kfold_cv") {
    # k-fold cross validation
    tr_control <- control_train_q(
      x = list_sampled,
      response = substitute(response),
      resampling_seed = substitute(resampling_seed), env = env
    )
  }
  # Fit a pls calibration model; pls object is output from caret::train() ------
  if (tuning_method == "resampling") {
    pls <- train_pls_q(
      x = list_sampled,
      evaluation_method = "test_set",
      response = substitute(response), tr_control = tr_control,
      center = center, scale = scale,
      pls_ncomp_max = substitute(pls_ncomp_max), env
    )
  } else if (tuning_method == "none") {
    pls <- train_pls_q(
      x = list_sampled,
      evaluation_method = "test_set",
      response = substitute(response), tr_control = tr_control,
      center = center, scale = scale, tuning_method = "none",
      ncomp_fixed = substitute(ncomp_fixed), env
    )
  }
  # Evaluate model accuracy (predicted vs. observed) ---------------------------
  stats <- evaluate_model_q(
    x = list_sampled, model = pls,
    response = substitute(response),
    evaluation_method = substitute(evaluation_method),
    tuning_method = substitute(tuning_method),
    resampling_method = substitute(resampling_method),
    env = parent.frame()
  )
  list(
    data = list_sampled, p_pc = list_sampled$p_pc,
    model = pls, stats = stats$stats, p_model = stats$p_model,
    predobs = stats$predobs
  )
}

## Old function name of `fit_pls`: `pls_ken_stone`

#' @rdname fit_pls
#' @export
pls_ken_stone <- fit_pls

## Random forest modeling in one function =======================

#' @title Calibration sampling, and random forest model tuning and evaluation
#' @description Perform calibration sampling and use selected
#' calibration set for model tuning
#' @param spec_chem Tibble that contains spectra, metadata and chemical
#' reference as list-columns. The tibble to be supplied to \code{spec_chem} can
#' be generated by the `join_chem_spc() function`
#' @param response Response variable as symbol or name
#' (without quotes, no character string). The provided response symbol needs to be
#' a column name in the \code{spec_chem} tibble.
#' @param variable Depreciated and replaced by `response`
#' @param evaluation_method Character string stating evaluation method.
#' Either \code{"test_set"} (default) or \code{"resampling"}. \code{"test_set"}
#' will split the data into a calibration (training) and validation (test) set,
#' and evaluate the final model by predicting on the validation set.
#' If \code{"resampling"}, the finally selected model will be evaluated based
#' on the cross-validation hold-out predictions.
#' @param validation Depreciated and replaced by \code{evaluation_method}.
#' Default is \code{TRUE}.
#' @param split_method Method how to to split the data into a independent test
#' set. Default is \code{"ken_sto"}, which will select samples for calibration
#' based on Kennard-Stone sampling algorithm of preprocessed spectra. The
#' proportion of validation to the total number of samples can be specified
#' in the argument \code{ratio_val}.
#' \code{split_method = "random"} will create a single random split.
#' @param ratio_val Ratio of validation (test) samples to
#' total number of samples (calibration (training) and validation (test)).
#' @param ken_sto_pc Number of component used
#' for calculating mahalanobsis distance on PCA scores for computing
#' Kennard-Stone algorithm.
#' Default is \code{ken_sto_pc = 2}, which will use the first two PCA
#' components.
#' @param pc Depreciated; renamed argument is `ken_sto_pc`.
#' @param invert Logical
#' @param tuning_method Character specifying tuning method. Tuning method
#' affects how caret selects a final tuning value set from a list of candidate
#' values. Possible values are \code{"resampling"}, which will use a
#' specified resampling method such as repeated k-fold cross-validation (see
#' argument \code{resampling_method}) and the generated performance profile
#' based on the hold-out predictions to decide on the final tuning values
#' that lead to optimal model performance. The value \code{"none"} will force
#' caret to compute a final model for a predefined canditate PLS tuning
#' parameter number of PLS components. In this case, the value
#' supplied by \code{ncomp_fixed}` is used to set model complexity at
#' a fixed number of components.
#' @param resampling_seed Random seed (integer) that will be used for generating
#' resampling indices, which will be supplied to \code{caret::trainControl}.
#' This makes sure that modeling results are constant when re-fitting.
#' Default is \code{resampling_seed = 123}.
#' @param cv Depreciated. Use \code{resampling_method} instead.
#' @param ntree_max Maximum random forest trees
#' by caret::train. Caret will aggregate a performance profile using resampling
#' for an integer sequence from 1 to \code{ntree_max} trees.
#' @param print Logical expression whether model evaluation graphs shall be
#' printed
#' @param env Environment where function is evaluated. Default is
#' \code{parent.frame}.
#' @export
# Note: check non standard evaluation, argument passing...
fit_rf <- function(
    spec_chem,
    response, variable = NULL, # variable depreciated, will not work
    evaluation_method = "test_set", validation = NULL, # Validation is depreciated
    split_method = "ken_stone",
    ratio_val,
    ken_sto_pc = 2, pc = NULL,
    invert = TRUE, # only if split_method = "ken_stone
    tuning_method = "resampling",
    resampling_seed = 123,
    cv = NULL, # cv depreciated
    ntree_max = 500,
    print = TRUE,
    env = parent.frame()) {
  calibration <- NULL

  # Warning messages and reassignment for depreciated arguments ----------------
  # Replace validation = TRUE or FALSE with
  # new argument evaluation_method = "test_set" or "resampling"
  if (!missing(validation)) {
    warning("argument validation is deprecated; please use evaluation_method instead.",
      call. = FALSE
    )
    evaluation_method <- validation
  }
  if (!missing(variable)) {
    warning("argument variable is deprecated; please use response instead.",
      call. = FALSE
    )
    response <- variable
  }
  # Depreciate argument pc, use more consistent and verbose argument ken_sto_pc
  if (!missing(pc)) {
    warning("argument pc is depreciated; please use ken_sto_pc instead.",
      call. = FALSE
    )
    ken_sto_pc <- pc
  }

  # Calibration sampling -------------------------------------------------------
  list_sampled <- split_data_q(
    spec_chem, split_method,
    ratio_val = ratio_val,
    ken_sto_pc = substitute(ken_sto_pc),
    evaluation_method = substitute(evaluation_method),
    invert = substitute(invert)
  )
  # Control parameters for caret::train ----------------------------------------
  tr_control <- control_train_q(
    x = list_sampled,
    response = substitute(response),
    resampling_seed = substitute(resampling_seed), env = env
  )
  # Train random forest model (model tuning) -----------------------------------
  rf <- train_rf_q(
    x = list_sampled,
    evaluation_method = "test_set",
    response = substitute(response), tr_control = tr_control, env,
    ntree_max = substitute(ntree_max)
  )
  # Evaluate finally chosen random forest model --------------------------------
  stats <- evaluate_model_q(
    x = list_sampled, model = rf,
    response = substitute(response),
    evaluation_method = substitute(evaluation_method),
    tuning_method = substitute(tuning_method),
    resampling_method = substitute(resampling_method),
    env = parent.frame()
  )
  # Return list with results ---------------------------------------------------
  list(
    data = list_sampled, p_pc = list_sampled$p_pc,
    rf_model = rf, stats = stats$stats, p_model = stats$p_model
  )
}
philipp-baumann/simplerspec documentation built on Oct. 3, 2023, 12:13 p.m.