R/growthcurve_features.R

Defines functions add_fit_plots predict_save_tgrid define_tgrid define_features_drop

Documented in add_fit_plots define_features_drop define_tgrid predict_save_tgrid

#' Add a column to a data frame containing trajectory / velocity plots
#'
#' @param all_preds A data.frame nested by subject and containing fits from \code{\link{predict_all}}
#' @param ggplot Set to \code{TRUE} to use ggplot for plotting. Otherwise the default is to use \code{rbokeh}
#' @export
add_fit_plots <- function(all_preds, ggplot = FALSE) {
  suppressWarnings(suppressMessages({
    fits_all_plot <- all_preds %>%
      plyr::mutate(panel = trelliscopejs::map_plot(fit,
        ~ rbokeh::figure(xlab = "age", ylab = "growth") %>%
          rbokeh::ly_points(x, y, data = .x[["fit"]], color = "black") %>%
          rbokeh::ly_points(x, preds, data = .x[["fit"]], color = "black", glyph = 19, size = 4) %>%
          rbokeh::ly_lines(x, preds, data = .x[["fitgrid"]], color = "grey") %>%
          rbokeh::ly_points(x, preds, data = .x[["holdout"]], color = "red")
      ))
  }))
  return(fits_all_plot)
}


#' Predict and save the entire growth curve (grid of time points).
#'
#' A thin wrapper for \code{define_features_drop}, \code{define_tgrid} and \code{predict_SL} functions.
#' Evaluates predictions from the existing SuperLearner fit using an entire grid of time points.
#' Optionally, when \code{file.name} is not \code{NULL}, the resulting dataset of predictions is saved as a csv file.
#' @param SLfit SuperLearner fit returned by \code{\link{fit_holdoutSL}} or  \code{\link{fit_cvSL}}.
#' @param data Input data used for model training.
#' @param ID A character string name of the column that contains the unique subject identifiers.
#' @param t_name A character string name of the column with integer-valued measurement time-points (in days, weeks, months, etc).
#' @param y A character string name of the column that represent the response variable in the model.
#' @param tmin Min t value of the grid
#' @param tmax Max t value of the grid
#' @param incr Increment value for the grid of \code{t}'s
# @param hold_column Column name in \code{dataDT} indicating the holdout observations (if used). Leave as \code{NULL} if no \code{hold_column} is present.
#' @param file.name A file name (without the .csv extension) in which the prediction dataset will be saved. Leave as NULL is no saving is necessary.
#' @param file.path A directory path in which the predictions file should be saved.
#' @return A \code{data.table} with subject IDs, grid of equally spaced time-points and the corresponding growth curve predictions.
#' The relevant subject summaries and predictors used for training will be also included in the output data.
#' In addition, the output dataset also contains an indicator column 'train_point', set to \code{TRUE} for all (ID,time-points) that also appear
#' in the input data \code{dataDT}. That is 'train_point' indicates if the row might have been previously used for model training.
#' Finally, if the argument 'hold_column' is not NULL, the output dataset will contain the indicator column of holdout observations
#' (named according to hold_column argument).
#' @export
# hold_column = NULL,
predict_save_tgrid <- function(SLfit,
                               data,
                               ID,
                               t_name,
                               y,
                               tmin = 1,
                               tmax = 500,
                               incr = 2,
                               file.name = NULL,
                               file.path = getOption('growthcurveSL.file.path')) {

  inputDT_all_train <- define_features_drop(data, ID = ID, t_name = t_name, y = y, train_set = TRUE)
  inputDT_tgrid <- define_tgrid(inputDT_all_train, ID = ID, t_name = t_name, y = y, tmin = tmin, tmax = tmax, incr = incr)
  preds_tgrid <- predict_growth(SLfit, newdata = inputDT_tgrid, grid = TRUE, add_subject_data = TRUE)

  preds_tgrid[, ("train_point") := inputDT_tgrid[["train_point"]]]
  # if (!is.null(hold_column))
  #   preds_tgrid[, (hold_column) := inputDT_tgrid[[hold_column]]]

  data.table::setcolorder(preds_tgrid, c(names(preds_tgrid)[-(ncol(preds_tgrid)-2)], names(preds_tgrid)[(ncol(preds_tgrid)-2)]))

  if (!is.null(file.name)) {
    full_path <- file.path(file.path, paste0(file.name, ".csv"))
    message(paste0("...writing csv file with predictions to: ", full_path))
    data.table::fwrite(preds_tgrid[, c(ID, t_name, "train_point", "preds"), with = FALSE], # hold_column,
                       file = full_path)
  }

  return(preds_tgrid)
}

#' Define data with grids of time-points for growth curve prediction
#'
#' @param dataDT Input data with all the relevant summaries that were used for modeling being already defined.
#' @param ID A character string name of the column that contains the unique subject identifiers.
#' @param t_name A character string name of the column with integer-valued measurement time-points (in days, weeks, months, etc).
#' @param y A character string name of the column that represent the response variable in the model.
# @param tmin Min t value of the grid
# @param tmax Max t value of the grid
# @param incr Increment value for the grid of \code{t}'s
#' @param tgrid Specify the grid of time-points directly.
#' If missing a subject-specific grid is defined based on the subject's follow-up range.
#' @param grid_size How many time-points should be used in equally spaced grid?
#' This argument is only used when \code{tgrid} is missing.
#' @return A \code{data.table} of subject IDs and time-points, along with all the relevant subject specific curve summaries and predictors.
#' In addition, the output dataset also contains an indicator column 'train_point', set to \code{TRUE} for all (ID,time-points) that also appear
#' in the input data \code{dataDT}. That is 'train_point' indicates if the row might have been previously used for model training.
# Finally, if the argument 'hold_column' is not NULL, the output dataset will contain the indicator column of holdout observations
# (named according to hold_column argument).
#' @export
# , hold_column = NULL
define_tgrid <- function(dataDT,
                         ID,
                         t_name,
                         y,
                         # tmin = 1,
                         # tmax = 500,
                         # incr = 5,
                         tgrid,
                         grid_size = 150) {
  if (!data.table::is.data.table(dataDT)) stop("critical error: dataDT must be a data.table, use 'data.table(dataDT)'")
  if ("train_point" %in% names(dataDT)) stop("critical error: columns named 'train_point' are not allowed in dataDT")

  # if (!is.null(hold_column)) {
  #   if (!(hold_column %in% names(dataDT))) stop("critical error: hold_column is not null, but cannot find the corresponding column name in dataDT")
  # }

  ## 1A. Define the grid of time-points for which to predict (common grid for all subjects)
  # if (missing(tgrid))
  #   tgrid <- seq(tmin, tmax, by = incr)
  # gridDT <- CJ(unique(dataDT[[ID]]), as.integer(tgrid))

  ## 1B. Define subject-specific grid of time-points based on the subject's observed t range (e.g, 150 equally spaced time-points)
  if (missing(tgrid)) {
    gridDT <- dataDT[,
      {
       xrng <- range(agedays, na.rm = TRUE);
       as.integer(seq(xrng[1], xrng[2], length = grid_size))
     }, by = eval(ID)]
  } else {
    gridDT <- CJ(unique(dataDT[[ID]]), as.integer(tgrid))
  }

  colnames(gridDT) <- c(ID, t_name)
  setkeyv(gridDT, cols = c(ID, t_name))

  ## 2. Remove person-time combos that already appear in input data (will be later added w/ rbind)
  gridDT <- fsetdiff(gridDT, dataDT[, c(ID, t_name), with = FALSE])
  # gridDT[subjid == 500, ]
  # dataDT[, .N,by=subjid][N==1, ]
  # gridDT[subjid == 3,]

  gridDT[, ("train_point") :=  FALSE] # indicator that this row hasn't been used for training
  # if (!is.null(hold_column)) gridDT[, (hold_column) :=  FALSE]

  # Prep data for defining (Y.lt, lt) & (Y.rt, rt)
  sel_vars <- c(ID, t_name, y) # sel_vars <- c(ID, t_name, y, c("l.obs", "mid.obs", "r.obs"))
  dataDT_sel <- dataDT[, sel_vars, with = FALSE]
  dataDT_sel[, ("t.cp") := eval(as.name(t_name))]
  setkeyv(dataDT_sel, cols = c(ID, t_name))

  ## 3a. Define (Y.lt, lt) with rolling join (+)
  gridDT <- dataDT_sel[, , with = FALSE][gridDT, roll = Inf]
  setnames(gridDT, old = c(y, "t.cp"), new = c("Y.lt", "lt"))

  ## 3b. Define (Y.rt, rt) with rolling join (-)
  gridDT <- dataDT_sel[, , with = FALSE][gridDT, roll = -Inf]
  setnames(gridDT, old = c(y, "t.cp"), new = c("Y.rt", "rt"))

  ## 4. Select first row of each subject and merge these covars into the grid data
  sel_covars <- names(dataDT)[!(names(dataDT) %in% c(t_name, y, "lt", "rt", "Y.lt", "Y.rt"))] # , hold_column
  setkeyv(dataDT, c(ID)) # set key to allow binary search using `J()`
  dataDT_ID_bsl <- dataDT[J(unique(eval(as.name(ID)))), sel_covars, mult ='first', with = FALSE]   # if you wanted the first row for each x
  # dataDT_ID_bsl <- dataDT[J(unique(eval(as.name(ID)))), sel_covars, mult ='last', with = FALSE]    # subset out the last row for each x
    # bsl_covars <- covars # baseline covars
    # dataDT_ID_bsl <- unique(dataDT[, c(ID, bsl_covars), with = FALSE])
  setkeyv(dataDT_ID_bsl, cols = c(ID))
  setkeyv(gridDT, cols = c(ID))
  gridDT <- merge(gridDT, dataDT_ID_bsl)

  ## TV covars? Apply last observation carried forward approach, merge above did just that.
    # TDvars <- c()        # time-dep covars
    # setkeyv(dataDT, cols = c(ID, t_name))
    # gridDT <- dataDT[, , with = FALSE][gridDT, roll = Inf]
    # setnames(gridDT, old = c(y, "t.cp"), new = c("Y.lt", "lt"))

  ## 5. rbind the actual observations into the grid data
  gridDT <- rbindlist(list(gridDT, dataDT[, ("train_point") := TRUE][, names(gridDT), with = FALSE]))
  dataDT[, ("train_point") := NULL]
  setkeyv(gridDT, cols = c(ID, t_name))
  setkeyv(dataDT, cols = c(ID, t_name))

  return(gridDT)
}

# ---------------------------------------------------------------------------------------
#' Define curve-based features and summaries for training or validation data
#'
#' Defines additional features and summaries of the growth curve person-time observations.
#' Used for modeling and defining the training and validation sets
#' (e.g., random holdout and cross-validation).
#' By setting \code{train_set} to \code{TRUE} this function will define features using all data points as a
#' full training set (no holdouts, summaries use all person-time rows).
#' In contrast, when \code{train_set = TRUE} and \code{hold_column} is not missing,
#' these features are defined only for non-holdout observations, excluding the holdout rows
#' (i.e., curve summaries will be defined based on training points only while dropping all holdout observations).
#' Finally, by setting \code{train_set} to \code{FALSE} one can create a validation dataset (e.g., for scoring with CV).
#' In this case the summaries and features will be defined for each row data point (X_i,Y_i)
#' by first dropping (X_i,Y_i) and then evaluating the summaries for (X_i,Y_i) based on the remaining observations.
#' This process is repeated in a loop for all person-time rows in the data.
#'
#' @param dataDT Input data.table
#' @param ID A character string name of the column that contains the unique subject identifiers.
#' @param t_name A character string name of the column with integer-valued measurement time-points
#' (in days, weeks, months, etc).
#' @param y A character string name of the column that represent the response variable in the model.
#' @param train_set Set to \code{TRUE} to define growth curve features and summaries for training data.
#' Set to \code{FALSE} to define the summaries for validation data. In the latter case the summaries
#' are defined for observation (X_i,Y_i) by first dropping that observation
#' and then evaluating the summaries for the remaining observations. This is repeated in a loop for
#' all person-time rows in the data.
#' @param hold_column A column with a logical flag for holdout rows / observations (\code{TRUE}
#' indicates that the row is a holdout).
#' When \code{train_set} is \code{TRUE} the resulting output data will contain all non-HOLDOUT
#' observations (training data points).
#' When \code{train_set} is \code{FALSE} the resulting output data will contain the HOLDOUT observations
#' only (validation data points).
#' To evaluate either training or validation data summaries FOR ALL observations this argument must be
#' missing
#' (in which case all observation from the input data are returned with their corresponding summaries).
#' @param noNAs ...
#' @param includeRLMIDind ...
#' @param verbose Set to \code{TRUE} to print messages on status and information to the console. Turn this on by default using \code{options(growthcurveSL.verbose=TRUE)}.
#' @export
#' @example tests/RUnit/RUnit_tests_01_features_cpp.R
#' @return ...
define_features_drop <- function(dataDT,
                                 ID,
                                 t_name,
                                 y,
                                 train_set = TRUE,
                                 hold_column,
                                 noNAs = FALSE,
                                 includeRLMIDind = FALSE,
                                 verbose = getOption("growthcurveSL.verbose")) {
  # dataDT <- data.table(dataDT)
  dataDT <- copy(dataDT) # Making sure nothing gets modified by reference:
  data.table::setkeyv(dataDT, cols = c(ID, t_name))

  if (train_set && !missing(hold_column)) {
    # Define which observations are in the hold-out set (and thus should be ignored when creating predictors for training set)
    non_hold_idx <- !dataDT[[hold_column]]
    dataDT <- dataDT[non_hold_idx, ]
  } else {
    # to define (Y.lt, Y.rt, lt, rt) for validation data points we will use the entire observed data
    non_hold_idx <- rep.int(TRUE, nrow(dataDT))
  }

  # ---------------------------------------------------------------------------------------
  # When train_set is TRUE the features / summaries will be defined for the training data only (without holdouts if hold_column is specified),
  # where the holdout observations are ignored (but still kept in the data)
  # When train_set is FALSE then all observations will be used to define features / predictors.
  #   - The predictors below turn out to be equivalent when defining validation data points as well as when training on ALL data
  #   - This allows us to use every single observation as a test point for CV.MSE
  # ---------------------------------------------------------------------------------------
  # 1. Set left (Y[lt], lt) if exists (otherwise NA); 2. Set right (Y[rt], rt) if exists (otherwise NA)
  dataDT[, c("lt", "rt") := list(shift(eval(as.name(t_name)), type = "lag"), shift(eval(as.name(t_name)), type = "lead")), by = eval(ID)]
  dataDT[, c("Y.lt", "Y.rt") := list(shift(eval(as.name(y)), type = "lag", fill = NA), shift(eval(as.name(y)), type = "lead", fill = NA)), by = eval(ID)]

  if (includeRLMIDind) {
    # Add dummy indicator column(s) of being left-most (l.obs) / middle (mid.obs) / right-most (r.obs) observation:
    dataDT[, c("l.obs", "mid.obs", "r.obs"):= list(0L, 0L, 0L)]
    dataDT[is.na(lt), l.obs := 1L]
    dataDT[!is.na(lt) & !is.na(rt), mid.obs := 1L]
    dataDT[is.na(rt), r.obs := 1L]
  }

  if (noNAs) {
    ##  If left (Y[lt], lt) does't exist then (Y[lt], lt) = (Y[rt], rt);
    dataDT[is.na(Y.lt), Y.lt := Y.rt]
    dataDT[is.na(lt),  lt  := rt]
    ##  If left (Y[rt], rt) does't exist then (Y[rt], rt) = (Y[lt], lt);
    dataDT[is.na(Y.rt), Y.rt := Y.lt]
    dataDT[is.na(rt), rt := lt]
  }

  if (train_set) {
    # Evaluate the summary (predictors) on a training set (for holdout=TRUE, this will evaluate the summaries among training data points (excluding the holdouts)):
    # dataDT <- dataDT[dataDT[non_hold_idx, {
    dataDT <- dataDT[dataDT[, {
        nY = length(eval(as.name(y)));
        meanY = mean(eval(as.name(y)));
        sdY = sd(eval(as.name(y)));
        medianY = median(eval(as.name(y)));
        minY = min(eval(as.name(y)));
        maxY = max(eval(as.name(y)));
        list(nY = nY, meanY = meanY, sdY = sdY, medianY = medianY, minY = minY, maxY = maxY)
      }, by = eval(ID)]]

  } else {
    # Evaluate the summary (predictors) for validation set. Treats each row of data as if it is a validation data point.
    # Loop through every data-point row i, remove it, then evaluate the summary for that subject with row i removed.
    # This will allow us to do validation set predictions when doing V-fold CV.
    # NOTE: such summaries will allow us to use ANY observed data point as a validation point and do prediction ALL at once (rather than having to loop over each point)
    dataDT[, c("Y_tmp") := list(eval(as.name(y)))]
    dataDT[, c("nY", "meanY", "sdY", "medianY", "minY", "maxY") := list(0.0,0.0,0.0,0.0,0.0,0.0)]
    dataDT[, c("nY", "meanY", "sdY", "medianY", "minY", "maxY") := {
            for (i in seq_len(.N)) {
              nY[i] = length(Y_tmp[-i])
              meanY[i] = mean(Y_tmp[-i])
              sdY[i] = sd(Y_tmp[-i])
              medianY[i] = median(Y_tmp[-i])
              minY[i] = ifelse(length(Y_tmp[-i]) > 0, min(Y_tmp[-i]), NA)
              maxY[i] = ifelse(length(Y_tmp[-i]) > 0, max(Y_tmp[-i]), NA)
              # maxY[i] = max(Y_tmp[-i])
            };
          list(nY = nY, meanY = meanY, sdY = sdY, medianY = medianY, minY = minY, maxY = maxY)
      }, by = eval(ID)]
    dataDT[, ("Y_tmp") :=  NULL]
    # dataDT <- dataDT[dataDT[, {meanY = eval(as.name(y)); for (i in seq_len(.N)) {meanY[i] = mean(meanY[-i])};  list(meanY = meanY)}, by = eval(ID)]]
    if (!missing(hold_column)) dataDT <- dataDT[dataDT[[hold_column]], ]
  }

#   # Add total sum of observed Y's and other summaries?
#   # ...

  return(dataDT)
}
osofr/growthcurveSL documentation built on May 24, 2019, 4:56 p.m.