R/preproc.R

Defines functions add_test_data from_unstructured_to_deeppam_tensor from_ped_to_tensor make_test_ped

Documented in add_test_data from_ped_to_tensor from_unstructured_to_deeppam_tensor make_test_ped

#' Creation of a generic test PED dataframe
#'
#' This function creates a list that contains (possibly) necessary PED frames
#' for prediction.
#' The list features three elements:
#' \code{[[1]]}: the PED frame using the original end times (original=.
#' (Could be used for validation loss)
#' \code{[[2]]}: the PED frame using the maximally observed end time for all
#' observations (transformed). (Could be used for testing)
#' \code{[[3]]}: the raw (no PED) data (raw_data).
#' The fucntions uses \code{pammtools::as_ped} in the backend.
#'
#' @param data a data.frame (or variant) for which the test set(s) should be
#' computed.
#' @param ped_train a PED data set (ee pammtools package) that has been used for
#' training
#' @return a list of length 3. (See description)
#' @export
#' @author Philipp Kopper
make_test_ped <- function(data, ped_train) {
  trafo_args <- attr(ped_train, "trafo_args")
  formula <- trafo_args[["formula"]]
  not <- which(names(trafo_args) == "formula")
  trafo_args <- trafo_args[(1:length(trafo_args))[-not]]
  timevar <- as.character(formula[[2]])[2]
  if (is.data.frame(data)) {
    # only static covars
    data2 <- data
    data2[[timevar]] <- max(data[[timevar]])
    list(original = do.call(as_ped, append(list(data, formula), trafo_args)),
         transformed = do.call(as_ped, append(list(data2, formula), trafo_args)),
         raw_data = data)
  } else {
    stop("Not yet implemented for longi. data.")
  }
}

#' Transform PED data to an array / tensor
#'
#' This function transforms PED data (from pammtools) to an array or tensor
#' for further usage in tensorflow.
#' The array is constructed from the PED so that the first dimension has same
#' length as there are unique IDs in the PED. The second dimension reflects all
#' distinct intervals per ID and the third dimension features all model features
#' effectively used by the PAM(M).
#' The third axis features the offset (\code{[, , 1]}) all linear covariates and
#' all spline covariates in a already transformed manner using mgcv::gam for
#' the transformation.
#' The transformation is based on the formula argument.
#' @param ped a PED (see pammtools) data frame.
#' @param formula a formula for a PAM(M) (see pammtools/mgcv)
#' @param training a boolean indicating if a training or test set is to be
#' constructed.
#' @param training_data only relevant if training == TRUE. The training PED
#' that has been used to fit the model is to be tested.
#' @param id a string indicating the name of the ID variable.
#' @return A list of class ped_tensor and length 2.
#' \code{[[1]]} is the associated input (X) and \code{[[2]]} the associated target (y).
#' @export
#' @author Philipp Kopper
#' @import pammtools
#' @import dplyr
#' @import stats
from_ped_to_tensor <-
  function(ped, formula = ped_status ~ s(tend, k = 9L, bs = "cr"),
           training = TRUE, training_data = NULL, id = "id") {
    if (training) {
      base_line_pamm <- pamm(as.formula(formula), data = ped)
      mm <- model.matrix(base_line_pamm)[, -1]
    } else {
      base_line_pamm <- pamm(as.formula(formula), data = training_data)
      mm <- predict(base_line_pamm, newdata = ped, type = "lpmatrix")[, -1]
    }
    dims_X <- c(length(unique(ped$id)), length(unique(ped$tend)), ncol(mm) + 1)
    dims_y <- c(length(unique(ped$id)), length(unique(ped$tend)), 1)
    X <- array(0, dims_X, dimnames = list(unique(ped[[id]]), unique(ped$tend),
                                          c("offset", colnames(mm))))
    y <- array(0, dims_y, dimnames = list(unique(ped[[id]]), unique(ped$tend)))
    wgs <- y
    for (i in 1:dim(X)[1]) {
      current_id <- as.integer(dimnames(X)[[1]][i])
      current_length <- sum(ped[[id]] == current_id)
      X[i, 1:current_length, 1] <- ped[current_id == ped[[id]], "offset"]
      X[i, 1:current_length, -1] <-
        mm[current_id == ped[[id]], dimnames(X)[[3]][-1]]
      y[i, 1:current_length, ] <- ped[current_id == ped[[id]], "ped_status"]
      wgs[i, 1:current_length, ] <- 1
    }
    indicator <- rep(0, dim(X)[3])
    names(indicator) <- dimnames(X)[[3]]
    indicator[dimnames(X)[[3]] %in% colnames(ped)] <- 1
    indicator[1] <- 0
    for (i in 1:length(base_line_pamm$smooth)) {
      ind_names <- paste(base_line_pamm$smooth[[i]]$label, ".",
                         1:base_line_pamm$smooth[[i]]$df, sep = "")
      indicator[ind_names] <- i + 1
    }

    class(X) <- c("ped_tensor_X", class(X))
    class(y) <- c("ped_tensor_y", class(y))
    attr(X, "trafo_args") <- attr(ped, "trafo_args")
    attr(X, "raw") <- ped
    res <- list(X = X, y = y)
    S <- sapply(base_line_pamm$smooth, "[[", "S")
    attr(res, "P") <- vector_list_prod(base_line_pamm$sp / nrow(ped), S)
    attr(res, "weights") <- wgs
    attr(res, "indicator") <- indicator
    attr(res, "trafo_args") <- attr(ped, "trafo_args")
    attr(res, "coeffs") <- base_line_pamm$coefficients
    class(res) <- "ped_tensor"
    res
  }

#' Transform unstructured data to an array / tensor that fits to the PED
#' tensor (BETA).
#'
#' This function transforms the unstrucutred data part.
#' For static covariates (format == "standard") basically nothing happens.
#' All other options are not yet implemented.
#' This current version, however, covers all previous use cases such as
#' Kopper et al. (2020).
#' @param data a list of tensors. In the current implementeation each tensor
#' must be processible by the respective network supplied to deeppam().
#' @param ped_tensor the associated tensor brought into PED format by
#' from_ped_to_tensor().
#' @param format a character indicating which transformation is used.
#' Currently only "standard has been implemented.
#' @return a list of tensors to be supplied to the deep part of the network
#' constructed by deeppam().
#' @export
#' @author Philipp Kopper
from_unstructured_to_deeppam_tensor <- function(data = list(), ped_tensor = NULL,
                                                format = "standard") {
  if (length(data) == 0) {
    stop("No data supplied.")
  }
  if (format == "standard") {
    data
  } else if (format == "ped") {
    stop("ped not implemented yet.")
    #to_ped_tensors(data, ped)
  } else {
    stop((c("Currently you have to supply more advanced longitudinal data",
            " formats manually. This function only covers two simple special",
            " cases: No transformation and a classical PED transformation.",
            " for static covaraites. This, however, will cover most primary",
            " use cases. There is no guarantee that custom. long. format",
            " can be modelled by DeepPAM.")))
  }
 # can vaguely stay the same for standard
}

#' Attach test data to DeepPAM model for testing via pec::pec.
#'
#' This function modifies the keras model constructed using deeppam() to feature
#' the data it should be tested on.
#' This step is necessary due to the rigorous preprocessing needed for DeepPAM.
#' newdata_str MUST be equivalent to the newdata argument of pec::pec.
#' Internally, the test data is attached via the attributes of the R object.
#' @param model a keras model of additional subclass deeppam which should be
#' evaluated.
#' @param newdata_str the strucured new data for evaluation. A regular data set.
#' newdata_str MUST be identical to the newdata argument used in pec:pec() later
#' on.
#' @param newdata_unstr the unstructured part of the data for evalution.
#' Must match newdata_str.
#' @param training_ped the PED used for training.
#' @param formula the formula used to fit the model. (can be solved via attribute)
#' @return the same model as inputted with a modified attribute set.
#' The model can be handed over to pec::pec() only after this step.
#' @export
#' @author Philipp Kopper
add_test_data <- function(model, newdata_str, newdata_unstr = list(),
                          training_ped, formula = ped_status ~ s(tend, k = 9L)) {
  test_ped <- make_test_ped(newdata_str, training_ped)
  ped_tensor_test <- from_ped_to_tensor(test_ped$transformed,
                                        formula, training = FALSE,
                                        training_data = training_ped)
  attachment <- vector(mode = "list", 1)
  attachment[[1]] <- ped_tensor_test$X
  attr(model, "test_data") <- append(attachment, list(newdata_unstr))
  model
}
pkopper/deeppam documentation built on Jan. 19, 2021, 12:39 a.m.