R/relative_poisson.R
In popEpi: Functions for Epidemiological Analysis using Population Data

Documented in relpois relpois_ag

#' @title Excess hazard Poisson model
#' @author Joonas Miettinen, Karri Seppa
#' @description Estimate a Poisson piecewise constant excess
#' hazards model
#' @param data a dataset split with e.g. `[lexpand]`;
#' must have expected hazard merged within
#' @param formula a formula which is passed on to `glm`; see Details
#' @param fot.breaks optional; a numeric vector of [a,b) breaks to specify
#' survival intervals over the follow-up time; if `NULL`, the
#' existing breaks along the mandatory `fot` time scale in `data`
#' are used (e.g. the breaks for `fot` supplied to `lexpand`)
#' @param subset a logical vector or condition; e.g. `subset = sex == 1`;
#' limits the data before estimation
#' @param check logical; if `TRUE`, tabulates excess cases by all
#' factor variables in the formula to check for negative / `NA`
#' excess cases before fitting the GLM
#' @param ... any argument passed on to `glm`
#' @import stats
#' @details
#'
#' **Basics**
#'
#' `relpois` employs a custom link function of the Poisson variety
#' to estimate piecewise constant parametric excess hazards. The pieces
#' are determined by `fot.breaks`. A `log(person-years)` offset
#' is passed automatically to the `glm` call.
#'
#' **Formula usage**
#'
#' The formula can be used like any ordinary `glm` formula. The user must
#' define the outcome in some manner, which is usually `lex.Xst` after splitting
#' with e.g. `lexpand`. The exception is the possibility of including
#' the baseline excess hazard terms by including the
#' reserved term `FOT` in the formula.
#'
#' For example, `lex.Xst != 0 ~ FOT + agegr` estimates a model with constant
#' excess hazards at the follow-up intervals as specified by
#' the pertinent breaks used in splitting `data`,
#' as well as for the different age groups.
#' `FOT` is created ad hoc if it is used in the formula.
#' If you leave out `FOT`, the hazard is effectively
#' assumed to be constant across the whole follow-up time.
#'
#' You can also simply use your own follow-up time interval variable that
#' you have created before calling `relpois`. However, when using
#' `FOT`, `relpois` automatically checks for e.g.
#' negative excess cases in follow-up intervals,
#' allowing for quickly finding splitting breaks
#' where model estimation is possible. It also drops any data outside the
#' follow-up time window.
#'
#' **Splitting and merging population hazard**
#'
#' The easiest way to both split and to include population hazard information is
#' by using `[lexpand]`. You may also fairly easily do it by hand
#' by splitting first and then merging in your population hazard information.
#'
#'
#' **Data requirements**
#'
#' The population hazard information must be available for each record and named
#' `pop.haz`. The follow-up time variable must be named `"fot"` e.g.
#' as a result of using `lexpand`. The `lex.dur` variable must also
#' be present, containing person-year information.
#'
#'
#' @return
#' A `glm` object created using a custom Poisson family construct. Some
#' `glm` methods are applicable.
#'
#' @seealso
#' `[lexpand]`, `[stats::poisson]`, `[stats::glm]`
#' @family main functions
#' @family relpois functions
#' @export relpois
#'
#' @examples
#' ## use the simulated rectal cancer cohort
#' data("sire", package = "popEpi")
#' sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE)
#'
#' ## usable straight away after splitting
#' fb <- c(0,3/12,6/12,1,2,3,4,5)
#' x <- lexpand(sire, birth = bi_date, entry = dg_date,
#'              exit = ex_date, status=status,
#'              breaks = list(fot=fb), pophaz=popmort)
#' rpm <- relpois(x, formula = lex.Xst %in% 1:2 ~ FOT + agegr)
#'
#' ## some methods for glm work. e.g. test for interaction
#' \donttest{
#' rpm2 <- relpois(x, formula = lex.Xst %in% 1:2 ~ FOT*agegr)
#' anova(rpm, rpm2, test="LRT")
#' AIC(rpm, rpm2)
#' ## update() won't work currently
#' }
#' @references
#' Paul W Dickman, Andy Sloggett, Michael Hills, and Timo Hakulinen.
#' Regression models for relative survival.
#' Stat Med. 2004 Jan 15;23(1):51-64.
#' \doi{10.1002/sim.1597}


relpois <- function(data,
                    formula,
                    fot.breaks = NULL, subset = NULL, check=TRUE, ...) {
  ## R CMD CHECK appeasement
  lex.dur <- NULL

  ## prep arguments ------------------------------------------------------------
  excess_cases <- fot <- pop.haz <-  NULL ## appease R CMD CHECK

  ## somehow the class of the data is being altered by this function
  oldClass <- class(data)

  if (missing(formula) || !inherits(formula, "formula")) stop("formula not defined")

  form_vars <- all.vars(formula)
  dataname <- as.name(deparse(substitute(data)))

  if (!inherits(data, "Lexis")) {
    stop("data is not a Lexis object; data must be a result of splitting or using Lexis")
  }

  if ("FOT" %in% names(data)) {
    stop("FOT is a reserved name but you have a variable with that name in data; rename/delete it first")
  }

  # wasDF <- FALSE
  if (!is.data.table(data)) {
    data <- copy(data)
    setDT(data)
    message("Took a copy of your data because it was a data.frame and not a data.table. This may take up a lot of memory.")
    message("It is recommended to convert your data to data.table before using this function using as.data.table or setDT")
  }

  req_vars <- unique(c("lex.id", "fot", "lex.dur", "pop.haz", setdiff(form_vars, "FOT")))
  all_names_present(data, req_vars)

  surv.breaks <- attr(data, "breaks")$fot
  if (is.null(surv.breaks)) {
    stop("did not find any breaks information in data attributes named 'fot';
            probable reason: split data was edited after splitting - ",
         "don't do that")
  } else {
    if (!is.null(fot.breaks)) {
      if (any(!fot.breaks %in% surv.breaks)) {
        stop("fot.breaks must be a subset of the breaks for 'fot' used in splitting;
           type attr(data, 'breaks')$fot to see the breaks you used in splitting")
      } else {
        surv.breaks <- fot.breaks
      }
    }

  }

  ## prep & subset data --------------------------------------------------------
  subset <- substitute(subset)
  subset <- evalLogicalSubset(data, subset)

  if (any(is.na(data[subset, ]$pop.haz))) {
    stop("some pop.haz are NA")
  }

  on.exit({
    setcolsnull(data, c("FOT", tmpdexp), soft = TRUE)
  }, add = TRUE)

  if ("FOT" %in% form_vars)  {
    data[, "FOT" := cut(fot, breaks = surv.breaks, right = FALSE)]
    # set(data, j = "FOT", value = cut(data$fot, breaks = surv.breaks, right=FALSE))
    subset <- subset & !is.na(data$FOT)
  }
  tmpdexp <- makeTempVarName(data, pre = "TEMP_d.exp_")
  data[, c(tmpdexp) := pop.haz*lex.dur]
  # set(data, j = tmpdexp, value = data$pop.haz * data$lex.dur)


  if (check) {
    ## test for negative excess cases in factor variable combinations ------------

    ## determine factor variables for cross-tabulating

    fac_vars <- colnames(attr(terms.formula(formula), "factors"))

    fac_list <- paste0(fac_vars, collapse=", ")
    fac_list <- paste0("list(", fac_list, ")")
    fac_list <- parse(text=fac_list)

    wh_fac <- as.data.table(data)[subset, unlist(lapply(eval(fac_list), is.factor))]
    fac_vars <- fac_vars[wh_fac]

    if (length(fac_vars) == 0) fac_vars <- NULL
    fac_list <- paste0(fac_vars, collapse=", ")
    fac_list <- paste0("list(", fac_list, ")")
    fac_list <- parse(text=fac_list)


    ## test negativity of excess cases
    LHS <- as.character(formula)
    LHS <- LHS[2]
    LHS <- parse(text = LHS)

    excas <- as.data.table(data)[subset, list(excess_cases = sum(eval(LHS)-get(tmpdexp))), keyby=eval(fac_list)]
    setnames(excas, 1:ncol(excas), c(fac_vars, "excess_cases"))


    if (any(is.na(excas$excess_cases))) {
      stop("some excess cases were NA; is pop.haz available for all records?")
    }
    excas <- excas[excess_cases <= 0]
    if (any(excas$excess_cases<=0)) {
      print(excas)
      warning("negative excess cases found in some combinations of factor variables;
           see printed table and try e.g. wider FOT intervals")
    }
  }


  ## custom poisson family -----------------------------------------------------
  RPL <- copy(poisson())
  RPL$link <- "glm relative survival model with Poisson error"
  RPL$linkfun <- function(mu, d.exp = data[[tmpdexp]][subset]) log(mu - d.exp)
  RPL$linkinv <- function(eta, d.exp = data[[tmpdexp]][subset]) d.exp + exp(eta)


  RPL$initialize <- substitute( {
    if (any(y < 0)) stop(paste("Negative values not allowed for",
                               "the Poisson family"))
    n <- rep.int(1, nobs)
    mustart <- pmax(y, d.exp) + 0.1
  }, list(d.exp = data[[tmpdexp]][subset]) )



  ## glm call ------------------------------------------------------------------
  ## update() won't work
  ## anova() works


  ml <- glm(formula = formula, data=data[subset,], offset=log(lex.dur),
            #             subset = subset, ## Error in xj[i] : invalid subscript type 'closure'
            family = RPL, ...)

  ## final touches -------------------------------------------------------------
  ml$d.exp <- data[subset, ][[tmpdexp]]
  ml$FOT <- data[subset, ]$FOT
  ml$fot.breaks <- surv.breaks
  ml$call$data <- dataname
  ml$call$formula <- formula
  setattr(ml, "class", c("relpois", "glm", "lm"))

  setattr(data, "class", oldClass) ## see beginning of function


  ml
}



#' @title Excess hazard Poisson model
#' @author Joonas Miettinen, Karri Seppa
#' @description Estimate a Poisson Piecewise Constant Excess
#' Hazards Model
#' @param formula a formula with the counts of events as the response.
#' Passed on to `glm`. May contain usage of the `offset()` function
#' instead of supplying the offset for the Poisson model via the argument
#' `offset`.
#' @param data an `aggre` object (an aggregated data set;
#' see `[as.aggre]` and `[aggre]`)
#' @param d.exp the counts of expected cases. Mandatory.
#' E.g. `d.exp = EXC_CASES`, where `EXC_CASES` is a column in data.
#' @param offset the offset for the Poisson model, supplied as e.g.
#' `offset = log(PTIME)`, where `PTIME` is a subject-time
#' variable in data. Not mandatory, but almost always should be supplied.
#' @param breaks optional; a numeric vector of [a,b) breaks to specify
#' survival intervals over the follow-up time; if `NULL`, the
#' existing breaks along the mandatory time scale mentioned in `formula`
#' are used
#' @param subset a logical vector or condition; e.g. `subset = sex == 1`;
#' limits the data before estimation
#' @param piecewise `logical`; if `TRUE`, and if any time scale
#' from data is used (mentioned) in the formula, the time scale is
#' transformed into a factor variable indicating intervals on the time scale.
#' Otherwise the time scale left as it is, usually a numeric variable.
#' E.g. if `formula = counts ~ TS1*VAR1`, `TS1` is transformed
#' into a factor before fitting model.
#' @param check `logical`; if `TRUE`, performs check on the
#' negativity excess cases by factor-like covariates in formula -
#' negative excess cases will very likely lead to non-converging model
#' @param ... any other argument passed on to `[stats::glm]` such as
#' `control` or `weights`
#' @import stats
#'
#' @return
#' A `relpois` object created using a custom Poisson family construct.
#'
#' @seealso
#' `[lexpand]`, `[poisson]`, `[glm]`
#' @family main functions
#' @family relpois functions
#' @examples
#' ## use the simulated rectal cancer cohort
#' data(sire, package = "popEpi")
#' sire$agegr <- cut(sire$dg_age, c(0,45,60,Inf), right=FALSE)
#'
#' ## create aggregated example data
#' fb <- c(0,3/12,6/12,1,2,3,4,5)
#' x <- lexpand(sire, birth = bi_date, entry = dg_date,
#'              exit = ex_date, status=status %in% 1:2,
#'              breaks = list(fot=fb),
#'              pophaz=popmort, pp = FALSE,
#'              aggre = list(agegr, fot))
#'
#' ## fit model using aggregated data
#' rpm <- relpois_ag(formula = from0to1 ~ fot + agegr,  data = x,
#'                   d.exp = d.exp, offset = log(pyrs))
#' summary(rpm)
#'
#' ## the usual functions for handling glm models work
#' rpm2 <- update(rpm, . ~ fot*agegr)
#' anova(rpm, rpm2, test="LRT")
#' AIC(rpm, rpm2)
#'
#' ## other features such as residuals or predicting are not guaranteed
#' ## to work as intended.
#' @export

relpois_ag <- function(formula, data, d.exp, offset = NULL, breaks = NULL, subset = NULL, piecewise = TRUE, check = TRUE, ...) {

  TF <- environment()
  PF <- parent.frame(1L)
  original_formula <- formula

  if (!inherits(data, "aggre")) {
    stop("data is not an aggre object. Please aggregate your data first using ",
         "e.g. lexpand(). If your data is pre-aggregated, use as.aggre() to ",
         "mark it as such.")
  }

  formula <- evalRecursive(formula, env = TF, enc = PF)$arg
  if (missing(formula) || !inherits(formula, "formula")) stop("formula not defined")




  ## detect survival time scale ------------------------------------------------
  oldBreaks <- copy(attr(data, "breaks"))
  allScales <- names(oldBreaks)
  if (is.null(oldBreaks)) {
    stop("data does not have breaks information. Is it a result of using ",
         "aggre() or as.aggre()?")
  }
  survScale <- intersect(all.vars(formula), allScales)
  if (length(survScale) > 1L) {
    stop("Found several used time scales in formula, which is not supported ",
         "(found ", paste0("'", survScale, "'", collapse = ", "), ")")
  }

  ## check supplied breaks -----------------------------------------------------
  if (is.numeric(breaks)) {
    breaks <- list(breaks)
    names(breaks) <- survScale
  }

  if (!is.null(breaks)) {
    if (!all_breaks_in(breaks, oldBreaks)) {
      stop("Supplied breaks must be subset of the breaks used in splitting/",
           "aggregating data. See the latter using e.g. ",
           "attributes(x)$aggre.meta$breaks where x is your aggregated data.")
    }
  }

  ## pre-find args -------------------------------------------------------------
  desub <- substitute(d.exp)
  sub_d.exp <- evalRecursive(desub, env = data[1L, ], enc = PF)$argSub
  offsub <- substitute(offset)
  sub_offset <- evalRecursive(offsub, env = data[1L, ], enc = PF)$argSub

  ## prep & subset data --------------------------------------------------------
  subset <- substitute(subset)
  subset <- evalLogicalSubset(data, subset)

  av <- c(all.vars(formula), all.vars(sub_d.exp), all.vars(sub_offset))
  av <- intersect(names(data), av)

  x <- subsetDTorDF(data, subset = subset, select = av)

  setDT(x)
  setattr(x, "class", c("aggre", "data.table", "data.frame"))

  ## handle breaks -------------------------------------------------------------
  if (!is.null(breaks)) {
    if (!piecewise) {
      stop("Supplied breaks but piecewise = FALSE. Please select piecewise = ",
           "TRUE if you want piecewise estimates defined by the breaks.")
    }

  }

  cutBreaks <- breaks
  othScales <- setdiff(names(oldBreaks), names(cutBreaks))
  cutBreaks[othScales] <- oldBreaks[othScales]
  cutBreaks[sapply(cutBreaks, length) < 2L] <- NULL

  if (piecewise && length(cutBreaks)) {

    for (sc in names(cutBreaks)) {
      set(x, j = sc, value = cut(x[[sc]], breaks = cutBreaks[[sc]],
                                 right = FALSE, labels = FALSE))

      pieces <- round(cutBreaks[[sc]], 2L)
      pieces <- paste0("[", pieces[-length(pieces)], ", ", pieces[-1L], ")")
      set(x, j = sc, value = pieces[x[[sc]]])
    }

  }

  ## eval value args -----------------------------------------------------------
  d.exp <- evalPopArg(x, sub_d.exp, enclos = PF,
                      DT = TRUE, recursive = TRUE)
  if (is.null(d.exp)) stop("argument d.exp was not supplied")
  d.exp <- rowSums(d.exp)
  if (length(d.exp) == nrow(data)) d.exp <- d.exp[subset]

  offset <- evalPopArg(x, sub_offset, enclos = PF,
                       DT = TRUE, recursive = TRUE)
  if (!is.null(offset)) offset <- rowSums(offset)
  if (length(offset) == nrow(data)) offset <- offset[subset]

  ## check excess cases --------------------------------------------------------
  d <- eval(formula[[2]], envir = x, enclos = PF)
  check_excess_cases(d = d, d.exp = d.exp, data = x,
                     formula = formula, enclos = PF)

  ## custom poisson family -----------------------------------------------------
  RPL <- copy(poisson())
  RPL$link <- "glm relative survival model with Poisson error"
  RPL$linkfun <- function(mu, d.exp = TF$d.exp) {
    log(mu - d.exp)
  }
  RPL$linkinv <- function(eta, d.exp = TF$d.exp) {
    d.exp + exp(eta)
  }

  RPL$initialize <- substitute( {
    if (any(y < 0)) stop(paste("Negative values not allowed for",
                               "the Poisson family"))
    n <- rep.int(1, nobs)
    mustart <- pmax(y, d.exp) + 0.1
  }, list(d.exp = TF$d.exp) )

  ## glm call ------------------------------------------------------------------

  ## NOTE: parent.frame(3L) to find this (this function's) environment
  ml <- glm(formula = formula, data=x, offset = parent.frame(3L)$offset, family = RPL, ...)

  ## final touches -------------------------------------------------------------

  ml$call <- match.call()
  setattr(ml, "class", c("relpois", "glm", "lm"))

  ml
}






check_excess_cases <- function(d, d.exp, formula, data, enclos = parent.frame(1)) {
  # @title Check Excess Counts for a Relative Poisson Model
  # @description Checks that the excess counts by strata all exceed 0.
  # @param d a vector of observed counts of cases
  # @param d.exp a vector of expected counts of cases
  # @param a formula, the right side of which is inspected for factor-like
  # stratifying variables (factors and character variables)
  # @param data a data set to eval formula in its context
  # @param enclos passed on to RHS2DT() to evaluate formula to columns;
  # enclosing environment of data
  PF <- parent.frame(1)
  tF <- environment()

  d.exc <- NULL

  by <- RHS2DT(formula, data = data, enclos = enclos)
  if (!length(by)) by <- list()
  facVars <- names(by)[sapply(by, function(col) is.factor(col) || is.character(col))]

  d <- substitute(d)
  d <- eval(d, envir = data, enclos = PF)
  d.exp <- substitute(d.exp)
  d.exp <- eval(d.exp, envir = data, enclos = PF)

  if (length(facVars)) {
    by <- setDT(mget(facVars, as.environment(by)))
  } else {
    by <- list()
  }

  dt <- data.table(d = d, d.exp = d.exp)
  dt[, d.exc := d - d.exp]

  for (k in seq_along(names(by))) {
    bycol <- names(by)[k]

    tab <- dt[, lapply(.SD, sum), keyby = .(by[[bycol]])][d.exc <= 0L, ]
    setnames(tab, 1, bycol)
    if (nrow(tab)) {
      on.exit(print(tab))
      stop("There are negative excess cases in the data calculated separately ",
           "by the factor-like variables ",
           paste0("'", facVars, "'", collapse = ", "), ". The model is not ",
           "estimable with negative excess cases in strata. ",
           "Infracting levels:")
    }

  }

  if (!length(by)) {
    tab <- dt[, lapply(.SD, sum)]
    if (tab$d.exc <= 0L) {
      stop("The marginal sum of excess cases is negative; the model cannot ",
           "be fitted. ")
    }
  }




  invisible(NULL)
}






relpois_lex <- function(formula,
                        data,
                        pophaz = NULL,
                        breaks = NULL,
                        subset = NULL,
                        check = TRUE,
                        ...) {
  PF <- parent.frame(1)
  TF <- environment()

  form <- agVars <- NULL


  ## checks --------------------------------------------------------------------

  checkLexisData(data)
  checkPophaz(lex = data, ph = pophaz)
  if (!is.null(breaks)) checkBreaksList(breaks)

  oldBreaks <- copy(attr(data, "breaks"))
  allScales <- copy(attr(data, "time.scales"))


  ## detect which time scale used ----------------------------------------------

  survScale <- intersect(all.vars(formula), allScales)
  if (length(survScale) > 1L) {
    stop("Found several used time scales in formula, which is not supported ",
         "(found ", paste0("'", survScale, "'", collapse = ", "), ")")
  }
  ## subset --------------------------------------------------------------------

  sb <- substitute(subset)
  subset <- evalLogicalSubset(data, sb, enclos = PF)
  x <- data[subset, ]

  ## essentially same steps as in survtab() here, maybe make that
  ## into a function / generalize lexpand.

  ## splitting -----------------------------------------------------------------
  if (is.numeric(breaks) && length(survScale)) {
    breaks <- list(breaks)
    names(breaks) <- survScale
  }
  if (!is.null(breaks)) x <- splitMulti(x, breaks = breaks, drop = TRUE)
  newBreaks <- copy(attr(x, "breaks"))

  ## merge in pophaz -----------------------------------------------------------
  haz <- makeTempVarName(x, pre = "haz_")
  ph <- data.table(pophaz)
  phVars <- setdiff(names(ph), "haz")
  setnames(ph, "haz", haz)
  x <- cutLowMerge(x, pophaz, by = phVars, all.x = TRUE, all.y = FALSE,
                   old.nums = TRUE, mid.scales = intersect(allScales, phVars))

  # expected cases
  d.exp <- makeTempVarName(x, pre = "d.exp_")
  set(x, j = d.exp, value = x$lex.dur * x[[haz]])

  ## aggregating ---------------------------------------------------------------
  ag <- model.frame(formula[-2], data = x) ## without response
  setDT(ag)
  set(ag, j = d.exp, value = x[[d.exp]])
  d <- makeTempVarName(x, pre = "d_")
  set(ag, j = d, value = eval(form))
  ag <- aggre(x, by = agVars, sum.values = d.exp)
  rm(x)

  ag_form <- formula
  ag_form[[2]] <- quote(from0to1)

  rp <- relpois_ag(ag_form, data = data, breaks = NULL)

  rp$call <- match.call()
  rp$formula <- formula

  rp
}