R/cpmgee.R

Defines functions cpmgee

Documented in cpmgee

#' CPMs for clustered continuous response data based on GEE methods for ordinal data
#'
#' This function function fits cumulative probability models (CPMs) clustered/longitudinal continuous response data
#' based on GEE methods for ordinal data.
#'
#' @param formula an R formula object
#' @param subjects a character string specifying the name of the subject variable
#' @param data a data frame including response data and covariates
#' @param corr.mod a character string specifying the working correlation structure
#'  ("independence", "exchangeable", and "ar1")
#' @param alpha an initial value for the association parameter in the range of 0.05 to 0.95.
#' @param fit.opt a vector of options to control the behavior of the fitting algorithm
#' @return A list containing the following components:
#' @return \item{max.id}{number of clusters}
#' @return \item{linear.predictors}{a vector of linear predictors}
#' @return \item{fitted.values}{a vector of the fitted values (linear prectors after inverse link function tranformation)}
#' @return \item{coefficients}{a vector of interecept and regression parameters}
#' @return \item{robust.var}{the robust (sandwich) variance matrix}
#' @return \item{alpha}{the estimate of the association parameter}
#' @export
#'
#' @details CPMs are useful for the analysis of continuous response data which may need to be transformed 
#' prior to fitting standard regression models. CPMs are semi-parametric linear transformation models; 
#' they nonparametrically estimate the appropriate transformation as part of the fitting procedure.
#'
#' We propose two feasible and computationally efficient approaches to
#' fit CPMs for clustered continuous response variables with different working correlation structures
#' (independence, exchangeable and AR1 working correlation structures).
#'
#' CPMs with independence working correlation can be efficiently fit to clustered continuous responses
#' with thousands of distinct values based on CPMs and sandwich estimator for variances.
#'
#' To improve efficiency, CPMs with more complex working correlation structures (exchangeable and AR1)
#' can be fit with a one-step GEE estimator for repolr (repeated measures proportional odds logistic regression
#' proposed by Parsons). The number of distinct response values can be further reduced by
#' equal-quantile binning or rounding.
#'
#' Estimates of the mean, quantiles, and exceedance probabilities conditional on covariates (new data)
#' can be derived from the model fit.
#'
#' @seealso \code{\link{cdf_cpmgee}, \link{quantile_cpmgee}, \link{mean_cpmgee}}
#'
#'
#' @references
#' Tian et al. "Analyzing clustered continuous response variables with ordinal
#'  regression models." (2022) (to be submitted)
#' @references
#' Parsons, N. (2017). repolr: an R package for fitting proportional-odds models to repeated
#' ordinal scores. R package version 3.4 https://CRAN.R-project.org/package=repolr
#' @references
#' Harrell, F. (2020). rms: Regression modeling strategies. R package version 6.1.0. https://CRAN.R-project.org/package=rms
#' @references
#' Liu, Q., Shepherd, B. E., Li, C., & Harrell Jr, F. E. (2017). Modeling continuous response variables using ordinal regression. 
#' Statistics in Medicine, 36(27), 4316-4335.
#'
#'
#' @import Matrix
#' @import rms
#' @import diagonals
#' @importFrom Rcpp evalCpp
#' @importFrom methods as
#' @importFrom stats as.formula coef dnorm model.matrix qnorm terms terms.formula
#' @useDynLib cpmgee, .registration=TRUE
#' @exportPattern "^[[:alpha:]]+"
#'
#' @examples
#' data(data)
#'
#' # independence working correlation structure
#' mod_cpmgee_ind <- cpmgee(formula = y ~ x + t, data = data, 
#' subjects = 'id', corr.mod = 'independence')
#'
#' # exchangeable working correlation structure
#' # (we use binned response data here for computational efficiency)
#' mod_cpmgee_ex <- cpmgee(formula = y_bin50 ~ x + t, data = data,
#' subjects = 'id', corr.mod = 'exchangeable', alpha = 0.5)
#'
#' # new data
#' new_data <- data.frame(x = c(0,1), t = 0.2)
#'
#' # conditional quantities for independence working correlation structure
#' mean_ind <- mean_cpmgee(mod_cpmgee_ind, data$y, new_data)
#' median_ind <- quantile_cpmgee(mod_cpmgee_ind, data$y, new_data, 0.5)
#' cdf_ind <- cdf_cpmgee(mod_cpmgee_ind, data$y, new_data, 3)
#'
#' # conditional quantities for exchangeable working correlation structure
#' mean_ex <- mean_cpmgee(mod_cpmgee_ex, data$y_bin50, new_data)
#' median_ex <- quantile_cpmgee(mod_cpmgee_ex, data$y_bin50, new_data, 0.5)
#' cdf_ex <- cdf_cpmgee(mod_cpmgee_ex, data$y_bin50, new_data, 3)

cpmgee <- function(formula, subjects, data, corr.mod = "independence",
                   alpha = 0.5, fit.opt = rep(NA, 5)){
  # start
  call <- match.call()

  # set-up
  corr.mods <- c("ar1", "exchangeable", "independence")
  icorr.mod <- as.integer(match(corr.mod, corr.mods, -1))
  if (icorr.mod < 1){stop("corr.mod: set to be independence, ar1 or exchangeable")}
  rcorr.mod <- corr.mod
  diffmeth <- 'analytic'
  initial <- 'orm'
  alpha <- as.double(alpha)
  set.fit.opt <- c(cmaxit = 10, omaxit = 5, ctol = 0.001, otol = 0.00001, h = 0.01)
  set.fit.opt[which(is.na(fit.opt) == FALSE)] <- fit.opt[which(is.na(fit.opt) == FALSE)]
  if(set.fit.opt[1] < 4){set.fit.opt[1] <- 4}
  if(alpha < 0.05 | alpha > 0.95){stop("alpha: invalid correlation parameter")}
  subjects <- as.character(subjects)
  isubject <- as.integer(match(subjects, names(as.data.frame(data)), -1))
  if (isubject < 1){stop("subjects: unknown subject name")}
  orig.formula <- as.formula(formula)
  
  # convert id to numeric
  data$subjects.ord <- as.numeric(factor(data[,subjects], levels = unique(data[,subjects])))
  data <- data[order(data$subjects.ord),]
  
  # max cluster size
  times <- seq(1, max(table(data$subjects.ord)))
  
  # convert response to ordinal variable
  y <- data[,all.vars(formula)[1]]
  if(is.factor(y)){
    ylevels <- levels(y)
    y.ord <- unclass(y)
  }else{
    ylevels <- sort(unique(y))
    y.ord <- match(y, ylevels)
  }
  data$y.ord <- y.ord
  formula.ord <- as.formula(paste('y.ord', '~', paste(all.vars(formula)[-1], collapse = ' + ')))

  # categories 
  categories <- max(y.ord)
  categories1 <- categories - 1
  
  # model matrix
  exdata <- ord.expand(formula = formula.ord, times = times,
                       data = data, subjects = 'subjects.ord', categories = categories)
  formula <- exdata$formula
  dat <- list(data = exdata$data)
  covariate_part <- model.matrix(formula.ord, data = exdata$data)[,-1]
  covariate_names <- colnames(covariate_part)
  intercept_names <- paste0('cuts', exdata$data$cuts[1:categories1])
  # intercept part for one observation
  i_intercept <- Matrix::Diagonal(categories1)
  intercept_names <- paste0('cuts', exdata$data$cuts[1:categories1])
  # number of replicates
  v <- rep(1, nrow(data))
  intercept_part <- kronecker(v, i_intercept)
  Xmat <- list(design = cbind(intercept_part, covariate_part))
  var.names <- c(intercept_names, covariate_names)

  mod <- rms::orm(formula = formula.ord, data = data, x = TRUE, y = TRUE) # default logit link
  inv.logit <- function(x) 1 / (1 + exp(-x))

  # independence working correlation
  if(corr.mod == 'independence' | length(times) == 1){
    mod_robust <- rms::robcov(fit = mod, cluster = data[,subjects])
    coeffs <- -mod_robust$coefficients
    coeffs[categories:length(coeffs)] <- -coeffs[categories:length(coeffs)] 
    robust.var <- mod_robust$var
    rownames(robust.var) <- colnames(robust.var) <- var.names
    names(coeffs) <- var.names

    return(list(
      title = "CPM GEE",
      call = call,
      data = call[["data"]],
      subjects = subjects,
      formula = mod_robust$sformula,
      orig.formula = mod_robust$sformula,
      corr.mod = 'independence',
      times = times,
      categories = categories,
      id = data[,subjects],
      max.id = length(unique(data[,subjects])),
      y = exdata$data[,all.vars(formula)[1]],
      linear.predictors = as.vector(Xmat$design %*% mod[['coefficients']]),
      fitted.values = inv.logit(mod$linear.predictors),
      coefficients = coeffs,
      robust.var = robust.var
    ))
  }

  mod[['coefficients']] <- -mod[['coefficients']]
  mod[['linear.predictors']] <- as.vector(Xmat$design %*% mod[['coefficients']])
  mod[['fitted.values']] <- inv.logit(mod$linear.predictors)
  mod[['data']] <- exdata$data
  mod[['y']] <- exdata$data[,all.vars(formula)[1]]
  xsmat <- smat(coeff = mod$coefficients[1:categories1])

  # 2. update alpha
  xhgmat <- hgmat(mod = mod, smat = xsmat, X = Xmat,
                  modtype = "glm", diffmeth = diffmeth, alpha = alpha,
                  corrmod = corr.mod, h = set.fit.opt[5])

  xupalpha <- upalpha(hgmat = xhgmat, alpha = alpha, diffmeth = diffmeth, h = set.fit.opt[5])
  alpha <- xupalpha$alpha

  # 3. update beta
  xicormat <- list(irmat = icormat(mod=mod, smat=xsmat, modtype='glm', diffmeth = diffmeth,
                                   alpha=alpha, corrmod = corr.mod, h = set.fit.opt[5]))
  mod.ordgee <- ordgee(mod = mod, icormat = xicormat, X = Xmat,
                          ctimes = times, categories = categories,
                          omaxit = as.integer(set.fit.opt[2]), otol = as.double(set.fit.opt[4]))
  coeffs <- mod.ordgee$coefficients

  polycuts <- NA
  # force increasing for intercepts
  decreasing_ind <- which(diff(coeffs[1:categories1]) <= 0)
  if(length(decreasing_ind) > 0){
    for(ind in decreasing_ind){
      # if the differences are extremely small -> use the mid value
      if(coeffs[ind+1] - coeffs[ind-1] < 1e-5){
        coeffs[ind] <- (coeffs[ind+1] + coeffs[ind-1]) / 2
      }else{
        coeffs[ind] <- coeffs[ind-1] + 1e-5
      }
    }
    ## putting the cofficients back to model (recalculate in fitted/linpred etc.)
    mod.ordgee <- fixmod(mod.ordgee, coeffs, Xmat)
    coeffs <- mod.ordgee$coefficients
  }

  xsmat <- smat(coeff = mod.ordgee$coefficients[1:categories1])

  # covariance matrices
  xhgmat <- hgmat_cov(mod = mod.ordgee, smat = xsmat,
                      X = Xmat, modtype = "gee", diffmeth = diffmeth,
                      alpha = alpha, corrmod = corr.mod, h = set.fit.opt[5])
  inv_hmat <- Matrix::solve(xhgmat$hmat)
  robust.var <- inv_hmat %*% xhgmat$gmat %*% inv_hmat

  inv_hmat <- Matrix::solve(xhgmat$hmat)
  robust.var <- inv_hmat %*% xhgmat$gmat %*% inv_hmat
  polycuts <- poly_robust.var <- NA

  rownames(robust.var) <- colnames(robust.var) <- var.names
  coeffs <- as.numeric(coeffs); names(coeffs) <- var.names
  coeffs[categories:length(coeffs)] <- -coeffs[categories:length(coeffs)]

  # output
  cpmgee.mod <- list(title = "CPM GEE",
                     call = call,
                     data = call[["data"]],
                     subjects = subjects,
                     formula = formula,
                     orig.formula = orig.formula,
                     corr.mod = rcorr.mod,
                     times = times,
                     categories = categories,
                     max.id = as.numeric(mod.ordgee$max.id),
                     id = as.numeric(mod.ordgee$id),
                     y = as.numeric(mod.ordgee$y),
                     linear.predictors = as.numeric(mod.ordgee$linear.predictors),
                     fitted.values = as.numeric(mod.ordgee$fitted.values),
                     coefficients = coeffs,
                     robust.var = as.matrix(robust.var),
                     alpha = as.numeric(alpha),
                     fit.opt = set.fit.opt)
}
YuqiTian35/cpmgee documentation built on Aug. 6, 2023, 4:30 a.m.