R/rm_uvsum2.R

Defines functions rm_uvsum

Documented in rm_uvsum

#'Output several univariate models nicely in a single table
#'
#' #'A table with the model parameters from running separate univariate models on
#'each covariate. For factors with more than two levels a Global p-value is
#'returned.
#'
#'Global p-values are likelihood ratio tests for lm, glm and polr models. For
#'lme models an attempt is made to re-fit the model using ML and if,successful
#'LRT is used to obtain a global p-value. For coxph models the model is re-run
#'without robust variances with and without each variable and a LRT is
#'presented. If unsuccessful a Wald p-value is returned. For GEE and CRR models
#'Wald global p-values are returned.
#'
#'As of version 0.1.1 if global p-values are requested they will be included in
#'the p-value column.
#'
#'The number of decimals places to display the statistics can be changed with
#'digits, but this will not change the display of p-values. If more significant
#'digits are required for p-values then use tableOnly=TRUE and format as
#'desired.
#'
#'tidyselect can only be used for response and covs variables. Additional
#'arguments must be passed in using characters
#'
#'@param response string vector with name of response
#'@param covs character vector with the names of columns to fit univariate
#'  models to
#'@param data dataframe containing data
#'@param digits number of digits to round estimates and CI to. Does not affect
#'  p-values.
#'@param covTitle character with the names of the covariate (predictor) column.
#'  The default is to leave this empty for output or, for table only output to
#'  use the column name 'Covariate'.
#'@param caption character containing table caption (default is no caption)
#'@param tableOnly boolean indicating if unformatted table should be returned
#'@param removeInf boolean indicating if infinite estimates should be removed
#'  from the table
#'@param p.adjust p-adjustments to be performed. Uses the [p.adjust] function
#'  from base R
#'@param unformattedp boolean indicating if you would like the p-value to be
#'  returned unformatted (ie not rounded or prefixed with '<'). Should be used
#'  in conjunction with the digits argument.
#'@param whichp string indicating whether you want to display p-values for
#'  levels within categorical data ("levels"), global p values ("global"), or
#'  both ("both"). Irrelevant for continuous predictors.
#'@param chunk_label only used if output is to Word to allow cross-referencing
#'@param  gee boolean indicating if gee models should be fit to account for
#'  correlated observations. If TRUE then the id argument must specify the
#'  column in the data which indicates the correlated clusters.
#'@param id character vector which identifies clusters. Only used for geeglm
#'@param corstr character string specifying the correlation structure. Only used
#'  for geeglm. The following are permitted: '"independence"', '"exchangeable"',
#'  '"ar1"', '"unstructured"' and '"userdefined"'
#'@param family description of the error distribution and link function to be
#'  used in the model. Only used for geeglm
#'@param type string indicating the type of univariate model to fit. The
#'  function will try and guess what type you want based on your response. If
#'  you want to override this you can manually specify the type. Options include
#'  "linear", "logistic", "poisson",coxph", "crr", "boxcox", "ordinal", "geeglm"
#'@param offset string specifying the offset term to be used for Poisson or
#'  negative binomial regression. Example: offset="log(follow_up)"
#'@param strata character vector of covariates to stratify by. Only used for
#'  coxph and crr
#'@param nicenames boolean indicating if you want to replace . and _ in strings
#'  with a space
#'@param showN boolean indicating if you want to show sample sizes
#'@param showEvent boolean indicating if you want to show number of events. Only
#'  available for logistic.
#'@param CIwidth width of confidence interval, default is 0.95
#'@param reflevel manual specification of the reference level. Only used for
#'  ordinal regression This will allow you to see which model is not fitting if
#'  the function throws an error
#'@param returnModels boolean indicating if a list of fitted models should be
#'  returned. If this is TRUE then the models will be returned, but the output
#'  will be suppressed. In addition to the model elements a data element will be
#'  appended to each model so that the fitted data can be examined, if
#'  necessary. See Details
#'@param fontsize PDF/HTML output only, manually set the table fontsize
#'@param forceWald `r lifecycle::badge("deprecated")` `forceWald = TRUE` is no
#'  longer supported; this function will always use profile likelihoods as per
#'  the inclusion of the MASS confidence intervals into base from from R 4.4.0
#'@seealso
#'\code{\link{lm}},\code{\link{glm}},
#'\code{\link[cmprsk:crr]{cmprsk::crr}},
#'\code{\link[survival:coxph]{survival::coxph}},
#'\code{\link[nlme:lme]{nlme::lme}},
#'\code{\link[geepack:geeglm]{geepack::geeglm}},
#'\code{\link[MASS:glm.nb]{MASS::glm.nb}}
#'@return A character vector of the table source code, unless tableOnly=TRUE in
#'  which case a data frame is returned
#'@export
#' @examples
#' # Examples are for demonstration and are not meaningful
#' # Coxph model with 90% CI
#' data("pembrolizumab")
#' rm_uvsum(response = c('os_time','os_status'),
#' covs=c('age','sex','baseline_ctdna','l_size','change_ctdna_group'),
#' data=pembrolizumab,CIwidth=.9)
#'
#' # Linear model with default 95% CI
#' rm_uvsum(response = 'baseline_ctdna',
#' covs=c('age','sex','l_size','pdl1','tmb'),
#' data=pembrolizumab)
#'
#' # Logistic model with default 95% CI
#' rm_uvsum(response = 'os_status',
#' covs=c('age','sex','l_size','pdl1','tmb'),
#' data=pembrolizumab,family = binomial)

#' # Poisson models returned as model list
#' mList <- rm_uvsum(response = 'baseline_ctdna',
#' covs=c('age','sex','l_size','pdl1','tmb'),
#' data=pembrolizumab, returnModels=TRUE)
#' #'
#' # GEE on correlated outcomes
#' data("ctDNA")
#' rm_uvsum(response = 'size_change',
#' covs=c('time','ctdna_status'),
#' gee=TRUE,
#' id='id', corstr="exchangeable",
#' family=gaussian("identity"),
#' data=ctDNA,showN=TRUE)
#'
#' # Using tidyselect
#' pembrolizumab |> rm_uvsum(response = sex,
#' covs = c(age, cohort))
rm_uvsum <- function(response, covs , data , digits=getOption("reportRmd.digits",2),
                     covTitle='',caption=NULL,
                     tableOnly=FALSE,removeInf=FALSE,p.adjust='none',unformattedp=FALSE,
                     whichp=c("levels","global","both"),
                     chunk_label,
                     gee=FALSE,id = NULL,corstr = NULL,family = NULL,type = NULL,
                     offset=NULL,
                     strata = 1,
                     nicenames = TRUE,showN=TRUE,showEvent=TRUE,CIwidth = 0.95,
                     reflevel=NULL,returnModels=FALSE,fontsize,
                     forceWald = FALSE){

  response_var <- tidyselect::eval_select(expr = enquo(response), data = data[unique(names(data))],
                                          allow_rename = FALSE)
  response <- names(response_var)
  x_vars <- tidyselect::eval_select(expr = enquo(covs), data = data[unique(names(data))],
                                    allow_rename = FALSE)
  x_vars <- names(x_vars)

  if (missing(data)) stop('data is a required argument')
  if (missing(covs)) stop('covs is a required argument') else covs <- unique(x_vars)
  if (missing(response)) stop('response is a required argument')

  if (!all(response %in% names(data))) stop("response is not a variable in data")
  if (!all(covs %in% names(data))) stop(paste("The following covs not found in data:",setdiff(covs,names(data))))

  argList <- as.list(match.call()[-1])

  whichp <- match.arg(whichp)
  # checks for id and type
  if (!is.null(id) & (length(id) != 1)) {
    stop('id should be specified as a variable name')
  }
  if (length(type) == 1) {
    if (type == "gee") stop('type == "gee" is not a valid argument; specify gee = TRUE instead')
  }
  empty <- NULL
  if ("" %in% c(strata, type, offset, id)) {
    args <- list(strata = strata, type = type, offset = offset, id = id)
    empty <- names(args)[which(args == "")]
    for (var in empty) {
      assign(var, formals()[[var]])
    }
    warning(paste0("empty string arguments "), paste(empty, collapse = ", "), " will be ignored")
  }

  if (length(response)>2) stop('The response must be a single outcome for linear, logistic and ordinal models or must specify the time and event status variables for survival models.')
  if (!inherits(data,'data.frame')) stop('data must be supplied as a data frame.')
  # if (!inherits(covs,'character')) stop('covs must be supplied as a character vector or string indicating variables in data')
  missing_vars = na.omit(setdiff(c(response, covs,id,ifelse(strata==1,NA,strata)), names(data)))
  if (length(missing_vars) > 0) stop(paste("These variables are not in the data:\n",
                                           paste0(missing_vars,collapse=csep())))
  if (is.null(strata)) {
    strata <- formals()[["strata"]]
    argList[["strata"]] <- formals()[["strata"]]
  }
  if (is.na(strata)) {
    strata <- formals()[["strata"]]
    argList[["strata"]] <- formals()[["strata"]]
  }
  if (strata==1) nm <- c(response,covs) else nm <- na.omit(c(strata,response,covs))
  if (!all(names(data[,nm])==names(data.frame(data[,nm])))) stop('Non-standard variable names detected.\n Try converting data with new_data <- data.frame(data) \n then use new variable names in rm_uvsum.' )

  argList$covs <- x_vars
  argList$response <- response
  if ("tableOnly" %in% names(argList)) {
    argList[["tableOnly"]] <- NULL
  }
  if ("caption" %in% names(argList)) argList[["caption"]] <- NULL
  if (!is.null(empty)) {
    for (var in empty) {
      if (is.null(eval(as.name(var)))) {
        argList[var] <- list(NULL)
      }
      else {
        argList[[var]] <- eval(as.name(var))
      }
    }
  }

  nomiss <- nrow(na.omit(data[,response,drop=FALSE]))
  if (nrow(data)!=nomiss) warning(paste("Cases with missing response data have been removed.\n",
                                        nrow(data)-nomiss,"case(s) removed."))
  for (v in covs) {
    if (inherits(data[[v]], c("character", "ordered"))) data[[v]] <- factor(data[[v]], ordered = FALSE)
    if (inherits(data[[v]],c('Date','POSIXt'))) {
      covs <- setdiff(covs,v)
      message(paste('Dates can not be used as predictors, try creating a time variable.\n The variable',v,'does not appear in the table.'))
    }

    df <- na.omit(data[,c(response,v)])
    if (v %in% response){
      warning(paste(v,'is the response and can not appear in the covariate list.\n',
                    'It is omitted from the output.'))
      covs <- setdiff(covs,v)
    }
    if (length(unique(df[[v]]))==1) {
      warning(paste(v,'has only one unique value for non-missing response.\n',
                    'It is omitted from the output.'))
      covs <- setdiff(covs,v)
    }
  }
  if (is.null(strata))
    if (is.na(strata)) assign(argList[["strata"]], formals()[["strata"]])

  # remove arguments not used by uvsum2
  valid_args <- names(formals(uvsum2))
  argList <- argList[names(argList) %in% valid_args]
  argList$data <- eval(data)
  # get the table
  tab <- do.call(uvsum2,argList)
  # If user specifies return models, don't format a table, just return a list of models
  if (returnModels) return (tab$models)
  to_indent <- attr(tab, "to_indent")
  bold_cells <- attr(tab, "bold_cells")
  att_tab <- attributes(tab)

  pv <- format_bold_pvalues(tab, bold_cells,
                            unformattedp = unformattedp, p.adjust = p.adjust)
  tab <- pv$tab; bold_cells <- pv$bold_cells

  names(tab)[1] <- covTitle
  lbl <- tab[, 1]
  if (nicenames) {
    tab[, 1] <- replaceLbl(data, lbl)
  }
  argL <- list(tab=tab, digits = digits,
               to_indent=to_indent,bold_cells=bold_cells,
               caption=caption)


  # Add attributes if returning a table
  for (a in setdiff(names(att_tab),names(attributes(tab)))) attr(tab,a) <- att_tab[[a]]
  if (tableOnly){
    if (names(tab)[1]=='') names(tab)[1]<- 'Covariate'
    attr(tab,"data call") <- deparse1(argList$data)
    attr(tab, 'to_indent') <- to_indent
    attr(tab,'bold_cells') <- bold_cells
    attr(tab,'dimchk') <- dim(tab)
    return(tab)
  }
  do.call(outTable, argL)

}

uvsum2 <- function (response, covs, data, digits=getOption("reportRmd.digits",2),id = NULL, corstr = NULL, family = NULL,
                    type = NULL, offset=NULL, gee=FALSE,strata = 1, nicenames = TRUE,
                    showN = TRUE, showEvent = TRUE, CIwidth = 0.95, reflevel=NULL,returnModels=FALSE, whichp="levels")
{
  argList <- as.list(match.call()[-1])
  if (inherits(data[[response[1]]],"character")) data[[response[1]]] <- factor(data[[response[1]]])
  if (!inherits(strata,"numeric")) {
    strataVar = strata
    strata <- sapply(strata, function(stra) {
      paste("strata(", stra, ")", sep = "")
    })
  }  else {
    strataVar <- ""
    strata <- ""
  }
  if (length(response)==1) {
    if (sum(is.na(data[[response]]))>0) message(paste(sum(is.na(data[[response]])),"observations with missing outcome removed."))
    data <- data[!is.na(data[[response]]), ]
  } else {
    if (sum(is.na(data[[response[1]]])|is.na(data[[response[2]]]))>0) message(paste(sum(is.na(data[[response[1]]])|is.na(data[[response[2]]])),"observations with missing outcome removed."))
    data <- data[!(is.na(data[[response[1]]]) | is.na(data[[response[2]]])), ]
  }
  # Set family if user specifies type
  if (!is.null(type)) {
    if (length(response)==2 & !(type %in% c('coxph','crr')))
      stop('Response can only be of length one for non-survival models.')
    if (type == "logistic") {
      if (is.null(family)) family='binomial'
    }
    else if (type == "poisson") {
      if (all(data[[response]]==as.integer(data[[response]]))){
        data[[response]]=as.integer(data[[response]])
      }
      else {
        stop('Poisson regression requires an integer response.')
      }
      if (is.null(family)) family='poisson'
    }
    else if (type == "negbin") {
      if (all(data[[response]]==as.integer(data[[response]]))){
        data[[response]]=as.integer(data[[response]])
      }
      else {
        stop('Negative binomial regression requires an integer response.')
      }
      if (!is.null(family)) message('For negative binomial regression currently only the log link is implemented.')
    }
    else if (type == "linear" | type == "boxcox") {
      if (is.null(family)) family='gaussian'
    }
    else if (type == "ordinal") {
      if (!inherits(data[[response[1]]],c("factor","ordered"))) {
        warning("Response variable is not a factor, will be converted to an ordered factor")
        data[[response]] <- factor(data[[response]],
                                   ordered = TRUE)
      }
      if (!is.null(reflevel)) {
        data[[response]] <- stats::relevel(data[[response]],
                                           ref = reflevel)
      }
    }
    else if (type %in% c("coxph", "crr")) {
      if (length(response)==1 )
        stop('Please specify two variables in the response for survival models. \nExample: response=c("time","status")')

    }
    else {
      stop("type must be either coxph, logistic, linear, poisson, negbin, boxcox, crr, ordinal (or NULL)")
    }
  }    else {
    if (length(response) == 2) {
      # Check that responses are numeric
      for (i in 1:2) if (!is.numeric(data[[response[i]]])) stop('Both response variables must be numeric')
      if (length(unique(na.omit(data[[response[2]]]))) < 3) {
        type <- "coxph"
      }
      else {
        type <- "crr"
      }
    } else if (length(unique(na.omit(data[[response]]))) == 2) {
      type <- "logistic"
      family="binomial"
    } else if (inherits(data[[response[1]]],"ordered")) {
      type <- "ordinal"
      if (!is.null(reflevel)) {
        data[[response]] <- stats::relevel(data[[response]],
                                           ref = reflevel)
      }
    } else if (inherits(data[[response[1]]],"integer")) {
      type <- "poisson"
      family="poisson"
    } else {
      if (!inherits(data[[response[1]]],"numeric")) stop('Response variable must be numeric')
      type <- "linear"
      beta <- "Estimate"
      family='gaussian'
    }
  }
  # Do some more model checking --------
  if (strata != "" & type != "coxph") {
    stop("strata can only be used with coxph")
  }
  if (!is.null(id)){
    if (!(gee | type =='coxph')) {
      warning('id argument will be ignored. This is used only for survival strata or clustering in GEE. To run a GEE model set gee=TRUE.')
    }
  }
  if (!is.null(offset) & !(type %in% c('poisson','negbin'))) {
    warning('Offset terms only used for Poisson and negative binomial regression.\nOffset term will be ignored.')
  }
  if (!is.null(corstr)){
    if (! (gee | type =='coxph')) {
      warning('id argument will be ignored. This is used only for survival strata or clustering in GEE. To run a GEE model set gee=TRUE.')
    }
  }
  if (!is.null(offset)){
    ovars <- unlist(strsplit(offset,"[^a-zA-Z_]"))
    if(length(intersect(names(data),ovars))==0){
      stop(paste('Variable names in the offset term contains special characters. \nPlease remove special characters, except "_" from the variable name and re-fit.\n',
                 'offset =',offset))
    } else ovars <- intersect(names(data),ovars)
  } else ovars <- NULL
  if (gee) {
    if (!type %in% c('linear','logistic','poisson')) {
      stop('GEE models currently only implemented for Poisson, logistic or linear regression.')
    }
    if (is.null(id)) {
      stop('The id argument must be set for gee models to indicate clusters.')
    }
    if (is.null(corstr)) {
      stop('You must provide correlation structure (i.e. corstr="independence") for GEE models.')
    }
  }

  # Assign the class to response using model registry --------
  model_info <- get_model_class(type = type, family = family, gee = gee)
  class(response) <- model_info$class
  modelList <- NULL
  for (cov in covs) {
    modelList[[cov]] <- autoreg(response, data, cov, id, strata, family, offset, corstr)
  }

  summaryList <- NULL
  summaryList <- lapply(modelList,m_summary,digits= digits, CIwidth=CIwidth, vif = FALSE,whichp=whichp, for_plot = FALSE)
  summaryList <- dplyr::bind_rows(summaryList)
  if (!showN) {
    summaryList[["N"]] <- NULL
  }
  if (!showEvent) {
    summaryList[["Event"]] <- NULL
  }
  to_indent <- which(!(summaryList[["Variable"]] %in% covs))
  bold_cells <- cbind(which(summaryList[["Variable"]] %in% covs), rep(1, length(which(summaryList[["Variable"]] %in% covs))))
  attr(summaryList, "to_indent") <- to_indent
  attr(summaryList, "bold_cells") <- bold_cells
  if (returnModels) return(list(summaryList,models=modelList)) else return(summaryList)
}

Try the reportRmd package in your browser

Any scripts or data that you put into this service are public.

reportRmd documentation built on March 20, 2026, 5:06 p.m.