R/misc_functions.R

Defines functions cosm_reg cosm_sum theme_pubh pseudo_r2 inv_logit rank_influence coef_det logistic_gof freq_cont knife_mean predict_inv rank_leverage leverage jack_knife ss_jk

Documented in coef_det cosm_reg cosm_sum freq_cont inv_logit jack_knife knife_mean leverage logistic_gof predict_inv pseudo_r2 rank_influence rank_leverage ss_jk theme_pubh

# Miscellaneous functions

#' Sum of squares for Jackknife.
#'
#' \code{ss_jk} is an internal function called by \code{\link{jack_knife}}. It calculates the squared
#' difference of a numerical variable around a given value (for example, the mean).
#'
#' @param obs A numerical vector with no missing values (NA's).
#' @param stat The value of the statistic that is used as a reference.
#' @return The squared difference between a variable and a given value.
#' @examples
#' x <- rnorm(10, 170, 8)
#' x
#' mean(x)
#' ss_jk(x, mean(x))
#' jack_knife(x)
#' @export
ss_jk <- function(obs, stat) {
  n <- length(obs)
  ss <- numeric(n)
  for (i in 1:n) ss[i] <- ((obs[i] - stat)^2)
  ss
}

#' Ranks leverage observations from Jackknife method.
#'
#' \code{jack_knife} Ranks the squared differences between mean values from Jackknife analysis
#' (arithmetic mean estimated by removing one observation at a time) and the original mean value.
#'
#' @param x A numeric variable. Missing values are removed by default.
#' @return Data frame with the ranked squared differences.
#' @seealso \code{\link{rank_leverage}}.
#' @examples
#' x <- rnorm(10, 170, 8)
#' x
#' mean(x)
#' jack_knife(x)
#'
#' x <- rnorm(100, 170, 8)
#' mean(x)
#' head(jack_knife(x))
#' @export
jack_knife <- function(x) {
  jk.means <- knife_mean(x)
  mu <- mean(x, na.rm = TRUE)
  jk.ss <- ss_jk(jk.means, mu)
  jk.ord <- order(jk.ss, decreasing = TRUE)
  Observation <- jk.ord
  SS <- jk.ss[jk.ord]
  Mean <- jk.means[jk.ord]
  res <- data.frame(Observation, Value = x[jk.ord], Mean, SS)
  rownames(res) <- NULL
  res
}

#' Leverage.
#'
#' \code{leverage} is an internal function called by \code{\link{rank_leverage}}.
#'
#' Estimates the leverage of each observation around the arithmetic mean.
#'
#' @param x A numeric variable. Missing values are removed by default.
#' @return Variable with corresponding leverage estimations
#' @examples
#' x <- rnorm(10, 170, 8)
#' x
#' mean(x)
#' leverage(x)
#' rank_leverage(x)
#' @export
leverage <- function(x) {
  inv <- 1 / length(x)
  ssx <- (x - mean(x, na.rm = TRUE))^2
  ssx2 <- sum(ssx)
  lev <- inv + ssx / ssx2
  lev
}

#' Ranks observations by leverage.
#'
#' \code{rank_leverage} ranks observations by their leverage (influence) on the arithmetic mean.
#'
#' @param x A numeric variable. Missing values are removed by default.
#' @return A data frame ranking observations by their leverage around the mean.
#' @seealso \code{\link{jack_knife}}.
#' @examples
#' x <- rnorm(10, 170, 8)
#' x
#' mean(x)
#' rank_leverage(x)
#'
#' x <- rnorm(100, 170, 8)
#' mean(x)
#' head(rank_leverage(x))
#' @export
rank_leverage <- function(x) {
  lev <- leverage(x)
  lev.ord <- order(lev, decreasing = TRUE)
  Observation <- lev.ord
  Leverage <- lev[lev.ord]
  res <- data.frame(Observation, Leverage)
  res
}


#' Given y solve for x in a simple linear model.
#'
#' \code{predict_inv} Calculates the value the predictor x that generates value y with a simple linear model.
#'
#' @param model A simple linear model object (class lm).
#' @param y A numerical scalar, the value of the outcome for which we want to calculate the predictor x.
#' @return The estimated value of the predictor.
#' @examples
#' ## Spectrophotometry example. Titration curve for riboflavin (nmol/ml). The sample has an absorbance
#' ## of 1.15. Aim is to estimate the concentration of riboflavin in the sample.
#'
#' Riboflavin <- seq(0, 80, 10)
#' OD <- 0.0125 * Riboflavin + rnorm(9, 0.6, 0.03)
#' titration <- data.frame(Riboflavin, OD)
#'
#' require(sjlabelled, quietly = TRUE)
#' titration <- titration %>%
#'   var_labels(
#'     Riboflavin = "Riboflavin (nmol/ml)",
#'     OD = "Optical density"
#'   )
#'
#' titration %>%
#'   gf_point(OD ~ Riboflavin) %>%
#'   gf_smooth(col = "indianred3", se = TRUE, lwd = 0.5, method = "loess")
#'
#' ## Model with intercept different from zero:
#' model <- lm(OD ~ Riboflavin, data = titration)
#' glm_coef(model)
#'
#' predict_inv(model, 1.15)
#' @export
predict_inv <- function(model, y) {
  param <- length(coef(model))
  m <- ifelse(param == 1, as.numeric(coef(model)), coef(model)[2])
  b <- ifelse(param == 1, 0, coef(model)[1])
  x <- (y - b) / m
  names(x) <- names(m)
  x
}

#' Jackknife for means.
#'
#' \code{knife_mean} is an internal function. Calculates arithmetic means by removing one observation
#' at a time.
#'
#' @param x A numerical variable. Missing values are removed for the mean calculation.
#' @return A vector with the mean calculations.
#' @examples
#' x <- rnorm(10, 170, 8)
#' x
#' mean(x)
#' knife_mean(x)
#' @export
knife_mean <- function(x) {
  n <- length(x)
  jk <- numeric(n)
  for (i in 1:n) jk[i] <- mean(x[-i], na.rm = TRUE)
  jk
}

#' Relative and Cumulative Frequency.
#'
#' \code{freq_cont} tabulates a continuous variable by given classes.
#'
#' @param x A numerical (continuous) variable. Ideally, relatively long (greater than 100 observations).
#' @param bks Breaks defining the classes (see example).
#' @param dg Number of digits for rounding (default = 2).
#' @return A data frame with the classes, the mid-point, the frequencies, the relative and cumulative frequencies.
#' @examples
#' data(IgM, package = "ISwR")
#' Ab <- data.frame(IgM)
#' estat(~IgM, data = Ab)
#' freq_cont(IgM, seq(0, 4.5, 0.5))
#' @export
freq_cont <- function(x, bks, dg = 2) {
  cx <- cut(x, breaks = bks)
  n <- length(x)
  x.tab <- as.data.frame(table(cx))
  rel <- x.tab$Freq / n
  n2 <- length(rel)
  mids <- numeric(n2)
  cum <- cumsum(rel)
  for (i in 1:n2) mids[i] <- mean(c(bks[i], bks[i + 1]))
  rel <- round(rel, digits = dg)
  cum <- round(cum, digits = dg)
  res <- data.frame(x.tab[, 1], mids, x.tab[, 2], rel, cum)
  names(res) <- c("Class", "Mids", "Counts", "rel.freq", "cum.freq")
  res
}

#' Goodness of fit for Logistic Regression.
#'
#' \code{logistic_gof} performs the Hosmer and Lemeshow test to test the goodness of fit of a logistic
#' regression model. This function is part of \code{residuals.lrm} from package \code{rms}.
#'
#' @author Frank Harell, Vanderbilt University <f.harrell@vanderbilt.edu>
#' @param model A logistic regression model object.
#' @references Hosmer DW, Hosmer T, Lemeshow S, le Cessie S, Lemeshow S. A comparison of goodness-of-fit
#' tests for the logistic regression model. Stat in Med 16:965–980, 1997.
#' @examples
#' data(diet, package = "Epi")
#' model <- glm(chd ~ fibre, data = diet, family = binomial)
#' glm_coef(model, labels = c("Constant", "Fibre intake (g/day)"))
#' logistic_gof(model)
#' @export
logistic_gof <- function(model) {
  L <- model$linear.predictors
  if (length(L) == 0) {
    stop("you did not use linear.predictors=TRUE for the fit")
  }
  cof <- model$coefficients
  P <- 1 / (1 + exp(-L))
  Y <- model$y
  rnam <- names(Y)
  cnam <- names(cof)
  if (!is.factor(Y)) {
    Y <- factor(Y)
  }
  lev <- levels(Y)
  Y <- unclass(Y) - 1
  X <- glm(model$formula, family = binomial, x = TRUE, data = model$data)$x
  y <- Y >= 1
  p <- P
  sse <- sum((y - p)^2)
  wt <- p * (1 - p)
  d <- 1 - 2 * p
  z <- lm.wfit(X, d, wt)
  res <- z$residuals * sqrt(z$weights)
  sd <- sqrt(sum(res^2))
  ev <- sum(wt)
  z <- (sse - ev) / sd
  P <- 2 * (1 - pnorm(abs(z)))
  stats <- data.frame(sse, ev, sd, z, P)
  names(stats) <- c(
    "Sum of squared errors", "Expected value|H0",
    "SD", "Z", "P"
  )
  return(drop(stats))
}

#' Coefficient of determination.
#'
#' \code{coef_det} estimates the coefficient of determination (r-squared) from fitted (predicted) and
#' observed values. Outcome from the model is assumed to be numerical.
#' @param obs Vector with observed values (numerical outcome).
#' @param fit Vector with fitted (predicted) values.
#' @return A scalar, the coefficient of determination (r-squared).
#' @examples
#' ## Linear regression:
#' Riboflavin <- seq(0, 80, 10)
#' OD <- 0.0125 * Riboflavin + rnorm(9, 0.6, 0.03)
#' titration <- data.frame(Riboflavin, OD)
#' model1 <- lm(OD ~ Riboflavin, data = titration)
#' summary(model1)
#' coef_det(titration$OD, fitted(model1))
#'
#' ## Non-linear regression:
#' library(nlme, quietly = TRUE)
#' data(Puromycin)
#' mm.tx <- gnls(rate ~ SSmicmen(conc, Vm, K),
#'   data = Puromycin,
#'   subset = state == "treated"
#' )
#' summary(mm.tx)
#' coef_det(Puromycin$rate[1:12], mm.tx$fitted)
#' @export
coef_det <- function(obs, fit) {
  obm <- mean(obs, na.rm = TRUE)
  SSres <- sum((obs - fit)^2, na.rm = TRUE)
  SStot <- sum((obs - obm)^2, na.rm = TRUE)
  res <- 1 - (SSres / SStot)
  res
}

#' Ranks observations based upon influence measures on models.
#'
#' \code{rank_influence} calculates influence measures of each data observation on models and then ranks them.
#'
#' @param model A generalised linear model object.
#' @details \code{rank_influence} is a wrap function that calls \code{\link[stats]{influence.measures}}, ranks observations on
#' their significance influence on the model and displays the 10 most influential observations
#' (if they are significant).
#' @seealso \code{\link[stats]{influence.measures}}.
#' @examples
#' data(diet, package = "Epi")
#' model <- glm(chd ~ fibre, data = diet, family = binomial)
#' rank_influence(model)
#' @export
rank_influence <- function(model) {
  infl <- influence.measures(model)
  mat <- (infl$is.inf) * 1
  signifa <- sort(apply(mat, 1, sum), decreasing = TRUE)
  mat.sum <- signifa[signifa > 0]
  mat.names <- as.numeric(names(mat.sum))
  pos <- match(mat.names, row.names(infl$infmat), nomatch = 0)
  if (length(pos) > 10) {
    pos2 <- pos[1:10]
    signifb <- as.vector(mat.sum)[1:10]
    res <- data.frame(infl$infmat[pos2, ], signif = signifb)
  }
  if (length(pos) < 10 & length(pos) > 0) {
    signifb <- as.vector(mat.sum)
    res <- data.frame(infl$infmat[pos, ], signif = signifb)
  }
  if (length(pos) == 0) {
    res <- "No significant influence measures"
  }
  res
}

#' Inverse of the logit
#'
#' \code{inv_logit} Calculates the inverse of the logit (probability in logistic regression)
#'
#' @param x Numerical value used to compute the inverse of the logit.
#' @export
inv_logit <- function(x) {
  if (any(omit <- is.na(x))) {
    lv <- x
    lv[omit] <- NA
    if (any(!omit)) {
      lv[!omit] <- Recall(x[!omit])
    }
    return(lv)
  }
  exp(x) / (1 + exp(x))
}

#' Pseudo R2 (logistic regression)
#' \code{pseudo_r2} Calculates R2 analogues (pseudo R2) of logistic regression.
#'
#' @param model A logistic regression model.
#' @details \code{pseudo_r2} calculates three pseudo R2 of logistic regression models: 1) Nagelkerke, @0 Cox and Snell, 3) Hosmer and Lemeshow.
#' @return A data frame with the calculated pseudo R2 values.
#' @examples
#' data(Oncho)
#' model_oncho <- glm(mf ~ area, data = Oncho, binomial)
#' glm_coef(model_oncho, labels = c("Constant", "Area (rainforest/savannah)"))
#' pseudo_r2(model_oncho)
#' @export
pseudo_r2 <- function(model) {
  if (!(model$family$family == "binomial" && (model$family$link ==
    "logit" || model$family$link == "probit"))) {
    stop("No logistic regression model, no pseudo R^2 computed.")
  }
  dev <- model$deviance
  null_dev <- model$null.deviance
  model_n <- length(model$fitted)
  r_hl <- 1 - dev / null_dev
  r_cs <- 1 - exp(-(null_dev - dev) / model_n)
  r_n <- r_cs / (1 - (exp(-(null_dev / model_n))))
  Index <- c("Nagelkerke", "Hosmer and Lemeshow", "Cox and Snell")
  Estimate <- round(c(r_n, r_hl, r_cs), 3)
  res <- data.frame(Index, Estimate)
  res
}

#' A theme for huxtables
#' This function quickly set a default style for a huxtable.
#' @param ht A huxtable object.
#' @param rw A numeric vector with the rows on which a bottom border is desired.
#' @details \code{theme_pubh} is a variation of \code{theme_article} with the added flexibility
#' of adding a bottom border line at desired row numbers.
#' @examples
#' require(dplyr, quietly = TRUE)
#' data(Oncho)
#'
#' Oncho %>%
#'   select(area, mf) %>%
#'   cross_tbl(by = "area") %>%
#'   theme_pubh(2)
#'
#' data(Bernard)
#'
#' t1 <- estat(~ apache | fate, data = Bernard)
#' t2 <- estat(~ o2del | fate, data = Bernard)
#' rbind(t1, t2) %>%
#'   as_hux() %>%
#'   theme_pubh(c(1, 3))
#' @export
theme_pubh <- function(ht, rw = 1) {
  ht <- huxtable::set_all_borders(ht, 0)
  ht <- huxtable::set_top_border(ht, 1, huxtable::everywhere)
  ht <- huxtable::set_bottom_border(ht, huxtable::final(1), huxtable::everywhere)
  ht <- huxtable::set_bottom_border(ht, rw, huxtable::everywhere)
  ht <- huxtable::style_header_rows(ht, bold = TRUE)
  ht
}

#' Cosmetics for summary tables
#' Adds some cosmetics to tables of descriptive statistics generated by tbl_summary.
#' @param gt_tbl A table object generated by \code{\link[gtsummary]{tbl_summary}}.
#' @param pad Numerical, padding above and bellow rows.
#' @param bold Display labels in bold?
#' @param head_label Character, label to be used as head for the variable's column.
#' @details Function \code{cosm_sum} adds some cosmetics to tables generated by \code{\link[gtsummary]{tbl_summary}}, then converts the table as a \code{\link[huxtable]{huxtable}} and sets proper alignment.
#' @return A \code{\link[huxtable]{huxtable}}.
#' @examples
#' require(dplyr, quietly = TRUE)
#'
#' data(Oncho)
#' Oncho %>%
#'   select(-id) %>%
#'   tbl_summary() %>%
#'   cosm_sum(bold = TRUE) %>%
#'   theme_pubh(1)
#' @export
cosm_sum <- function(gt_tbl, pad = 3, bold = FALSE, head_label = "**Variable**") {
  tbl <- gt_tbl %>%
    gtsummary::modify_header(label ~ head_label) %>%
    gtsummary::modify_footnote(everything() ~ NA)

  if (bold == FALSE) {
    tbl <- tbl
  } else {
    tbl <- tbl %>% gtsummary::bold_labels()
  }

  tbl <- tbl %>%
    gtsummary::as_hux_table() %>%
    huxtable::set_align(everywhere, -1, "right") %>%
    huxtable::set_top_padding(pad) %>%
    huxtable::set_bottom_padding(pad)

  tbl
}

#' Cosmetics for tables of regression coefficients.
#' Converts tables generated by \code{\link[gtsummary]{tbl_regression}} to \code{\link[huxtable]{huxtable}} and adds some cosmetics.
#' @param gt_tbl A table object generated by \code{\link[gtsummary]{tbl_regression}}.
#' @param pad Numerical, padding above and bellow rows.
#' @param type Anova's type to calculate global p-values.
#' @param bold Display labels in bold?
#' @param head_label Character, label to be used as head for the variable's column.
#' @return A \code{\link[huxtable]{huxtable}}.
#' @examples
#' require(sjlabelled, quietly = TRUE)
#'
#' data(diet, package = "Epi")
#' diet <- diet %>%
#'   var_labels(
#'     chd = "Coronary Heart Disease",
#'     fibre = "Fibre intake (g/day)"
#'   )
#'
#' model_binom <- glm(chd ~ fibre, data = diet, family = binomial)
#'
#' model_binom %>%
#'   tbl_regression(exponentiate = TRUE) %>%
#'   cosm_reg(bold = TRUE) %>%
#'   theme_pubh(1) %>%
#'   add_footnote(get_r2(model_binom), font_size = 9)
#'
#' data(birthwt, package = "MASS")
#' birthwt <- birthwt %>%
#'   mutate(
#'     smoke = factor(smoke, labels = c("Non-smoker", "Smoker")),
#'     race = factor(race, labels = c("White", "African American", "Other"))
#'   ) %>%
#'   var_labels(
#'     bwt = "Birth weight (g)",
#'     smoke = "Smoking status",
#'     race = "Race"
#'   )
#'
#' model_norm <- lm(bwt ~ smoke + race, data = birthwt)
#'
#' model_norm %>%
#'   tbl_regression() %>%
#'   cosm_reg(bold = TRUE) %>%
#'   theme_pubh(1) %>%
#'   add_footnote(get_r2(model_norm), font_size = 9)
#' @export
cosm_reg <- function(gt_tbl, pad = 3, type = 3, bold = TRUE, head_label = "**Variable**") {
  tbl0 <- gt_tbl %>%
    gtsummary::modify_header(label ~ head_label) %>%
    gtsummary::modify_footnote(everything() ~ NA, abbreviation = TRUE)

  if (is.null(type)) {
    tbl <- tbl0
  } else {
    tbl <- tbl0 %>% gtsummary::add_global_p(type = type, quiet = TRUE)
  }

  if (bold == FALSE) {
    tbl <- tbl
  } else {
    tbl <- tbl %>% gtsummary::bold_labels()
  }

  tbl <- tbl %>%
    gtsummary::as_hux_table() %>%
    huxtable::set_align(everywhere, -1, "right") %>%
    huxtable::set_top_padding(pad) %>%
    huxtable::set_bottom_padding(pad)

  tbl
}
josie-athens/pubh documentation built on Feb. 3, 2024, 4:32 a.m.