##' Moving Estimates Using Overlapping Windows
##'
##' Function to compute moving averages and other statistics as a function
##' of a continuous variable, possibly stratified by other variables.
##' Estimates are made by creating overlapping moving windows and
##' computing the statistics defined in the stat function for each window.
##' The default method, `space='n'` creates varying-width intervals each having a sample size of `2*eps +1`, and the smooth estimates are made every `xinc` observations. Outer intervals are not symmetric in sample size (but the mean x in those intervals will reflect that) unless `eps=nignore`, as outer intervals are centered at observations `nignore` and `n - nignore + 1` where the default for `nignore` is 10. The mean x-variable within each windows is taken to represent that window. If `trans` and `itrans` are given, x means are computed on the `trans(x)` scale and then `itrans`'d. For `space='x'`, by default estimates are made on to the `nignore` smallest to the `nignore` largest
##' observed values of the x variable to avoid extrapolation and to
##' help getting the moving statistics off on an adequate start for
##' the left tail. Also by default the moving estimates are smoothed using `supsmu`.
##' When `melt=TRUE` you can feed the result into `ggplot` like this:
##' `ggplot(w, aes(x=age, y=crea, col=Type)) + geom_line() +`
##' `facet_wrap(~ Statistic)`
##'
##' See [here](https://hbiostat.org/rflow/analysis.html#sec-analysis-assoc) for several examples.
##'
##' @title movStats
##' @author Frank Harrell
##' @md
##' @param formula a formula with the analysis variable on the left and the x-variable on the right, following by optional stratification variables
##' @param stat function of one argument that returns a named list of computed values. Defaults to computing mean and quartiles + N except when y is binary in which case it computes moving proportions. If y has two columns the default statistics are Kaplan-Meier estimates of cumulative incidence at a vector of `times`.
##' @param discrete set to `TRUE` if x-axis variable is discrete and no intervals should be created for windows
##' @param space defines whether intervals used fixed width or fixed sample size
##' @param eps tolerance for window (half width of window). For `space='x'` is in data units, otherwise is the sample size for half the window, not counting the middle target point.
##' @param varyeps applies to `space='n'` and causes a smaller `eps` to be used in strata with fewer than `` observations so as to arrive at three x points
##' @param nignore see description, default is to exclude `nignore=10` points on the left and right tails from estimation and plotting
##' @param xinc increment in x to evaluate stats, default is xlim range/100 for `space='x'`. For `space='n'` `xinc` defaults to m observations, where m = max(n/200, 1).
##' @param xlim 2-vector of limits to evaluate if `space='x'` (default is `nignore` smallest to `nignore` largest)
##' @param times vector of times for evaluating one minus Kaplan-Meier estimates
##' @param tunits time units when `times` is given
##' @param msmooth set to `'smoothed'` or `'both'` to compute `lowess`-smooth moving estimates. `msmooth='both'` will display both. `'raw'` will display only the moving statistics. `msmooth='smoothed'` (the default) will display only he smoothed moving estimates.
##' @param tsmooth defaults to the super-smoother `'supsmu'` for after-moving smoothing. Use `tsmooth='lowess'` to instead use `lowess`.
##' @param bass the `supsmu` `bass` parameter used to smooth the moving statistics if `tsmooth='supsmu'`. The default of 8 represents quite heavy smoothing.
##' @param span the `lowess` `span` used to smooth the moving statistics
##' @param maxdim passed to `hare`, default is 6
##' @param penalty passed to `hare`, default is to use BIC. Specify 2 to use AIC.
##' @param trans transformation to apply to x
##' @param itrans inverse transformation
##' @param loess set to TRUE to also compute loess estimates
##' @param ols set to TRUE to include rcspline estimate of mean using ols
##' @param qreg set to TRUE to include quantile regression estimates w rcspline
##' @param lrm set to TRUE to include logistic regression estimates w rcspline
##' @param orm set to TRUE to include ordinal logistic regression estimates w rcspline (mean + quantiles in `tau`)
##' @param hare set to TRUE to include hazard regression estimtes of incidence at `times`, using the `polspline` package
##' @param lrm_args a `list` of optional arguments to pass to `lrm` when `lrm=TRUE`, e.g., `list(maxit=20)`
##' @param family link function for ordinal regression (see `rms::orm`)
##' @param k number of knots to use for ols and/or qreg rcspline
##' @param tau quantile numbers to estimate with quantile regression
##' @param melt set to TRUE to melt data table and derive Type and Statistic
##' @param data
##' @param pr defaults to no printing of window information. Use `pr='plain'` to print in the ordinary way, `pr='kable` to convert the object to `knitr::kable` and print, or `pr='margin'` to convert to `kable` and place in the `Quarto` right margin. For the latter two `results='asis'` must be in the chunk header.
##' @param data data.table or data.frame, default is calling frame
##' @return a data table, with attribute `infon` which is a data frame with rows corresponding to strata and columns `N`, `Wmean`, `Wmin`, `Wmax` if `stat` computed `N`. These summarize the number of observations used in the windows. If `varyeps=TRUE` there is an additional column `eps` with the computed per-stratum `eps`. When `space='n'` and `xinc` is not given, the computed `xinc` also appears as a column. An additional attribute `info` is a `kable` object ready for printing to describe the window characteristics.
##'
movStats <- function(formula, stat=NULL, discrete=FALSE,
space=c('n', 'x'),
eps =if(space=='n') 15, varyeps=FALSE, nignore=10,
xinc=NULL, xlim=NULL,
times=NULL, tunits='year',
msmooth=c('smoothed', 'raw', 'both'),
tsmooth=c('supsmu', 'lowess'),
bass=8, span=1/4, maxdim=6, penalty=NULL,
trans=function(x) x, itrans=function(x) x,
loess=FALSE,
ols=FALSE, qreg=FALSE, lrm=FALSE,
orm=FALSE, hare=FALSE,
lrm_args=NULL, family='logistic',
k=5, tau=(1:3)/4, melt=FALSE,
data=environment(formula),
pr=c('none', 'kable', 'plain', 'margin')) {
space <- match.arg(space)
msmooth <- match.arg(msmooth)
movlab <- 'Moving '
if(discrete) {
msmooth <- 'raw'
movlab <- ''
}
tsmooth <- match.arg(tsmooth)
pr <- match.arg(pr)
if(! requireNamespace('data.table', quietly=TRUE))
stop('data.table package must be installed')
if(ols || qreg || lrm)
if(! requireNamespace('rms', quietly=TRUE))
stop('rms package must be installed if using ols, qreg, or lrm')
if(pr %in% c('kable', 'margin'))
if(! requireNamespace('kableExtra', quietly=TRUE))
stop('must install kableExtra package if pr is kable or margin')
envi <- .GlobalEnv
if(! discrete) assign('.knots.', k, envir=envi)
mf <- model.frame(formula, data=data)
v <- names(mf)
Y <- mf[[1]]
sec <- NCOL(Y) == 2
if(sec && ! length(times))
stop('when dependent variable has two columns you must specify times')
if(sec && (loess || ols || qreg || lrm || orm))
stop('loess, ols, qreg, lrm, orm do not apply when dependent variable has two columns')
if(varyeps && space == 'x')
stop('varyeps applies only to space="n"')
if(sec) {
if(hare) if(! requireNamespace('polspline', quietly=TRUE))
stop('polspline package must be installed if hare=TRUE')
Y2 <- Y[, 2]
Y <- Y[, 1]
} else Y2 <- rep(1, nrow(mf))
X <- trans(mf[[2]])
bythere <- length(v) > 2
By <- if(bythere)
do.call(interaction, c(mf[3:length(mf)], list(sep='::')))
else
rep(1, length(X))
i <- is.na(X) | is.na(Y) | is.na(Y2) | is.na(By)
if(any(i)) {
i <- ! i
X <- X [i]
Y <- Y [i]
Y2 <- Y2[i]
By <- By[i]
}
ybin <- ! sec && all(Y %in% 0:1)
qformat <- function(x)
fcase(x == 0.05, 'P5', x == 0.1, 'P10',
x == 0.25, 'Q1', x == 0.5, 'Median', x == 0.75, 'Q3',
x == 0.9, 'P90', x == 0.95, 'P95',
default=as.character(x))
if(! length(stat))
stat <- if(ybin) function(y)
if(discrete) list(Proportion = mean(y), N= length(y)) else
list('Moving Proportion' = mean(y), N = length(y))
else if(sec)
function(y, y2) {
# km.quick is in Hmisc
z <- c(1. - km.quick(survival::Surv(y, y2), times), length(y))
names(z) <- c(paste0(movlab, times, '-', tunits), 'N')
as.list(z)
}
else
function(y) {
if(! length(y)) return(list(Mean=NA, Median=NA, Q1=NA, Q3=NA))
qu <- quantile(y, (1:3)/4)
z <- list('Mean' = mean(y),
'Median' = qu[2],
'Q1' = qu[1],
'Q3' = qu[3],
N=length(y))
names(z)[1:4] <- paste0(movlab, names(z)[1:4])
z
}
statx <- function(y, y2, x) {
s <- if(sec) stat(y, y2) else stat(y)
if(! discrete && space == 'n') s$.xmean. <- mean(x)
s
}
R <- NULL
Xinc <- xinc
uby <- if(is.factor(By)) levels(By) else sort(unique(By))
needxinc <- space == 'n' && ! length(Xinc)
info <- if(discrete) matrix(NA, nrow=length(uby), ncol=1,
dimnames=list(uby, 'N'))
else
matrix(NA,
nrow=length(uby),
ncol=5 + needxinc,
dimnames=list(uby, c('N', 'Wmean', 'Wmin', 'Wmax',
'eps', if(needxinc) 'xinc')))
if(discrete) xlev <- if(is.factor(X)) levels(X) else sort(unique(X))
for(by in uby) {
j <- By == by
if(sum(j) < nignore) {
warning(paste('Stratum', by, 'has <', nignore, 'observations and is ignored'))
next
}
x <- X [j]
y <- Y [j]
y2 <- Y2[j]
n <- length(x)
if(discrete) {
xv <- x
xseq <- xlev
}
else
switch(space,
x = {
xl <- xlim
if(! length(xl)) {
xs <- sort(x)
xl <- c(xs[nignore], xs[n - nignore + 1])
if(diff(xl) >= 25) xl <- round(xl)
}
xinc <- Xinc
if(! length(xinc)) xinc <- diff(xl) / 100.
xseq <- seq(xl[1], xl[2], by=xinc)
xv <- x
},
n = {
i <- order(x)
x <- x [i]
y <- y [i]
y2 <- y2[i]
xinc <- Xinc
if(! length(xinc)) {
xinc <- max(floor(n / 200.), 1)
info[by, 'xinc'] <- xinc
}
xseq <- seq(nignore, n - nignore + 1, by=xinc)
xv <- 1 : n
} )
ep <- eps
if(space == 'n' && varyeps) {
## Requirement to have >= 3 x-points; new eps=h
## First estimate at observation min(xseq), last at max(xseq)
## First window goes to min(xseq) + h on right
## Last window goes to max(xseq) - h on left
## Need difference in these two to be >= 2*xinc
lowesteps <- floor((max(xseq) - min(xseq) - 2 * xinc) / 2.)
if(lowesteps < eps) ep <- lowesteps
info[by, 'eps'] <- ep
}
if(discrete) m <- data.table::data.table(x, y, y2, tx=xv)
else {
s <- data.table::data.table(x, y, y2, xv)
a <- data.table::data.table(tx=xseq, key='tx') # target xs for estimation
a[, .q(lo, hi) := .(tx - ep, tx + ep)] # define all windows
m <- a[s, on=.(lo <= xv, hi >= xv)] # non-equi join
}
data.table::setkey(m, tx)
## Non-equi join adds observations tx=NA
m <- m[! is.na(tx), ]
w <- m[, statx(y, y2, x), by=tx]
info[by, 'N'] <- n
if('N' %in% names(w)) {
N <- w[, N]
if(! discrete) info[by, 2:4] <- c(round(mean(N), 1), min(N), max(N))
}
if(! discrete && space == 'n') {
w[, tx := .xmean.]
w[, .xmean. := NULL ]
}
if(msmooth != 'raw') {
computed <- setdiff(names(w), c('tx', 'N'))
for(vv in computed) {
smoothed <- switch(tsmooth,
lowess = lowess(w[, tx], w[[vv]], f =span),
supsmu = supsmu(w[, tx], w[[vv]], bass=bass))
if(length(smoothed$x) < 2)
stop(paste0('Only 1 x point for stratum ', by,
' with ', n,
' observations. Consider specifying varyeps=TRUE.'))
smfun <- function(x) approx(smoothed, xout=x)$y
switch(msmooth,
smoothed = {
w[, (vv) := smfun(tx)]
},
both = {
newname <- sub('Moving', 'Moving-smoothed', vv)
w[, (newname) := smfun(tx)]
}
)
}
}
## Also compute loess estimates
dat <- data.frame(x=w[, tx])
if(loess) {
np <- loess(y ~ x, data=s)
pc <- predict(np, dat)
if(ybin)
w[, 'Loess Proportion' := pc]
else
w[, `Loess Mean` := pc]
}
if(ols) {
f <- rms::ols(y ~ rms::rcs(x, .knots.), data=s)
pc <- predict(f, dat)
w[, `OLS Mean` := pc]
}
if(lrm) {
f <- if(length(lrm_args))
do.call(rms::lrm, c(list(y ~ rms::rcs(x, .knots.), data=s), lrm_args)) else
rms::lrm(y ~ rms::rcs(x, .knots.), data=s)
pc <- predict(f, dat, type='fitted')
w[, 'LR Proportion' := pc]
}
if(orm) {
f <- rms::orm(y ~ rms::rcs(x, .knots.), data=s, family=family)
pc <- predict(f, dat, type='mean')
w[, 'ORM Mean' := pc]
if(length(tau)) {
pc <- predict(f, dat)
qu <- Quantile(f)
for(ta in tau) {
w[, ormqest := qu(ta, pc)]
cta <- qformat(ta)
data.table::setnames(w, 'ormqest', paste('ORM', cta))
}
}
}
if(qreg)
for(ta in tau) {
f <- rms::Rq(y ~ rms::rcs(x, .knots.), tau=ta, data=s)
pc <- predict(f, dat)
w[, qrest := pc]
cta <- qformat(ta)
data.table::setnames(w, 'qrest', paste('QR', cta))
}
if(hare) {
f <- if(length(penalty))
polspline::hare(y, y2, x, maxdim=maxdim, penalty=penalty)
else
polspline::hare(y, y2, x, maxdim=maxdim)
for(ti in times) {
inc <- polspline::phare(ti, dat$x, f)
newname <- paste0('HARE ', ti, '-', tunits)
w[, (newname) := inc]
}
}
w[, by := by]
R <- rbind(R, w)
}
R[, tx := itrans(tx)]
if(bythere) {
if(length(v) == 3) data.table::setnames(R, c('tx', 'by'), v[2:3])
else {
splitby <- strsplit(R[, by], split='::')
data.table::setnames(R, 'tx', v[2])
data.table::set(R, j='by', value=NULL) # remove 'by'
for(iby in 3 : length(v))
data.table::set(R, j=v[iby],
value=sapply(splitby, function(u) u[[iby - 2]]))
}
}
else {
R[, by := NULL]
data.table::setnames(R, 'tx', v[2])
}
if(melt) {
if(sec) v[1] <- 'incidence'
# Exclude N if present or would mess up melt
if('N' %in% names(R)) R[, N := NULL]
R <- data.table::melt(R, id.vars=v[-1], variable.name='Statistic',
value.name=v[1])
if(sec) {
addlab <- function(x) {
label(x) <- 'Cumulative Incidence'
x
}
R[, incidence := addlab(incidence)]
}
R[, Type := sub (' .*', '', Statistic)]
R[, Statistic := sub ('.* ', '', Statistic)]
R[, Type := gsub('~', ' ', Type)]
R[, Statistic := gsub('~', ' ', Statistic)]
}
if('eps' %in% colnames(info) && all(is.na(info[, 'eps'])))
info <- info[, setdiff(colnames(info), 'eps'), drop=FALSE]
if(! bythere) row.names(info) <- NULL
infon <- info
if(pr %in% c('kable', 'margin')) {
n <- colnames(info)
nam <- .q(N=N, Wmean=Mean, Wmin=Min, Wmax=Max, eps=eps, xinc=xinc)
colnames(info) <- nam[n]
## rownames returns "1" "2" "3" ... even if row names not really there
ri <- rownames(info)
rnp <- length(ri) && ! all(ri == as.character(1 : length(ri)))
ghead <- c(if(rnp) c(' ' = 1),
c(' ' = 1, 'Window Sample Sizes' = 3),
if('eps' %in% n) c(' ' = 1),
if('xinc' %in% n) c(' ' = 1))
info <- if(discrete) knitr::kable(info)
else
info <- knitr::kable(info) |>
kableExtra::add_header_above(ghead) |>
kableExtra::kable_styling(font_size=9)
}
if(pr == 'margin' && ! requireNamespace('qreport', quietly=TRUE))
stop('must install qreport package when specifying pr=margin')
switch(pr,
plain = print(info),
kable = cat(info),
margin = qreport::makecolmarg(info) )
attr(R, 'infon') <- infon
attr(R, 'info') <- info
R
}
utils::globalVariables(c('lo', 'hi', 'tx', '.xmean.', 'Loess Mean', 'OLS Mean',
'ormqest', 'qrest', 'incidence', 'Type', 'Statistic', 'Min', 'Max'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.