knitr::opts_chunk$set( echo = TRUE, fig.height = 4, fig.width = 7, dpi = 128, cache.path = "cache/overhead-cache/", fig.path = "fig/overhead-fig/", error = FALSE, cache.lazy = FALSE) options(digits = 4, scipen = 10, width = 70)
The objective of this file is to highlight the performance when multithreading is used. The conclusions likely depend on both the hardware, operating system and compiler.
We will simulate and estimate a first order vector auto-regression model in
this example using the particle filter and smoother. For details see
this vignette which can also be found by
calling vignette("Particle_filtering", package = "dynamichazard")
. The models
we are going to simulate from and estimate are of the form
$$ \begin{split} y_{it} &\sim g(\cdot\vert\eta_{it}) & \ \vec{\eta}t &= X_tR^+\vec{\alpha}_t + Z_t\vec{\beta} + \vec{o}_t \ \vec{\alpha}_t &= F\vec{\alpha}{t - 1} + R\vec{\epsilon}_t & \quad \vec{\epsilon}_t \sim N(\vec{0}, Q) \ & & \quad \vec{\alpha}_0 \sim N(\vec{a}_0, Q_0) \end{split}, \qquad \begin{array}{l} i = 1, \dots, n_t \ t = 1, \dots, d \end{array} $$
where the $y_{it}$ is individual $i$'s indicator at time $t$ for whether he dies between time $(t - 1, t]$. The indicators, $y_{it}$, are binomial distributed with the complementary log-log link function conditional on knowing the log times at risk, $\vec{o}_1,\cdots,\vec{o}_d$, covariates, $X_t$ and $Z_t$, and latent states, $\vec{\alpha}_1,\dots,\vec{\alpha}_d$. The total survival time of individual $i$ is $T_i$ which is piecewise constant exponentially distributed conditional on knowing the latent states. Further, we set $Z_t = X_t$ so the states have a non-zero mean. The true values are
$$ F = \begin{pmatrix}0.9 & 0 \ 0 & 0.9 \end{pmatrix}, \quad Q = \begin{pmatrix}0.33^2 & 0 \ 0 & 0.33^2 \end{pmatrix}, \quad R = I_2, \quad \vec{\beta} = (-6.5, -2)^\top $$
git_key <- system("git rev-parse --short HEAD", intern = TRUE) git_bra <- system("git branch", intern = TRUE) regexp <- "^(\\*\ )(.+)$" git_bra <- git_bra[grepl(regexp, git_bra)] git_bra <- gsub(regexp, "\\2", git_bra)
where $I_2$ is the two-dimensional identity matrix and
$\vec{a}_0$ and $Q_0$ are given by the invariant distribution. The unknown
parameters to be estimated is everything but $Q_0$ and $R$ (since we fix $Q_0$ doing the estimation and we set $\vec{a}_0 = (0, 0)^\top$). This
example is run on the git branch "r git_bra
" with ID "r git_key
". The code
can be found on
the Github site for the package.
All functions which assignments are not shown and are not in the
dynamichazard
package can be found on the Github site.
We start by simulating the data. Feel free to skip this part as the specifications are given above. First we assign the parameters for the simulation
# function to find Q_0 get_Q_0 <- function(Qmat, Fmat){ # see https://math.stackexchange.com/q/2854333/253239 eg <- eigen(Fmat) las <- eg$values if(any(abs(las) >= 1)) stop("Divergent series") U <- eg$vectors U_t <- t(U) T. <- crossprod(U, Qmat %*% U) Z <- T. / (1 - tcrossprod(las)) solve(U_t, t(solve(U_t, t(Z)))) }
n_periods <- 300L Fmat <- matrix(c(.9, 0, 0, .9), 2) Rmat <- diag(1 , 2) Qmat <- diag(.33^2, 2) Q_0 <- get_Q_0(Qmat, Fmat) beta <- c(-6.5, -2)
get_Q_0
is a function to get the covariance matrix for the invariant distribution.
Then we simulate and plot the latent states
set.seed(54432125) betas <- matrix(nrow = n_periods + 1, ncol = 2) betas[1, ] <- rnorm(2) %*% chol(Q_0) for(i in 1:n_periods + 1) betas[i, ] <- Fmat %*% betas[i - 1, ] + drop(rnorm(2) %*% chol(Qmat)) betas <- t(t(betas) + beta) # plot of latent variables cols <- c("black", "darkblue") matplot(betas, type = "l", lty = 1, col = cols) for(i in 1:2) abline(h = beta[i], lty = 2, col = cols[i])
We simulate the observations as follows
# assign function to simulate observations get_obs <- function(n_obs){ set.seed(37723679) df <- replicate(n_obs, { # left-censoring tstart <- max(0L, sample.int((n_periods - 1L) * 2L, 1) - n_periods + 1L) # covariates x <- runif(1, -1, 1) covars <- c(1, x) # outcome (stop time and event indicator) y <- FALSE for(tstop in (tstart + 1L):n_periods){ fail_time <- rexp(1) / exp(covars %*% betas[tstop + 1L, ]) if(fail_time <= 1){ y <- TRUE tstop <- tstop - 1L + fail_time break } } c(tstart = tstart, tstop = tstop, x = x, y = y) }) df <- data.frame(t(df)) } # use function df <- get_obs(3000L) head(df, 10)
We left-censor the observations since we otherwise may end up with a low number of observations towards the end.
We use the generalized two-filter smoother from @fearnhead10
library(dynamichazard)
func <- function(N_fw_n_bw, N_smooth, N_first){ lapply(1:6, function(n_threads){ set.seed(30520116) suppressWarnings(ti <- system.time(pf_Fear <- PF_EM( Surv(tstart, tstop, y) ~ x + ddFixed(x) + ddFixed_intercept(TRUE), df, Q_0 = diag(1, 2), Q = diag(1, 2), Fmat = matrix(c(.1, 0, 0, .1), 2), by = 1, type = "VAR", model = "exponential", max_T = n_periods, control = PF_control( N_fw_n_bw = N_fw_n_bw, N_smooth = N_smooth, N_first = N_first, method = "AUX_normal_approx_w_cloud_mean", n_max = 1, # Just take one EM-iteration smoother = "Fearnhead_O_N", Q_tilde = diag(.3^2, 2), n_threads = n_threads)))) list(ti = ti, fit = pf_Fear, n_threads = n_threads) }) }
out <- func(200L, 500L, 2000L)
We assign a function to check the results
show_res <- function(out){ do_check <- c("fixed_effects", "Q", "F") if(all(sapply( out[-1L], function(target) isTRUE(all.equal(target$fit[do_check], out[[1]]$fit[do_check]))))) cat("All estimates match\n") # plot log time versus log number of threads ti <- sapply(out, "[[", "ti")["elapsed", ] n_threads <- sapply(out, "[[", "n_threads") plot(ti ~ n_threads, log = "xy") cat("Ols estimates are\n") print(coef(lm(log(ti) ~ log(n_threads)))) } show_res(out)
Increasing the number of particles we use changes the results
out <- func(5000L, 5000L, 10000L)
show_res(out)
Similar changes apply for for larger data sets
df <- get_obs(10000L) out <- func(200L, 500L, 2000L)
show_res(out)
sessionInfo() parallel::detectCores(logical = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.