knitr::opts_chunk$set(
  echo = TRUE, 
  fig.height = 4, fig.width = 7, dpi = 128,
  cache.path = "cache/overhead-cache/", fig.path = "fig/overhead-fig/", 
  error = FALSE, cache.lazy = FALSE)
options(digits = 4, scipen = 10, width = 70)

Introduction

The objective of this file is to highlight the performance when multithreading is used. The conclusions likely depend on both the hardware, operating system and compiler.

We will simulate and estimate a first order vector auto-regression model in this example using the particle filter and smoother. For details see this vignette which can also be found by calling vignette("Particle_filtering", package = "dynamichazard"). The models we are going to simulate from and estimate are of the form

$$ \begin{split} y_{it} &\sim g(\cdot\vert\eta_{it}) & \ \vec{\eta}t &= X_tR^+\vec{\alpha}_t + Z_t\vec{\beta} + \vec{o}_t \ \vec{\alpha}_t &= F\vec{\alpha}{t - 1} + R\vec{\epsilon}_t & \quad \vec{\epsilon}_t \sim N(\vec{0}, Q) \ & & \quad \vec{\alpha}_0 \sim N(\vec{a}_0, Q_0) \end{split}, \qquad \begin{array}{l} i = 1, \dots, n_t \ t = 1, \dots, d \end{array} $$

where the $y_{it}$ is individual $i$'s indicator at time $t$ for whether he dies between time $(t - 1, t]$. The indicators, $y_{it}$, are binomial distributed with the complementary log-log link function conditional on knowing the log times at risk, $\vec{o}_1,\cdots,\vec{o}_d$, covariates, $X_t$ and $Z_t$, and latent states, $\vec{\alpha}_1,\dots,\vec{\alpha}_d$. The total survival time of individual $i$ is $T_i$ which is piecewise constant exponentially distributed conditional on knowing the latent states. Further, we set $Z_t = X_t$ so the states have a non-zero mean. The true values are

$$ F = \begin{pmatrix}0.9 & 0 \ 0 & 0.9 \end{pmatrix}, \quad Q = \begin{pmatrix}0.33^2 & 0 \ 0 & 0.33^2 \end{pmatrix}, \quad R = I_2, \quad \vec{\beta} = (-6.5, -2)^\top $$

git_key <- system("git rev-parse --short HEAD", intern = TRUE)
git_bra <- system("git branch", intern = TRUE)
regexp <- "^(\\*\ )(.+)$"
git_bra <- git_bra[grepl(regexp, git_bra)]
git_bra <- gsub(regexp, "\\2", git_bra)

where $I_2$ is the two-dimensional identity matrix and $\vec{a}_0$ and $Q_0$ are given by the invariant distribution. The unknown parameters to be estimated is everything but $Q_0$ and $R$ (since we fix $Q_0$ doing the estimation and we set $\vec{a}_0 = (0, 0)^\top$). This example is run on the git branch "r git_bra" with ID "r git_key". The code can be found on the Github site for the package. All functions which assignments are not shown and are not in the dynamichazard package can be found on the Github site.

Simulation

We start by simulating the data. Feel free to skip this part as the specifications are given above. First we assign the parameters for the simulation

# function to find Q_0
get_Q_0 <- function(Qmat, Fmat){
  # see https://math.stackexchange.com/q/2854333/253239
  eg  <- eigen(Fmat)
  las <- eg$values
  if(any(abs(las) >= 1))
    stop("Divergent series")
  U   <- eg$vectors
  U_t <- t(U)
  T.  <- crossprod(U, Qmat %*% U)
  Z   <- T. / (1 - tcrossprod(las))
  solve(U_t, t(solve(U_t, t(Z))))
}
n_periods <- 300L

Fmat <- matrix(c(.9, 0, 0, .9), 2)
Rmat <- diag(1    , 2)
Qmat <- diag(.33^2, 2)
Q_0  <- get_Q_0(Qmat, Fmat)
beta <- c(-6.5, -2)

get_Q_0 is a function to get the covariance matrix for the invariant distribution. Then we simulate and plot the latent states

set.seed(54432125)
betas <- matrix(nrow = n_periods + 1, ncol = 2)
betas[1, ] <- rnorm(2) %*% chol(Q_0)
for(i in 1:n_periods + 1)
  betas[i, ] <- Fmat %*% betas[i - 1, ] + drop(rnorm(2) %*% chol(Qmat))

betas <- t(t(betas) + beta)

# plot of latent variables
cols <- c("black", "darkblue")
matplot(betas, type = "l", lty = 1, col = cols)
for(i in 1:2)
  abline(h = beta[i], lty = 2, col = cols[i])

We simulate the observations as follows

# assign function to simulate observations
get_obs <- function(n_obs){
  set.seed(37723679)
  df <- replicate(n_obs, {
    # left-censoring
    tstart <- max(0L, sample.int((n_periods - 1L) * 2L, 1) - n_periods + 1L)

    # covariates
    x <- runif(1, -1, 1)
    covars <- c(1, x)

    # outcome (stop time and event indicator)
    y <- FALSE
    for(tstop in (tstart + 1L):n_periods){
      fail_time <- rexp(1) / exp(covars %*% betas[tstop + 1L, ])
      if(fail_time <= 1){
        y <- TRUE
        tstop <- tstop - 1L + fail_time
        break
      }
    }

    c(tstart = tstart, tstop = tstop, x = x, y = y)
  })
  df <- data.frame(t(df))
}

# use function
df <- get_obs(3000L)
head(df, 10)

We left-censor the observations since we otherwise may end up with a low number of observations towards the end.

Particle filter and smoother

We use the generalized two-filter smoother from @fearnhead10

library(dynamichazard)
func <- function(N_fw_n_bw, N_smooth, N_first){
  lapply(1:6, function(n_threads){
    set.seed(30520116)
    suppressWarnings(ti <- system.time(pf_Fear <- PF_EM(
      Surv(tstart, tstop, y) ~ x + ddFixed(x) + ddFixed_intercept(TRUE), df,
      Q_0 = diag(1, 2), Q = diag(1, 2), Fmat = matrix(c(.1, 0, 0, .1), 2), 
      by = 1, type = "VAR", model = "exponential", max_T = n_periods,
      control = PF_control(
        N_fw_n_bw = N_fw_n_bw, N_smooth = N_smooth, N_first = N_first,
        method = "AUX_normal_approx_w_cloud_mean",
        n_max = 1, # Just take one EM-iteration
        smoother = "Fearnhead_O_N",
        Q_tilde = diag(.3^2, 2), n_threads = n_threads))))
    list(ti = ti, fit = pf_Fear, n_threads = n_threads)
  })
}
out <- func(200L, 500L, 2000L)

We assign a function to check the results

show_res <- function(out){
  do_check <- c("fixed_effects", "Q", "F")
  if(all(sapply(
    out[-1L], function(target)
      isTRUE(all.equal(target$fit[do_check], out[[1]]$fit[do_check])))))
    cat("All estimates match\n")

  # plot log time versus log number of threads
  ti        <- sapply(out, "[[", "ti")["elapsed", ]
  n_threads <- sapply(out, "[[", "n_threads")
  plot(ti ~ n_threads, log = "xy")

  cat("Ols estimates are\n")
  print(coef(lm(log(ti) ~ log(n_threads))))
}
show_res(out)

Increasing the number of particles we use changes the results

out <- func(5000L, 5000L, 10000L)
show_res(out)

Similar changes apply for for larger data sets

df <- get_obs(10000L)
out <- func(200L, 500L, 2000L)
show_res(out)

Session info

sessionInfo()
parallel::detectCores(logical = FALSE)

References



boennecd/dynamichazard documentation built on Oct. 11, 2022, 2:41 p.m.