scripts/sgd-MRA.R

# TODO
# - add in a decaying learning rate
# - add in penalty term Q_alpha_tau2 with fixed penalty tau2
#     - add in adaptive penalty term tau2?
# - add in sum-to-zero constraint
#
# Do this with BayesMRA
library(tidyverse)
library(patchwork)
library(sgMRA)
library(BayesMRA)
library(spam)
library(Matrix)
library(patchwork)
set.seed(11)

N <- 400^2

## setup the spatial process
locs <- as.matrix(
    expand.grid(
        seq(0, 1, length.out = sqrt(N)),
        seq(0, 1, length.out = sqrt(N))
    )
)
# D <- fields::rdist(locs)

## fixed effects include intercept, elevation, and latitude
X <- cbind(1, rnorm(N), locs[, 2])
# X <- cbind(1, as.vector(mvnfast::rmvn(1, rep(0, N), 3 * exp(-D / 20))), locs[, 2])
p <- ncol(X)

beta <- rnorm(ncol(X))

## MRA spatio-temporal random effect
M <- 3
n_coarse <- 40

MRA <- mra_wendland_2d(locs, M = M, n_coarse = n_coarse, use_spam = TRUE)

# MRA    <- mra_wendland_2d(locs, M = M, n_max_fine_grid = 2^8, use_spam = TRUE)
W <- as.dgCMatrix.spam(MRA$W) # using the Matrix package is faster here because we don't need the Choleksy
# W <- MRA$W
n_dims <- MRA$n_dims
dims_idx <- MRA$dims_idx

Q_alpha      <- make_Q_alpha_2d(sqrt(n_dims), rep(0.999, length(n_dims)), prec_model = "CAR", use_spam = FALSE)

tau2         <- 10 * 2^(2 * (1:M - 1))
Q_alpha_tau2 <- make_Q_alpha_tau2(Q_alpha, tau2, use_spam = FALSE)

## initialize the random effect
## set up a linear constraint so that each resolution sums to one
A_constraint <- sapply(1:M, function(i){
    tmp = rep(0, sum(n_dims))
    tmp[dims_idx == i] <- 1
    return(tmp)
})

a_constraint <- rep(0, M)
alpha   <- as.vector(rmvnorm.prec.const(n = 1, mu = rep(0, sum(n_dims)), Q = Q_alpha_tau2, A = t(A_constraint), a = a_constraint))

sigma2 <- 0.25

epsilon <- rnorm(N, 0, sqrt(sigma2))

mu <- drop(W %*% alpha)
y <- as.numeric(mu + epsilon)
# y <- as.numeric(X %*% beta + W %*% alpha + sigma2)

dat <- data.frame(x=locs[, 1], y = locs[, 2], z = y, mu = mu)

p_sim <- ggplot(dat, aes(x, y, fill = z)) +
    geom_raster() +
    scale_fill_viridis_c() +
    ggtitle("Simulated")
p_sim

p_latent <- ggplot(dat, aes(x, y, fill = mu)) +
    geom_raster() +
    scale_fill_viridis_c() +
    ggtitle("Simulated")
p_latent

# gradient descent function for MRA using minibatch ----

U <- cbind(X, W)
message("loss = ", target_fun(y, U, c(rep(0, length(beta)), alpha)))
# message("loss = ", target_fun(y, U, c(beta, alpha)))

# profvis::profvis(
system.time(
    dat_mini <- regression_gradient_descent(c(y),
                                       X, W, inits = rnorm(ncol(U))*0.1,
                                       threshold = 0.000001,
                                       learn_rate = 1, num_iters = 20000,
                                       print_every = 50,
                                       minibatch_size = 2^8))

# Using full gradient -- This algorithm gets more competitive with increasing sample size
# profvis::profvis(
system.time(
    dat_full <- regression_gradient_descent(c(y),
                                       X, W, inits = rnorm(ncol(U))* 0.1,
                                       threshold = 0.000001,
                                       learn_rate = 0.1, num_iters = 20000,
                                       print_every = 50,
                                       minibatch_size = NULL))



layout(matrix(1:2, 2, 1))
plot(dat_mini$loss, type='l')
abline(h = target_fun(y, U, c(rep(0, length(beta)), alpha)), col='red')
# lines(dat_full$loss, col='red', type='l')
plot(dat_full$loss, type='l')
abline(h = target_fun(y, U, c(rep(0, length(beta)), alpha)), col='red')

# predictions from the minibatch model
beta_idx <- grepl("beta", names(dat_mini))
beta_fit_mini <- dat_mini[nrow(dat_mini), beta_idx]
y_pred_mini <- U %*% as.numeric(beta_fit_mini)
# predictions from the full model
beta_idx <- grepl("beta", names(dat_full))
beta_fit_full <- dat_full[nrow(dat_full), beta_idx]
y_pred_full <- U %*% as.numeric(beta_fit_full)


dat <- data.frame(x = locs[, 1], y = locs[, 2], z = drop(W %*% alpha), y_sim = y,
                  y_pred_mini = drop(y_pred_mini),
                  y_pred_full = drop(y_pred_full))

p_mini <- ggplot(dat, aes(x, y, fill = y_pred_mini)) +
    geom_raster() +
    scale_fill_viridis_c() +
    ggtitle("Minibatch fit")
p_full <- ggplot(dat, aes(x, y, fill = y_pred_full)) +
    geom_raster() +
    scale_fill_viridis_c() +
    ggtitle("Full fit")

p_sim / p_mini / p_full
# p_sim / p_full

(p_sim + p_latent) / (p_mini + p_full)

p_latent / p_full

sd(y - y_pred_mini)
sd(y - y_pred_full)
mean((y - y_pred_mini)^2)
mean((y - y_pred_full)^2)


dat %>%
    # pivot_longer(cols= y_pred_mini:y_pred_full) %>%
    # ggplot(aes(x = y_sim, y=value)) +
    ggplot(aes(x = y_sim, y=y_pred_full)) +
    geom_hex() +
    geom_abline(slope=1, intercept=0, color="red") #+
    # facet_wrap(~ name, nrow=2)

tU <- t(U)
tUU <- t(U) %*% U
theta_test <- rnorm(ncol(U))

# bm <- microbenchmark::microbenchmark(
#     tUU %*% theta_test,
#     tU %*% (U %*% theta_test), times = 10
# )
# autoplot(bm)


# tUy <- t(U) %*% y
# idx <- sample(1:length(y), 2^6)
#
# y_idx <- y[idx]
# tUy_idx <- tU[, idx] %*% y_idx
#
#
# bm <- microbenchmark::microbenchmark(
#     gradient_fun(y, tUy, tUU, theta_test),
#     gradient_fun(y[idx], tU[, idx] %*% y[idx], tUU, theta_test), times = 10)
# autoplot(bm)
# all.equal(    gradient_fun(y, U, theta_test),
#               gradient_fun2(y, U, tUy, tUU, theta_test))


ggplot(dat, aes(x = iteration, y = loss)) +
    geom_point() +
    geom_line() +
    ggtitle("loss as a function of iteration")


# Plot the regression parameters

# convert beta into a data.frame
# results from lm()

dat %>%
    pivot_longer(cols = 1:10, names_to = "parameter", values_to = "beta") %>%
    ggplot(aes(x = iteration, y = beta, color = parameter)) +
    geom_line() +
    ggtitle("Parameter estimates by iteration")


# layout(matrix(1:2, 2, 1))
# plot(y, cbind(X, W) %*% as.matrix(dat)[nrow(dat), 1:(ncol(dat) - 2)])
# plot(y, X %*% beta + W %*% alpha)

# fitted MSE and R^2
preds <- as.numeric(cbind(X, W) %*% unlist(dat[nrow(dat), ][1:(ncol(dat) - 2)]))
mean((y - dat$y_pred_full)^2)
cor(y, dat$y_pred_full)^2
# oracle MSE and R^2
# mu <- as.numeric(X %*% beta + W %*% alpha)
mu <- as.numeric(W %*% alpha)
mean((y - mu)^2)
cor(y, mu)^2

# plot spatial random field
dat <- data.frame(x = locs[, 1], y = locs[, 2], value = y, mu = mu, pred = preds)
plot_lims <- range(c(y, mu, preds))
p_sim <- ggplot(dat) +
    geom_raster(aes(x, y, fill = value)) +
    scale_fill_viridis_c(end = 0.8, limits = plot_lims) +
    ggtitle("simulated data")
p_latent <- ggplot(dat) +
    geom_raster(aes(x, y, fill = mu)) +
    scale_fill_viridis_c(end = 0.8, limits = plot_lims) +
    ggtitle("simulated latent process")
p_fit <- ggplot(dat) +
    geom_raster(aes(x, y, fill = preds)) +
    scale_fill_viridis_c(end = 0.8, limits = plot_lims) +
    ggtitle("predicted latent process")
p_sim + p_latent + p_fit
jtipton25/sgMRA documentation built on Feb. 9, 2023, 4:53 a.m.