knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(ggplot2)
library(dplyr)
library(stringr)
library(cowplot)
theme_set(theme_minimal_grid())
library(mgcv)
library(dbarts)
library(possum)
library(lars)
library(reshape2)
library(tidyr)

Load and prepare US crime data

data(UScrime, package = "MASS")

glimpse(UScrime)

y <- UScrime %>%
  pull(y) %>%
  scale()

## Covariates
X <- UScrime[, -which(colnames(UScrime) == "y")] %>%
  as.matrix()

## log-transform and scale the data
X[, -2] <- log(X[, -2])

X <- scale(X)

varnames <- colnames(X)

varnamesDf <- data.frame(
  Var1 = seq_along(varnames),
  varname = varnames
)

Estimate Bayesian nonparametric regression model

set.seed(420)

mybayes<-horseshoe::horseshoe(y, X)
names(mybayes)
betaSamp<-mybayes$BetaSamples

cbart <- bart(X, y)

sigma2Samples <- cbart$sigma^2

rsq(y, cbart$yhat.train.mean)

ggplot() +
  geom_abline(intercept=0, slope=1, col="firebrick3") + 
  geom_point(aes(y, cbart$yhat.train.mean)) + 
  labs(x = "y", y = "yhat")

fhatmat <- t(cbart$yhat.train)

## devtools::load_all()


lin2 <- sparse_linear_summary(X, y, betaSamples = betaSamp, sigma2Samples=sigma2Samples, varnames = varnames)

lin <- sparse_linear_summary(X, fhatmat, varnames = varnames)
lin2 <- sparse_linear_summary(X, betaSamples = betaSamp, varnames = varnames)

names(lin)

lin$summaryDfCI %>% 
  filter(stat == "rsq_gamma") %>%
  ggplot(aes(modelSize, mid)) +
  geom_linerange(aes(ymin = lo,
                     ymax = hi)) + 
  geom_point() +
  # facet_wrap(~stat, ncol=1, scales = "free_y") +
  labs(x = "Summary size", y = "Summary R-sq")

The "elbow point" is about 7.

# Posterior mean and credible intervals for this summary
lin$betaProjSummary %>% filter(modelSize == 7)

# Poster plot for the summary
lin$betaProjDf %>%
  filter(modelSize == 7) %>% 
  filter(value!=0) %>%
  ggplot() +
  geom_hline(yintercept=0) + 
  geom_violin(aes(varname, value))


spencerwoody/possum documentation built on Aug. 5, 2022, 12:18 a.m.