Introduction

This vignette goes through the basic steps of the Jian et al. paper analysis, demonstrating some of the features of the urbankfs R package along the way.

Setup

Load packages.

library(dplyr)
library(forcats)
library(tidyr)
library(modelr)
library(purrr)
library(ggplot2)
theme_bw()
library(cowplot)
library(urbankfs)
# devtools::install(here::here())
requireNamespace("randomForest", quietly = TRUE)
requireNamespace("here", quietly = TRUE)

Load the full set of fitted models. Because it is large (>100MB), this file is not distributed with the package. To obtain it, either download from OSF (via urbankfs::download_jian_fits) or re-generate it with the scripts/fit_models.R script.

# download_jian_fits(here::here("extdata/fitted_models.rda"))
load(here::here("extdata", "fitted_models.rda"))

Test and training data predictions

Generate neural network and random forest predictions for the training and test data. randomForest provides a predict method for the randomForest S3 class. urbankfs defines a custom predict method for its neural network fits that is a thin wrapper around neuralnet::compute and which also undoes the scaling that happens during fitting.

pred <- fitted_models %>%
  select(sample, train_data, test_data, model_type, model_fit) %>%
  gather(data_type, data, train_data, test_data) %>%
  mutate(
    predicted = map2(model_fit, data, predict),
    model_type = fct_recode(model_type, !!!pretty_model_types())
  )

Fit a linear model, observed ~ predicted, for each bootstrapped sample, and extract the coefficients and R^2^.

pred_fits <- pred %>%
  filter(data_type == "test_data") %>%
  unnest(data, predicted) %>%
  select(sample, model_type, observed = Unsaturated_K2cm_cmhr, predicted) %>%
  group_by(sample, model_type) %>%
  summarize(fit = list(lm(observed ~ predicted))) %>%
  ungroup() %>%
  mutate(
    coefficients = map(fit, coefficients),
    slope = map_dbl(coefficients, 2),
    intercept = map_dbl(coefficients, 1),
    r2 = map_dbl(fit, ~summary(.)[["adj.r.squared"]])
  )

Use the linear model fits from above to generate a range of predictions of the observed data. This is mostly for plotting the linear 1:1 fits in subsequent steps.

obs <- tibble(
  predicted = fitted_models %>%
    unnest(train_data) %>%
    pull(Unsaturated_K2cm_cmhr) %>%
    seq_range(20)
)

pred_lm <- pred_fits %>%
  mutate(xpred = list(obs),
         lmpred = map2(fit, xpred, predict)) %>%
  unnest(xpred, lmpred) %>%
  group_by(model_type, predicted) %>%
  summarize_at(vars(lmpred), list(
    mean = mean,
    sd = sd,
    lo = ~quantile(., 0.1),
    hi = ~quantile(., 0.9)
  ))

Summarize the neural network and random forest model predictions at each point. These will be drawn as points with horizontal error bars.

pred_summary <- pred %>%
  filter(data_type == "test_data") %>%
  unnest(data, predicted) %>%
  group_by(model_type, observed = Unsaturated_K2cm_cmhr) %>%
  summarize_at(vars(predicted), list(
    mean = mean,
    sd = sd,
    lo = ~quantile(., 0.1),
    hi = ~quantile(., 0.9),
    n = length
  ))

Draw the observed vs. predicted scatter plot, with true 1:1 line (dashed) and the bootstrapped linear regression (blue shaded region).

fig_regression_cap <- paste0(
  "Observed vs. predicted regression for neural network and random forest models. ",
  "Dashed line is the 1:1 fit, and blue shaded region is the observed ~ predicted regression."
)
ggplot() +
  aes(x = predicted, y = mean) +
  geom_point(aes(y = observed, x = mean),
             data = pred_summary,
             size = 0.5) +
  geom_errorbarh(aes(y = observed, xmin = lo, xmax = hi, x = NULL),
                 data = pred_summary,
                 color = "gray40",
                 size = 0.5) +
  geom_ribbon(
    data = pred_lm,
    aes(ymin = lo, ymax = hi, y = NULL),
    alpha = 0.5,
    fill = "lightblue"
  ) +
  geom_line(aes(y = mean), data = pred_lm) +
  geom_abline(linetype = "dashed") +
  facet_wrap(vars(model_type), nrow = 3) +
  labs(x = expression('Predicted K'[fs] ~ (cm ~ hr^{-1})),
       y = expression('Observed K'[fs] ~ (cm ~ hr^{-1}))) 

Histogram of correlation coefficients for the training and testing data.

fig_correlation_cap <- paste0(
  "Histogram of correlation coefficients for the training and testing data."
)
pred %>%
  unnest(data, predicted) %>%
  mutate(data_type = fct_inorder(data_type) %>% fct_recode(
    "Training" = "test_data",
    "Testing" = "train_data"
  )) %>%
  group_by(model_type, sample, data_type) %>%
  summarize(corr = cor(predicted, Unsaturated_K2cm_cmhr, method = "spearman")) %>%
  ggplot() +
  aes(x = corr) +
  geom_density() +
  facet_grid(vars(model_type), vars(data_type)) +
  labs(x = "Correlation between prediction and data") +
  theme_cowplot()

Urban data predictions

Model predictions for Urban data. First, load the data.

histdata <- read.csv(here::here("extdata/UrbanSoilK_V3.csv")) %>%
  as_tibble() %>%
  mutate(Top_Type = factor(Top_Type, soil_type_levels())) %>%
  normalize_soil_pct_data(add_rock = TRUE)

Generate bootstrapped predictions for the data. urbankfs makes this easy with the predict_bootstrap function, which also has summary method for quickly generating tidy outputs.

hist_predict <- predict_bootstrap(histdata, fitted_models)
hist_summary <- summary(hist_predict)
hist_summary

How well does it do?

fig_urbandata_cap <- paste0(
  "Predicted vs. observed plot for urban data. ",
  "Dashed line is the 1:1 fit, and blue line with grey shading is a `observed ~ predicted` linear fit."
)
histdata %>%
  left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay",
                                 "Top_Type")) %>%
  ggplot() +
  aes(x = mean, xmin = q050, xmax = q950, y = Ksat) +
  geom_errorbarh(color = "grey50", size = 0.5) +
  geom_point(size = 0.7) +
  geom_smooth(method = "lm", color = "blue") +
  geom_abline(linetype = "dashed") +
  facet_wrap(model_type~., ncol = 2) +
  labs(x = expression("Predicted Ksat" ~ (cm ~ hr^-1)),
       y = expression("Observed Ksat" ~ (cm ~ hr^-1))) +
  theme_cowplot() +
  coord_cartesian(xlim = c(0, 35))


jinshijian/UrbanKfs documentation built on Jan. 9, 2021, 9:54 a.m.