Introduction

This vignette goes through the basic steps of the Jian et al. paper analysis, demonstrating some of the features of the urbankfs R package along the way.

Load packages

library(ggmap)
library(maps)
library(mapdata)
library(forcats)
library(tidyr)
library(modelr)
library(purrr)
library(ggplot2)
library(cowplot)
library(urbankfs)
library(readxl)
# install.packages("here")
# install.packages("randomForest")
library(devtools)
requireNamespace("here", quietly = TRUE)
# devtools::install(here::here())
requireNamespace("randomForest", quietly = TRUE)
library(dplyr)
library(ncdf4)

Setup

knitr::opts_chunk$set(results = 'hide', message = TRUE, include = TRUE,
                      echo = FALSE, warning = FALSE,
                      # fig.height = 4, fig.width = 8, 
                      cache = F)
histdata <- read.csv(here::here("extdata/UrbanSoilK_V3.csv")) %>%
  as_tibble() %>%
  mutate(Top_Type = factor(Top_Type, soil_type_levels())) %>%
  normalize_soil_pct_data(add_rock = FALSE)

histdata$Top_Type <- as.character(histdata$Top_Type)
histdata$Top_Type <- ifelse(is.na(histdata$Top_Type), "SG", histdata$Top_Type)

plot sites spatial distribution

# prepare data
site <- read.csv(here::here("vignettes", "All_SSURGO_and_Point.csv"))
kfsdata <- read.csv(here::here("vignettes", "All_SSURGO_and_Surface_HC.csv"))
left_join(site %>% select(SampleSite_ID,SamplePoint_ID, Longitude, Latitude),
          kfsdata %>% select(SampleSite_ID,SamplePoint_ID, Unsaturated_K2cm_cmhr) %>% 
            filter(!is.na(Unsaturated_K2cm_cmhr))) %>% 
  filter(!is.na(Unsaturated_K2cm_cmhr)) -> kfs_site

kfs_site %>% select(Latitude, Longitude) %>% 
  filter(Longitude < 0) %>% 
  unique() -> kfs_site_agg
# plot
theme_set(theme_bw())
map_data("world") %>% select(region) %>% unique()

USA <- map_data("state")
PuertoRico <- map_data("world", region = c("Puerto Rico"), exact = FALSE)

northAmer <- bind_rows(USA, PuertoRico)
northAmer <- northAmer %>% filter(long < 0)
northAmer %>% filter(!(subregion %in% c("Hawaii","Alaska"))) -> northAmer

ggplot(data = northAmer) + 
  geom_polygon(aes(x = long, y = lat, group=group), fill = "gray", color = "white") +
  geom_point(aes(x = Longitude, y = Latitude), shape = 3, color = "blue", size = 2
             , data = kfs_site_agg) +
  geom_point(aes(x = Longitude, y = Latitude), shape = 1, col = "red", data = histdata) +
  labs(x = "Longitude", y = "Latitude")

 # ggsave("Figure1-1 Sites.jpg", width = 8, height = 5, units = "in")

Load the full set of fitted models. Because it is large (>100MB), this file is not distributed with the package. To obtain it: 1. download from OSF (via urbankfs::download_jian_fits), 2. or re-generate it with the scripts/fit_models.R script, 3. or download it from https://github.com/jinshijian/UrbanKfs --> 'release'

# download_jian_fits(here::here("extdata/fitted_models.rda"))
load(here::here("extdata", "fitted_models.rda"))
# load("extdata/fitted_models.rda")

Urban data predictions

Model predictions for Urban data. First, load the data. Note that the urban soil data does not report rock percentage.

model_rock_type <- tribble(
  ~model_type, ~use_rock, ~top_type,
  "ann", FALSE, FALSE,
  "annr", TRUE, FALSE,
  "rf1", FALSE, FALSE,
  "rf1r", TRUE, FALSE,
  "rf2", FALSE, TRUE,
  "rf2r", TRUE, TRUE
)
pred <- fitted_models %>%
  dplyr::select(sample, train_data, test_data, model_type, model_fit) %>%
  gather(data_type, data, train_data, test_data) %>%
  left_join(model_rock_type, by = "model_type") %>%
  mutate(
    # Subset the data so that it first the needs of the model
    data_prep = pmap(list(data, use_rock, top_type), prepare_data), #1
    # Actually run the predictions
    predicted_log = map2(model_fit, data_prep, predict), #2
    # Un-log the predictions
    predicted = purrr::map(predicted_log, exp), #3
    # Make the model_type labels plot-ready
    model_type = fct_recode(model_type, !!!pretty_model_types()) #4
  )

# head(pred)

Generate bootstrapped predictions for the data. urbankfs makes this easy with the predict_bootstrap function, which also has summary method for quickly generating tidy outputs. Note that we can only use models without rock percentage as a predictor because it is not reported.

fitted_models_norock <- fitted_models %>%
  left_join(model_rock_type, "model_type") %>%
  filter(!use_rock)

# NOTE: The `predicted` column here is `log(Kfs)`!
hist_predict <- predict_bootstrap(histdata, fitted_models_norock)

hist_summary <- summary(hist_predict)

# hist_summary <- hist_predict %>%
#   unnest(data, predicted) %>%
#   group_by(model_type, Percent_Sand, Percent_Silt, Percent_Clay, Top_Type) %>%
#   summarize_at(vars(predicted), list(
#     mean = mean,
#     sd = sd,
#     lo = ~quantile(., 0.1),
#     hi = ~quantile(., 0.9),
#     n = length
#   ))

hist_summary

How well does it do?

facet_title = c("(a) ANN-no-rock" = "Neural network (no rock)",
                "(b) RF-no-rock" = "RandomForest (no rock, no type)",
                "(c) RF-with-structure-no-rock" = "RandomForest (no rock, with type)")

# bind_rows(
#   histdata %>%
#     transmute(
#       mean = log(Ksat_Rosseta),
#       log_Ksat = log(Ksat),
#       q050 = log_Ksat,
#       q950 = log_Ksat,
#       model_type = "ROSETTA") %>%
#     filter_if(is.numeric, all_vars(is.finite(.))),
#   histdata %>%
#     left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay", "Top_Type")) %>%
#     mutate(log_Ksat = log(Ksat)) %>%
#     select(mean, log_Ksat, q050, q950, model_type)) 

histdata %>%
  left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay", "Top_Type")) %>%
  mutate(log_Ksat = log(Ksat)) %>%
  select(mean, log_Ksat, q050, q950, model_type) %>%
  mutate(model_type = fct_recode(model_type, !!!facet_title)) %>%
  filter(model_type != "(c) RF-with-structure-no-rock") %>% 
  ggplot() +
  aes(x = mean, xmin = q050, xmax = q950, y = log_Ksat) +
  geom_errorbarh(color = "grey50", size = 0.5) +
  geom_point(size = 0.7) +
  geom_smooth(method = "lm", color = "blue", alpha = 0.5, lwd = 0.5, fill = "lightblue") +
  geom_abline(linetype = "dashed") +
  facet_wrap(model_type~., ncol = 1
             # labeller = as_labeller(facet_title) 
             ) +
  labs(x = expression(Predicted ~ log ~"["~K[n] ~ (cm ~ hr^{-1})~"]"),
       y = expression(Observed ~ log ~"["~K~ (cm ~ hr^{-1})~"]")) +
  theme_cowplot() -> p2

print(p2)
# ggsave("Figure4-Evaluation.jpg", width = 4, height = 6, dpi = 300, units = "in")
# ggsave("Figure3.pdf", width = 4, height = 6, dpi = 300, units = "in")
# summary for Table 1
# ANS: This object doesn't exist in this Rmd document
## min(data_structure$Ksat_cmhr)
histdata %>%
  left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay",
                                 "Top_Type")) %>% 
  select(Percent_Sand, Percent_Silt, Percent_Clay, Ksat, Ksat_Rosseta, mean, model_type) %>% 
  mutate(Ksat_predict = exp(mean))
# %>% spread(model_type,mean)

Test and training data predictions

Generate neural network and random forest predictions for the training and test data. randomForest provides a predict method for the randomForest S3 class. urbankfs defines a custom predict method for its neural network fits that is a thin wrapper around neuralnet::compute and which also undoes the scaling that happens during fitting.

Fit a linear model, observed ~ predicted, for each bootstrapped sample, and extract the coefficients and R^2^.

# pred_fits <- pred %>%
#   filter(data_type == "test_data") %>%
#   unnest(c(data_prep,  predicted_log)) %>%
#   mutate(observed_log = log(Unsaturated_K2cm_cmhr)) %>%
#   select(sample, model_type, observed_log, predicted_log) %>%
#   # filter(is.infinite(observed_log) & is.infinite(predicted_log)) %>% na.omit() %>% 
#   group_by(sample, model_type) %>%
#   summarize(fit = list(lm(observed_log ~ predicted_log))) %>%
#   ungroup() 
  # mutate(
  #   coefficients = map(fit, coefficients),
  #   slope = map_dbl(coefficients, 2),
  #   intercept = map_dbl(coefficients, 1),
  #   r2 = map_dbl(fit, ~summary(.)[["adj.r.squared"]])
  # )

Use the linear model fits from above to generate a range of predictions of the observed data. This is mostly for plotting the linear 1:1 fits in subsequent steps.

# obs <- tibble(
#   predicted = fitted_models %>%
#     unnest(train_data) %>%
#     pull(Unsaturated_K2cm_cmhr) %>%
#     seq_range(20)
# )
# 
# pred_lm <- pred_fits %>%
#   mutate(xpred = list(obs),
#          lmpred = map2(fit, xpred, predict)) %>%
#   unnest(xpred, lmpred) %>%
#   group_by(model_type, predicted) %>%
#   summarize_at(vars(lmpred), list(
#     mean = mean,
#     sd = sd,
#     lo = ~quantile(., 0.1),
#     hi = ~quantile(., 0.9)
#   ))

Summarize the neural network and random forest model predictions at each point. These will be drawn as points with horizontal error bars.

pred_summary <- pred %>%
  filter(data_type == "test_data") %>%
  unnest(c(data_prep, predicted_log)) %>%
  mutate(observed_log = log(Unsaturated_K2cm_cmhr)) %>%
  group_by(model_type, observed_log) %>%
  summarize_at(vars(predicted_log), list(
    mean = mean,
    sd = sd,
    lo = ~quantile(., 0.1),
    hi = ~quantile(., 0.9),
    n = length
  ))

Draw the observed vs. predicted scatter plot, with true 1:1 line (dashed) and the bootstrapped linear regression (blue shaded region).

fig_regression_cap <- paste0(
  "Observed vs. predicted regression for neural network and random forest models. ",
  "Dashed line is the 1:1 fit, and blue shaded region is the observed ~ predicted regression."
)

facet_title = c("Neural network (no rock)" = "(a) ANN-no-rock"
                   ,"RandomForest (no rock, no type)" = "(b) RF-no-rock"
                   # ,"RandomForest (no rock, with type)" = "(c) RF-with-structure-no-rock"
                   )

# plot results without rock
pred_summary %>% filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)")) %>% 
  ggplot(aes(x = mean, y = observed_log)) +  geom_point(size = 0.5) +
  facet_wrap(model_type~., ncol = 1
             , labeller = as_labeller(facet_title)
             ) +
  geom_errorbarh(aes(y = observed_log, xmin = lo, xmax = hi, x = NULL),
                 color = "gray40",
                 size = 0.5,
                 alpha = 0.2) +
  # geom_ribbon(
  #   data = pred_lm %>% filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)")),
  #   aes(ymin = lo, ymax = hi, y = NULL),
  #   alpha = 0.5,
  #   fill = "lightblue"
  # ) +
  geom_abline(linetype = "dashed") +
  geom_smooth(method = "lm", color = "blue", se = T, fill="lightblue", alpha = 0.5, lwd = 0.5) +
  # geom_line(aes(y = observed), data = pred_lm %>% 
  #             filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)")) ) +
  labs(x = expression(Predicted ~ log ~"["~K[n] ~ (cm ~ hr^{-1})~"]"),
       y = expression(Observed ~ log ~"["~K[n] ~ (cm ~ hr^{-1})~"]")) +
  xlim(-3, 3) +
  ylim(-4, 4) +
  theme_cowplot() -> p1


# evaluation dataset (n=20)
facet_title = c("Neural network (no rock)" = "(a) ANN-no-rock"
                   ,"RandomForest (no rock, no type)" = "(b) RF-no-rock"
                   # ,"RandomForest (no rock, with type)" = "(c) RF-with-structure-no-rock"
                   )
fig_urbandata_cap <- paste0(
  "Predicted vs. observed plot for urban data. ",
  "Dashed line is the 1:1 fit, and blue line with grey shading is a `observed ~ predicted` linear fit."
)


# plot with rock

# evaluation dataset (n=20)
facet_title_wr = c("Neural network (with rock)" = "(c) ANN-with-rock"
                   ,"RandomForest (with rock, no type)" = "(d) RF-with-rock"
                   # ,"RandomForest (with rock, with type)" = "(f) RF-with-structure-with-rock"
                   )

pred_summary %>% filter(model_type %in% c("Neural network (with rock)", "RandomForest (with rock, no type)")) %>% 
  ggplot(aes(x = mean, y = observed_log)) +  geom_point(size = 0.5) +
  facet_wrap(model_type~., ncol = 1
             , labeller = as_labeller(facet_title_wr)
             ) +
  geom_errorbarh(aes(y = observed_log, xmin = lo, xmax = hi, x = NULL),
                 color = "gray40",
                 size = 0.5,
                 alpha = 0.2) +
  # geom_ribbon(
  #   data = pred_lm %>% filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)")),
  #   aes(ymin = lo, ymax = hi, y = NULL),
  #   alpha = 0.5,
  #   fill = "lightblue"
  # ) +
  geom_abline(linetype = "dashed") +
  geom_smooth(method = "lm", color = "blue", se = T, fill="lightblue", alpha = 0.5, lwd = 0.5) +
  # geom_line(aes(y = observed), data = pred_lm %>% 
  #             filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)")) ) +
  labs(x = expression(Predicted ~ log ~"["~K[n] ~ (cm ~ hr^{-1})~"]")) +
  xlim(-3, 3) +
  ylim(-4, 4) +
  theme_cowplot() +
  theme(axis.title.y = element_blank()) -> p2

plot_grid(p1, p2,
          rel_widths = c(1.05, 1),
          ncol = 2)

ggsave("Figure3_Kn.jpg", width = 8, height = 6, dpi = 300, units = "in")
# ggsave("Figure2.pdf", width = 8, height = 6, dpi = 300, units = "in")
# summary for Table 2
# pred_summary$mean <- ifelse(pred_summary$mean < 0, 0.001, pred_summary$mean)
pred_summary %>% mutate(S_M = observed_log - mean) -> pred_summary
pred_summary %>% filter(is.finite(S_M)) -> pred_summary
pred_summary %>% select(model_type) %>% unique
var_modelType <- c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)",
                   "Neural network (with rock)", "RandomForest (with rock, no type)", "RandomForest (with rock, with type)")

evaluation <- function () {
  results <- data.frame()
  for (i in 1:length(var_modelType)) {
    pred_summary %>% filter(model_type == var_modelType[i]) -> subdata
    SLR <- lm(observed_log ~ mean, data = subdata)
    inter <- summary(SLR)$coefficients[1,1] %>% round(3)
    slope <- summary(SLR)$coefficients[2,1] %>% round(3)
    r2 <- summary(SLR)$adj.r.squared %>% round(3)
    # other model evaluation matric
    E <- (sum(subdata$S_M) / length(subdata$S_M)) %>% round(3)
    d <-  (1- sum(subdata$S_M^2)/sum((abs(subdata$mean-mean(subdata$observed_log))+abs(subdata$observed_log-mean(subdata$observed_log)))^2)) %>% round(3)
    # EF <- 1- sum(subdata$S_M^2)/sum((subdata$observed_log-mean(subdata$observed_log))^2)
    RMSE <- (sum(subdata$S_M^2)/length(subdata$S_M))^0.5 %>% round(3)
    p <- t.test(subdata$S_M)$p.value %>% round(3)

    results <- rbind(results, data.frame(var_modelType[i], inter, slope, r2, E, RMSE, d, p) )
  }
  return(results)
}

# getwd()
results <- evaluation()
write.csv(results, "results.csv", row.names = F)
results

c(exp(0.35),exp(0.29),exp(0.28),exp(0.31),exp(0.19),exp(0.19))
i = 3
pred_summary %>% filter(model_type == var_modelType[i]) -> subdata
SLR <- lm(observed_log ~ mean, data = subdata)
subdata %>% ggplot(aes(x = mean, y = observed_log)) + geom_point() + geom_smooth(method = "lm") +
  geom_abline(h = 0, v = 0)

histdata %>% ggplot(aes(x = Ksat_Rosseta, y = Ksat)) + geom_point()

pred_summary %>% filter(model_type %in% c("Neural network (no rock)")) %>% 
  ggplot() +
  aes(x = predicted_log, y = mean) +
  geom_point(aes(y = observed_log, x = mean),
             size = 0.5)

subdata <- pred_summary %>% filter(model_type %in% c("Neural network (no rock)")) 

Histogram of correlation coefficients for the training and testing data.

# fig_correlation_cap <- paste0(
#   "Histogram of correlation coefficients for the training and testing data."
# )
# pred %>%
#   unnest(data_prep, predicted) %>%
#   mutate(data_type = fct_inorder(data_type) %>% fct_recode(
#     "Training" = "train_data",
#     "Testing" = "test_data"
#   )) %>%
#   group_by(model_type, sample, data_type) %>%
#   summarize(corr = cor(predicted, Unsaturated_K2cm_cmhr, method = "spearman")) %>%
#   ggplot() +
#   aes(x = corr) +
#   geom_density() +
#   facet_grid(vars(model_type), vars(data_type)) +
#   labs(x = "Correlation between prediction and data") +
#   theme_cowplot()
# summary for Table 3

bind_rows(
  histdata %>%
    transmute(
      mean = log(Ksat_Rosseta),
      log_Ksat = log(Ksat),
      q050 = log_Ksat,
      q950 = log_Ksat,
      model_type = "ROSSETA"
    ) %>%
    filter_if(is.numeric, all_vars(is.finite(.))),
  histdata %>%
    left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay",
                                   "Top_Type")) %>%
    mutate(log_Ksat = log(Ksat)) %>%
    select(mean, log_Ksat, q050, q950, model_type)
) -> hist_sum

# histdata %>%
#   left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay",
#                                  "Top_Type")) %>% select(Percent_Sand, Percent_Silt, Percent_Clay, Ksat, Ksat_Rosseta, mean, model_type) -> hist_sum

hist_sum %>% mutate(S_M = log_Ksat - mean) -> hist_sum
var_modelType <- unique(hist_sum$model_type)

lm(hist_sum[hist_sum$model_type == "ROSSETA",]$log_Ksat ~ hist_sum[hist_sum$model_type == "ROSSETA",]$mean) 

evaluation2 <- function () {
  results <- data.frame()
  for (i in 1:length(var_modelType)) {
    hist_sum %>% filter(model_type == var_modelType[i]) -> subdata
    SLR <- lm(log_Ksat ~ mean, data = subdata)
    inter <- summary(SLR)$coefficients[1,1] %>% round(3)
    slope <- summary(SLR)$coefficients[2,1] %>% round(3)
    r2 <- summary(SLR)$adj.r.squared %>% round(3)
    # other model evaluation matric
    E <- (sum(subdata$S_M) / length(subdata$S_M)) %>% round(3)
    # EF <- 1- sum(subdata$S_M^2)/sum((subdata$log_Ksat-mean(subdata$log_Ksat))^2)
    RMSE <- (sum(subdata$S_M^2)/length(subdata$S_M))^0.5 %>% round(3)
    d <-  (1- sum(subdata$S_M^2)/sum((abs(subdata$mean-mean(subdata$log_Ksat))+abs(subdata$log_Ksat-mean(subdata$log_Ksat)))^2)) %>% round(3)
    p <- t.test(subdata$S_M)$p.value %>% round(3)

    results <- rbind(results, data.frame(var_modelType[i], inter, slope, r2, E, RMSE, d, p) )
  }
  return(results)
}

# getwd()
results2 <- evaluation2()
write.csv(results2, "results2.csv", row.names = F)
results2
# test the structure group
# data_orig <- read.csv(here::here("extdata/AllCities_Victoria_RDS.csv")) %>%
#   as_tibble()
# 
# type_df <- as_tibble(soil_types())
# 
# data_structure <- data_orig %>%
#   select(Percent_Sand, Percent_Silt, Percent_Clay, Percent_Rock_Fragment,
#          Unsaturated_K2cm_cmhr, Type) %>%
#   filter_at(vars(-Percent_Rock_Fragment), negate(is.na)) %>%
#   mutate(Type = as.character(Type)) %>%
#   left_join(type_df, by = "Type") %>%
#   mutate(Top_Type = factor(Top_Type, soil_type_levels()))

# data_structure %>% select(Type, Top_Type) %>% filter(is.na(Top_Type)) %>% unique()
# questions need double check
# 1. How many times repeaded for hist data?
# Set chunks defaults; these options will be applied to all subsequent chunks


jinshijian/UrbanKfs documentation built on Jan. 9, 2021, 9:54 a.m.