This vignette goes through the basic steps of the Jian et al. paper analysis, demonstrating some of the features of the urbankfs
R package along the way.
library(ggmap) library(maps) library(mapdata) library(forcats) library(tidyr) library(modelr) library(purrr) library(ggplot2) library(cowplot) library(urbankfs) library(readxl) # install.packages("here") # install.packages("randomForest") library(devtools) requireNamespace("here", quietly = TRUE) # devtools::install(here::here()) requireNamespace("randomForest", quietly = TRUE) library(dplyr) library(ncdf4)
knitr::opts_chunk$set(results = 'hide', message = TRUE, include = TRUE, echo = FALSE, warning = FALSE, # fig.height = 4, fig.width = 8, cache = F)
histdata <- read.csv(here::here("extdata/UrbanSoilK_V3.csv")) %>% as_tibble() %>% mutate(Top_Type = factor(Top_Type, soil_type_levels())) %>% normalize_soil_pct_data(add_rock = FALSE) histdata$Top_Type <- as.character(histdata$Top_Type) histdata$Top_Type <- ifelse(is.na(histdata$Top_Type), "SG", histdata$Top_Type)
# prepare data site <- read.csv(here::here("vignettes", "All_SSURGO_and_Point.csv")) kfsdata <- read.csv(here::here("vignettes", "All_SSURGO_and_Surface_HC.csv")) left_join(site %>% select(SampleSite_ID,SamplePoint_ID, Longitude, Latitude), kfsdata %>% select(SampleSite_ID,SamplePoint_ID, Unsaturated_K2cm_cmhr) %>% filter(!is.na(Unsaturated_K2cm_cmhr))) %>% filter(!is.na(Unsaturated_K2cm_cmhr)) -> kfs_site kfs_site %>% select(Latitude, Longitude) %>% filter(Longitude < 0) %>% unique() -> kfs_site_agg
# plot theme_set(theme_bw()) map_data("world") %>% select(region) %>% unique() USA <- map_data("state") PuertoRico <- map_data("world", region = c("Puerto Rico"), exact = FALSE) northAmer <- bind_rows(USA, PuertoRico) northAmer <- northAmer %>% filter(long < 0) northAmer %>% filter(!(subregion %in% c("Hawaii","Alaska"))) -> northAmer ggplot(data = northAmer) + geom_polygon(aes(x = long, y = lat, group=group), fill = "gray", color = "white") + geom_point(aes(x = Longitude, y = Latitude), shape = 3, color = "blue", size = 2 , data = kfs_site_agg) + geom_point(aes(x = Longitude, y = Latitude), shape = 1, col = "red", data = histdata) + labs(x = "Longitude", y = "Latitude") # ggsave("Figure1-1 Sites.jpg", width = 8, height = 5, units = "in")
Load the full set of fitted models.
Because it is large (>100MB), this file is not distributed with the package.
To obtain it:
1. download from OSF (via urbankfs::download_jian_fits
),
2. or re-generate it with the scripts/fit_models.R
script,
3. or download it from https://github.com/jinshijian/UrbanKfs --> 'release'
# download_jian_fits(here::here("extdata/fitted_models.rda")) load(here::here("extdata", "fitted_models.rda")) # load("extdata/fitted_models.rda")
Model predictions for Urban data. First, load the data. Note that the urban soil data does not report rock percentage.
model_rock_type <- tribble( ~model_type, ~use_rock, ~top_type, "ann", FALSE, FALSE, "annr", TRUE, FALSE, "rf1", FALSE, FALSE, "rf1r", TRUE, FALSE, "rf2", FALSE, TRUE, "rf2r", TRUE, TRUE ) pred <- fitted_models %>% dplyr::select(sample, train_data, test_data, model_type, model_fit) %>% gather(data_type, data, train_data, test_data) %>% left_join(model_rock_type, by = "model_type") %>% mutate( # Subset the data so that it first the needs of the model data_prep = pmap(list(data, use_rock, top_type), prepare_data), #1 # Actually run the predictions predicted_log = map2(model_fit, data_prep, predict), #2 # Un-log the predictions predicted = purrr::map(predicted_log, exp), #3 # Make the model_type labels plot-ready model_type = fct_recode(model_type, !!!pretty_model_types()) #4 ) # head(pred)
Generate bootstrapped predictions for the data.
urbankfs
makes this easy with the predict_bootstrap
function, which also has summary
method for quickly generating tidy outputs.
Note that we can only use models without rock percentage as a predictor because it is not reported.
fitted_models_norock <- fitted_models %>% left_join(model_rock_type, "model_type") %>% filter(!use_rock) # NOTE: The `predicted` column here is `log(Kfs)`! hist_predict <- predict_bootstrap(histdata, fitted_models_norock) hist_summary <- summary(hist_predict) # hist_summary <- hist_predict %>% # unnest(data, predicted) %>% # group_by(model_type, Percent_Sand, Percent_Silt, Percent_Clay, Top_Type) %>% # summarize_at(vars(predicted), list( # mean = mean, # sd = sd, # lo = ~quantile(., 0.1), # hi = ~quantile(., 0.9), # n = length # )) hist_summary
How well does it do?
facet_title = c("(a) ANN-no-rock" = "Neural network (no rock)", "(b) RF-no-rock" = "RandomForest (no rock, no type)", "(c) RF-with-structure-no-rock" = "RandomForest (no rock, with type)") # bind_rows( # histdata %>% # transmute( # mean = log(Ksat_Rosseta), # log_Ksat = log(Ksat), # q050 = log_Ksat, # q950 = log_Ksat, # model_type = "ROSETTA") %>% # filter_if(is.numeric, all_vars(is.finite(.))), # histdata %>% # left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay", "Top_Type")) %>% # mutate(log_Ksat = log(Ksat)) %>% # select(mean, log_Ksat, q050, q950, model_type)) histdata %>% left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay", "Top_Type")) %>% mutate(log_Ksat = log(Ksat)) %>% select(mean, log_Ksat, q050, q950, model_type) %>% mutate(model_type = fct_recode(model_type, !!!facet_title)) %>% filter(model_type != "(c) RF-with-structure-no-rock") %>% ggplot() + aes(x = mean, xmin = q050, xmax = q950, y = log_Ksat) + geom_errorbarh(color = "grey50", size = 0.5) + geom_point(size = 0.7) + geom_smooth(method = "lm", color = "blue", alpha = 0.5, lwd = 0.5, fill = "lightblue") + geom_abline(linetype = "dashed") + facet_wrap(model_type~., ncol = 1 # labeller = as_labeller(facet_title) ) + labs(x = expression(Predicted ~ log ~"["~K[n] ~ (cm ~ hr^{-1})~"]"), y = expression(Observed ~ log ~"["~K~ (cm ~ hr^{-1})~"]")) + theme_cowplot() -> p2 print(p2) # ggsave("Figure4-Evaluation.jpg", width = 4, height = 6, dpi = 300, units = "in") # ggsave("Figure3.pdf", width = 4, height = 6, dpi = 300, units = "in")
# summary for Table 1 # ANS: This object doesn't exist in this Rmd document ## min(data_structure$Ksat_cmhr) histdata %>% left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay", "Top_Type")) %>% select(Percent_Sand, Percent_Silt, Percent_Clay, Ksat, Ksat_Rosseta, mean, model_type) %>% mutate(Ksat_predict = exp(mean)) # %>% spread(model_type,mean)
Generate neural network and random forest predictions for the training and test data.
randomForest
provides a predict
method for the randomForest
S3 class.
urbankfs
defines a custom predict
method for its neural network fits that is a thin wrapper around neuralnet::compute
and which also undoes the scaling that happens during fitting.
Fit a linear model, observed ~ predicted
, for each bootstrapped sample, and extract the coefficients and R^2^.
# pred_fits <- pred %>% # filter(data_type == "test_data") %>% # unnest(c(data_prep, predicted_log)) %>% # mutate(observed_log = log(Unsaturated_K2cm_cmhr)) %>% # select(sample, model_type, observed_log, predicted_log) %>% # # filter(is.infinite(observed_log) & is.infinite(predicted_log)) %>% na.omit() %>% # group_by(sample, model_type) %>% # summarize(fit = list(lm(observed_log ~ predicted_log))) %>% # ungroup() # mutate( # coefficients = map(fit, coefficients), # slope = map_dbl(coefficients, 2), # intercept = map_dbl(coefficients, 1), # r2 = map_dbl(fit, ~summary(.)[["adj.r.squared"]]) # )
Use the linear model fits from above to generate a range of predictions of the observed data. This is mostly for plotting the linear 1:1 fits in subsequent steps.
# obs <- tibble( # predicted = fitted_models %>% # unnest(train_data) %>% # pull(Unsaturated_K2cm_cmhr) %>% # seq_range(20) # ) # # pred_lm <- pred_fits %>% # mutate(xpred = list(obs), # lmpred = map2(fit, xpred, predict)) %>% # unnest(xpred, lmpred) %>% # group_by(model_type, predicted) %>% # summarize_at(vars(lmpred), list( # mean = mean, # sd = sd, # lo = ~quantile(., 0.1), # hi = ~quantile(., 0.9) # ))
Summarize the neural network and random forest model predictions at each point. These will be drawn as points with horizontal error bars.
pred_summary <- pred %>% filter(data_type == "test_data") %>% unnest(c(data_prep, predicted_log)) %>% mutate(observed_log = log(Unsaturated_K2cm_cmhr)) %>% group_by(model_type, observed_log) %>% summarize_at(vars(predicted_log), list( mean = mean, sd = sd, lo = ~quantile(., 0.1), hi = ~quantile(., 0.9), n = length ))
Draw the observed vs. predicted scatter plot, with true 1:1 line (dashed) and the bootstrapped linear regression (blue shaded region).
fig_regression_cap <- paste0( "Observed vs. predicted regression for neural network and random forest models. ", "Dashed line is the 1:1 fit, and blue shaded region is the observed ~ predicted regression." ) facet_title = c("Neural network (no rock)" = "(a) ANN-no-rock" ,"RandomForest (no rock, no type)" = "(b) RF-no-rock" # ,"RandomForest (no rock, with type)" = "(c) RF-with-structure-no-rock" ) # plot results without rock pred_summary %>% filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)")) %>% ggplot(aes(x = mean, y = observed_log)) + geom_point(size = 0.5) + facet_wrap(model_type~., ncol = 1 , labeller = as_labeller(facet_title) ) + geom_errorbarh(aes(y = observed_log, xmin = lo, xmax = hi, x = NULL), color = "gray40", size = 0.5, alpha = 0.2) + # geom_ribbon( # data = pred_lm %>% filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)")), # aes(ymin = lo, ymax = hi, y = NULL), # alpha = 0.5, # fill = "lightblue" # ) + geom_abline(linetype = "dashed") + geom_smooth(method = "lm", color = "blue", se = T, fill="lightblue", alpha = 0.5, lwd = 0.5) + # geom_line(aes(y = observed), data = pred_lm %>% # filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)")) ) + labs(x = expression(Predicted ~ log ~"["~K[n] ~ (cm ~ hr^{-1})~"]"), y = expression(Observed ~ log ~"["~K[n] ~ (cm ~ hr^{-1})~"]")) + xlim(-3, 3) + ylim(-4, 4) + theme_cowplot() -> p1 # evaluation dataset (n=20) facet_title = c("Neural network (no rock)" = "(a) ANN-no-rock" ,"RandomForest (no rock, no type)" = "(b) RF-no-rock" # ,"RandomForest (no rock, with type)" = "(c) RF-with-structure-no-rock" ) fig_urbandata_cap <- paste0( "Predicted vs. observed plot for urban data. ", "Dashed line is the 1:1 fit, and blue line with grey shading is a `observed ~ predicted` linear fit." ) # plot with rock # evaluation dataset (n=20) facet_title_wr = c("Neural network (with rock)" = "(c) ANN-with-rock" ,"RandomForest (with rock, no type)" = "(d) RF-with-rock" # ,"RandomForest (with rock, with type)" = "(f) RF-with-structure-with-rock" ) pred_summary %>% filter(model_type %in% c("Neural network (with rock)", "RandomForest (with rock, no type)")) %>% ggplot(aes(x = mean, y = observed_log)) + geom_point(size = 0.5) + facet_wrap(model_type~., ncol = 1 , labeller = as_labeller(facet_title_wr) ) + geom_errorbarh(aes(y = observed_log, xmin = lo, xmax = hi, x = NULL), color = "gray40", size = 0.5, alpha = 0.2) + # geom_ribbon( # data = pred_lm %>% filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)")), # aes(ymin = lo, ymax = hi, y = NULL), # alpha = 0.5, # fill = "lightblue" # ) + geom_abline(linetype = "dashed") + geom_smooth(method = "lm", color = "blue", se = T, fill="lightblue", alpha = 0.5, lwd = 0.5) + # geom_line(aes(y = observed), data = pred_lm %>% # filter(model_type %in% c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)")) ) + labs(x = expression(Predicted ~ log ~"["~K[n] ~ (cm ~ hr^{-1})~"]")) + xlim(-3, 3) + ylim(-4, 4) + theme_cowplot() + theme(axis.title.y = element_blank()) -> p2 plot_grid(p1, p2, rel_widths = c(1.05, 1), ncol = 2) ggsave("Figure3_Kn.jpg", width = 8, height = 6, dpi = 300, units = "in") # ggsave("Figure2.pdf", width = 8, height = 6, dpi = 300, units = "in")
# summary for Table 2 # pred_summary$mean <- ifelse(pred_summary$mean < 0, 0.001, pred_summary$mean) pred_summary %>% mutate(S_M = observed_log - mean) -> pred_summary pred_summary %>% filter(is.finite(S_M)) -> pred_summary pred_summary %>% select(model_type) %>% unique var_modelType <- c("Neural network (no rock)", "RandomForest (no rock, no type)", "RandomForest (no rock, with type)", "Neural network (with rock)", "RandomForest (with rock, no type)", "RandomForest (with rock, with type)") evaluation <- function () { results <- data.frame() for (i in 1:length(var_modelType)) { pred_summary %>% filter(model_type == var_modelType[i]) -> subdata SLR <- lm(observed_log ~ mean, data = subdata) inter <- summary(SLR)$coefficients[1,1] %>% round(3) slope <- summary(SLR)$coefficients[2,1] %>% round(3) r2 <- summary(SLR)$adj.r.squared %>% round(3) # other model evaluation matric E <- (sum(subdata$S_M) / length(subdata$S_M)) %>% round(3) d <- (1- sum(subdata$S_M^2)/sum((abs(subdata$mean-mean(subdata$observed_log))+abs(subdata$observed_log-mean(subdata$observed_log)))^2)) %>% round(3) # EF <- 1- sum(subdata$S_M^2)/sum((subdata$observed_log-mean(subdata$observed_log))^2) RMSE <- (sum(subdata$S_M^2)/length(subdata$S_M))^0.5 %>% round(3) p <- t.test(subdata$S_M)$p.value %>% round(3) results <- rbind(results, data.frame(var_modelType[i], inter, slope, r2, E, RMSE, d, p) ) } return(results) } # getwd() results <- evaluation() write.csv(results, "results.csv", row.names = F) results c(exp(0.35),exp(0.29),exp(0.28),exp(0.31),exp(0.19),exp(0.19))
i = 3 pred_summary %>% filter(model_type == var_modelType[i]) -> subdata SLR <- lm(observed_log ~ mean, data = subdata) subdata %>% ggplot(aes(x = mean, y = observed_log)) + geom_point() + geom_smooth(method = "lm") + geom_abline(h = 0, v = 0) histdata %>% ggplot(aes(x = Ksat_Rosseta, y = Ksat)) + geom_point() pred_summary %>% filter(model_type %in% c("Neural network (no rock)")) %>% ggplot() + aes(x = predicted_log, y = mean) + geom_point(aes(y = observed_log, x = mean), size = 0.5) subdata <- pred_summary %>% filter(model_type %in% c("Neural network (no rock)"))
Histogram of correlation coefficients for the training and testing data.
# fig_correlation_cap <- paste0( # "Histogram of correlation coefficients for the training and testing data." # ) # pred %>% # unnest(data_prep, predicted) %>% # mutate(data_type = fct_inorder(data_type) %>% fct_recode( # "Training" = "train_data", # "Testing" = "test_data" # )) %>% # group_by(model_type, sample, data_type) %>% # summarize(corr = cor(predicted, Unsaturated_K2cm_cmhr, method = "spearman")) %>% # ggplot() + # aes(x = corr) + # geom_density() + # facet_grid(vars(model_type), vars(data_type)) + # labs(x = "Correlation between prediction and data") + # theme_cowplot()
# summary for Table 3 bind_rows( histdata %>% transmute( mean = log(Ksat_Rosseta), log_Ksat = log(Ksat), q050 = log_Ksat, q950 = log_Ksat, model_type = "ROSSETA" ) %>% filter_if(is.numeric, all_vars(is.finite(.))), histdata %>% left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay", "Top_Type")) %>% mutate(log_Ksat = log(Ksat)) %>% select(mean, log_Ksat, q050, q950, model_type) ) -> hist_sum # histdata %>% # left_join(hist_summary, by = c("Percent_Sand", "Percent_Silt", "Percent_Clay", # "Top_Type")) %>% select(Percent_Sand, Percent_Silt, Percent_Clay, Ksat, Ksat_Rosseta, mean, model_type) -> hist_sum hist_sum %>% mutate(S_M = log_Ksat - mean) -> hist_sum var_modelType <- unique(hist_sum$model_type) lm(hist_sum[hist_sum$model_type == "ROSSETA",]$log_Ksat ~ hist_sum[hist_sum$model_type == "ROSSETA",]$mean) evaluation2 <- function () { results <- data.frame() for (i in 1:length(var_modelType)) { hist_sum %>% filter(model_type == var_modelType[i]) -> subdata SLR <- lm(log_Ksat ~ mean, data = subdata) inter <- summary(SLR)$coefficients[1,1] %>% round(3) slope <- summary(SLR)$coefficients[2,1] %>% round(3) r2 <- summary(SLR)$adj.r.squared %>% round(3) # other model evaluation matric E <- (sum(subdata$S_M) / length(subdata$S_M)) %>% round(3) # EF <- 1- sum(subdata$S_M^2)/sum((subdata$log_Ksat-mean(subdata$log_Ksat))^2) RMSE <- (sum(subdata$S_M^2)/length(subdata$S_M))^0.5 %>% round(3) d <- (1- sum(subdata$S_M^2)/sum((abs(subdata$mean-mean(subdata$log_Ksat))+abs(subdata$log_Ksat-mean(subdata$log_Ksat)))^2)) %>% round(3) p <- t.test(subdata$S_M)$p.value %>% round(3) results <- rbind(results, data.frame(var_modelType[i], inter, slope, r2, E, RMSE, d, p) ) } return(results) } # getwd() results2 <- evaluation2() write.csv(results2, "results2.csv", row.names = F) results2
# test the structure group # data_orig <- read.csv(here::here("extdata/AllCities_Victoria_RDS.csv")) %>% # as_tibble() # # type_df <- as_tibble(soil_types()) # # data_structure <- data_orig %>% # select(Percent_Sand, Percent_Silt, Percent_Clay, Percent_Rock_Fragment, # Unsaturated_K2cm_cmhr, Type) %>% # filter_at(vars(-Percent_Rock_Fragment), negate(is.na)) %>% # mutate(Type = as.character(Type)) %>% # left_join(type_df, by = "Type") %>% # mutate(Top_Type = factor(Top_Type, soil_type_levels())) # data_structure %>% select(Type, Top_Type) %>% filter(is.na(Top_Type)) %>% unique()
# questions need double check # 1. How many times repeaded for hist data? # Set chunks defaults; these options will be applied to all subsequent chunks
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.