knitr::opts_chunk$set(dpi=100, out.width="700px", out.height="500px")
library(apsimx)
library(ggplot2)
library(lubridate)
library(dplyr)
library(mgcv)
library(tidyr)
library(caret)
library(ggplot2)
library(apsimo)
library(gganimate)

Computer model emulation

Statistical models can be used to emulate (approximate) complex computer models, such as APSIM.

What is online prediction?

Predict the $i^{th}$ observation by modeling the previous $i-1$ observations, then update the model and predict the $(i+1)^{th}$ observation.

What is online prediction?

#Load datasets from package
data("training")
data("testing")

ggplot(data = (testing %>% filter(year == 2005))[120:275,]) +
    geom_line(aes(x=Date, y=Maize.AboveGround.Wt)) + 
    geom_line(aes(x=Date, y=online_emulate_maize(training, testing %>% filter(year == 2005), pred_var = "Maize.AboveGround.Wt", method = "ar", pred_type = "online_total_local")[120:275]), col = "red", linetype = "dashed") +
  transition_reveal(yday)

Why study online prediction for APSIM?

It may seem silly to predict the next observation using a statistical model when we are only running a single APSIM simulation...

But, online prediction can be seen as a first step toward full emulation of APSIM.

  1. Accurate online prediction knowing the next day's weather
  2. Online prediction without knowing the next day's weather (imputed or naive)
  3. Predict next week or month of APSIM output
  4. Fully predict APSIM output using a statistical model (emulation)
  5. Use APSIM emulator to run large-scale simulations more efficiently than APSIM

How does this connect to my research?

I work for the National Resources Inventory, which conducts land-use / erosion monitoring through complex geographical survey techniques over all non-Federal lands in the United States.

  knitr::include_graphics("/Users/labuzzetta/Downloads/NRI.png")

APSIM Setup

I used a simple APSIM simulation, as learning the basics of APSIM was one of my goals for this project.

Simulation --> R

Covariates:

data("simulation")
print(colnames(simulation[c(5:11,22)]))

Simulation --> R

Outputs:

data("simulation")
print(colnames(simulation[c(12:21)]))

Training Data

data("simulation")
report <- simulation

#Training Data Yield Figure
ggplot(data = report %>%
         filter(year(Date) < 2005) %>%
         mutate(Year = as.factor(lubridate::year(Date)),
                `Above Ground Biomass (g/m^2)` = Maize.AboveGround.Wt,
                `Day of Year` = lubridate::yday(Date)),
       aes(x = `Day of Year`, y = `Above Ground Biomass (g/m^2)`, col = Year)) + 
  geom_line()

Testing Data

data("simulation")
report <- simulation

#Testing and Validation Data Yield Figure
ggplot(data = report %>%
         filter(year(Date) >= 2005) %>%
         mutate(Year = as.factor(lubridate::year(Date)),
                `Above Ground Biomass (g/m^2)` = Maize.AboveGround.Wt,
                `Day of Year` = lubridate::yday(Date)),
       aes(x = `Day of Year`, y = `Above Ground Biomass (g/m^2)`, col = Year)) + 
  geom_line()

Algorithm

  1. Has the crop been sowed? then continue (Else predict 0)
  2. Is the crop still growing? then continue (Else predict 0)
  3. Update model on $i-1$ observations
  4. Is the crop ripe? then harvest and predict 0
  5. If crop is not ripe, predict the $i^{th}$ observation using model
  6. Retrieve the true observation for $i$

Trial 1 - Linear Model (LM)

data("testing")
data("training")
trial1 <- apsimo::online_emulate_maize(training, testing, pred_var = "Maize.AboveGround.Wt", pred_type = "online_total_full", method = "lm")
`Maize AboveGround Wt` <- trial1
ggplot(data = testing) +
  geom_line(aes(x=Date, y=`Maize AboveGround Wt`), col = "red", linetype = "dashed") + geom_line(aes(x=Date, y=Maize.AboveGround.Wt))

Linear Model Assumptions

  1. Independence between observations
  2. Normally distributed residuals
  3. Constant variance
  4. Mean of responses are linear combination of covariates

Definitely violate the independence assumption... why?

The residuals here are also ugly.

Trial 2 - LM on daily change

data("testing")
data("training")
trial2 <- apsimo::online_emulate_maize(training, testing, pred_var = "Maize.AboveGround.Wt", pred_type = "online_change_full", method = "lm")
ggplot(data = testing) +
  geom_line(aes(x=Date, y=Maize.AboveGround.Wt)) +
  geom_line(aes(x=Date, y=trial2 %>% purrr::accumulate(`+`)), col = "red", linetype = "dashed")

Trial 3 - Autoregressive Model

data("testing")
data("training")
trial3 <- apsimo::online_emulate_maize(training, testing, pred_var = "Maize.AboveGround.Wt", pred_type = "online_total_full", method = "ar")
ggplot(data = testing) +
  geom_line(aes(x=Date, y=Maize.AboveGround.Wt)) +
  geom_line(aes(x=Date, y=trial3), col = "red", linetype = "dashed")

Trial 4 - Gen. Additive Model (GAM)

data("testing")
data("training")
trial4 <- apsimo::online_emulate_maize(training, testing, pred_var = "Maize.AboveGround.Wt", pred_type = "online_total_full", method = "gam")
ggplot(data = testing) +
  geom_line(aes(x=Date, y=Maize.AboveGround.Wt)) +
  geom_line(aes(x=Date, y=trial4), col = "red", linetype = "dashed")

Trial 5 - Local GAM

data("testing")
data("training")
trial5 <- apsimo::online_emulate_maize(training, testing, pred_var = "Maize.AboveGround.Wt", pred_type = "online_total_local", method = "gam")
ggplot(data = testing) +
  geom_line(aes(x=Date, y=Maize.AboveGround.Wt)) +
  geom_line(aes(x=Date, y=trial5), col = "red", linetype = "dashed")

Trial 6 - Random Forest

data("testing")
data("training")
trial6 <- apsimo::online_emulate_maize(training, testing, pred_var = "Maize.AboveGround.Wt", pred_type = "no_update", method = "rf")
ggplot(data = testing) +
  geom_line(aes(x=Date, y=Maize.AboveGround.Wt)) +
  geom_line(aes(x=Date, y=trial6), col = "red", linetype = "dashed")

Results

methods <- c("LM: total", "LM: daily change", 
             "AR", "GAM: full", "GAM: local", "RF")
rmse <- c(RMSE(trial1, testing$Maize.AboveGround.Wt), 
          RMSE(trial2 %>% purrr::accumulate(`+`), testing$Maize.AboveGround.Wt), 
          RMSE(trial3, testing$Maize.AboveGround.Wt), 
          RMSE(trial4, testing$Maize.AboveGround.Wt), 
          RMSE(trial5, testing$Maize.AboveGround.Wt), 
          RMSE(trial6, testing$Maize.AboveGround.Wt))
knitr::kable(cbind(methods, rmse=trunc(rmse)))

Others: Transpiration (Local GAM)

data("testing")
data("training")
trial7 <- apsimo::online_emulate_maize(training, testing, pred_var = "Maize.Leaf.Transpiration", pred_type = "online_total_local", method = "gam")
ggplot(data = testing[1:365,]) +
  geom_line(aes(x=Date, y=Maize.Leaf.Transpiration)) +
  geom_line(aes(x=Date, y=trial7[1:365]), col = "red", linetype = "dashed")

Others: Runoff (RF)

data("testing")
data("training")
trial7 <- apsimo::online_emulate_maize(training, testing, pred_var = "Soil.SoilWater.Runoff", pred_type = "no_update", method = "rf")
ggplot(data = testing) +
  geom_point(aes(x=Date, y=Soil.SoilWater.Runoff)) +
  geom_point(aes(x=Date, y=trial7), col = "red")

Conclusions

Online prediction can be used to approximate daily APSIM output.

Future Directions

Complex techniques might be needed to fully emulate APSIM



labuzzetta/apsimo documentation built on Jan. 13, 2020, 4 p.m.