In lvaudor/woody: Use wood occurrence data and discharge data for wood flux prediction

knitr::opts_chunk$set(warning=FALSE,message=FALSE)
library(woody)
library(tidyverse)
library(DiagrammeR)

Global process

For one site

Global process

grViz("digraph boxes_and_circles {
  graph [overlap = true, fontsize = 10]
  node [shape = box, style= filled]
  wpath [fillcolor = burlywood1]
  qpath  [fillcolor = cadetblue1]
  W [fillcolor = burlywood1]
  Q  [fillcolor = cadetblue1]
  Wc [fillcolor = darkkhaki]
  Qc  [fillcolor = cadetblue1]
  Q [fillcolor = cadetblue1]
  Wwt [fillcolor = darkkhaki]
  Wwt_test [fillcolor = darkkhaki]
  myrf [fillcolor = gold]
  Wp [fillcolor = darkolivegreen2]

  node [shape = ellipse,fixedsize = false,color = grey, style= filled, fillcolor=grey90]
  'import_Wdata()'
  'import_Qdata()'
  'complete_Qdata()' 
  'complete_Wdata_with_Qdata()'
  'Wdata_as_waiting_times()'
  'run_rf()'
  'predict_rf()'

  # several 'edge' statements
  wpath -> 'import_Wdata()' [arrowhead = none]
  'import_Wdata()'-> W 
  qpath -> 'import_Qdata()' [arrowhead = none]
  'import_Qdata()'-> Q 
  Q -> 'complete_Qdata()' [arrowhead=none]
  'complete_Qdata()' -> Qc 
  W -> 'complete_Wdata_with_Qdata()' [arrowhead=none]
  Qc-> 'complete_Wdata_with_Qdata()' [arrowhead=none]
  'complete_Wdata_with_Qdata()' -> Wc
  Wc -> 'Wdata_as_waiting_times()' [arrowhead=none]
  'Wdata_as_waiting_times()' -> Wwt
  Wwt -> 'run_rf()' [arrowhead = none]
  'run_rf()' -> myrf 
  myrf -> 'predict_rf()' [arrowhead = none]
  'Wwt_test' -> 'predict_rf()' [arrowhead = none]
  'predict_rf()' -> 'Wp' 
}")

For several sites

Here, we want to carry out this process for two sites, so that we use and enrich a nested table from step to step in the following manner:

library(DiagrammeR)
grViz("
digraph boxes_and_circles {
    graph [fontsize=10 compound=true overlap=true,  labeljust=r];
    node [shape=egg fontsize=10, style=filled]
    // all nodes related to Q
    node [fillcolor=cadetblue1, label='Qc[1,2]']
    'Q' [label='Q[1,2]']
    'Qc' 'Qc5'  'Qc6' 'Qc7'
    // all nodes related to W
    node [fillcolor=burlywood1, label='W[1,2]']
    'W' 'W3' 'W4'
    // all nodes related to W+Qnode 
    node [fillcolor = darkkhaki]
    'Wc' [label='Wc[1,2]']
    'Wwt' [label= 'Wwt[1,2]']
    'Wp' [label= 'Wp[1,2]', fillcolor = darkolivegreen2]


    node[shape=box]
    myrf [fillcolor = gold, label='myrf']
    'Wwt_out' [label='Wwt']

    // all nodes related to descriptors
    node [shape=egg, fillcolor=ivory, label='descriptors[1,2]']
    'descriptors'    'descriptors2'    'descriptors3' 'descriptors4'


    subgraph cluster_1 {'descriptors';label='tib_sites'}
    subgraph cluster_2 {'descriptors2' 'W'; label='tib_W'}
    subgraph cluster_3 {'descriptors3' 'W3' 'Q' ; label='tib_WQ'}
    subgraph cluster_4 {'descriptors4' 'W4' 'Qc'; label='tib_WQc'}
    subgraph cluster_5 {'descriptors5' 'Wc' 'Qc5'; label='tib_WcQc'}
    subgraph cluster_6 {'descriptors6' 'Wwt', 'Qc6'; label='tib_WwtQc'}
    subgraph cluster_7 {'descriptors7' 'Wp', 'Qc7'; label='tib_WpQc'}


    edge[color=grey]
    'descriptors' -> 'descriptors2' -> 'descriptors3' -> 'descriptors4' -> 'descriptors5' -> 'descriptors6' ->'descriptors7'
    'descriptors'  -> 'W' -> 'W3' -> 'W4' -> 'Wc' -> 'Wwt' 
    'descriptors2' -> 'Q' -> 'Qc' -> 'Qc5' -> 'Qc6' -> 'Qc7'
    'Qc' -> 'Wc'
    'Wwt' -> 'Wwt_out' -> 'myrf'
    'Wwt','myrf' -> 'Wp'
 }
")

We detail each step of this process afterwards.

Prepare data

tib_sites: info about wood and discharge data path, and hydrological stations

Prepare data describing the two stations:

tib_sites=tibble::tibble(site=c("Ain","Allier"),
                         q1.5=c(840,460),
                         station=c("V294201001 ","K340081001")) %>% 
  mutate(wpath=paste0("../data-raw/Wdata_",site),
         qpath=paste0("../data-raw/qtvar/q",site,".csv"))
tib_sites

`Wdata` : wood occurrence data

First step: import wood data , indicating path to files and site name:

We will obtain as a result tib_W= tib_sites completed with Wdata.

result_file="../data-raw/results/tib_W.RDS"
if(!file.exists(result_file)){
  tib_W=tib_sites %>% 
    group_by(vars=site,q1.5,station,wpath) %>% 
    mutate(Wdata=purrr::map2(.x=wpath,.y=site,
                             ~import_Wdata(path=.x,site=.y,
                                           min_length=1,
                                           sample_length=TRUE
                                           ))) %>% 
    mutate(Wdata=purrr::map(.x=Wdata,
                            ~mutate(.x,site=factor(site,levels=tib_sites$site))))
  saveRDS(tib_W,result_file)
}
tib_W=readRDS(result_file)

tib_W looks like this:

tib_W_look=tib_W %>%
  mutate(Wdata=purrr::map(Wdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>%
  unnest(cols=c(Wdata))
knitr::kable(tib_W_look)

and inside tib_W, the first lines of Wdata (for both sites) look like this (here for the Ain site):

knitr::kable(head(tib_W$Wdata[[1]]))

`Qdata`: collect hydrological data & calculate all variables

Collect qtvar data from banquehydro, for the period covered by Wdata, and back in time so as to be able to calculate $T_Q$:

result_file="../data-raw/results/tib_WQ.RDS"
if(!file.exists(result_file)){
  tib_WQ=tib_W %>% 
    mutate(Qdata=purrr::map(.x=qpath,.y=site,
                            ~import_Qdata(path=.x, site=.y))) 
  saveRDS(tib_WQ,result_file)
}
tib_WQ=readRDS(result_file)

tib_WQ looks like this:

tib_WQ_look=tib_WQ %>%
  mutate(Wdata=purrr::map(Wdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>%
  mutate(Qdata=purrr::map(Qdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>% 
  unnest(cols=c(Wdata, Qdata))
knitr::kable(tib_WQ_look)

and inside tib_WQ, the first lines of Qdata (for both sites) look like this:

head(tib_WQ$Qdata[[1]])

`Qdata` completed with calculation of $T_Q$ and $S$

result_file="../data-raw/results/tib_WQc.RDS"
if(!file.exists(result_file)){
  tib_WQc=tib_WQ %>% 
    mutate(Qdata=purrr::map2(.x=Qdata,.y=q1.5,
                             ~complete_Qdata(qtvar=.x, qnorm=.y)))
  saveRDS(tib_WQc,result_file)
}
tib_WQc=readRDS(result_file)

tib_WQc looks like tib_WQ, except now Qdata inside has been completed with new variables:

head(tib_WQc$Qdata[[1]])

`Wdata` completed with discharge data

result_file="../data-raw/results/tib_WcQc.RDS"
if(!file.exists(result_file)){
  tib_WcQc=tib_WQc %>% 
    mutate(Wdata=purrr::map2(.x=Wdata,.y=Qdata,
                             ~complete_Wdata_with_Qdata(Wdata=.x,Qdata=.y)))
  saveRDS(tib_WcQc,result_file)
}
tib_WcQc=readRDS(result_file)

Inside tib_WcQc, Wdata has been updated into:

head(tib_WcQc$Wdata[[1]])

Calculate waiting times to apply random forest model

We then calculate tib_Wwt, updating Wdata so that 1 row= 1 waiting time between two wood occurrences.

result_file="../data-raw/results/tib_WwtQc.RDS"
if(!file.exists(result_file)){
  tib_WwtQc=tib_WcQc %>% 
    mutate(Wdata=purrr::map(.x=Wdata,~Wdata_as_waiting_times(.x)))
  saveRDS(tib_WwtQc,result_file)
}
tib_WwtQc=readRDS(result_file)

Inside tib_Wwt, Wdata now looks like this (for both sites):

head(tib_WwtQc$Wdata[[1]])

Prediction

Run random forest on Wdata (both sites)

We run one random forest for both sites. Site is one of the predictors anyway.

# get a single table for Wdata (waiting times)
Wwt=tib_WwtQc %>%
  ungroup() %>% 
  select(Wdata) %>% 
  tidyr::unnest(Wdata)

file_result="../data-raw/results/rf.RDS"
if(!file.exists(file_result)){
  myrf <- run_rf(Wwt,
                 pred_vars=c("Q","S","rT_Q","site"))
  saveRDS(myrf,file_result)
}
myrf=readRDS(file_result)

Predict Wdata based on random forest

tib_WpQc=tib_WwtQc %>% 
  mutate(Wdata=purrr::map(.x=Wdata,~predict_rf(newdata=.x,obj_rf=myrf)))

saveRDS(tib_WpQc,"../data-raw/results/tib_WpQc.RDS")

Model performance

Now let's assess the model's predictive performance

For both sites:

R2=tib_WpQc %>% select(site,Wdata) %>% 
  mutate(R2=purrr::map_df(Wdata,calc_rf_R2)) %>% 
  unnest(R2)
R2

Plot predicted/observed

Wdata_pred=tib_WpQc %>% 
  select(Wdata) %>% 
  tidyr::unnest(Wdata, .name_repair="minimal") %>% 
  ungroup() %>% 
  mutate(Ycat=cut(Y,quantile(Y,seq(0,1,by=0.1),include.lowest=TRUE)))

p=ggplot(Wdata_pred,aes(x=Y_pred,y=Y))+
  geom_point(alpha=0.05)+
  geom_abline(intercept=0,slope=1, col="red")+
  facet_grid(cols=vars(site),
             labeller = labeller(.rows = label_both,
                                 .cols = label_both))+
  geom_smooth(method="lm")
plot(p)

Correction of prediction bias linked to covariate shift

Due to covariate shift, there is a tendency to overestimate low values of Y and underestimate high values of Y (trend in blue, theoretical Y=X line in red).

$$Y=\alpha*Y_{pred}+\beta$$

We will apply a simple linear correction of predictions to account for this in the predict_rf function (arguments)

lm_mod=lm(Y~Y_pred, data=Wdata_pred)
alpha=lm_mod$coefficients[[2]]
beta=lm_mod$coefficients[[1]]
saveRDS(c(alpha,beta),"../data-raw/correction_covariate_shift.RDS")

So we correct $Y_{pred}$ values accordingly

tib_WpQc=tib_WwtQc %>% 
  mutate(Wdata=purrr::map(.x=Wdata,
                          ~predict_rf(newdata=.x,
                                      obj_rf=myrf,
                                      correction=c(alpha,beta))))

saveRDS(tib_WpQc,"../data-raw/results/tib_WpQc.RDS")

R2=tib_WpQc %>% select(site,Wdata) %>% 
  mutate(R2=purrr::map_df(Wdata,calc_rf_R2)) %>% 
  unnest(R2)
R2

Wdata_pred=tib_WpQc %>% 
  select(Wdata) %>% 
  tidyr::unnest(Wdata, .name_repair="minimal") %>% 
  ungroup() %>% 
  mutate(Ycat=cut(Y,quantile(Y,seq(0,1,by=0.1),include.lowest=TRUE)))

p=ggplot(Wdata_pred,aes(x=Y_pred,y=Y))+
  geom_point(alpha=0.05)+
  geom_abline(intercept=0,slope=1, col="red")+
  facet_grid(cols=vars(site),
             labeller = labeller(.rows = label_both,
                                 .cols = label_both))+
  geom_smooth(method="lm")
plot(p)

lvaudor/woody documentation built on March 22, 2022, 9:53 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

lvaudor/woody
Use wood occurrence data and discharge data for wood flux prediction

In lvaudor/woody: Use wood occurrence data and discharge data for wood flux prediction

Global process

For one site

Global process

For several sites

Prepare data

tib_sites: info about wood and discharge data path, and hydrological stations

`Wdata` : wood occurrence data

`Qdata`: collect hydrological data & calculate all variables

`Qdata` completed with calculation of $T_Q$ and $S$

`Wdata` completed with discharge data

Calculate waiting times to apply random forest model

Prediction

Run random forest on Wdata (both sites)

Predict Wdata based on random forest

Model performance

Plot predicted/observed

Correction of prediction bias linked to covariate shift

R Package Documentation

Browse R Packages

We want your feedback!

lvaudor/woody Use wood occurrence data and discharge data for wood flux prediction

In lvaudor/woody: Use wood occurrence data and discharge data for wood flux prediction

Global process

For one site

Global process

For several sites

Prepare data

tib_sites: info about wood and discharge data path, and hydrological stations

Wdata : wood occurrence data

Qdata: collect hydrological data & calculate all variables

Qdata completed with calculation of $T_Q$ and $S$

Wdata completed with discharge data

Calculate waiting times to apply random forest model

Prediction

Run random forest on Wdata (both sites)

Predict Wdata based on random forest

Model performance

Plot predicted/observed

Correction of prediction bias linked to covariate shift

R Package Documentation

Browse R Packages

We want your feedback!

lvaudor/woody
Use wood occurrence data and discharge data for wood flux prediction

`Wdata` : wood occurrence data

`Qdata`: collect hydrological data & calculate all variables

`Qdata` completed with calculation of $T_Q$ and $S$

`Wdata` completed with discharge data