knitr::opts_chunk$set(warning=FALSE,message=FALSE)
library(woody)
library(tidyverse)
library(DiagrammeR)

Global process

For one site

Global process

grViz("digraph boxes_and_circles {
  graph [overlap = true, fontsize = 10]
  node [shape = box, style= filled]
  wpath [fillcolor = burlywood1]
  qpath  [fillcolor = cadetblue1]
  W [fillcolor = burlywood1]
  Q  [fillcolor = cadetblue1]
  Wc [fillcolor = darkkhaki]
  Qc  [fillcolor = cadetblue1]
  Q [fillcolor = cadetblue1]
  Wwt [fillcolor = darkkhaki]
  Wwt_test [fillcolor = darkkhaki]
  myrf [fillcolor = gold]
  Wp [fillcolor = darkolivegreen2]

  node [shape = ellipse,fixedsize = false,color = grey, style= filled, fillcolor=grey90]
  'import_Wdata()'
  'import_Qdata()'
  'complete_Qdata()' 
  'complete_Wdata_with_Qdata()'
  'Wdata_as_waiting_times()'
  'run_rf()'
  'predict_rf()'

  # several 'edge' statements
  wpath -> 'import_Wdata()' [arrowhead = none]
  'import_Wdata()'-> W 
  qpath -> 'import_Qdata()' [arrowhead = none]
  'import_Qdata()'-> Q 
  Q -> 'complete_Qdata()' [arrowhead=none]
  'complete_Qdata()' -> Qc 
  W -> 'complete_Wdata_with_Qdata()' [arrowhead=none]
  Qc-> 'complete_Wdata_with_Qdata()' [arrowhead=none]
  'complete_Wdata_with_Qdata()' -> Wc
  Wc -> 'Wdata_as_waiting_times()' [arrowhead=none]
  'Wdata_as_waiting_times()' -> Wwt
  Wwt -> 'run_rf()' [arrowhead = none]
  'run_rf()' -> myrf 
  myrf -> 'predict_rf()' [arrowhead = none]
  'Wwt_test' -> 'predict_rf()' [arrowhead = none]
  'predict_rf()' -> 'Wp' 
}")

For several sites

Here, we want to carry out this process for two sites, so that we use and enrich a nested table from step to step in the following manner:

library(DiagrammeR)
grViz("
digraph boxes_and_circles {
    graph [fontsize=10 compound=true overlap=true,  labeljust=r];
    node [shape=egg fontsize=10, style=filled]
    // all nodes related to Q
    node [fillcolor=cadetblue1, label='Qc[1,2]']
    'Q' [label='Q[1,2]']
    'Qc' 'Qc5'  'Qc6' 'Qc7'
    // all nodes related to W
    node [fillcolor=burlywood1, label='W[1,2]']
    'W' 'W3' 'W4'
    // all nodes related to W+Qnode 
    node [fillcolor = darkkhaki]
    'Wc' [label='Wc[1,2]']
    'Wwt' [label= 'Wwt[1,2]']
    'Wp' [label= 'Wp[1,2]', fillcolor = darkolivegreen2]


    node[shape=box]
    myrf [fillcolor = gold, label='myrf']
    'Wwt_out' [label='Wwt']

    // all nodes related to descriptors
    node [shape=egg, fillcolor=ivory, label='descriptors[1,2]']
    'descriptors'    'descriptors2'    'descriptors3' 'descriptors4'


    subgraph cluster_1 {'descriptors';label='tib_sites'}
    subgraph cluster_2 {'descriptors2' 'W'; label='tib_W'}
    subgraph cluster_3 {'descriptors3' 'W3' 'Q' ; label='tib_WQ'}
    subgraph cluster_4 {'descriptors4' 'W4' 'Qc'; label='tib_WQc'}
    subgraph cluster_5 {'descriptors5' 'Wc' 'Qc5'; label='tib_WcQc'}
    subgraph cluster_6 {'descriptors6' 'Wwt', 'Qc6'; label='tib_WwtQc'}
    subgraph cluster_7 {'descriptors7' 'Wp', 'Qc7'; label='tib_WpQc'}


    edge[color=grey]
    'descriptors' -> 'descriptors2' -> 'descriptors3' -> 'descriptors4' -> 'descriptors5' -> 'descriptors6' ->'descriptors7'
    'descriptors'  -> 'W' -> 'W3' -> 'W4' -> 'Wc' -> 'Wwt' 
    'descriptors2' -> 'Q' -> 'Qc' -> 'Qc5' -> 'Qc6' -> 'Qc7'
    'Qc' -> 'Wc'
    'Wwt' -> 'Wwt_out' -> 'myrf'
    'Wwt','myrf' -> 'Wp'
 }
")

We detail each step of this process afterwards.

Prepare data

tib_sites: info about wood and discharge data path, and hydrological stations

Prepare data describing the two stations:

tib_sites=tibble::tibble(site=c("Ain","Allier"),
                         q1.5=c(840,460),
                         station=c("V294201001 ","K340081001")) %>% 
  mutate(wpath=paste0("../data-raw/Wdata_",site),
         qpath=paste0("../data-raw/qtvar/q",site,".csv"))
tib_sites

Wdata : wood occurrence data

First step: import wood data , indicating path to files and site name:

We will obtain as a result tib_W= tib_sites completed with Wdata.

result_file="../data-raw/results/tib_W.RDS"
if(!file.exists(result_file)){
  tib_W=tib_sites %>% 
    group_by(vars=site,q1.5,station,wpath) %>% 
    mutate(Wdata=purrr::map2(.x=wpath,.y=site,
                             ~import_Wdata(path=.x,site=.y,
                                           min_length=1,
                                           sample_length=TRUE
                                           ))) %>% 
    mutate(Wdata=purrr::map(.x=Wdata,
                            ~mutate(.x,site=factor(site,levels=tib_sites$site))))
  saveRDS(tib_W,result_file)
}
tib_W=readRDS(result_file)

tib_W looks like this:

tib_W_look=tib_W %>%
  mutate(Wdata=purrr::map(Wdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>%
  unnest(cols=c(Wdata))
knitr::kable(tib_W_look)

and inside tib_W, the first lines of Wdata (for both sites) look like this (here for the Ain site):

knitr::kable(head(tib_W$Wdata[[1]]))

Qdata: collect hydrological data & calculate all variables

Collect qtvar data from banquehydro, for the period covered by Wdata, and back in time so as to be able to calculate $T_Q$:

result_file="../data-raw/results/tib_WQ.RDS"
if(!file.exists(result_file)){
  tib_WQ=tib_W %>% 
    mutate(Qdata=purrr::map(.x=qpath,.y=site,
                            ~import_Qdata(path=.x, site=.y))) 
  saveRDS(tib_WQ,result_file)
}
tib_WQ=readRDS(result_file)

tib_WQ looks like this:

tib_WQ_look=tib_WQ %>%
  mutate(Wdata=purrr::map(Wdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>%
  mutate(Qdata=purrr::map(Qdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>% 
  unnest(cols=c(Wdata, Qdata))
knitr::kable(tib_WQ_look)

and inside tib_WQ, the first lines of Qdata (for both sites) look like this:

head(tib_WQ$Qdata[[1]])

Qdata completed with calculation of $T_Q$ and $S$

result_file="../data-raw/results/tib_WQc.RDS"
if(!file.exists(result_file)){
  tib_WQc=tib_WQ %>% 
    mutate(Qdata=purrr::map2(.x=Qdata,.y=q1.5,
                             ~complete_Qdata(qtvar=.x, qnorm=.y)))
  saveRDS(tib_WQc,result_file)
}
tib_WQc=readRDS(result_file)

tib_WQc looks like tib_WQ, except now Qdata inside has been completed with new variables:

head(tib_WQc$Qdata[[1]])

Wdata completed with discharge data

result_file="../data-raw/results/tib_WcQc.RDS"
if(!file.exists(result_file)){
  tib_WcQc=tib_WQc %>% 
    mutate(Wdata=purrr::map2(.x=Wdata,.y=Qdata,
                             ~complete_Wdata_with_Qdata(Wdata=.x,Qdata=.y)))
  saveRDS(tib_WcQc,result_file)
}
tib_WcQc=readRDS(result_file)

Inside tib_WcQc, Wdata has been updated into:

head(tib_WcQc$Wdata[[1]])

Calculate waiting times to apply random forest model

We then calculate tib_Wwt, updating Wdata so that 1 row= 1 waiting time between two wood occurrences.

result_file="../data-raw/results/tib_WwtQc.RDS"
if(!file.exists(result_file)){
  tib_WwtQc=tib_WcQc %>% 
    mutate(Wdata=purrr::map(.x=Wdata,~Wdata_as_waiting_times(.x)))
  saveRDS(tib_WwtQc,result_file)
}
tib_WwtQc=readRDS(result_file)

Inside tib_Wwt, Wdata now looks like this (for both sites):

head(tib_WwtQc$Wdata[[1]])

Prediction

Run random forest on Wdata (both sites)

We run one random forest for both sites. Site is one of the predictors anyway.

# get a single table for Wdata (waiting times)
Wwt=tib_WwtQc %>%
  ungroup() %>% 
  select(Wdata) %>% 
  tidyr::unnest(Wdata)

file_result="../data-raw/results/rf.RDS"
if(!file.exists(file_result)){
  myrf <- run_rf(Wwt,
                 pred_vars=c("Q","S","rT_Q","site"))
  saveRDS(myrf,file_result)
}
myrf=readRDS(file_result)

Predict Wdata based on random forest

tib_WpQc=tib_WwtQc %>% 
  mutate(Wdata=purrr::map(.x=Wdata,~predict_rf(newdata=.x,obj_rf=myrf)))

saveRDS(tib_WpQc,"../data-raw/results/tib_WpQc.RDS")

Model performance

Now let's assess the model's predictive performance

For both sites:

R2=tib_WpQc %>% select(site,Wdata) %>% 
  mutate(R2=purrr::map_df(Wdata,calc_rf_R2)) %>% 
  unnest(R2)
R2

Plot predicted/observed

Wdata_pred=tib_WpQc %>% 
  select(Wdata) %>% 
  tidyr::unnest(Wdata, .name_repair="minimal") %>% 
  ungroup() %>% 
  mutate(Ycat=cut(Y,quantile(Y,seq(0,1,by=0.1),include.lowest=TRUE)))

p=ggplot(Wdata_pred,aes(x=Y_pred,y=Y))+
  geom_point(alpha=0.05)+
  geom_abline(intercept=0,slope=1, col="red")+
  facet_grid(cols=vars(site),
             labeller = labeller(.rows = label_both,
                                 .cols = label_both))+
  geom_smooth(method="lm")
plot(p)

Correction of prediction bias linked to covariate shift

Due to covariate shift, there is a tendency to overestimate low values of Y and underestimate high values of Y (trend in blue, theoretical Y=X line in red).

$$Y=\alpha*Y_{pred}+\beta$$

We will apply a simple linear correction of predictions to account for this in the predict_rf function (arguments)

lm_mod=lm(Y~Y_pred, data=Wdata_pred)
alpha=lm_mod$coefficients[[2]]
beta=lm_mod$coefficients[[1]]
saveRDS(c(alpha,beta),"../data-raw/correction_covariate_shift.RDS")

So we correct $Y_{pred}$ values accordingly

tib_WpQc=tib_WwtQc %>% 
  mutate(Wdata=purrr::map(.x=Wdata,
                          ~predict_rf(newdata=.x,
                                      obj_rf=myrf,
                                      correction=c(alpha,beta))))

saveRDS(tib_WpQc,"../data-raw/results/tib_WpQc.RDS")
R2=tib_WpQc %>% select(site,Wdata) %>% 
  mutate(R2=purrr::map_df(Wdata,calc_rf_R2)) %>% 
  unnest(R2)
R2
Wdata_pred=tib_WpQc %>% 
  select(Wdata) %>% 
  tidyr::unnest(Wdata, .name_repair="minimal") %>% 
  ungroup() %>% 
  mutate(Ycat=cut(Y,quantile(Y,seq(0,1,by=0.1),include.lowest=TRUE)))

p=ggplot(Wdata_pred,aes(x=Y_pred,y=Y))+
  geom_point(alpha=0.05)+
  geom_abline(intercept=0,slope=1, col="red")+
  facet_grid(cols=vars(site),
             labeller = labeller(.rows = label_both,
                                 .cols = label_both))+
  geom_smooth(method="lm")
plot(p)


lvaudor/woody documentation built on March 22, 2022, 9:53 a.m.