knitr::opts_chunk$set(warning=FALSE,message=FALSE) library(woody) library(tidyverse) library(DiagrammeR)
grViz("digraph boxes_and_circles { graph [overlap = true, fontsize = 10] node [shape = box, style= filled] wpath [fillcolor = burlywood1] qpath [fillcolor = cadetblue1] W [fillcolor = burlywood1] Q [fillcolor = cadetblue1] Wc [fillcolor = darkkhaki] Qc [fillcolor = cadetblue1] Q [fillcolor = cadetblue1] Wwt [fillcolor = darkkhaki] Wwt_test [fillcolor = darkkhaki] myrf [fillcolor = gold] Wp [fillcolor = darkolivegreen2] node [shape = ellipse,fixedsize = false,color = grey, style= filled, fillcolor=grey90] 'import_Wdata()' 'import_Qdata()' 'complete_Qdata()' 'complete_Wdata_with_Qdata()' 'Wdata_as_waiting_times()' 'run_rf()' 'predict_rf()' # several 'edge' statements wpath -> 'import_Wdata()' [arrowhead = none] 'import_Wdata()'-> W qpath -> 'import_Qdata()' [arrowhead = none] 'import_Qdata()'-> Q Q -> 'complete_Qdata()' [arrowhead=none] 'complete_Qdata()' -> Qc W -> 'complete_Wdata_with_Qdata()' [arrowhead=none] Qc-> 'complete_Wdata_with_Qdata()' [arrowhead=none] 'complete_Wdata_with_Qdata()' -> Wc Wc -> 'Wdata_as_waiting_times()' [arrowhead=none] 'Wdata_as_waiting_times()' -> Wwt Wwt -> 'run_rf()' [arrowhead = none] 'run_rf()' -> myrf myrf -> 'predict_rf()' [arrowhead = none] 'Wwt_test' -> 'predict_rf()' [arrowhead = none] 'predict_rf()' -> 'Wp' }")
Here, we want to carry out this process for two sites, so that we use and enrich a nested table from step to step in the following manner:
library(DiagrammeR) grViz(" digraph boxes_and_circles { graph [fontsize=10 compound=true overlap=true, labeljust=r]; node [shape=egg fontsize=10, style=filled] // all nodes related to Q node [fillcolor=cadetblue1, label='Qc[1,2]'] 'Q' [label='Q[1,2]'] 'Qc' 'Qc5' 'Qc6' 'Qc7' // all nodes related to W node [fillcolor=burlywood1, label='W[1,2]'] 'W' 'W3' 'W4' // all nodes related to W+Qnode node [fillcolor = darkkhaki] 'Wc' [label='Wc[1,2]'] 'Wwt' [label= 'Wwt[1,2]'] 'Wp' [label= 'Wp[1,2]', fillcolor = darkolivegreen2] node[shape=box] myrf [fillcolor = gold, label='myrf'] 'Wwt_out' [label='Wwt'] // all nodes related to descriptors node [shape=egg, fillcolor=ivory, label='descriptors[1,2]'] 'descriptors' 'descriptors2' 'descriptors3' 'descriptors4' subgraph cluster_1 {'descriptors';label='tib_sites'} subgraph cluster_2 {'descriptors2' 'W'; label='tib_W'} subgraph cluster_3 {'descriptors3' 'W3' 'Q' ; label='tib_WQ'} subgraph cluster_4 {'descriptors4' 'W4' 'Qc'; label='tib_WQc'} subgraph cluster_5 {'descriptors5' 'Wc' 'Qc5'; label='tib_WcQc'} subgraph cluster_6 {'descriptors6' 'Wwt', 'Qc6'; label='tib_WwtQc'} subgraph cluster_7 {'descriptors7' 'Wp', 'Qc7'; label='tib_WpQc'} edge[color=grey] 'descriptors' -> 'descriptors2' -> 'descriptors3' -> 'descriptors4' -> 'descriptors5' -> 'descriptors6' ->'descriptors7' 'descriptors' -> 'W' -> 'W3' -> 'W4' -> 'Wc' -> 'Wwt' 'descriptors2' -> 'Q' -> 'Qc' -> 'Qc5' -> 'Qc6' -> 'Qc7' 'Qc' -> 'Wc' 'Wwt' -> 'Wwt_out' -> 'myrf' 'Wwt','myrf' -> 'Wp' } ")
We detail each step of this process afterwards.
Prepare data describing the two stations:
tib_sites=tibble::tibble(site=c("Ain","Allier"), q1.5=c(840,460), station=c("V294201001 ","K340081001")) %>% mutate(wpath=paste0("../data-raw/Wdata_",site), qpath=paste0("../data-raw/qtvar/q",site,".csv")) tib_sites
Wdata
: wood occurrence dataFirst step: import wood data , indicating path to files and site name:
We will obtain as a result tib_W
= tib_sites completed with Wdata.
result_file="../data-raw/results/tib_W.RDS" if(!file.exists(result_file)){ tib_W=tib_sites %>% group_by(vars=site,q1.5,station,wpath) %>% mutate(Wdata=purrr::map2(.x=wpath,.y=site, ~import_Wdata(path=.x,site=.y, min_length=1, sample_length=TRUE ))) %>% mutate(Wdata=purrr::map(.x=Wdata, ~mutate(.x,site=factor(site,levels=tib_sites$site)))) saveRDS(tib_W,result_file) } tib_W=readRDS(result_file)
tib_W
looks like this:
tib_W_look=tib_W %>% mutate(Wdata=purrr::map(Wdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>% unnest(cols=c(Wdata)) knitr::kable(tib_W_look)
and inside tib_W
, the first lines of Wdata
(for both sites) look like this (here for the Ain site):
knitr::kable(head(tib_W$Wdata[[1]]))
Qdata
: collect hydrological data & calculate all variablesCollect qtvar data from banquehydro, for the period covered by Wdata
, and back in time so as to be able to calculate $T_Q$:
result_file="../data-raw/results/tib_WQ.RDS" if(!file.exists(result_file)){ tib_WQ=tib_W %>% mutate(Qdata=purrr::map(.x=qpath,.y=site, ~import_Qdata(path=.x, site=.y))) saveRDS(tib_WQ,result_file) } tib_WQ=readRDS(result_file)
tib_WQ
looks like this:
tib_WQ_look=tib_WQ %>% mutate(Wdata=purrr::map(Wdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>% mutate(Qdata=purrr::map(Qdata,~paste0(nrow(.x)," rows x ",ncol(.x)," cols"))) %>% unnest(cols=c(Wdata, Qdata)) knitr::kable(tib_WQ_look)
and inside tib_WQ
, the first lines of Qdata
(for both sites) look like this:
head(tib_WQ$Qdata[[1]])
Qdata
completed with calculation of $T_Q$ and $S$result_file="../data-raw/results/tib_WQc.RDS" if(!file.exists(result_file)){ tib_WQc=tib_WQ %>% mutate(Qdata=purrr::map2(.x=Qdata,.y=q1.5, ~complete_Qdata(qtvar=.x, qnorm=.y))) saveRDS(tib_WQc,result_file) } tib_WQc=readRDS(result_file)
tib_WQc
looks like tib_WQ
, except now Qdata
inside has been completed with new variables:
head(tib_WQc$Qdata[[1]])
Wdata
completed with discharge dataresult_file="../data-raw/results/tib_WcQc.RDS" if(!file.exists(result_file)){ tib_WcQc=tib_WQc %>% mutate(Wdata=purrr::map2(.x=Wdata,.y=Qdata, ~complete_Wdata_with_Qdata(Wdata=.x,Qdata=.y))) saveRDS(tib_WcQc,result_file) } tib_WcQc=readRDS(result_file)
Inside tib_WcQc
, Wdata
has been updated into:
head(tib_WcQc$Wdata[[1]])
We then calculate tib_Wwt
, updating Wdata
so that 1 row= 1 waiting time between two wood occurrences.
result_file="../data-raw/results/tib_WwtQc.RDS" if(!file.exists(result_file)){ tib_WwtQc=tib_WcQc %>% mutate(Wdata=purrr::map(.x=Wdata,~Wdata_as_waiting_times(.x))) saveRDS(tib_WwtQc,result_file) } tib_WwtQc=readRDS(result_file)
Inside tib_Wwt
, Wdata
now looks like this (for both sites):
head(tib_WwtQc$Wdata[[1]])
We run one random forest for both sites. Site is one of the predictors anyway.
# get a single table for Wdata (waiting times) Wwt=tib_WwtQc %>% ungroup() %>% select(Wdata) %>% tidyr::unnest(Wdata) file_result="../data-raw/results/rf.RDS" if(!file.exists(file_result)){ myrf <- run_rf(Wwt, pred_vars=c("Q","S","rT_Q","site")) saveRDS(myrf,file_result) } myrf=readRDS(file_result)
tib_WpQc=tib_WwtQc %>% mutate(Wdata=purrr::map(.x=Wdata,~predict_rf(newdata=.x,obj_rf=myrf))) saveRDS(tib_WpQc,"../data-raw/results/tib_WpQc.RDS")
Now let's assess the model's predictive performance
For both sites:
R2=tib_WpQc %>% select(site,Wdata) %>% mutate(R2=purrr::map_df(Wdata,calc_rf_R2)) %>% unnest(R2) R2
Wdata_pred=tib_WpQc %>% select(Wdata) %>% tidyr::unnest(Wdata, .name_repair="minimal") %>% ungroup() %>% mutate(Ycat=cut(Y,quantile(Y,seq(0,1,by=0.1),include.lowest=TRUE))) p=ggplot(Wdata_pred,aes(x=Y_pred,y=Y))+ geom_point(alpha=0.05)+ geom_abline(intercept=0,slope=1, col="red")+ facet_grid(cols=vars(site), labeller = labeller(.rows = label_both, .cols = label_both))+ geom_smooth(method="lm") plot(p)
Due to covariate shift, there is a tendency to overestimate low values of Y and underestimate high values of Y (trend in blue, theoretical Y=X line in red).
$$Y=\alpha*Y_{pred}+\beta$$
We will apply a simple linear correction of predictions to account for this in the predict_rf function (arguments)
lm_mod=lm(Y~Y_pred, data=Wdata_pred) alpha=lm_mod$coefficients[[2]] beta=lm_mod$coefficients[[1]] saveRDS(c(alpha,beta),"../data-raw/correction_covariate_shift.RDS")
So we correct $Y_{pred}$ values accordingly
tib_WpQc=tib_WwtQc %>% mutate(Wdata=purrr::map(.x=Wdata, ~predict_rf(newdata=.x, obj_rf=myrf, correction=c(alpha,beta)))) saveRDS(tib_WpQc,"../data-raw/results/tib_WpQc.RDS")
R2=tib_WpQc %>% select(site,Wdata) %>% mutate(R2=purrr::map_df(Wdata,calc_rf_R2)) %>% unnest(R2) R2
Wdata_pred=tib_WpQc %>% select(Wdata) %>% tidyr::unnest(Wdata, .name_repair="minimal") %>% ungroup() %>% mutate(Ycat=cut(Y,quantile(Y,seq(0,1,by=0.1),include.lowest=TRUE))) p=ggplot(Wdata_pred,aes(x=Y_pred,y=Y))+ geom_point(alpha=0.05)+ geom_abline(intercept=0,slope=1, col="red")+ facet_grid(cols=vars(site), labeller = labeller(.rows = label_both, .cols = label_both))+ geom_smooth(method="lm") plot(p)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.