knitr::opts_chunk$set(warning=FALSE,message=FALSE)
library(woody)
library(tidyverse)
library(DiagrammeR)

Global process

For one site

grViz("
digraph boxes_and_circles {
  graph [overlap = true, fontsize = 10]
  node [shape = box, style= filled]
  A [fillcolor=pink]
  Qc  [fillcolor = cadetblue1]
  W [fillcolor = burlywood1]
  Qh [fillcolor=cadetblue1]
  myrf[fillcolor=gold]
  Qp [fillcolor=cadetblue2]
  Qn [fillcolor=cadetblue3]
  Wn [fillcolor=burlywood2]
  N  [fillcolor=darkkhaki]

  node [shape = ellipse,fixedsize = false,color = grey, style= filled, fillcolor=grey90]
  'interp_Qdata()'
  'predict_rf()'
  'summarise_Qdata()'  
  'summarise_Wdata()'

  # several 'edge' statements
  Qc -> 'interp_Qdata()' [arrowhead = none]
  'interp_Qdata()'-> Qh [label= 'interpolate Qdata hourly']
  Qh, myrf-> 'predict_rf()'
  'predict_rf()' -> 'Qp' [label = ' add Y_pred values']
  Qp -> 'summarise_Qdata()' [arrowhead = none]
  'summarise_Qdata()' -> 'Qn' [label = ' add N_pred values']
  A,W -> 'summarise_Wdata()' [arrowhead = none]
  'summarise_Wdata()' -> 'Wn' [label = ' add N_obse values']
  Wn,Qn  -> 'N' 

}
")

For several sites

Here, we want to carry out this process for two sites, so that we use and enrich a nested table from step to step in the following manner:

grViz("
digraph boxes_and_circles {
  graph [overlap = true,  labeljust=l]

  graph [fontsize=10 compound=true overlap=true];
    node [shape=egg fontsize=10, style=filled]
    // all nodes related to Q
    node [fillcolor=cadetblue1]
    'Qc8'  [label='Qc[1,2]'] Qh
    'Qh' [label='Qh[1,2]']
    node [fillcolor=cadetblue2]
    'Qp'  [label='Qp[1,2]'] 
    node [fillcolor=cadetblue3]
    'Qn'  [label='Qn[1,2]']
    'Qn12'  [label='Qn[1,2]']
    'Qn13'  [label='Qn[1,2]']
    // all nodes related to W
    node [fillcolor=burlywood1, label='W[1,2]']
    'W8' 'W9' 'W10' 'W11'
    // all nodes related to W+Qnode 
     node [label= 'W[1,2]', fillcolor = burlywood]
    'W8' 'W9' 'W10' 'W11'
    node [label= 'Wn[1,2]', fillcolor = burlywood2]
    'Wn' 'Wn13'
    node [label= 'N[1,2]', fillcolor = darkkhaki]
    'N'

    node[shape=box]
    myrf [fillcolor = gold, label='myrf']

    // all nodes related to descriptors
    node [shape=egg, fillcolor=ivory, label='descriptors[1,2]']
    'descriptors8'    'descriptors9'    'descriptors10' 'descriptors11' 'descriptors12' 'descriptors13'

    subgraph cluster_8 {'descriptors8' 'W8' 'Qc8' label='tib_WpQc'}
    subgraph cluster_9 {'descriptors9' 'W9' 'Qh'; label='tib_WpQh'}
    subgraph cluster_10 {'descriptors10' 'W10' 'Qp' ; label='tib_WpQp'}
    subgraph cluster_11 {'descriptors11' 'W11' 'Qn'; label='tib_WpQn'}
    subgraph cluster_12 {'descriptors12' 'Wn' 'Qn12'; label='tib_WnQn'}
    subgraph cluster_13 {'descriptors13' 'Wn13', 'Qn13' 'N'; label='tib_WnQnN'}

    edge[color=grey]
    'descriptors8' -> 'descriptors9' -> 'descriptors10' -> 'descriptors11' -> 'descriptors12' -> 'descriptors13'
    'W8' -> 'W9' -> 'W10' -> 'W11' -> 'Wn' -> 'Wn13'
    'descriptors11' -> 'Wn' [label='An[1,2]', fontsize=6]
    'Qc8' -> 'Qh' 
    'Qp' -> 'Qn' -> 'Qn12' -> 'Qn13'  
    'Qh','myrf' -> 'Qp'
    'Wn','Qn12' -> 'N'
  }
")

From log-flux estimates $Y_{pred}$ to flux estimates $N_{pred}$

First, let's see how the predictions of log-flux $Y_{pred}$ (estimated through the Random Forest) might be used to calculate predictions of flux $N_{pred}$. It is indeed necessary to take into account the residuals error of the model to do so.

$$Y =log(N)= log(3600/W)$$

where :

Distribution of $Y_{pred}$

Let's have a look at the distribution of residuals:

tib_WpQc=readRDS("../data-raw/results/tib_WpQc.RDS")
Wdata_pred=tib_WpQc %>% 
  select(Wdata) %>% 
  tidyr::unnest(Wdata, .name_repair="minimal")

p=ggplot(Wdata_pred,aes(x=Y_pred,y=Y-Y_pred))+
  geom_point(alpha=0.05)+
  facet_grid(cols=vars(site),
             labeller = labeller(.rows = label_both,
                                 .cols = label_both))+
  geom_hline(aes(yintercept=0), col='red')
plot(p)

The observed value $Y$ is distributed around predicted value $\mu=Y_{pred}$, in an approximately Gaussian distribution:

$$Y=log(3600/W) \sim \mathcal{N}(\mu,\sigma) $$ Thus, we have $log(W)=log(3600)-Y$ hence:

$$log(W) \sim \mathcal{N}(log(3600)-\mu,\sigma) $$ The estimate of $\mu$ is given by

$$\mu=Y_{pred}$$ Let us now estimate $\sigma$.

Estimate of $\sigma$

$\sigma$, as the residual standard deviation, corresponds to a measure of error of the model predicting $Y$. It actually depends on $\mu$, as the error tends to be less important when predicted log-flux is important:

data_sigma=Wdata_pred %>% 
  ungroup() %>% 
  mutate(Ycat=cut(Y_pred,quantile(Y_pred,seq(0,1,by=0.1)))) %>% 
  group_by(Ycat) %>% 
  summarise(mu=mean(Y_pred),
            sigma=sd(Y_pred-Y)) 

ggplot(data_sigma, aes(x=mu,y=sigma))+
  geom_point()+
  geom_smooth(method="lm")

lm_sigma=lm(sigma~mu, data=data_sigma)
gamma=lm_sigma$coefficients[[2]]
delta=lm_sigma$coefficients[[1]]
sigma_coeffs=c(gamma,delta)

We will hence consider that

$$\sigma=\gamma\mu+\delta $$

Expected value of $N$

If $log(Z) \sim \mathcal{N}(\mu',\sigma)$ then $E(Z)=e^{\mu'+\sigma²/2)}$

So that

$$ E(W)=e^{log(3600)-\mu+\sigma²/2}=3600e^{-\mu+\sigma²/2} $$

and

$$ E(N)=e^{\mu-\sigma²/2} $$

Summarise Qdata hourly and calculate $N_{pred}$ and $N_{obs}$

Interpolate Qdata hourly

Let's interpolate Qdata at an hourly timescale and use these interpolated, hourly data to calculate $Y_{pred}$.

myrf=readRDS("../data-raw/results/rf.RDS")
tib_WQc=readRDS("../data-raw/results/tib_WQc.RDS")
tib_WQh=tib_WQc %>%
  # interpolate at an hourly timescale
  mutate(Qdata=purrr::map(.x=Qdata,
                          ~interp_Qdata(.x, timescale="hours")))

Calculate hourly $Y_{pred}$

correction_covariate_shift=readRDS("../data-raw/correction_covariate_shift.RDS")


tib_WQp=tib_WQh %>%
  mutate(Qdata=purrr::map(.x=Qdata %>% na.omit(),
                          ~predict_rf(newdata=.x,obj_rf=myrf,
                                      correction=correction_covariate_shift)))

Deduce hourly $N_{pred}$

tib_WQn=tib_WQp %>%
  mutate(apath=paste0("../data-raw/annot_times_",site,".csv")) %>%
  mutate(Qdata=purrr::map(.x=Qdata,
                          ~summarise_Qdata(Qdata=.x,sigma_coeffs=sigma_coeffs)))

Get hourly $N_{obs}$ and $N_{obse}$ from Wdata

We want to compare these values $N_{pred}$ with the observed number of wood pieces during an hour $N_{obse}$.

The observation of wood pieces is not continuous, so that we have to take into account, for each hour, the proportion of time the observation of wood pieces really occurred $P_{obs}$. This information is retrievable from Adata (A for "annotation").

$N_{obse}=N_{obs}/P_{obs}$

tib_WnQn=tib_WQn %>%
  mutate(Adata=purrr::map2(.x=apath,.y=site,
                           ~import_Adata(path=.x,site=.y))) %>%
  mutate(Wdata=purrr::map2(.x=Wdata,.y=Adata,
                           ~summarise_Wdata(Wdata=.x,Adata=.y)))

Once imported and cleaned, this how Adata (first lines, for the Ain site) looks like:

knitr::kable(head(tib_WnQn$Adata[[1]]))

So this is how Wdata now looks like, with a new extrapolated estimate for the observed flux $N_{obse}$ (first lines, for the Ain site)

knitr::kable(head(tib_WnQn$Wdata[[1]]))

Join values $N_{pred}$ (from Qdata) and $N_{obse}$ (from Wdata) into a single table Ndata

tib_WnQnN=tib_WnQn %>% 
  mutate(Ndata=purrr::map2(.x=Qdata,.y=Wdata,
                           ~left_join(.x,.y,by=c("site","Time"))))

Summary

To make the production of plots easier, we unnest Ndata from tib_WnQnN:

Ndata=tib_WnQnN %>% 
  ungroup() %>% 
  select(Ndata) %>% 
  tidyr::unnest(Ndata)

Model performance

R2=tib_WnQnN %>% select(site,Ndata) %>% 
  mutate(R2=purrr::map_df(Ndata,calc_rf_R2,type="N")) %>% 
  tidyr::unnest(R2)
R2

Time plot of $N_{pred}$ and $N_{obs}$

On a log-scale:

Ndataplot=Ndata %>% 
  tidyr::pivot_longer(cols=c(N_pred,N_obse), names_to="type") %>%
  filter(!is.na(event))
ggplot(Ndataplot, aes(x=Time,y=value))+
  geom_path(aes(col=type))+
  geom_point(aes(col=type))+
  facet_wrap(site~event, scales="free_x", ncol=1)+
  scale_y_log10()

On a natural scale:

Ndataplot=Ndata %>% 
  tidyr::pivot_longer(cols=c(N_pred,N_obse), names_to="type") %>%
  filter(!is.na(event))
ggplot(Ndataplot, aes(x=Time,y=value))+
  geom_path(aes(col=type))+
  geom_point(aes(col=type))+
  facet_wrap(site~event, scales="free_x", ncol=1)

Predicted vs observed (hourly scale)

ggplot(Ndata %>% filter(!is.na(event)), aes(x=N_obse,y=N_pred, col=event))+
  geom_point()+
  geom_abline(vintercept=0,slope=1,col="red")+
  facet_wrap(~site)+
  scale_x_log10()+scale_y_log10()


lvaudor/woody documentation built on March 22, 2022, 9:53 a.m.