knitr::opts_chunk$set(warning=FALSE,message=FALSE) library(woody) library(tidyverse) library(DiagrammeR)
grViz(" digraph boxes_and_circles { graph [overlap = true, fontsize = 10] node [shape = box, style= filled] A [fillcolor=pink] Qc [fillcolor = cadetblue1] W [fillcolor = burlywood1] Qh [fillcolor=cadetblue1] myrf[fillcolor=gold] Qp [fillcolor=cadetblue2] Qn [fillcolor=cadetblue3] Wn [fillcolor=burlywood2] N [fillcolor=darkkhaki] node [shape = ellipse,fixedsize = false,color = grey, style= filled, fillcolor=grey90] 'interp_Qdata()' 'predict_rf()' 'summarise_Qdata()' 'summarise_Wdata()' # several 'edge' statements Qc -> 'interp_Qdata()' [arrowhead = none] 'interp_Qdata()'-> Qh [label= 'interpolate Qdata hourly'] Qh, myrf-> 'predict_rf()' 'predict_rf()' -> 'Qp' [label = ' add Y_pred values'] Qp -> 'summarise_Qdata()' [arrowhead = none] 'summarise_Qdata()' -> 'Qn' [label = ' add N_pred values'] A,W -> 'summarise_Wdata()' [arrowhead = none] 'summarise_Wdata()' -> 'Wn' [label = ' add N_obse values'] Wn,Qn -> 'N' } ")
Here, we want to carry out this process for two sites, so that we use and enrich a nested table from step to step in the following manner:
grViz(" digraph boxes_and_circles { graph [overlap = true, labeljust=l] graph [fontsize=10 compound=true overlap=true]; node [shape=egg fontsize=10, style=filled] // all nodes related to Q node [fillcolor=cadetblue1] 'Qc8' [label='Qc[1,2]'] Qh 'Qh' [label='Qh[1,2]'] node [fillcolor=cadetblue2] 'Qp' [label='Qp[1,2]'] node [fillcolor=cadetblue3] 'Qn' [label='Qn[1,2]'] 'Qn12' [label='Qn[1,2]'] 'Qn13' [label='Qn[1,2]'] // all nodes related to W node [fillcolor=burlywood1, label='W[1,2]'] 'W8' 'W9' 'W10' 'W11' // all nodes related to W+Qnode node [label= 'W[1,2]', fillcolor = burlywood] 'W8' 'W9' 'W10' 'W11' node [label= 'Wn[1,2]', fillcolor = burlywood2] 'Wn' 'Wn13' node [label= 'N[1,2]', fillcolor = darkkhaki] 'N' node[shape=box] myrf [fillcolor = gold, label='myrf'] // all nodes related to descriptors node [shape=egg, fillcolor=ivory, label='descriptors[1,2]'] 'descriptors8' 'descriptors9' 'descriptors10' 'descriptors11' 'descriptors12' 'descriptors13' subgraph cluster_8 {'descriptors8' 'W8' 'Qc8' label='tib_WpQc'} subgraph cluster_9 {'descriptors9' 'W9' 'Qh'; label='tib_WpQh'} subgraph cluster_10 {'descriptors10' 'W10' 'Qp' ; label='tib_WpQp'} subgraph cluster_11 {'descriptors11' 'W11' 'Qn'; label='tib_WpQn'} subgraph cluster_12 {'descriptors12' 'Wn' 'Qn12'; label='tib_WnQn'} subgraph cluster_13 {'descriptors13' 'Wn13', 'Qn13' 'N'; label='tib_WnQnN'} edge[color=grey] 'descriptors8' -> 'descriptors9' -> 'descriptors10' -> 'descriptors11' -> 'descriptors12' -> 'descriptors13' 'W8' -> 'W9' -> 'W10' -> 'W11' -> 'Wn' -> 'Wn13' 'descriptors11' -> 'Wn' [label='An[1,2]', fontsize=6] 'Qc8' -> 'Qh' 'Qp' -> 'Qn' -> 'Qn12' -> 'Qn13' 'Qh','myrf' -> 'Qp' 'Wn','Qn12' -> 'N' } ")
First, let's see how the predictions of log-flux $Y_{pred}$ (estimated through the Random Forest) might be used to calculate predictions of flux $N_{pred}$. It is indeed necessary to take into account the residuals error of the model to do so.
$$Y =log(N)= log(3600/W)$$
where :
Let's have a look at the distribution of residuals:
tib_WpQc=readRDS("../data-raw/results/tib_WpQc.RDS") Wdata_pred=tib_WpQc %>% select(Wdata) %>% tidyr::unnest(Wdata, .name_repair="minimal") p=ggplot(Wdata_pred,aes(x=Y_pred,y=Y-Y_pred))+ geom_point(alpha=0.05)+ facet_grid(cols=vars(site), labeller = labeller(.rows = label_both, .cols = label_both))+ geom_hline(aes(yintercept=0), col='red') plot(p)
The observed value $Y$ is distributed around predicted value $\mu=Y_{pred}$, in an approximately Gaussian distribution:
$$Y=log(3600/W) \sim \mathcal{N}(\mu,\sigma) $$ Thus, we have $log(W)=log(3600)-Y$ hence:
$$log(W) \sim \mathcal{N}(log(3600)-\mu,\sigma) $$ The estimate of $\mu$ is given by
$$\mu=Y_{pred}$$ Let us now estimate $\sigma$.
$\sigma$, as the residual standard deviation, corresponds to a measure of error of the model predicting $Y$. It actually depends on $\mu$, as the error tends to be less important when predicted log-flux is important:
data_sigma=Wdata_pred %>% ungroup() %>% mutate(Ycat=cut(Y_pred,quantile(Y_pred,seq(0,1,by=0.1)))) %>% group_by(Ycat) %>% summarise(mu=mean(Y_pred), sigma=sd(Y_pred-Y)) ggplot(data_sigma, aes(x=mu,y=sigma))+ geom_point()+ geom_smooth(method="lm") lm_sigma=lm(sigma~mu, data=data_sigma) gamma=lm_sigma$coefficients[[2]] delta=lm_sigma$coefficients[[1]] sigma_coeffs=c(gamma,delta)
We will hence consider that
$$\sigma=\gamma\mu+\delta $$
If $log(Z) \sim \mathcal{N}(\mu',\sigma)$ then $E(Z)=e^{\mu'+\sigma²/2)}$
So that
$$ E(W)=e^{log(3600)-\mu+\sigma²/2}=3600e^{-\mu+\sigma²/2} $$
and
$$ E(N)=e^{\mu-\sigma²/2} $$
Qdata
hourly and calculate $N_{pred}$ and $N_{obs}$Qdata
hourlyLet's interpolate Qdata at an hourly timescale and use these interpolated, hourly data to calculate $Y_{pred}$.
myrf=readRDS("../data-raw/results/rf.RDS") tib_WQc=readRDS("../data-raw/results/tib_WQc.RDS") tib_WQh=tib_WQc %>% # interpolate at an hourly timescale mutate(Qdata=purrr::map(.x=Qdata, ~interp_Qdata(.x, timescale="hours")))
correction_covariate_shift=readRDS("../data-raw/correction_covariate_shift.RDS") tib_WQp=tib_WQh %>% mutate(Qdata=purrr::map(.x=Qdata %>% na.omit(), ~predict_rf(newdata=.x,obj_rf=myrf, correction=correction_covariate_shift)))
tib_WQn=tib_WQp %>% mutate(apath=paste0("../data-raw/annot_times_",site,".csv")) %>% mutate(Qdata=purrr::map(.x=Qdata, ~summarise_Qdata(Qdata=.x,sigma_coeffs=sigma_coeffs)))
Wdata
We want to compare these values $N_{pred}$ with the observed number of wood pieces during an hour $N_{obse}$.
The observation of wood pieces is not continuous, so that we have to take into account, for each hour, the proportion of time the observation of wood pieces really occurred $P_{obs}$. This information is retrievable from Adata
(A for "annotation").
$N_{obse}=N_{obs}/P_{obs}$
tib_WnQn=tib_WQn %>% mutate(Adata=purrr::map2(.x=apath,.y=site, ~import_Adata(path=.x,site=.y))) %>% mutate(Wdata=purrr::map2(.x=Wdata,.y=Adata, ~summarise_Wdata(Wdata=.x,Adata=.y)))
Once imported and cleaned, this how Adata
(first lines, for the Ain site) looks like:
knitr::kable(head(tib_WnQn$Adata[[1]]))
So this is how Wdata
now looks like, with a new extrapolated estimate for the observed flux $N_{obse}$ (first lines, for the Ain site)
knitr::kable(head(tib_WnQn$Wdata[[1]]))
Qdata
) and $N_{obse}$ (from Wdata
) into a single table Ndata
tib_WnQnN=tib_WnQn %>% mutate(Ndata=purrr::map2(.x=Qdata,.y=Wdata, ~left_join(.x,.y,by=c("site","Time"))))
To make the production of plots easier, we unnest Ndata
from tib_WnQnN
:
Ndata=tib_WnQnN %>% ungroup() %>% select(Ndata) %>% tidyr::unnest(Ndata)
R2=tib_WnQnN %>% select(site,Ndata) %>% mutate(R2=purrr::map_df(Ndata,calc_rf_R2,type="N")) %>% tidyr::unnest(R2) R2
Ndataplot=Ndata %>% tidyr::pivot_longer(cols=c(N_pred,N_obse), names_to="type") %>% filter(!is.na(event)) ggplot(Ndataplot, aes(x=Time,y=value))+ geom_path(aes(col=type))+ geom_point(aes(col=type))+ facet_wrap(site~event, scales="free_x", ncol=1)+ scale_y_log10()
Ndataplot=Ndata %>% tidyr::pivot_longer(cols=c(N_pred,N_obse), names_to="type") %>% filter(!is.na(event)) ggplot(Ndataplot, aes(x=Time,y=value))+ geom_path(aes(col=type))+ geom_point(aes(col=type))+ facet_wrap(site~event, scales="free_x", ncol=1)
ggplot(Ndata %>% filter(!is.na(event)), aes(x=N_obse,y=N_pred, col=event))+ geom_point()+ geom_abline(vintercept=0,slope=1,col="red")+ facet_wrap(~site)+ scale_x_log10()+scale_y_log10()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.