#' Analyse modelled values versus observed data.
#'
#' Calculates a set of performance statistics and optionally creates plots of modelled
#' versus observed values.
#'
#' @param df A data frame containing columns with names corresponding to arguments
#' \code{mod} and \code{obs}
#' @param mod A character string specifying the variable name (column) of the
#' modelled (simulated) values in data frame \code{df}.
#' @param obs A character string specifying the variable name (column) of the
#' observed values in data frame \code{df}.
#' @param type If \code{"points"}, uses \code{geom_points()}, if \code{"hex"}
#' uses \code{ggplot2::geom_hex()}, if \code{"heat"} uses adjusted
#' \code{geom_points()} with color indicating density, if \code{"density"} uses
#' \code{stat_density_2d()} to draws polygos of equal density.
#' @param filnam A character string specifying the name of the file containing
#' the plot. Defaults to \code{NA} (no file is created).
#' @param relative A logical specifying whether the relative RMSE and bias (after
#' division by the mean) is to be showed in the subtitle labels.
#' @param shortsubtitle A boolean specifying whether to display a reduced set of metrics
#' in the subtitle.
#' @param rsquared A boolean specifying whether to display R-squared and the RMSE
#' (if \code{TRUE}) or the r (Pearson's correlation coefficient) and the p (p-value of
#' test of significance of correlation, if \code{TRUE}). Defaluts to \code{TRUE}.
#' @param plot_subtitle A boolean specifying whether to display any metrics. Defaults
#' to \code{TRUE}.
#' @param plot_linmod A boolean specifying whether to display the fitted linear
#' regression as a red line. Defaults to \code{TRUE}.
#' @param plot_legend A boolean specifying whether to display a legend for the colors.
#' Defaults to \code{TRUE} if \code{type} is one of \code{"heat"}, \code{"hex"}, or
#' \code{"density"}.
#' @param label A boolean specifying whether points should be labelled using ggrepel.
#' Defaults to \code{FALSE}. Only available for \code{type == "points"}. Use argument
#' \code{nlabels} to specify how many points should be labelled, starting with points
#' that have the largest residuals from the linear regression fit.
#' @param id A character string specifying the column name that identifies the points.
#' The column's values must be of type integer and is used to label points in case of
#' \code{label = TRUE}.
#' @param nlabels An integer specifying how many points to be labelled, starting with points
#' that have the largest residuals from the linear regression fit. Only available
#' for \code{type == "points"}. Defaults to one.
#'
#' @export
#'
#' @examples
#'
analyse_modobs2 <- function(
df,
mod,
obs,
type = "points",
filnam = NA,
relative = FALSE,
xlim = NULL,
ylim = NULL,
use_factor = NULL,
shortsubtitle = FALSE,
rsquared = TRUE,
plot_subtitle = TRUE,
plot_linmod = TRUE,
plot_legend = TRUE,
label = FALSE,
id = NULL,
nlabels = 1,
...
){
require(ggplot2)
require(dplyr)
require(LSD)
require(ggthemes)
require(RColorBrewer)
#if (identical(filnam, NA)) filnam <- "analyse_modobs.pdf"
## rename to 'mod' and 'obs' and remove rows with NA in mod or obs
if (label){
df <- df %>%
as_tibble() %>%
ungroup() %>%
dplyr::select(mod=mod, obs=obs, id=!!id) %>%
tidyr::drop_na(mod, obs)
} else {
df <- df %>%
as_tibble() %>%
ungroup() %>%
dplyr::select(mod=mod, obs=obs) %>%
tidyr::drop_na(mod, obs)
}
## get linear regression (coefficients)
linmod <- lm( obs ~ mod, data=df )
## construct metrics table using the 'yardstick' library
df_metrics <- df %>%
yardstick::metrics(obs, mod) %>%
dplyr::bind_rows( tibble( .metric = "n", .estimator = "standard", .estimate = summarise(df, numb=n()) %>% unlist() ) ) %>%
dplyr::bind_rows( tibble( .metric = "slope", .estimator = "standard", .estimate = coef(linmod)[2]) ) %>%
# dplyr::bind_rows( tibble( .metric = "nse", .estimator = "standard", .estimate = hydroGOF::NSE( obs, mod, na.rm=TRUE ) ) ) %>%
dplyr::bind_rows( tibble( .metric = "mean_obs", .estimator = "standard", .estimate = summarise(df, mean=mean(obs, na.rm=TRUE)) %>% unlist() ) ) %>%
dplyr::bind_rows( tibble( .metric = "prmse", .estimator = "standard",
.estimate = dplyr::filter(., .metric=="rmse") %>% dplyr::select(.estimate) %>% unlist() /
dplyr::filter(., .metric=="mean_obs") %>% dplyr::select(.estimate) %>% unlist() ) ) %>%
dplyr::bind_rows( tibble( .metric = "pmae", .estimator = "standard",
.estimate = dplyr::filter(., .metric=="mae") %>% dplyr::select(.estimate) %>% unlist() /
dplyr::filter(., .metric=="mean_obs") %>% dplyr::select(.estimate) %>% unlist() ) ) %>%
dplyr::bind_rows( tibble( .metric = "bias", .estimator = "standard", .estimate = dplyr::summarise(df, mean((mod-obs), na.rm=TRUE )) %>% unlist() ) ) %>%
dplyr::bind_rows( tibble( .metric = "pbias", .estimator = "standard", .estimate = dplyr::summarise(df, mean((mod-obs)/obs, na.rm=TRUE)) %>% unlist() ) ) %>%
dplyr::bind_rows( tibble( .metric = "cor", .estimator = "standard", .estimate = cor(df$mod, df$obs, method = "pearson") ) ) %>%
dplyr::bind_rows( tibble( .metric = "cor_p", .estimator = "standard", .estimate = cor.test(df$mod, df$obs, method = "pearson")$p.value ) )
rsq_val <- df_metrics %>% dplyr::filter(.metric=="rsq") %>% dplyr::select(.estimate) %>% unlist() %>% unname()
rmse_val <- df_metrics %>% dplyr::filter(.metric=="rmse") %>% dplyr::select(.estimate) %>% unlist() %>% unname()
mae_val <- df_metrics %>% dplyr::filter(.metric=="mae") %>% dplyr::select(.estimate) %>% unlist() %>% unname()
bias_val <- df_metrics %>% dplyr::filter(.metric=="bias") %>% dplyr::select(.estimate) %>% unlist() %>% unname()
slope_val <- df_metrics %>% dplyr::filter(.metric=="slope") %>% dplyr::select(.estimate) %>% unlist() %>% unname()
n_val <- df_metrics %>% dplyr::filter(.metric=="n") %>% dplyr::select(.estimate) %>% unlist() %>% unname()
cor_val <- df_metrics %>% dplyr::filter(.metric=="cor") %>% dplyr::select(.estimate) %>% unlist() %>% unname()
cor_p_val <- df_metrics %>% dplyr::filter(.metric=="cor_p") %>% dplyr::select(.estimate) %>% unlist() %>% unname()
if (relative){
rmse_val <- rmse_val / mean(df$obs, na.rm = TRUE)
bias_val <- bias_val / mean(df$obs, na.rm = TRUE)
}
rsq_lab <- format( rsq_val, digits = 2 )
rmse_lab <- format( rmse_val, digits = 3 )
mae_lab <- format( mae_val, digits = 3 )
bias_lab <- format( bias_val, digits = 3 )
slope_lab <- format( slope_val, digits = 3 )
n_lab <- format( n_val, digits = 3 )
cor_lab <- format( cor_val, digits = 3 )
cor_p_lab <- format( cor_p_val, digits = 3 )
results <- tibble( rsq = rsq_val, rmse = rmse_val, mae = mae_val, bias = bias_val, slope = slope_val, n = n_val )
if (shortsubtitle){
if (rsquared){
subtitle <- bquote(
italic(R)^2 == .(rsq_lab) ~~
RMSE == .(rmse_lab)
)
} else {
subtitle <- bquote(
italic(r) == .(cor_lab) ~~
italic(p) == .(cor_p_lab)
)
}
} else {
subtitle <- bquote( italic(R)^2 == .(rsq_lab) ~~
RMSE == .(rmse_lab) ~~
bias == .(bias_lab) ~~
slope == .(slope_lab) ~~
italic(N) == .(n_lab) )
}
if (type=="heat"){
# if (!identical(filnam, NA)) dev.off()
# source("~/LSD/R/LSD.heatscatter.R")
gg <- heatscatter(
df$mod,
df$obs,
xlim=xlim,
ylim=ylim,
main="",
ggplot=TRUE )
gg <- gg +
geom_abline(intercept=0, slope=1, linetype="dotted") +
theme_classic() +
labs(x = mod, y = obs)
if (plot_linmod) gg <- gg + geom_smooth(method='lm', color="red", size=0.5, se=FALSE)
if (plot_subtitle) gg <- gg + labs(subtitle = subtitle)
if (!plot_legend) gg <- gg + theme(legend.position = "none")
if (!identical(filnam, NA)) {
ggsave(filnam, width=5, height=5)
}
} else if (type=="hex"){
## ggplot hexbin
gg <- df %>%
ggplot2::ggplot(aes(x=mod, y=obs)) +
geom_hex(bins = 100) +
scale_fill_gradientn(
colours = colorRampPalette( c("gray65", "navy", "red", "yellow"))(5),
trans = "log") +
geom_abline(intercept=0, slope=1, linetype="dotted") +
# coord_fixed() +
# xlim(0,NA) +
# ylim(0,NA) +
theme_classic() +
labs(x = mod, y = obs)
if (plot_subtitle) gg <- gg + labs(subtitle = subtitle)
if (plot_linmod) gg <- gg + geom_smooth(method='lm', color="red", size=0.5, se=FALSE)
if (!plot_legend) gg <- gg + theme(legend.position = "none")
if (!identical(filnam, NA)) {
ggsave(filnam, width=5, height=5)
}
} else if (type=="points"){
if (label){
df <- df %>%
dplyr::mutate(.res = mod - obs) %>%
dplyr::mutate(.absres = abs(.res)) %>%
dplyr::arrange(desc(.absres)) %>%
dplyr::mutate(.rankres = 1:n()) %>%
dplyr::mutate(.dolab = ifelse(.rankres <= nlabels, TRUE, FALSE))
## points with labels
library(ggrepel)
gg <- df %>%
ggplot(aes(x=mod, y=obs, label = ifelse(.dolab, id, ""))) +
geom_point() +
geom_label_repel(min.segment.length = 0, seed = 42, box.padding = 0.5) +
geom_point(data = dplyr::filter(df, .dolab), color = "red") +
geom_abline(intercept=0, slope=1, linetype="dotted") +
theme_classic() +
labs(x = mod, y = obs)
} else {
## points
gg <- df %>%
ggplot(aes(x=mod, y=obs)) +
geom_point() +
geom_abline(intercept=0, slope=1, linetype="dotted") +
theme_classic() +
labs(x = mod, y = obs)
}
if (plot_subtitle) gg <- gg + labs(subtitle = subtitle)
if (plot_linmod) gg <- gg + geom_smooth(method='lm', color="red", size=0.5, se=FALSE)
if (!identical(filnam, NA)) {
ggsave(filnam, width=5, height=5)
}
} else if (type=="density"){
## density as raster
gg <- df %>%
ggplot(aes(x=mod, y=obs)) +
stat_density_2d(
geom = "raster", #the geometric object to display the data (in this case: rectangles)
aes(fill = after_stat(density)), #using `density`, a variable calculated by the stat
contour = FALSE
) +
scale_fill_gradientn(colours = colorRampPalette( c("white", "gray65", "navy", "red", "yellow"))(6),
guide = FALSE) +
geom_abline(intercept=0, slope=1, linetype="dotted") +
# coord_fixed() +
# xlim(0,NA) +
# ylim(0,NA) +
theme_classic() +
labs(x = mod, y = obs)
if (plot_subtitle) gg <- gg + labs(subtitle = subtitle)
if (plot_linmod) gg <- gg + geom_smooth(method='lm', color="red", size=0.5, se=FALSE)
if (!plot_legend) gg <- gg + theme(legend.position = "none")
if (!identical(filnam, NA)) {
ggsave(filnam, width=5, height=5)
}
}
return(list(df_metrics=df_metrics, gg=gg, linmod=linmod, results = results))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.