R/scores.grid.notime.R
In amber: Automated Model Benchmarking R Package

Documented in scores.grid.notime

################################################################################
#' Scores for gridded reference data that do not have a varying time dimension
#' @description This function compares model output against remote-sensing
#' based reference data that do not vary in time. The performance of a model is
#' expressed through a score that ranges from zero to one, where increasing values
#' imply better performance.
#' Contrary to the function \link{scores.grid.time}, only two scores are computed
#' (bias score \eqn{S_{bias}} and spatial distribution score, \eqn{S_{dist}}) since the reference data do
#' not vary with time. Contrary to \link{scores.grid.time}, the bias is relative to the absolute reference mean
#' value rather than the reference standard deviation. Again, this is because the reference data do
#' not vary with time:
#'
#' \eqn{(i) \ bias(\lambda, \phi)=\overline{v_{mod}}(\lambda, \phi)-\overline{v_{ref}}(\lambda, \phi)}
#'
#' \eqn{(ii) \ \varepsilon_{bias}=|bias(\lambda, \phi)|/|\overline{v_{ref}}(\lambda, \phi)|}
#'
#' \eqn{(iii) \ s_{bias}(\lambda, \phi)=e^{-\varepsilon_{bias}(\lambda, \phi)}}
#'
#' \eqn{(iv) \ S_{bias}=\overline{\overline{s_{bias}}}}
#'
#' @param long.name A string that gives the full name of the variable, e.g. 'Gross primary productivity'
#' @param nc.mod A string that gives the path and name of the netcdf file that contains the model output, e.g. '/home/model_gpp.nc'
#' @param nc.ref A string that gives the path and name of the netcdf file that contains the reference data output, e.g. '/home/reference_gpp.nc'
#' @param mod.id A string that identifies the source of the reference data set, e.g. 'CanESM2'
#' @param ref.id A string that identifies the source of the reference data set, e.g. 'MODIS'
#' @param unit.conv.mod A number that is used as a factor to convert the unit of the model data, e.g. 86400
#' @param unit.conv.ref A number that is used as a factor to convert the unit of the reference data, e.g. 86400
#' @param variable.unit A string that gives the final units using LaTeX notation, e.g. 'gC m$^{-2}$ day$^{-1}$'
#' @param score.weights R object that gives the weights of each score (\eqn{S_{bias}}, \eqn{S_{rmse}}, \eqn{S_{phase}}, \eqn{S_{iav}}, \eqn{S_{dist}})
#' that are used for computing the overall score, e.g. c(1,2,1,1,1)
#' @param outlier.factor A number that is used to define outliers, e.g. 10.
#'  Plotting raster objects that contain extreme outliers lead to figures where
#'  most grid cells are presented by a single color since the color legend covers
#'  the entire range of values. To avoid this, the user may define outliers that
#'  will be masked out and marked with a red dot. Outliers are all values that
#'  exceed the interquartile range multiplied by the outlier factor defined here.
#' @param irregular Logical. If TRUE the data are converted from an irregular to a regular grid. Default is FALSE.
#' @param my.projection A string that defines the projection of the irregular grid
#' @param numCores An integer that defines the number of cores, e.g. 2
#' @param period An R object that gives the period over which to average the model data, e.g. c('1980-01', '2017-12')
#' @param timeInt A string that gives the time interval of the model data, e.g. 'month' or 'year'
#' @param outputDir A string that gives the output directory, e.g. '/home/project/study'. The output will only be written if the user specifies an output directory.
#' @param variable.name A string with the variable name, e.g. 'GPP'. If FALSE, the variable name stored in the NetCDF file will be used instead. Default is FALSE.
#'
#' @return (1) A list that contains three elements. The first element is a
#' raster stack with model data
#' (mean, \eqn{mod.mean}),
#' reference data
#' (mean, \eqn{ref.mean}),
#' and the corresponding bias
#' (bias, \eqn{bias}). The second and third element of the list are spatial
#' point data frames that give the model and reference outliers, respectively.
#' The content of the list can be plotted using \link{plotGrid}.
#'
#' (2) NetCDF files for each of the statistical variables listed above.
#
#' (3) Three text files: (i) score values and (ii) score inputs averaged across
#' the entire grid, and (iii) score values for each individual grid cell.
#'
#' @examples
#' library(amber)
#' library(classInt)
#' library(doParallel)
#' library(foreach)
#' library(Hmisc)
#' library(latex2exp)
#' library(ncdf4)
#' library(parallel)
#' library(raster)
#' library(rgdal)
#' library(rgeos)
#' library(scico)
#' library(sp)
#' library(stats)
#' library(utils)
#' library(viridis)
#' library(xtable)
#'
#' # (1) Global plots on a regular grid
#' long.name <- 'Soil Carbon'
#' nc.mod <- system.file('extdata/modelRegular', 'cSoil_monthly.nc', package = 'amber')
#' nc.ref <- system.file('extdata/referenceRegular', 'soilc_HWSD_128x64.nc', package = 'amber')
#' mod.id <- 'CLASSIC' # define a model experiment ID
#' ref.id <- 'HWSD' # give reference dataset a name
#' unit.conv.mod <- 1 # optional unit conversion for model data
#' unit.conv.ref <- 1 # optional unit conversion for reference data
#' variable.unit <- 'kgC m$^{-2}$' # unit after conversion (LaTeX notation)
#'
#' # Short version using default settings:
#' plot.me <- scores.grid.notime(long.name, nc.mod, nc.ref, mod.id, ref.id,
#' unit.conv.mod, unit.conv.ref, variable.unit)
#' plotGrid(long.name, plot.me)
#'
#' \donttest{
#' # (2) Regional plots on a rotated grid
#' long.name <- 'Soil Carbon'
#' nc.mod <- system.file('extdata/modelRotated', 'cSoil_monthly.nc', package = 'amber')
#' nc.ref <- system.file('extdata/referenceRotated', 'soilc_HWSD_rotated.nc', package = 'amber')
#' mod.id <- 'CLASSIC' # define a model experiment ID
#' ref.id <- 'HWSD' # give reference dataset a name
#' unit.conv.mod <- 1 # optional unit conversion for model data
#' unit.conv.ref <- 1 # optional unit conversion for reference data
#' variable.unit <- 'kgC m$^{-2}$' # unit after conversion (LaTeX notation)
#' score.weights <- c(1,2,1,1,1) # score weights of S_bias, S_rmse, S_phase, S_iav, S_dist
#' outlier.factor <- 1000
#' irregular <- TRUE
#' my.projection <- '+proj=ob_tran +o_proj=longlat +o_lon_p=83. +o_lat_p=42.5 +lon_0=263.'
#' numCores <- 2
#' period <- c('1980-01', '2017-12') # period over which to average the model data
#'
#' plot.me <- scores.grid.notime(long.name, nc.mod, nc.ref, mod.id, ref.id, unit.conv.mod,
#' unit.conv.ref, variable.unit, score.weights, outlier.factor, irregular, my.projection,
#' numCores, period)
#'
#' # Plot results:
#' irregular <- TRUE # data is on an irregular grid
#' my.projection <- '+proj=ob_tran +o_proj=longlat +o_lon_p=83. +o_lat_p=42.5 +lon_0=263.'
#' # shp.filename <- system.file('extdata/ne_50m_admin_0_countries/ne_50m_admin_0_countries.shp',
#' #  package = 'amber')
#' shp.filename <- system.file("extdata/ne_110m_land/ne_110m_land.shp", package = "amber")
#' my.xlim <- c(-171, 0) # longitude range that you wish to plot
#' my.ylim <- c(32, 78) # latitude range that you wish to plot
#' plot.width <- 7 # plot width
#' plot.height <- 3.8 # plot height
#'
#' plotGrid(long.name, plot.me, irregular, my.projection,
#' shp.filename, my.xlim, my.ylim, plot.width, plot.height)
#' } #donttest
#' @export
scores.grid.notime <- function(long.name, nc.mod, nc.ref, mod.id, ref.id, unit.conv.mod, unit.conv.ref, variable.unit, score.weights = c(1,
    2, 1, 1, 1), outlier.factor = 1000, irregular = FALSE, my.projection = "+proj=ob_tran +o_proj=longlat +o_lon_p=83. +o_lat_p=42.5 +lon_0=263.",
    numCores = 2, period = c("1980-01", "2017-12"), timeInt = "month", outputDir = FALSE, variable.name = FALSE) {

    #---------------------------------------------------------------------------

    # (I) Data preparation

    #---------------------------------------------------------------------------

    # (1) Reproject data from an irregular to a regular grid if 'irregular=TRUE'

    if (irregular == TRUE) {
        regular <- "+proj=longlat +ellps=WGS84"
        rotated <- my.projection
        # get variable name
        if (variable.name == FALSE) {
            nc <- ncdf4::nc_open(nc.mod)
            variable.name <- base::names(nc[["var"]])
            variable.name <- variable.name[1]
            variable.name <- ifelse(variable.name == "burntFractionAll", "burnt", variable.name)  # rename burntFractionAll to shorter name
            variable.name <- toupper(variable.name)  # make variable name upper-case
            ncdf4::nc_close(nc)
        }
        # get data for both, model and reference data
        modRef <- c(nc.mod, nc.ref)
        for (id in 1:length(modRef)) {
            my.nc <- modRef[id]
            nc <- ncdf4::nc_open(my.nc)
            data <- ncdf4::ncvar_get(nc)  # get values
            lon <- ncdf4::ncvar_get(nc, "lon")
            lat <- ncdf4::ncvar_get(nc, "lat")
            time <- ncdf4::ncvar_get(nc, "time")
            #
            nCol <- base::length(lon[, 1])
            nRow <- base::length(lon[1, ])
            nTime <- base::length(time)
            # compute dates
            origin <- ncdf4::ncatt_get(nc, "time", attname = "units")[2]
            origin <- base::strsplit(origin$value, " ")
            origin <- base::unlist(origin)[3]
            start.date <- base::as.Date(origin) + time[1] + 30  # the result is consitent with cdo sinfo
            dates <- base::seq(as.Date(start.date), by = timeInt, length = nTime)
            start.date <- min(dates)
            end.date <- max(dates)
            start.date <- format(as.Date(start.date), "%Y-%m")
            end.date <- format(as.Date(end.date), "%Y-%m")

            lon <- base::matrix(lon, ncol = 1)
            lat <- base::matrix(lat, ncol = 1)

            lonLat <- base::data.frame(lon, lat)
            sp::coordinates(lonLat) <- ~lon + lat
            raster::projection(lonLat) <- regular
            suppressWarnings(lonLat <- sp::spTransform(lonLat, sp::CRS(rotated)))
            myExtent <- raster::extent(lonLat)
            # create an empty raster
            r <- raster::raster(ncols = nCol, nrows = nRow)  # empty raster
            # create a raster by looping through all time steps
            cl <- parallel::makePSOCKcluster(numCores)
            doParallel::registerDoParallel(cl)
            myRaster <- foreach::foreach(i = 1:nTime) %dopar% {
                if (nTime > 1) {
                  myValues <- data[, , i]
                } else {
                  myValues <- data
                }  # first case: model (with time), second case: ref (no time)
                myValues <- base::apply(base::t(myValues), 2, rev)  # rotate values
                r <- raster::setValues(r, myValues)  # assign values
                raster::extent(r) <- myExtent  # extent using the rotated projection
                r <- r * 1  # this is necessary for the base::do.call function below
            }
            myStack <- base::do.call(raster::stack, myRaster)
            parallel::stopCluster(cl)
            myStack <- raster::setZ(myStack, dates, name = "time")
            names(myStack) <- dates
            assign(c("mod", "ref")[id], myStack)
        }
        # model data are averaged over a period since the reference data set has no time dimension
        start.date <- period[1]
        end.date <- period[2]

        mod <- mod[[which(format(as.Date(raster::getZ(mod)), "%Y-%m") >= start.date & format(as.Date(raster::getZ(mod)), "%Y-%m") <=
            end.date)]]
        mod <- raster::mean(mod, na.rm = TRUE)

        # unit conversion if appropriate
        mod <- mod * unit.conv.mod
        ref <- ref * unit.conv.ref

        # get layer names
        mod.names <- names(mod)
        ref.names <- names(ref)
    } else {

        #-----------------------------------------------------------------------

        # (2) Process data if 'irregular=FALSE'

        #-----------------------------------------------------------------------

        # model data are averaged over a period since the reference data set has no time dimension
        start.date <- period[1]
        end.date <- period[2]

        mod <- raster::brick(nc.mod)
        suppressWarnings(mod <- raster::rotate(mod))
        dates.mod <- raster::getZ(mod)

        mod <- raster::setZ(mod, dates.mod, name = "date")
        mod <- mod[[which(format(as.Date(raster::getZ(mod)), "%Y-%m") >= start.date & format(as.Date(raster::getZ(mod)), "%Y-%m") <=
            end.date)]]

        mod <- raster::mean(mod, na.rm = TRUE)

        # get variable name
        if (variable.name == FALSE) {
            nc <- ncdf4::nc_open(nc.mod)
            variable.name <- names(nc[["var"]])
            ncdf4::nc_close(nc)
            variable.name <- variable.name[length(variable.name)]  # take the last variable (relevant for CanESM5)
            variable.name <- ifelse(variable.name == "burntFractionAll", "burnt", variable.name)  # rename burntFractionAll to shorter name
            variable.name <- toupper(variable.name)  # make variable name upper-case
        }

        # reference data
        ref <- raster::raster(nc.ref)  # this reference data has no time dimension
        suppressWarnings(ref <- raster::rotate(ref))
        # unit conversion if appropriate
        mod <- mod * unit.conv.mod
        ref <- ref * unit.conv.ref

        # get layer names
        mod.names <- names(mod)
        ref.names <- names(ref)
    }

    #---------------------------------------------------------------------------

    # 1.3 Remaining part applies to both regular and irregular gridded data

    #---------------------------------------------------------------------------

    # Make a string that summarizes metadata. This will be added to each netcdf file (longname).

    # The string can then be accessed like this: names(raster(file.nc))

    meta.data.mod <- paste(variable.name, mod.id, "from", start.date, "to", end.date, sep = "_")
    meta.data.ref <- paste(variable.name, ref.id, "from", start.date, "to", end.date, sep = "_")
    meta.data.com <- paste(variable.name, mod.id, "vs", ref.id, "from", start.date, "to", end.date, sep = "_")
    # outliers all extreme outliers are set to NA in the grid
    mod.mean <- mod  # time mean (mod is already a time-mean)
    mod.outlier_range <- intFun.grid.define.outlier(mod.mean, outlier.factor)  # define outlier range
    outlier.neg <- mod.outlier_range[1]
    outlier.pos <- mod.outlier_range[2]
    mod.mask_outliers <- intFun.grid.outliers.na(mod.mean, outlier.neg, outlier.pos)
    mod.mask_outliers <- mod.mask_outliers - mod.mask_outliers + 1
    mod <- mod * mod.mask_outliers
    names(mod) <- mod.names
    mod.outlier.points <- intFun.grid.outliers.points(mod.mean, outlier.neg, outlier.pos)
    ## reference data
    ref.mean <- ref  # time mean (ref is already a time-mean due to preprocessing)
    ref.outlier_range <- intFun.grid.define.outlier(ref.mean, outlier.factor)  # define outlier range
    outlier.neg <- ref.outlier_range[1]
    outlier.pos <- ref.outlier_range[2]
    ref.mask_outliers <- intFun.grid.outliers.na(ref.mean, outlier.neg, outlier.pos)
    ref.mask_outliers <- ref.mask_outliers - ref.mask_outliers + 1
    ref <- ref * ref.mask_outliers
    names(ref) <- ref.names
    ref.outlier.points <- intFun.grid.outliers.points(ref.mean, outlier.neg, outlier.pos)

    #---------------------------------------------------------------------------

    # II Statistical analysis

    #---------------------------------------------------------------------------

    # (1) Bias

    #---------------------------------------------------------------------------
    # create a mask to excludes all grid cells that the model and reference data do not have in common.  This mask varies in
    # time.
    mask <- (mod * ref)
    mask <- mask - mask + 1
    mod <- mod * mask
    # names(mod) <- mod.names # this adds the corresponding dates
    ref <- ref * mask
    # names(ref) <- ref.names # this adds the corresponding dates now mod and ref are based on the same grid cells

    #---------------------------------------------------------------------------

    mod.mean <- mod  # data is already time mean
    ref.mean <- ref  # data is already time mean
    weights <- ref  # weights used for spatial integral
    bias <- mod.mean - ref.mean  # time mean
    epsilon_bias <- abs(bias)/abs(ref)
    epsilon_bias[epsilon_bias == Inf] <- NA  # relative error wrt ref mean
    bias.score <- exp(-epsilon_bias)  # bias score as a function of space
    S_bias_not.weighted <- mean(raster::getValues(bias.score), na.rm = TRUE)  # scalar score (not weighted)
    # calculate the weighted scalar score
    a <- bias.score * weights  # this is a raster
    b <- raster::cellStats(a, "sum")  # this is a scalar, the sum up all values
    S_bias_weighted <- b/raster::cellStats(weights, "sum")  # scalar score (weighted)
    # Compute global mean values of score input(s) The mask ensures that mod and ref are based on same grid cells.
    mask <- (mod.mean * ref.mean)
    mask <- mask - mask + 1
    mod.mean.scalar <- mean(raster::getValues(mask * mod.mean), na.rm = TRUE)  # global mean value
    ref.mean.scalar <- mean(raster::getValues(mask * ref.mean), na.rm = TRUE)  # global mean value
    bias.scalar <- mean(raster::getValues(bias), na.rm = TRUE)  # global mean value
    bias.scalar.rel <- (mod.mean.scalar - ref.mean.scalar)/abs(ref.mean.scalar) * 100
    ref.sd.scalar <- NA  # global mean value
    epsilon_bias.scalar <- stats::median(raster::getValues(epsilon_bias), na.rm = TRUE)  # global mean value

    #---------------------------------------------------------------------------

    # (5) dist

    #---------------------------------------------------------------------------
    mod.sigma.scalar <- sd(raster::getValues(mod.mean), na.rm = TRUE)  # standard deviation of period mean data
    ref.sigma.scalar <- sd(raster::getValues(ref.mean), na.rm = TRUE)  # standard deviation of period mean data
    sigma <- mod.sigma.scalar/ref.sigma.scalar
    y <- raster::getValues(mod.mean)
    x <- raster::getValues(ref.mean)
    reg <- stats::lm(y ~ x)
    R <- sqrt(summary(reg)$r.squared)
    S_dist <- 2 * (1 + R)/(sigma + 1/sigma)^2  # weighting does not apply

    #---------------------------------------------------------------------------

    # Scores

    #---------------------------------------------------------------------------

    S_bias <- S_bias_not.weighted
    S_rmse <- NA
    S_phase <- NA
    S_iav <- NA
    # weight importance of statisitcal metrics and compute overall score
    S_overall <- (S_bias + S_dist)/2
    scores <- data.frame(variable.name, ref.id, S_bias, S_rmse, S_phase, S_iav, S_dist, S_overall)
    scores_not.weighted <- scores
    # weighted (except for S_dist)
    S_bias <- S_bias_weighted
    S_rmse <- NA
    S_phase <- NA
    S_iav <- NA
    S_overall <- S_dist
    S_overall <- (S_bias + S_dist)/2

    scores <- data.frame(variable.name, ref.id, S_bias, S_rmse, S_phase, S_iav, S_dist, S_overall)
    scores_weighted <- scores
    #
    scores <- rbind(scores_not.weighted, scores_weighted)
    rownames(scores) <- c("not.weighted", "weighted")
    if (outputDir != FALSE) {
        utils::write.table(scores, paste(outputDir, "/", "scorevalues", "_", meta.data.com, sep = ""))
    }
    # Get all score values in case you want to compare this run against another run

    # Having all values will enable you to conduct a significance test

    bias.score.values <- raster::getValues(bias.score)
    dist.score.values <- raster::getValues(mask * S_dist)
    all.score.values <- data.frame(bias.score.values, NA, NA, NA, dist.score.values)
    colnames(all.score.values) <- c("bias.score", "rmse.score", "phase.score", "iav.score", "dist.score")
    all.score.values[is.na(all.score.values)] <- NA  # converts all NaN to NA
    if (outputDir != FALSE) {
        utils::write.table(all.score.values, paste(outputDir, "/", "allscorevalues", "-", variable.name, "-", ref.id, sep = ""))
    }
    # selected score inputs
    rmse.scalar <- NA
    crmse.scalar <- NA
    phase.scalar <- NA
    mod.iav.scalar <- NA
    ref.iav.scalar <- NA
    epsilon_rmse.scalar <- NA
    S_rmse_not.weighted <- NA
    mod.max.month.scalar <- NA
    ref.max.month.scalar <- NA
    S_phase_not.weighted <- NA
    epsilon_iav.scalar <- NA
    S_iav_not.weighted <- NA

    scoreinputs <- data.frame(long.name, variable.name, ref.id, variable.unit, mod.mean.scalar, ref.mean.scalar, bias.scalar,
        bias.scalar.rel, ref.sd.scalar, epsilon_bias.scalar, S_bias_not.weighted, rmse.scalar, crmse.scalar, ref.sd.scalar, epsilon_rmse.scalar,
        S_rmse_not.weighted, mod.max.month.scalar, ref.max.month.scalar, phase.scalar, S_phase_not.weighted, mod.iav.scalar,
        ref.iav.scalar, epsilon_iav.scalar, S_iav_not.weighted, mod.sigma.scalar, ref.sigma.scalar, sigma, R, S_dist)
    if (outputDir != FALSE) {
        utils::write.table(scoreinputs, paste(outputDir, "/", "scoreinputs", "_", meta.data.com, sep = ""))
    }
    # function that returns the min, max, and interval used in legend function min.max.int only requires one input apply the
    # functions min.max.int and min.max.int.mod.ref min.max.int
    mmi.bias <- intFun.min.max.int.bias(bias)
    mmi.bias.score <- c(0, 1, 0.1)
    mmi.mean <- intFun.min.max.int.mod.ref(mod.mean, ref.mean)
    # add metadata: 1. filename (e.g. nee_mod.mean.nc), 2. figure title (e.g.  Mean_nee_ModID_123_from_1982-01_to_2008-12), 3.
    # min, max, interval used in legend (e.g. 0, 1, 0.1), 4.  legend bar text (e.g. 'score (-)')
    raster::metadata(mod.mean) <- list(paste(variable.name, ref.id, "mod_mean", sep = "_"), paste("Mean", meta.data.mod, sep = "_"),
        mmi.mean, variable.unit)
    raster::metadata(ref.mean) <- list(paste(variable.name, ref.id, "ref_mean", sep = "_"), paste("Mean", meta.data.ref, sep = "_"),
        mmi.mean, variable.unit)
    raster::metadata(bias) <- list(paste(variable.name, ref.id, "bias", sep = "_"), paste("Bias", meta.data.com, sep = "_"),
        mmi.bias, variable.unit)
    raster::metadata(bias.score) <- list(paste(variable.name, ref.id, "bias_score", sep = "_"), paste("Bias_score", meta.data.com,
        sep = "_"), mmi.bias.score, "score (-)")
    # create a netcdf file from stat.metric

    stat.metric <- raster::stack(mod.mean, ref.mean, bias, bias.score)
    for (i in 1:raster::nlayers(stat.metric)) {
        data <- raster::subset(stat.metric, i:i)
        my.filename <- unlist(raster::metadata(data)[1])
        my.filename <- gsub("_", "-", my.filename)
        my.filename <- gsub(".", "-", my.filename, fixed = TRUE)
        my.longname <- unlist(raster::metadata(data)[2])
        if (outputDir != FALSE) {
            raster::writeRaster(data, filename = paste(outputDir, my.filename, sep = "/"), format = "CDF", varname = variable.name,
                longname = my.longname, varunit = variable.unit, overwrite = TRUE)
        }
    }
    return(list(stat.metric, mod.outlier.points, ref.outlier.points))
}