R/QDM.R
In ClimDown: Climate Downscaling Library for Daily Climate Model Output

Documented in qdm.netcdf.wrapper

################################################################################
# Quantile Delta Mapping (QDM) for bias correction of ratio and
# interval data. Alex Cannon (acannon@uvic.ca)
################################################################################

QDM <- function(o.c, m.c, m.p, ratio=TRUE, trace=0.05, jitter.factor=0.01,
                 n.tau=NULL)
{
    # Quantile perturbation quantile mapping bias correction:
    # o = vector of observed values; m = vector of modelled values
    # c = current period;  p = projected period
    # ratio = TRUE --> preserve relative trends in a ratio variable
    # trace = 0.05 --> treat values below trace as left censored
    # jitter.factor = 0.01 --> jittering to accomodate ties
    # n.tau = NULL --> number of empirical quantiles (NULL=sample length)
    #
    # tau.m-p = F.m-p(x.m-p)
    # delta.m = x.m-p {/,-} F.m-c^-1(tau.m-p)
    # xhat.m-p = F.o-c^-1(tau.m-p) {*,+} delta.m
    #
    # Apply a small amount of jitter to accomodate ties due to limited
    # measurement precision
    if (all(is.na(m.p))) {
      return(list(mhat.c=NULL, mhat.p=rep(NA, length(m.p))))
    }
    o.c <- jitter(o.c, jitter.factor)
    m.c <- jitter(m.c, jitter.factor)
    m.p <- jitter(m.p, jitter.factor)
    # For ratio data, treat exact zeros as left censored values less than trace
    if(ratio){
        epsilon <- .Machine$double.eps
        o.c[o.c < trace & !is.na(o.c)] <- runif(sum(o.c < trace, na.rm=TRUE), min=epsilon, max=trace)
        m.c[m.c < trace & !is.na(m.c)] <- runif(sum(m.c < trace, na.rm=TRUE), min=epsilon, max=trace)
        m.p[m.p < trace & !is.na(m.p)] <- runif(sum(m.p < trace, na.rm=TRUE), min=epsilon, max=trace)
      }
    # Calculate empirical quantiles using Weibull plotting position
    n <- max(length(o.c), length(m.c), length(m.p))
    if(is.null(n.tau)) n.tau <- n
    tau <- seq(1/(n+1), n/(n+1), length=n.tau)
    quant.o.c <- quantile(o.c, tau, type=6, na.rm=TRUE)
    quant.m.c <- quantile(m.c, tau, type=6, na.rm=TRUE)
    quant.m.p <- quantile(m.p, tau, type=6, na.rm=TRUE)
    # Apply QDM bias correction
    tau.m.p <- approx(quant.m.p, tau, m.p, rule=2)$y    
    if(ratio){
        delta.m <- m.p/approx(tau, quant.m.c, tau.m.p, rule=2)$y
        mhat.p <- approx(tau, quant.o.c, tau.m.p, rule=2)$y*delta.m
    } else{
        delta.m <- m.p - approx(tau, quant.m.c, tau.m.p, rule=2)$y
        mhat.p <- approx(tau, quant.o.c, tau.m.p, rule=2)$y + delta.m
    }
    mhat.c <- approx(quant.m.c, quant.o.c, m.c, rule=2)$y
    # For ratio data, set values less than trace to zero
    if(ratio){
        mhat.c[mhat.c < trace] <- 0
        mhat.p[mhat.p < trace] <- 0
    }
    list(mhat.c=mhat.c, mhat.p=mhat.p)
}

mk.multiyear.factor <- function(dates, block.size, expand.multiyear=TRUE) {
    years <- as.numeric(format(dates, '%Y'))

    block.factor <- factor(years - years %% block.size)

    if(expand.multiyear && nlevels(block.factor) > 1) {
        # Fold incomplete data from the final block into the previous
        # complete multi-year block
        multiyear.lengths <- tapply(block.factor, block.factor, length)

        n.thresh <- max(multiyear.lengths[-length(multiyear.lengths)])/2
        # Is the final block less than half the length of other blocks?
        if (multiyear.lengths[nlevels(block.factor)] < n.thresh) {
            # Assign the final level to the value of the penultimate level
            levels(block.factor)[nlevels(block.factor)] <- levels(block.factor)[nlevels(block.factor)-1]
        }
    }

    block.factor
}

mk.annual.factor <- function(dates) {
    factor(format(dates, '%Y'))
}

mk.monthly.factor <- function(dates) {
  factor(format(dates, '%m'))
}

mk.seasonal.factor <- function(dates) {
    cal <- attr(dates, 'cal')
    # PCICt can possibly omit the "_day" on the cal attribute. Add it back.
    cal <- sub('^(360|365)$', '\\1_day', cal)
    if (cal == "proleptic_gregorian") { cal <- NULL}
    jday <- as.integer(format(dates, '%j'))
    years <- as.integer(format(dates, '%Y'))
    mkseas(jday, 'DJF', calendar=cal, year=years)
}

mk.factor.set <- function(o.c.dates, m.c.dates, m.p.dates,
                          multiyear=FALSE, seasonal=TRUE,
                          n.multiyear=10, expand.multiyear=TRUE) {
    if (seasonal) {
        mk.factor <- mk.seasonal.factor
    } else {
        mk.factor <- mk.monthly.factor
    }

    if (multiyear) {
        list(
            oc = mk.factor(o.c.dates),
            mc = mk.factor(m.c.dates),
            mp = interaction(mk.multiyear.factor(m.p.dates, n.multiyear, expand.multiyear), mk.factor(m.p.dates), sep='-')
        )
    } else{
        list(
            oc = mk.factor(o.c.dates),
            mc = mk.factor(m.c.dates),
            mp = interaction(mk.annual.factor(m.p.dates), mk.factor(m.p.dates), sep='-')
        )
    }
}

tQDM <- function(o.c, m.c, m.p,
                  o.c.factor, m.c.factor, m.p.factor,
                  n.window=30, ratio=TRUE, trace=0.05,
                  jitter.factor=0.01, n.tau=NULL)
{
    # Apply QDM bias correction over 3-month moving windows (seasonal=TRUE)
    # or single months, both over (sliding) blocks of n.window years or
    # multi-year chunks of years
    # o = vector of observed values; m = vector of modelled values
    # c = current period;  p = projected period
    # *.factor = date factors for grouping values together, levels should
    # be the same in each of the three factors
    # ratio = TRUE --> preserve relative trends in a ratio variable
    # trace = 0.05 --> treat values below trace as left censored
    # jitter.factor = 0.01 --> jittering to accomodate ties
    # seasonal = TRUE --> apply over sliding 3-month windows
    # multiyear = FALSE --> apply over multi-year rather than 1-yr chunks
    # n.multiyear = 10 --> if multiyear==TRUE, apply to n.multiyear chunks
    # expand.multiyear --> fold incomplete multi-year block into previous
    # n.tau = NULL --> number of empirical quantiles (NULL=sample length)

    # We'll run the computation on blocks of data that consist of the same month/season
    # and maybe across multiple (e.g. 30) years (if multiyear is TRUE)
    sections <- split(m.p, m.p.factor)
    months <- sub('^[0-9]+-', '', names(sections))
    oc.by.month <- split(o.c, o.c.factor)
    mc.by.month <- split(m.c, m.c.factor)
    mhat.list <- mapply(
                   function(mp.subset, month) {
                     qdm <- QDM(o.c=oc.by.month[[month]], m.c=mc.by.month[[month]],
                                  m.p=mp.subset, ratio=ratio,
                                  trace=trace, jitter.factor=jitter.factor,
                                  n.tau=n.tau)
                     qdm$mhat.p
                   },
                   mp.subset=sections,
                   months
                   )

    unsplit(mhat.list, m.p.factor)
}

## Package check tools can't detect foreach's use of non-standard evaluation
## Ensure that they skip these variable names
utils::globalVariables(c('o.c', 'm.p'))

#' @title High-level wrapper for Quantile Delta Mapping (QDM)
#'
#' @description This function performs the QDM algorithm on a
#' cell-by-cell basis for each cell in the spatial domain of the
#' inputted high-res gridded observations. It uses the gridded
#' observations plus the GCM-based output of CI as input to the
#' algorithm and then performs a quantile perturbation/quantile
#' mapping bias correction. The output is written out to out.file.
#' 
#' @param obs.file Filename of high-res gridded historical observations
#' @param gcm.file Filename of GCM simulations interpolated to the obs.file grid
#' @param out.file Filename to create (or overwrite) with the bias corrected outputs
#' @param varname Name of the NetCDF variable to downscale (e.g. 'tasmax')
#' @return NULL
#'
#' @examples
#' \dontrun{
#' ci.file <<- tempfile(fileext='.nc')
#' ClimDown::ci.netcdf.wrapper('./tiny_gcm.nc', './tiny_obs.nc', ci.file, 'tasmax')
#' out.nc <- tempfile(fileext='.nc')
#' options(
#'     calibration.end=as.POSIXct('1972-12-31', tz='GMT'),
#'     cend=as.POSIXct('1972-12-31', tz='GMT')
#' )
#' ClimDown::qdm.netcdf.wrapper('./tiny_obs.nc', ci.file, out.nc, 'tasmax')
#' unlink(ci.file)
#' unlink(out.nc)
#' }
#'
#' @references Cannon, A. J., Sobie, S. R., & Murdock, T. Q. (2015). Bias Correction of GCM Precipitation by Quantile Mapping: How Well Do Methods Preserve Changes in Quantiles and Extremes?. Journal of Climate, 28(17), 6938-6959. doi: 10.1175/JCLI-D-14-00754.1
#' @export
qdm.netcdf.wrapper <- function(obs.file, gcm.file, out.file, varname='tasmax') {
    ptm <- proc.time()

    cstart <- getOption('calibration.start')
    cend <- getOption('calibration.end')

    print("Opening the input files and reading metadata")
    gcm <- nc_open(gcm.file)
    obs <- nc_open(obs.file)

    lat <- gcm$dim$lat$vals
    lon <- gcm$dim$lon$vals

    gcm.time <- compute.time.stats(gcm)
    obs.time <- compute.time.stats(obs, cstart, cend)
    # indices for gcm time that are within the observational time range
    gcm.obs.subset.i <- gcm.time$vals > as.PCICt(cstart, attr(gcm.time$vals, 'cal')) & gcm.time$vals < as.PCICt(cend, attr(gcm.time$vals, 'cal'))

    print("Calculating the time factors outside in order to subdivide the problem space")
    time.factors <- mk.factor.set(obs.time$vals,
                                  gcm.time$vals[gcm.obs.subset.i],
                                  gcm.time$vals,
                                  multiyear=getOption('multiyear'),
                                  seasonal=getOption('seasonal')[[varname]],
                                  n.multiyear=getOption('multiyear.window.length'),
                                  expand.multiyear=getOption('expand.multiyear')
                                  )

    cat('Creating output file', out.file, '\n')
    dims <- gcm$var[[varname]]$dim
    vars <- ncvar_def(varname, getOption('target.units')[varname], dims, NA)
    out <- nc_create(out.file, vars)

    na.gcm <- rep(NA, gcm.time$n)

    # Calculate out to split up the chunks for reading.
    # We have to read all of time and as many rows as possible
    # The spatial domains are the same
    chunk.size <- optimal.chunk.size((gcm.time$n + obs.time$n) * length(lat))
    chunk.lon.indices <- chunk.indices(length(lon), chunk.size)
    n.chunks <- length(chunk.lon.indices)

    # I/O loop (read, compute, write)
    for (chunk in chunk.lon.indices) {

        print(paste('Bias correcting', varname, 'longitudes', chunk['start'], '-', chunk['stop'], '/', length(lon)))
        print(paste("Reading longitudes", chunk['start'], '-', chunk['stop'], '/', length(lon), 'from file:', obs$filename))
        print(paste("... and reading latitudes 1 -", length(lat), '/', length(lat)))

        o.c.chunk <- CD_ncvar_get(obs, start=c(chunk['start'], 1, obs.time$t0),
                                  count=c(chunk['length'], -1, obs.time$n),
                                  varid=varname)

        print(paste("Reading longitudes", chunk['start'], '-', chunk['stop'], '/', length(lon), 'from file:', gcm$filename))
        print(paste("... and reading latitudes 1 -", length(lat), '/', length(lat)))

        m.p.chunk <- CD_ncvar_get(gcm, start=c(chunk['start'], 1, gcm.time$t0),
                                  count=c(chunk['length'], -1, gcm.time$n),
                                  varid=varname)

        xn <- dim(o.c.chunk)[1]
        yn <- dim(o.c.chunk)[2]
        ij <- expand.grid(i=seq(xn), j=seq(yn))

        ncells <- xn*yn

        # Compute loop. Cell major. Do something to a timeseries for each cell.
        print(paste("Computing QDM on", ncells, "cells"))
        m.p.chunk <- foreach(o.c=split(o.c.chunk, 1:ncells),
                             m.p=split(m.p.chunk, 1:ncells),
                             .combine=c, .multicombine=TRUE, .inorder=TRUE,
                             .export=c('na.gcm', 'tQDM', 'varname', 'gcm.obs.subset.i', 'time.factors', 'QDM')) %dopar% {

          if(all(is.na(o.c), is.na(m.p))) {
            na.gcm
          } else {
            # consider the modeled values during the observed period separately
            # m.c <- m.p[gcm.obs.subset.i]
            tQDM(o.c=o.c, m.c=m.p[gcm.obs.subset.i], m.p=m.p,
                  time.factors$oc,
                  time.factors$mc,
                  time.factors$mp,
                  ratio=getOption('ratio')[[varname]],
                  trace=getOption('trace'),
                  jitter.factor=getOption('jitter.factor'),
                  n.tau=getOption('tau')[[varname]])
          }
        }

        dim(m.p.chunk) <- c(gcm.time$n, xn, yn)
        m.p.chunk <- aperm(m.p.chunk, c(2, 3, 1))

        print(paste("Writing chunk", chunk['start'], '-', chunk['stop'], '/', length(lon), 'to file:', out$filename))
        ncvar_put(nc=out, varid=varname, vals=m.p.chunk,
                  start=c(chunk['start'], 1, 1), count=dim(m.p.chunk))
        rm(o.c.chunk, m.p.chunk)
        gc()
    }

    print("Closing up the input and output files")
    nc_close(gcm)
    nc_close(obs)
    nc_close(out)

    print('Elapsed Time')
    print(proc.time() - ptm)
}
################################################################################