pkern: Fast Kriging with Product Kernels

#' ---
#' title: "meuse_vignette"
#' author: "Dean Koch"
#' date: "`r format(Sys.Date())`"
#' output: github_document
#' ---
#'
#' **Mitacs UYRW project**
#'
#' **pkern**: Fast kriging on gridded datasets
#'
#' This vignette shows how to use `pkern` to interpolate the Meuse river dataset
#' (part of the `sp` package) onto a high resolution grid:
#'
#' * `pkern_grid` defines the grid
#' * `pkern_snap` snaps point data to the grid
#' * `pkern_fit` fits a covariance model to the data
#' * `pkern_cmean` computes the kriging predictor and variance
#'
#' All plots in this document except the first are generated by `pkern_plot`, which uses base graphics
#' to display rasters and matrices. Try setting `ij=TRUE` in any of the plot calls to see a different
#' styling used for matrices.
#'
#' ## Getting started
#'
#' `pkern` has no requirements outside of base packages (`stats`, `utils`, `grDevices`, and `graphics`),
#' but `terra` and `sf` are (strongly) suggested, and we use them in this vignette to load and manage
#' the example data.
#'

#+ dependencies_hide, include=FALSE

# load pkern
library(devtools)
load_all()

#+ dependencies

# load extra dependencies for the vignette
library(sp)
library(sf)
library(terra)

#'
#' ## Example data
#'
#' We will look at the "meuse" dataset (Pebesma, 2009) on heavy metal concentrations in the Meuse
#' river floodplain in the Netherlands. These data are included with the `sp` package that we just
#' loaded, and appear in this
#' [tutorial pdf](https://cran.r-project.org/web/packages/gstat/vignettes/gstat.pdf)
#' for gstat. Load the data with `data(meuse)` and `data(meuse.riv)`. I have a helper function
#' `get_meuse` to do this (see source code for this document).

#+ meuse_hide, include=FALSE

# load the Meuse data into a convenient format
get_meuse = function(dfMaxLength = units::set_units(50, m))
{
  # Note: dfMaxLength sets the interval used to sample line geometries of the river
  # using Voronoi tiles. This is a fussy and not well-tested algorithm for finding the
  # centre line of a river polygon, but it seems to work well enough for the example here

  # EPSG code for the coordinate system
  epsg_meuse = 28992

  # open river location data
  utils::data(meuse.riv)
  crs_meuse = sf::st_crs(epsg_meuse)[['wkt']]

  # reshape the river (edge) point data as a more densely segmented polygon
  colnames(meuse.riv) = c('x', 'y')
  meuse_river_points = sf::st_as_sf(as.data.frame(meuse.riv), coords=c('x', 'y'), crs=crs_meuse)
  meuse_river_seg = sf::st_cast(sf::st_combine(meuse_river_points), 'LINESTRING')
  meuse_river_poly = sf::st_cast(st_segmentize(meuse_river_seg, dfMaxLength), 'POLYGON')

  # skeletonization trick to get a single linestring at center of the river
  meuse_river_voronoi = sf::st_cast(sf::st_voronoi(meuse_river_poly, bOnlyEdges=TRUE), 'POINT')
  meuse_river_skele = sf::st_intersection(meuse_river_voronoi, meuse_river_poly)
  n_skele = length(meuse_river_skele)

  # compute distance matrix
  dmat_skele = units::drop_units(sf::st_distance(meuse_river_skele))

  # re-order to start from northernmost point
  idx_first = which.max(st_coordinates(meuse_river_skele)[,2])
  idx_reorder = c(idx_first, integer(n_skele-1L))
  for(idx_skele in seq(n_skele-1L))
  {
    # find least distance match
    idx_tocheck = seq(n_skele) != idx_first
    idx_first = which(idx_tocheck)[ which.min(dmat_skele[idx_tocheck, idx_first]) ]
    idx_reorder[1L+idx_skele] = idx_first

    # modify distance matrix so the matching point is not selected again
    dmat_skele[idx_first, ] = Inf
  }

  # connect the points to get the spine
  meuse_river = sf::st_cast(sf::st_combine(meuse_river_skele[idx_reorder]), 'LINESTRING')

  # load soil points data
  utils::data(meuse)
  meuse_soils = sf::st_as_sf(meuse, coords=c('x', 'y'), crs=epsg_meuse)

  # add 'distance' (to river) and 'logzinc' columns
  meuse_soils[['distance']] = units::drop_units( sf::st_distance(meuse_soils, meuse_river))
  meuse_soils[['log_zinc']] = log(meuse_soils[['zinc']])

  # crop the river objects to buffered bounding box of soils data
  bbox_padded = st_buffer(sf::st_as_sfc(sf::st_bbox(meuse_soils)), units::set_units(500, m))
  meuse_river_poly = sf::st_crop(meuse_river_poly, bbox_padded)
  meuse_river = sf::st_crop(meuse_river, bbox_padded)

  # return three geometry objects in a list
  return( list(soils=meuse_soils, river_poly=meuse_river_poly, river_line=meuse_river) )
}

#+ meuse_load

# load data
meuse = get_meuse()
str(meuse)

# plot using sf package
plot(meuse[['river_poly']], col='lightblue', reset=FALSE)
plot(meuse[['river_line']], lwd=2, add=TRUE)
plot(meuse[['soils']]['zinc'], pch=16, add=TRUE)
n_meuse = nrow(meuse[['soils']])

#'
#' We will interpolate the log-transformed values (as in the `gstat` vignette). Start by
#' defining a grid and snapping the points to it
#'

#+ snap_grid

# desired resolution in units of metres
gres = c(y=5, x=5)

# snap points, copying values of dependent variable
g_meuse = pkern_snap(meuse[['soils']]['log_zinc'], g=list(gres=gres))

# plot the grid only
g = pkern_grid(modifyList(g_meuse, list(gval=NULL)))
pkern_plot(modifyList(g_meuse, list(gval=NULL)), col_grid=NA)

# plot with source points indicated over their snapped grid location
pkern_plot(g_meuse, zlab='log(zinc)', reset=FALSE)
plot(sf::st_geometry(meuse[['soils']]), add=TRUE)

#'
#' Pass this data to pkern_fit to do ordinary kriging
#'

#+ ordinary_kriging

# ordinary kriging: fit isotropic gaussian model by default
fit_result_OK = pkern_fit(g_obs=g_meuse, quiet=TRUE)
#fit_result_OK = pkern_fit(g_obs=g_meuse, pars='mat', quiet=TRUE)
#vg_detrend = pkern_sample_vg(g_meuse)
#pkern_plot_semi(vg_detrend, fit_result_OK$pars)


#'
#' By default, a sample semi-variogram is plotted, with the fitted model
#' drawn in transparent blue. Notice that a range of semi-variances can be
#' realized for a given distance because the model is anisotropic.
#'
#' The fitted model overestimates the (partial sill) variance in this case because
#' there is an unaccounted-for trend. Ordinary kriging estimates spatially constant
#' trends, but here we should expect it to depend on how near we are to the presumed
#' source of the zinc deposits (the Meuse River).
#'
#' The code below demonstrates creating a linear predictor - the distance to the river.
#'

#+ make_distances

# useful variables
gdim = g_meuse[['gdim']]
is_obs = !is.na(g_meuse[['gval']])
n_obs = sum(is_obs)

# get distance values for entire grid
g_meuse_sf = pkern_coords(g_meuse, out='sf')
d2r_result = units::drop_units(st_distance(g_meuse_sf, meuse[['river_line']]))

# include both distance and its square root
meuse_predictors = scale(cbind(d2r_result, sqrt(d2r_result)))

# copy the subset of predictors at observed response locations
X = matrix(meuse_predictors[is_obs,], nrow=n_obs)


#'
#' Fit the model with the linear predictor included. This is called universal kriging
#'

#+ universal_kriging

# fit the model and plot semivariogram
fit_result_UK = pkern_fit(g_obs=g_meuse, X=X, quiet=TRUE)

# print the GLS estimates for coefficients of the linear predictor
pars_UK = fit_result_UK$pars
pkern_GLS(g_meuse, pars_UK, X=X, out='b')

#'
#' These fitted kernels can be visualized using `pkern_plot_pars`. This plots the
#' footprint of correlations around any given central point on the grid.
#'

#+ kernel_plot

# plot the kernel shape
pkern_plot_pars(pars_UK, g_meuse)

# print the actual contents of the parameters list
str(pars_UK)


#+ GLS_plot

# GLS to estimate the (spatially varying) trend
z_gls = pkern_GLS(g_meuse, pars_UK, X=meuse_predictors, out='z')
g_meuse_gls = modifyList(g_meuse, list(gval=z_gls))
pkern_plot(g_meuse_gls, main='estimated trend component')

# g2 = g_meuse
# g2[['gval']] = g2[['gval']] - z_gls
# vg_detrend = pkern_sample_vg(g2)
# pkern_plot_semi(vg_detrend, pars_UK, fun='classical')
# pkern_plot_semi(vg_detrend, pars_UK, fun='root_median')



#'
#' Once you have a set of covariance parameters, interpolation becomes very easy
#' with `pkern_cmean`. This computes the kriging predictor, an interpolator with nice
#' properties like unbiasedness and minimal variance, under suitable assumptions.
#'

#+ spatial_plot

# compute spatial mean
g_meuse_detrend = modifyList(g_meuse, list(gval=g_meuse[['gval']]-z_gls))
z_spat = pkern_cmean(g_meuse_detrend, pars_UK, X=0)
g_meuse_spat = modifyList(g_meuse, list(gval=z_spat))
pkern_plot(g_meuse_spat,
           main='estimated spatial component')

#+ predictor_plot

# compute UK predictions, masking outside observed range
z_pred = z_gls + z_spat
g_meuse_pred = modifyList(g_meuse, list(gval=z_pred))
pkern_plot(g_meuse_pred,
           zlim=log(range(meuse[['soils']][['zinc']])),
           zlab='log(zinc)',
           main='kriging predictor')

#'
#' Kriging variance is also computed with `pkern_cmean`. This is much slower
#' to compute than the kriging predictor
#'

#+ variance_plot


# prediction variance
z_var = pkern_cmean(g_meuse_detrend, pars_UK, X=0, out='v', quiet=TRUE)
g_meuse_var = modifyList(g_meuse, list(gval=z_var))
pkern_plot(g_meuse_var, main='kriging variance')


#'
#' The above plots are for the variable on the log-scale. The unbiasdness property
#' is lost when transforming back to the original scale, but now that we have the
#' kriging variance, we can make a correction:
#'

#+ predictor

# prediction bias adjustment from log scale
z_pred2 = exp(z_pred + z_var/2)
g_meuse_pred2 = modifyList(g_meuse, list(gval=z_pred2))
pkern_plot(g_meuse_pred2,
           zlab='zinc (ppm)',
           zlim=range(meuse[['soils']][['zinc']]),
           main='UK predictions (masked to input range)')


#'
#' ## Summary
#'
#' This vignette is intended to get users up and running with `pkern` by demonstrating
#' a very simple example on a familiar dataset. In other vignettes we will look in more
#' detail at how `pkern_cmean` actually works, and how users can modify the workflow to
#' incorporate a trend model.


#'
#' ## Markdown
#'
#' This chunk below is used to create the markdown document you're reading from the
#' R script file with same name, in this directory. It uses `rmarkdown` to create
#' the md file, then substitutes local image paths for github URLs so the document
#' will display the images properly on my repo page.

if(FALSE)
{
  # Restart session and run code chunk below to build the markdown file
  library(here)
  library(rmarkdown)

  # make the markdown document and delete the unwanted html
  path.input = here('vignettes/meuse_vignette.R')
  path.output = here('vignettes/meuse_vignette.md')
  path.garbage = here('vignettes/meuse_vignette.html')
  rmarkdown::render(path.input, clean=TRUE, output_file=path.output)
  unlink(path.garbage)

  # substitute local file paths for image files with URLs on github
  md.github = gsub('D:/pkern', 'https://github.com/deankoch/pkern/blob/main', readLines(path.output))
  writeLines(md.github, path.output)
}