inst/scripts/01_bathymetry_stmv_example.md

Bathymetry modelling and prediction using STMV

The example here uses FFT (Fast Fourier Transforms) to discretize and describe the local spatial autocorrelation process (Matern) and with no global model (no external drift). It is the method of choice for speed and ability to capture the variability ... the following should take just a few minutes. The spatial domain is small for puposes of exposition. When applied to larger domains, first and second order nonstationarity becomes very obvious.

This example runs in serial mode. Parallel mode is faster, but must be balanced by RAM demands.

# prep data

require(aegis)
require(aegis.survey)
require(aegis.bathymetry)
require(stmv)
require(sf)

p0 = aegis::spatial_parameters( spatial_domain="bathymetry_example",
  aegis_proj4string_planar_km="+proj=utm +ellps=WGS84 +zone=20 +units=km",
  dres=1/60/4, pres=0.5, lon0=-64, lon1=-62, lat0=44, lat1=45, psignif=2 )

# p0 = stmv_test_data( "aegis.test.parameters")  # same thing

# separate data components
input = stmv::stmv_test_data( datasource="aegis.space", p=p0)
input = sf::st_as_sf( input, coords=c("lon","lat"), crs=st_crs(projection_proj4string("lonlat_wgs84")) )
input = sf::st_transform( input, crs=st_crs(p0$aegis_proj4string_planar_km) )
input = as.data.frame( cbind( input$z, st_coordinates(input) ) )
names(input) = c("z", "plon", "plat")
input = input[ which(is.finite(input$z)), ]
output = list( LOCS = spatial_grid(p0) )
DATA = list( input = input,  output = output )

input = output = NULL
gc()

# set up interpolation options
p = bathymetry_parameters(
  p=p0,  # start with spatial settings of input data
  project_class="stmv",
  stmv_model_label="fft_tapered_statsgrid10km",  # this label is used as directory for storage
  data_root = file.path(tempdir(), "bathymetry_example"),
  DATA = DATA,
  spatial_domain = p0$spatial_domain,
  spatial_domain_subareas =NULL,
  inputdata_spatial_discretization_planar_km = p0$pres,  # pres = 0.5
  dimensionality="space",
  stmv_variables = list(Y="z"),  # required as fft has no formulae
  stmv_global_modelengine = "none",  # too much data to use glm as an entry into link space ... use a direct transformation
  stmv_local_modelengine="fft",
  stmv_fft_filter = "matern tapered lowpass modelled fast_predictions", #  matern with taper, fast predictions are sufficient as data density is high
  stmv_lowpass_nu = 0.5,
  stmv_lowpass_phi = stmv::matern_distance2phi( distance=0.5, nu=0.5, cor=0.1 ),  # note: p$pres = 0.5
  stmv_variogram_method = "fft",
  stmv_autocorrelation_fft_taper = 0.9,  # benchmark from which to taper .. high data density truncate local predictions to capture heterogeneity
  stmv_autocorrelation_localrange = 0.1,
  stmv_autocorrelation_interpolation = c(0.3, 0.25, 0.1, 0.05, 0.01),  # start with smaller localrange estimates and expand
  stmv_filter_depth_m = FALSE,  # need data above sea level to get coastline
  stmv_Y_transform =list(
    transf = function(x) {log10(x + 2500)} ,
    invers = function(x) {10^(x) - 2500}
  ), # data range is from -1667 to 5467 m: make all positive valued
  stmv_rsquared_threshold = 0.01, # lower threshold  .. i.e., ignore ... there is no timeseries model, nor a fixed effect spatial "model"
  stmv_distance_statsgrid = 10, # resolution (km) of data aggregation (i.e. generation of the ** statistics ** )
  stmv_distance_scale = c( 2.5, 5, 10, 20, 40, 60, 80 ), # km ... approx guesses of 95% AC range
  stmv_distance_interpolation = c( 5, 10, 15, 20, 40, 60, 80  ) , # range of permissible predictions km (i.e 1/2 stats grid to upper limit) .. in this case 5, 10, 20
  stmv_distance_prediction_limits =c( 2.5, 10 ), # range of permissible predictions km (i.e 1/2 stats grid to upper limit based upon data density)
  stmv_nmin = 30,  # min number of data points req before attempting to model in a localized space
  stmv_nmax = 900, # no real upper bound.. just speed /RAM
  stmv_force_complete_method = "fft",
  stmv_runmode = list(
    scale =list( 
      distsc1 = rep("localhost", 1), 
      distsc2 = rep("localhost", 1), 
      distsc3 = rep("localhost", 1), 
      distsc4 = rep("localhost", 1), 
      distsc5 = rep("localhost", 1), 
      distsc6 = rep("localhost", 1), 
      distsc7 = rep("localhost", 1)
    ),
    interpolate_correlation_basis = list(
      c1 = rep("localhost", 1),
      c2 = rep("localhost", 1),
      c3 = rep("localhost", 1),
      c4 = rep("localhost", 1),
      c5 = rep("localhost", 1)
    ),
    interpolate_predictions = list(
      c1 = rep("localhost", 1),
      c2 = rep("localhost", 1),
      c3 = rep("localhost", 1),
      c4 = rep("localhost", 1),
      c5 = rep("localhost", 1),
      c6 = rep("localhost", 1),
      c7 = rep("localhost", 1) 
    ),
    globalmodel = FALSE,
    save_intermediate_results = TRUE,
    save_completed_data = TRUE
  ) 
)


if (0) {
  # to force parallel mode

    scale_ram_required_main_process = 1 # GB twostep / fft ---
    scale_ram_required_per_process  = 1 # twostep / fft /fields vario ..  (mostly 0.5 GB, but up to 5 GB)
    scale_ncpus = min( parallel::detectCores(), floor( (ram_local()- scale_ram_required_main_process) / scale_ram_required_per_process ) )

    # 5 mins
    interpolate_ram_required_main_process = 1 # GB twostep / fft
    interpolate_ram_required_per_process  = 1.5 # twostep / fft /fields vario ..
    interpolate_ncpus = min( parallel::detectCores(), floor( (ram_local()- interpolate_ram_required_main_process) / interpolate_ram_required_per_process ) )

    stmv_runmode = list(
      scale = list( 
        distsc1 = rep("localhost", scale_ncpus), 
        distsc2 = rep("localhost", scale_ncpus), 
        distsc3 = rep("localhost", scale_ncpus), 
        distsc4 = rep("localhost", scale_ncpus), 
        distsc5 = rep("localhost", scale_ncpus), 
        distsc6 = rep("localhost", scale_ncpus), 
        distsc7 = rep("localhost", scale_ncpus)
      ),
      interpolate_correlation_basis = list(
        c1 = rep("localhost", interpolate_ncpus),  # ncpus for each runmode
        c2 = rep("localhost", interpolate_ncpus),  # ncpus for each runmode
        c3 = rep("localhost", interpolate_ncpus),
        c4 = rep("localhost", interpolate_ncpus),
        c5 = rep("localhost", interpolate_ncpus)
      ),
      interpolate_predictions = list(
        c1 = rep("localhost", interpolate_ncpus),  # ncpus for each runmode
        c2 = rep("localhost", interpolate_ncpus),  # ncpus for each runmode
        c3 = rep("localhost", interpolate_ncpus),
        c4 = rep("localhost", interpolate_ncpus),
        c5 = rep("localhost", interpolate_ncpus),
        c6 = rep("localhost", interpolate_ncpus),
        c7 = rep("localhost", interpolate_ncpus)
      ),
      globalmodel = FALSE,
      # restart_load = "interpolate_correlation_basis_0.01" ,  # only needed if this is restarting from some saved instance
      save_intermediate_results = TRUE,
      save_completed_data = TRUE

    )  # ncpus for each runmode

}


# quick look of data
dev.new(); surface( as.image( Z=DATA$input$z, x=DATA$input[, c("plon", "plat")], nx=p$nplons, ny=p$nplats, na.rm=TRUE) )

The raw data with a fast interpolation with "surface":

We see that land, to the north-west is sparsely sampled.

Now we run the model:


stmv( p=p  )  # This will take from a few minutes, depending upon system
# stmv_db( p=p, DS="cleanup.all" )

And then some outputs:


predictions = stmv_db( p=p, DS="stmv.prediction", ret="mean" )
statistics  = stmv_db( p=p, DS="stmv.stats" )
locations =  spatial_grid( p )


# comparison .. need to remove NA's that messes up isolines
i = which( is.finite(predictions )) 
o = interp( x=locations[i,1], y=locations[i,2], z= predictions[i],  xo=p$plons, yo=p$plats, method="linear" )
dev.new(); surface( o )

We can see that it recovers most of the oceanographic features and as a bonus even the land features that are sampled on very different scales.

What makes STMV unique is the non-stationary assumption of the spatial process. As such, we can recover estimates of spatial parameters as a function of location!

statsvars = dimnames(statistics)[[2]]

# statsvars = c("sdTotal", "ndata", "fixed_mean", "fixed_sd", "dic", "dic_p_eff",
#   "waic", "waic_p_eff", "mlik", "Expected_number_of_parameters",
#   "Stdev_of_the_number_of_parameters", "Number_of_equivalent_replicates",
#   "Precision_for_the_Gaussian_observations", "Precision_for_space",
#   "Phi_for_space", "Precision_for_the_Gaussian_observations_sd", "Precision_for_space_sd", "Phi_for_space_sd"
# )

# statsvars = c( "sdTotal", "rsquared", "ndata", "sdSpatial", "sdObs", "phi", "nu", "localrange" )
dev.new(); levelplot( predictions[] ~ locations[,1] + locations[,2], aspect="iso" )
dev.new(); levelplot( statistics[,match("nu", statsvars)]  ~ locations[,1] + locations[,2], aspect="iso" ) # nu
dev.new(); levelplot( statistics[,match("sdTotal", statsvars)]  ~ locations[,1] + locations[,2], aspect="iso" ) #sd total
dev.new(); levelplot( statistics[,match("localrange", statsvars)]  ~ locations[,1] + locations[,2], aspect="iso" ) #localrange

These are estimates of the spatial variability of depth as a function of space. Clearly they are spatially patterned and so non-stationary!

Further the shape of the AC (autocorrelation) function varies with space. The Matern nu (strength of curvature of AC function) as a function of space is here.

And estimates of spatial range, the length scales at which variability of the AC-function declines to some very low level!



jae0/emei documentation built on Jan. 25, 2024, 10:57 p.m.