# Helper function for running long benchmarking test loop
#
# This takes the input data files created in the main vignette and opens each one
# in a loop, running the ordinary kriging workflow (creating predictions, and if possible,
# prediction variance) for the specified output grid size.
#
# An inner loop repeats this for all the packages listed in `pkg_test`, which must start with 'pkern',
# and include any subset of 'fields', 'RandomFields', 'geoR', 'gstat'. For each of these packages there
# should be a corresponding pair of helper functions defined (in 'run_methods.R') with the names
# `{pkg}_bench_fit` and `{pkg}_bench_OK` (where '{pkg}' is the package name). These fit the model
# and run the kriging workflow, timing results.
#
run_benchmark = function(inputs_df, dir_storage, pkg_test, n_rep, timeout, n_max, make_plots=FALSE)
{
# inputs_df: data-frame containing info on each input test file (and its path)
# pkg_test: names of packages to test
# make_plots: logical, indicates to plot the results as the loop progresses
# define directory to write to and path to CSV with info on all files written
dir_outputs = file.path(dir_storage, 'outputs')
if(!dir.exists(dir_outputs)) dir.create(dir_outputs)
outputs_csv_path = file.path(dir_storage, 'outputs.csv')
# begin long loop over input files to fill results_df and write outputs
results_df = data.frame()
for(i in 20:80)
#for(i in seq(nrow(inputs_df)))
{
# index of all output grids for this example
name_in = inputs_df[i, 'name_in']
i_all = which(inputs_df[['name_in']] == name_in)
i_eg = paste(', output', which(i_all==i), 'of', length(i_all))
# number of output points
n_out = inputs_df[i, 'n_out']
# print info about the example
cat(paste0('\n\nEXAMPLE ', i, ' OF ', nrow(inputs_df), ' (', name_in, i_eg, ')'))
cat(paste('\nnumber of training points:', inputs_df[i, 'n_train']))
cat(paste('\nnumber of output points:', n_out))
cat(paste('\nloading input file:', inputs_df[i, 'path_in']))
# load the input data file and unpack contents
path_in = inputs_df[i, 'path_in']
data_in = readRDS(path_in)
testing_pts = data_in[['testing_pts']]
training_pts = data_in[['training_pts']]
output_grid = data_in[['output_grid']]
training_grid = data_in[['training_grid']]
# initial values for the problem (assigned later)
initial = NULL
# run workflow loop over packages
for(j in seq_along(pkg_test))
{
# copy functions to use in kriging
package_name = pkg_test[j]
cat(paste('\n\nrunning workflow with', package_name, 'package...'))
fit_fun = get(paste0(package_name, '_bench_fit'))
pred_fun = get(paste0(package_name, '_bench_OK'))
# define path to write output
file_prefix = substr(basename(path_in), 1, nchar(basename(path_in))-4)
file_out = paste0(file_prefix, paste0('_', package_name, '.Rdata'))
path_out = file.path(dir_outputs, file_out)
# only fit once to each dataset
is_first = (i == i_all[1])
if(is_first)
{
# copy pkern's fitted parameters to use as initial values in geoR and fields
if(package_name != 'pkern')
{
# load fitted parameters from pkern
is_pkern = ( results_df[['pkg']] == 'pkern' ) & ( results_df[['name_in']] == name_in )
if(!any(is_pkern)) stop('pkern model fitting results expected but not found')
prev_result = readRDS(results_df[ which(is_pkern)[1], 'path_out'])
pkern_pars = prev_result[['fit_result']][['result']][['pars']]
rm(prev_result)
# copy to pkern fitted values to initial values vector
range_initial = pkern_pars[['y']][['kp']]
psill_initial = pkern_pars[['psill']]
initial = c(psill=psill_initial, range_initial)
}
cat('fitting...')
# geoR and fields receive a copy of pkern's fitted values to use as initials
initial_pass = initial
if(package_name == 'fields')
{
# fields produced singular matrix errors for the supplied initial values in
# some cases, so we substitute values that will work manually here.
# NULL means to use fields' defaults
if(name_in == 'treed') initial_pass = NULL
if(name_in == 'treed_88') initial_pass = list(rho=1e4)
if(name_in == 'treed_352') initial_pass = NULL
if(name_in == 'treed_1376') initial_pass = NULL
}
# skip model fitting if above maximum set in n_max
if( n_out < n_max[[name_in]][['fit']][package_name] )
{
# sink suppresses messy console printouts
sink(nullfile())
# no timeout on fit calls - user interrupt fails to halt them in some cases anyway
fit_result = fit_fun(training_grid, training_pts, n_rep, initial_pass, timeout=Inf)
sink()
} else {
fit_result = list(error='skipped')
}
} else {
# load previously fitted parameters
is_prev = ( results_df[['pkg']] == package_name ) & ( results_df[['name_in']] == name_in )
if(!any(is_prev)) stop('previous model fitting results expected but not found')
prev_result = readRDS(results_df[ which(is_prev)[1], 'path_out'])
fit_result = prev_result[['fit_result']]
rm(prev_result)
}
# initialize ordinary kriging output
OK_pred_result = OK_var_result = list()
OK_err = NA
error_string_pred = NA
error_string_var = NA
if( !is.null(fit_result[['error']]) )
{
cat('failed. Skipping prediction, MSPE, and variance.')
} else {
# run kriging prediction with timeout
cat('prediction...')
if( n_out < n_max[[name_in]][['pred']][package_name] )
{
# sink suppresses messy console printouts
sink(nullfile())
OK_pred_result = pred_fun(output_grid, training_pts, fit_result[['result']],
n_rep, 'p', timeout)
sink()
} else {
OK_pred_result = list(error='skipped')
}
# convert output to terra object for MSPE computation
if( !is.null(OK_pred_result[['error']]) )
{
cat('failed. Skipping MSPE and variance.')
# copy class of error
error_string_pred = class(OK_pred_result[['error']])[1]
} else {
# compute MSPE when there is a test set
if(!is.null(testing_pts))
{
# extract predictions and calculate MSPE
OK_rast = pkern_export(OK_pred_result[['result']])
OK_diff = testing_pts[['gval']] - terra::extract(OK_rast, testing_pts)[,2]
OK_err = mean(OK_diff^2, na.rm=TRUE)
}
# run kriging variance with timeout
cat('variance...')
if( n_out < n_max[[name_in]][['var']][package_name] )
{
sink(nullfile())
OK_var_result = pred_fun(output_grid, training_pts, fit_result[['result']],
n_rep, 'v', timeout)
sink()
} else {
OK_var_result = list(error='skipped')
}
# copy class of error (if any)
if( is.null(OK_var_result[['error']]) ) { cat('done.') } else {
error_string_var = class(OK_var_result[['error']])[1]
}
}
}
# assign NAs when execution failed
if( is.null(fit_result[['teval']]) ) fit_result[['teval']] = NA
if( is.null(OK_pred_result[['teval']]) ) OK_pred_result[['teval']] = NA
if( is.null(OK_var_result[['teval']]) ) OK_var_result[['teval']] = NA
if( is.null(OK_err) ) OK_err = NA
# gather into data-frame
results_ij = cbind(inputs_df[i,], data.frame(pkg = package_name,
fit_s = fit_result[['teval']],
pred_s = OK_pred_result[['teval']],
var_s = OK_var_result[['teval']],
mspe = OK_err,
path_out = path_out,
pred_failure = error_string_pred,
var_failure = error_string_var))
# copy results to storage data-frame
results_df = rbind(results_df, results_ij)
# save important objects to disk
cat(paste('\nwriting results to', path_out))
saveRDS(list(fit_fun = fit_fun,
pred_fun = pred_fun,
path_in = path_in,
fit_result = fit_result,
OK_pred_result = OK_pred_result,
OK_var_result = OK_var_result,
results_ij = results_ij), file=path_out)
}
# optionally make a plot to show results so far for this example
if(make_plots)
{
# extract all data for this example so far
results_df_batch = results_df[results_df[['name_in']] == name_in,]
results_df_batch[['log_10_n_out']] = log(results_df_batch[['n_out']], base=10)
col_names = c('black', 'blue', 'violet', 'red')
plot_colors = col_names[ match(results_df_batch[['pkg']], pkg_test) ]
# set zero eval time for situations where variance was skipped or doesn't apply
results_df_batch[['var_s']][ results_df_batch[['pkg']] %in% c('gstat', 'geoR', 'RandomFields') ] = 0
# create variance + prediction eval time column
results_df_batch[['all_s']] = results_df_batch[['pred_s']] + results_df_batch[['var_s']]
results_df_batch[['with_var']] = 'yes'
results_df_batch[['with_var']][results_df_batch[['pkg']] == 'RandomFields'] = 'no'
# add prediction time rows for two packages that allowed separate variance calculation
idx_pred_only = results_df_batch[['pkg']] %in% c('fields', 'pkern')
results_df_batch_new = results_df_batch[idx_pred_only,]
results_df_batch_new[['all_s']] = results_df_batch_new[['pred_s']]
results_df_batch_new[['with_var']] = 'no'
results_df_batch_combo = rbind(results_df_batch, results_df_batch_new)
# create prediction + variance time plot
gg1 = ggplot(results_df_batch_combo) +
aes(x=n_out, y=all_s, color=pkg, lty=with_var) +
geom_point() +
geom_line() +
geom_line() +
xlab('prediction points') +
ylab('time (seconds)') +
labs(color='R package',
lty='with variance') +
scale_x_log10(
breaks = scales::trans_breaks('log10', function(x) 10^x),
labels = scales::trans_format('log10', scales::math_format(10^.x))
) +
scale_y_log10(
breaks = scales::trans_breaks('log10', function(x) 10^x),
labels = scales::trans_format('log10', scales::math_format(10^.x))
) +
theme_bw() +
theme(text=element_text(size=8),
strip.text.x=element_text(face='bold'),
strip.text.y=element_text(face='bold'))
print(gg1)
# # create MSPE plot
# gg2 = ggplot(results_df_batch) +
# aes(x=n_out, y=mspe, color=pkg) +
# geom_point() +
# geom_line() +
# geom_line() +
# xlab('prediction points') +
# ylab('MSPE') +
# labs(color='R package',
# lty='with variance') +
# scale_x_log10(
# breaks = scales::trans_breaks('log10', function(x) 10^x),
# labels = scales::trans_format('log10', scales::math_format(10^.x))
# ) +
# scale_y_log10(
# breaks = scales::trans_breaks('log10', function(x) 10^x),
# labels = scales::trans_format('log10', scales::math_format(10^.x))
# ) +
# theme_bw() +
# theme(text=element_text(size=8),
# strip.text.x=element_text(face='bold'),
# strip.text.y=element_text(face='bold'))
# # draw the plots
# suppressMessages(grid.arrange(gg1, gg2, nrow=2))
}
}
# save results_df to disk as a csv
write.csv(results_df, file=outputs_csv_path, quote=FALSE)
return(outputs_csv_path)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.