stmdat: Package that interacts with stmv.

# ----------------
# Prep OSD, snow crab and groundfish temperature profiles
# this one has to be done manually .. no longer mainted by anyone ..

if (!exists("year.assessment")) {
  year.assessment=lubridate::year(Sys.Date())
  year.assessment=lubridate::year(Sys.Date()) - 1
}

p = aegis::aegis_parameters( DS="temperature", yrs=1950:year.assessment )  # these are default years

# ------------------------------
create.baseline.database=FALSE
if ( create.baseline.database ) {
  # 0 data assimilation

  if (historical.data.redo) {
    # data dumps from various sources
    temperature.db( p=p, DS="USSurvey_NEFSC" ) # US data .. new additions have to be made at the rawdata level manually
    temperature.db( p=p, DS="lobster.redo" ) # FSRS data ...  new additions have to be made at the rawdata level manually
    temperature.db( p=p, DS="misc.redo" ) # various survey data, mostly scallop ...  new additions have to be made at the rawdata level manually until a few db approach is finalized
    temperature.db( DS="osd.rawdata.1910_2008", p=p ) # redo whole data set (historical) from 1910 to 2010
    temperature.db( DS="osd.rawdata.2008_2016", p=p ) # 2008:2016, 2008 is overlapping ... overwrite the older series
    # temperature.db( DS="ODF_ARCHIVE", p=p, yr=1969:2015 ) # specify range or specific year .. not used .. here in case the data series gets reactivated .. will need to be brought into profiles.annual.redo
    # NOTE:: if groundfish data profiles are maintained by groundfish databases again then an easy way wold be
    # to update aegis::groundfish.db( DS="gshyd.georef" ) to assimilate the data ..  this would also mean that 00.surveys.r would need to be run first. ..
  }

  # Roger Petipas has been maintaining a database, the following loads this data
  # the data needs to be unzipped and the Data* files need to be moved into:
  #   project.directory("aegis", "temperature", "archive", "profiles")
  temperature.db( DS="osd.rawdata.annual", p=p, yr=2017:year.assessment ) # specify range or specific year

  # Merge depth profiles from all data streams: OSD, groundfish, snowcrab, USSurvey_NEFSC
  temperature.db( DS="profiles.annual.redo", p=p, yr=2008:year.assessment  )

  # Extract bottom data from each profile and discretization of space and time resolution to manageable numbers
  temperature.db( DS="bottom.annual.redo", p=p, yr=2008:year.assessment  )

  (p$yrs) # check the years to ensure we are selecting the correct years 1950:present
  temperature.db ( DS="bottom.all.redo", p=p )

}



# ------------------------------
# Basic data uptake now complete  .. move to interpolations
# ------------------------------


create.interpolated.results.stmv=TRUE
if (create.interpolated.results.stmv ) {

  # 1. stmv interpolations assuming some seasonal pattern
  # twostep:  ~ 160+ hrs

    scale_ram_required_main_process = 0.8 # GB twostep / fft
    scale_ram_required_per_process  = 1.25 # twostep / fft /fields vario ..  (mostly 0.5 GB, but up to 5 GB) -- 20 hrs
    scale_ncpus = min( parallel::detectCores(), floor( (ram_local()- scale_ram_required_main_process) / scale_ram_required_per_process ) )

    interpolate_ram_required_main_process = 24 # GB twostep / fft
    interpolate_ram_required_per_process  = 1.25 # 1 GB seems enough for twostep / fft /fields vario .. but make 2 in case
    interpolate_ncpus = min( parallel::detectCores(), floor( (ram_local()- interpolate_ram_required_main_process) / interpolate_ram_required_per_process ) )

    nyrs = year.assessment-1950

    p = aegis::aegis_parameters(
      DS="temperature",
      data_root = project.datadirectory( "aegis", "temperature" ),
      spatial.domain = "canada.east", # default
      DATA = 'temperature.db( p=p, DS="stmv.inputs" )',
      additional.data=c("groundfish", "snowcrab", "USSurvey_NEFSC", "lobster"),
      pres_discretization_temperature = 1 / 100, # 1==p$pres; controls resolution of data prior to modelling (km .. ie 100 linear units smaller than the final discretization pres)
      yrs = 1950:year.assessment,
      stmv_dimensionality="space-year-season",
      stmv_global_modelengine = "none",
      stmv_global_modelformula = "none",  # only marginally useful .. consider removing it and use "none",
      stmv_global_family ="none",
      stmv_local_modelengine = "twostep" ,
      stmv_local_modelformula_time = formula( paste(
        't',
        '~ s( yr, k=20, bs="ts") + s(cos.w, k=3, bs="ts") + s(sin.w, k=3, bs="ts")  ',
        '+ s( yr, cos.w, sin.w, k=30, bs="ts") ',
        '+ s( log(z), k=3, bs="ts") + s( plon, k=3, bs="ts") + s( plat, k=3, bs="ts")  ',
        '+ s( log(z), plon, plat, k=30, bs="ts")  '
       ) ),
      stmv_twostep_time = "gam",
      stmv_twostep_space = "fft",  # everything else is too slow ...
      stmv_fft_filter="lowpass_matern_tapered",  #  matern, krige (very slow), lowpass, lowpass_matern
      stmv_range_correlation_fft_taper = 0.5,  # in local smoothing convolutions occur of this correlation scale
      stmv_local_model_distanceweighted = TRUE,
      stmv_rsquared_threshold = 0, # lower threshold .. not used if twostep method
      stmv_distance_statsgrid = 5, # resolution (km) of data aggregation (i.e. generation of the ** statistics ** )
      stmv_distance_scale = c( 20, 25, 30, 35, 40, 45, 50 ), # km ... approx guess of 95% AC range
      stmv_distance_prediction_fraction = 4/5, # i.e. 4/5 * 5 = 4 km
      stmv_clusters = list( scale=rep("localhost", scale_ncpus), interpolate=rep("localhost", interpolate_ncpus) ),  # ncpus for each runmode
      stmv_nmin = 16*nyrs,  # ~ 1000 min number of data points req before attempting to model timeseries in a localized space .. control no error in local model
      stmv_nmax = 25*nyrs # no real upper bound.. just speed / RAM limits  .. can go up to 10 GB / core if too large
    )


    stmv( p=p, runmode=c("scale", "interpolate")  )  #700 MB (main); 500 MB child .. 2 days for scaling and 2 days for interpolation


    if (debugging) {
      stmv( p=p, runmode=c("debug", "scale", "interpolate"), force_complete_solution=TRUE )
      stmv_db( p=p, DS="stmv.results" ) # save to disk for use outside stmv*, returning to user scale
      if (really.finished) {
        stmv_db( p=p, DS="cleanup.all" )
      }
    }


  # 2.  collect predictions from stmv and warp/break into sub-areas defined by
  #     p$spatial.domain.subareas = c( "SSE", "SSE.mpa", "snowcrab" )
  temperature.db( p=p, DS="predictions.redo" ) # ~1hr
  temperature.db( p=p, DS="stmv.stats.redo" ) # warp to sub grids

  # 3. extract relevant statistics .. including climatologies all defined by p$yrs
  temperature.db( p=p, DS="bottom.statistics.annual.redo" )

  # 4. all time slices in array format
  temperature.db( p=p,  DS="spatial.annual.seasonal.redo" )

  # 5. time slice at prediction time of year
  temperature.db( p=p,  DS="timeslice.redo" )

  # 6. complete statistics and warp/regrid database ... ~ 2 min :: only for  default grid . TODO might as well do for each subregion/subgrid
  temperature.db( p=p, DS="complete.redo")


# 7. maps
  # run only on local cores ... file swapping seem to reduce efficiency
  # p = aegis::aegis_parameters( DS="temperature", yrs=1950:year.assessment )

  temperature.map( p=p, DS="all", yr=p$yrs ) # default is to do all

  # just redo a couple maps for ResDoc in the  SSE domain
  #p$bstats = "tmean"
  p = spatial_parameters( p=p, spatial.domain = "SSE.mpa" )  # default grid and resolution
  p$corners = data.frame(plon=c(150, 1022), plat=c(4600, 5320) )  # alter corners ..
  temperature.map( p=p, DS='climatology' )
  temperature.map( p=p, DS='stmv.stats' )
  temperature.map( p=p, DS='annual' )
  temperature.map( p=p, DS='seasonal' )
}

# finished