emaf: Package that interacts with stmv.

  survey.db = function( p=NULL, DS=NULL, year.filter=TRUE, add_groundfish_strata=FALSE ) {
    #\\ assimilation of all survey data into a coherent form
    surveydir = project.datadirectory( "aegis", "survey" )

    dir.create( surveydir, showWarnings=FALSE, recursive=TRUE )

    if (DS %in% c("set.init", "set.init.redo") ) {
      # survet sets
      set = NULL # trip/set loc information
      fn = file.path( surveydir, "set.init.rdata"  )
      if (DS=="set.init") {
        if (file.exists( fn) ) load( fn)
        return ( set )
      }

			set.names =  c("data.source", "id", "timestamp", "yr", "lon", "lat",
                     "z", "t", "sal", "oxyml", "sa", "sa_towdistance", "gear", "setquality", "settype", "cf_tow" )

      if ( "groundfish" %in% p$data.sources ) {
        # settype:
        # 1=stratified random,
        # 2=regular survey,
        # 3=unrepresentative(net damage),
        # 4=representative sp recorded(but only part of total catch),
        # 5=comparative fishing experiment,
        # 6=tagging,
        # 7=mesh/gear studies,
        # 8=explorartory fishing,
        # 9=hydrography
        pg = aegis::aegis_parameters(DS="groundfish", yrs=1970:max(p$yrs) )
        y = aegis::groundfish.db(p=pg, "set.base" )
        y$data.source = "groundfish"
        y$sa = y$sweptarea  # sa is in km^2 .. best estimate given data
        # y$sa_towdistance_wing = y$wing.sa
        y$sa_towdistance = y$sakm2
        y$z = y$sdepth  # m
        y$gear = y$geardesc
        y$setquality = NA
        y$setquality[ which( y$settype %in% c(1,2,5) ) ] = "good"
        gsvn = c("data.source", "id", "timestamp", "yr", "lon", "lat",
                 "z", "temp", "sal", "oxyml", "sa", "sa_towdistance", "gear", "setquality", "settype", "cf_tow" )
        set = rbind( set, y[ ,gsvn ] )
        names(set) = set.names
        rm (y); gc()
      }

      if ( "snowcrab" %in% p$data.sources ) {
        ps = bio.snowcrab::snowcrab.parameters(DS="groundfish", yrs=1999:max(p$yrs))
        y =  bio.snowcrab::snowcrab.db( p=ps, DS ="set.clean" )
        y$data.source = "snowcrab"
        y$gear ="Nephrops trawl"
        y$id = paste( y$trip, y$set, sep="." )
        y$settype  = y$towquality  # copy
        y$setquality = NA
        y$setquality[ which( y$towquality == 1 ) ] = "good"  # 1=good
        y$sal = NA  # dummy
        y$oxyml = NA # dummy var
        y$cf_tow = 1/y$sa
        # y$sa = y$sweptarea  # sa is in km^2 .. best estimate given data
        y$sa_towdistance = NA  # TODO
        set = rbind( set, y[ , set.names ] )  # sa is in km^2
        rm (y); gc()
      }

      save( set, file=fn, compress=T )
      return (fn)
    }


    # --------------------


    if (DS %in% c("cat.init", "cat.init.redo") ) {

      # all species caught
      cat = NULL # trip/cat loc information
      fn = file.path( surveydir, "cat.init.rdata"  )
      if (DS=="cat.init") {
        if (file.exists( fn) ) load( fn)
        return ( cat )
      }

      ###  NOTE:: cf == correction factor is a reweighting required to make each totno and totwgt comparable for each set and species subsampling

      cat.names =  c("data.source", "id", "id2", "spec", "spec_bio", "totno", "totwgt", "cf_cat" )
      if ( "groundfish" %in% p$data.sources ) {

        pg = aegis_parameters( DS="groundfish" )
        x = aegis::groundfish.db(p=pg, "gscat" )  #kg/set, no/set
        x$data.source = "groundfish"
        x$spec_bio = taxonomy.recode( from="spec", to="parsimonious", tolookup=x$spec )
        x$id2 = paste(x$id, x$spec_bio, sep=".")
        x = x[x$spec_bio > 0, ]

        # meansize.crude
        # fix missing numbers and mass estimates:
        # zeros for one while nonzeros for correpsonding records
        x$meanwgt = x$totwgt / x$totno  # kg / individual

        mw = applyMean( x[, c("spec_bio", "meanwgt", "cf_cat")], newnames=c("spec_bio", "meanweight.crude") )

        # meansize directly:
        k = groundfish.db( DS="gsdet" )
        k$spec_bio = taxonomy.recode( from="spec", to="parsimonious", tolookup=k$spec )
        ml = applyMean( k[,c( "spec_bio", "len")], newnames=c("spec_bio", "meanlength.direct") )
        mw = merge( mw, ml, by="spec_bio", all=T, sort=T )

        mm = applyMean( k[,c( "spec_bio", "mass")], newnames=c("spec_bio", "meanweight.direct"))
        mw = merge(mw, mm, by="spec_bio", all=T, sort=T )

        # directly determined mean size has greater reliability --- replace
        mw$meanweight = mw$meanweight.crude
        ii = which( is.finite(mw$meanweight.direct))
        mw$meanweight[ii] = mw$meanweight.direct[ii]
        mw = mw[which(is.finite(mw$meanweight)) ,]

        print( "Estimating catches from mean weight information... ")

        ii = which( x$totwgt > 0  & !is.finite(x$totno) )
        if (length(ii)>0) {
          # replace each number estimate with a best guess based upon average body weight in the historical record
          uu = unique( x$spec_bio[ii] )
          for (u in uu ) {
            os =  which( mw$spec_bio==u )
            if (length( os)==0 ) next()
            toreplace = intersect( ii, which( x$spec_bio==u) )
            x$totno[toreplace] = ceiling(x$totwgt[toreplace] / mw$meanweight[os])
          }
        }

        jj = which( x$totno >  0 & !is.finite(x$totwgt) )
        if (length(jj)>0) {
          # replace each number estimate with a best guess based upon average body weight in the historical record
          uu = unique( x$spec_bio[jj] )
          for (u in uu ) {
            os =  which( mw$spec_bio==u )
            if (length( os)==0 ) next()
            toreplace = intersect( jj, which( x$spec_bio==u) )
            x$totwgt[toreplace] = x$totno[toreplace] * mw$meanweight[os]
          }
        }


        # as spec codes have been altered, look for duplicates and update totals
        d = which(duplicated(x$id2))
        s = NULL
        for (i in d) {
          q = which(x$id2 == x$id2[i])
          x$totno[q[1]]  = sum( x$totno[q], na.rm=T )
          x$totwgt[q[1]] = sum( x$totwgt[q], na.rm=T )
          s = c(s, q[2:length(q)])
        }
        if (length(s)>0) x = x[-s,]

        oo = which( duplicated( x$id2) )
        if ( length( oo )>0 ) {
          print( x[ oo , "id2"] )
          stop("Duplcated id2's in gscat"  )
        }

        ll = which( !is.finite(x$totno) & !is.finite(x$totwgt) )
        if (length(ll) > 0) x = x[-ll,]



        # if without sweptarea, then another gear, use SA based upon positional info: sakm2
        isana = which(! is.finite( x$cf_cat))
        if (length(isana) > 0) x$cf_cat[isana] = x$cf_vessel[isana] / x$sakm2[isana]

        # qn = quantile( x$cf_cat, 0.99, na.rm=TRUE )
        # x$cf_cat[ x$cf_cat > qn ] = qn

				x = x[, cat.names]
        cat = rbind( cat, x )
        rm (x); gc()
      }

      if ( "snowcrab" %in% p$data.sources ) {
        x = bio.snowcrab::snowcrab.db( DS ="cat.georeferenced" ) # sa corrected ; kg/km2; no./km2
        x$data.source = "snowcrab"
        x$spec_bio = taxonomy.recode( from="spec", to="parsimonious", tolookup=x$spec )
        x$id = paste( x$trip, x$set, sep="." )
        x$id2 = paste( x$trip, x$set, x$spec_bio, sep="." )

        x$totno[ x$totno > 1e7] = NA
        x$totno = ceiling(x$totno * x$sa) # return to total rather than density
        x$totwgt = x$totmass * x$sa  # return to total rather than density
        x$cf_cat = 1 / x$sa  # == cf_tow (ie., no vessel "corrections")
        x = x[, cat.names]

        # snow crab are assumed to be real zeros .. find them and force 0 value
        iissp = taxonomy.recode( from="spec", to="parsimonious", tolookup=2526 ) # snow crab using groundfish codes
        oo = which( !is.finite(x$totno) & x$spec_bio==iissp  )
        if (length(oo) > 0 ) x$totno[oo] = 0
        oo = which( !is.finite(x$totwgt) & x$spec_bio== iissp )  # snow crab are assumed to be real zeros
        if (length(oo) > 0 ) x$totwgt[oo] = 0

				cat = rbind( cat, x[,names(cat)]  )
        rm (x); gc()
      }

      lh = taxonomy.db( "life.history" )
      lh = lh[,c("spec", "name.common", "name.scientific", "itis.tsn" )]
      cat = merge(x=cat, y=lh, by=c("spec"), all.x=T, all.y=F, sort=F)
      cat = cat[ which( cat$itis.tsn > 0 ), ]

      save( cat, file=fn, compress=T )
      return (fn)
    }



    # --------------------


    if (DS %in% c("det.init","det.init.redo") ) {
      # all species caught
      det = NULL # biologicals
      fn = file.path( surveydir, "det.init.rdata"  )
      if (DS=="det.init") {
        if (file.exists( fn) ) load( fn)
        return ( det )
      }


        # sex codes
        #  male = 0
        #  female = 1
        #  sex.unknown = 2

        # maturity codes
        #  immature = 0
        #  mature = 1
        #  mat.unknown = 2


      det.names =  c("data.source", "id", "id2", "spec", "spec_bio", "sex", "mass", "len", "mat")
      if ( "groundfish" %in% p$data.sources ) {
        pg = aegis_parameters( DS="groundfish" )
        x = aegis::groundfish.db(p=pg, "gsdet" )
        x$data.source = "groundfish"

        x$spec_bio = taxonomy.recode( from="spec", to="parsimonious", tolookup=x$spec )
        x$id2 = paste(x$id, x$spec_bio, sep=".")
        x = x[x$spec_bio > 0, ]

        # mass in kg, len in cm

        # convert sex codes to snow crab standard
        # --------- codes ----------------
        # sex: 0=undetermined, 1=male, 2=female,  3=hermaphrodite, 9= not examined
        # mat: 0=observed but undetermined, 1=imm, 2=ripening(1), 3=ripening(2), 4=ripe(mature),
        #      5=spawning(running), 6=spent, 7=recovering, 8=resting
        # settype: 1=stratified random, 2=regular survey, 3=unrepresentative(net damage),
        #      4=representative sp recorded(but only part of total catch), 5=comparative fishing experiment,
        #      6=tagging, 7=mesh/gear studies, 8=explorartory fishing, 9=hydrography
        # --------- codes ----------------

        u = which(x$spec==2526)
        if ( mean(x$len[u], na.rm=TRUE) > 20 ) {
          # 200 mm or 20 cm is really the upper limit of what is possible for snow crab (in 2016, it was 50)
          # if the mean is above this then there is an issue, assume it is recorded as mm
          # and convert to cm as that is the expectation in groundfish.db p=p,and aegis_db
          message( "groundfish gsdet seems to have stored snowcrab lengths in mm .. fixing ? -- please check other species such as lobster, etc.")
          x$len[u] = x$len[u] / 10
          # mass looks ok .. in kg
        }

        sx = x$sex
        x$sex = NA
        oo = which( sx %in% c(0, 3, 9) ); if (length(oo)>0) x$sex[oo] = 2 # unknown
        oo = which( sx %in% c(1) ); if (length(oo)>0) x$sex[oo] = 0 # male
        oo = which( sx %in% c(2) ); if (length(oo)>0) x$sex[oo] = 1 # female

        # convert maturity to snow crab standard
        mt = x$mat
        x$mat = NA
        oo = which( mt %in% c(0) ); if (length(oo)>0) x$mat[oo] = 2 # unknown
        oo = which( mt %in% c(1) ); if (length(oo)>0) x$mat[oo] = 0  # immature
        oo = which( mt %in% c(2,3,4,5,6,7,8) ); if (length(oo)>0) x$mat[oo] = 1 # mature  -- investmvent into gonads has begun

        det = rbind( det, x[, det.names] )
        rm (x); gc()

      }

      if ( "snowcrab" %in% p$data.sources ) {
        # snow crab only ... add bycatch from survey :: TODO
        x = bio.snowcrab::snowcrab.db( DS ="det.georeferenced" )
        x$data.source = "snowcrab"
        x$spec = 2526
        x$spec_bio =  taxonomy.recode( from="spec", to="parsimonious", tolookup=x$spec ) # snow crab using groundfish codes
        x$id = paste( x$trip, x$set, sep="." )
        x$id2 = paste( x$id, x$spec_bio, sep=".")

        x$len = x$cw / 10  # convert mm to cm
        # x$cf_det = 1/x$sa  ########## <<<<<< ------ NOTE THIS accounts only for SA as there is no subsampling (so far)
        x$sex = as.numeric( as.character( x$sex) )
        x$mat = as.numeric( as.character( x$mat) )
        x$mass = x$mass /1000  # g to kg

        det = rbind( det, x[, det.names] )
        rm (x); gc()
      }


      save( det, file=fn, compress=T )
      return (fn)
    }


    # -------------


    if (DS %in% c("set.base", "set.base.redo") ) {
      # lookup missing information

      set = NULL # trip/set loc information
      fn = file.path( surveydir, "set.base.rdata"  )
      if (DS=="set.base") {
        if (file.exists( fn) ) load( fn)
        if( year.filter) if (exists("yrs", p) ) set = set[ set$yr %in% p$yrs, ]  # select to correct years
        return ( set )
      }

      set = survey.db( DS="set.init", p=p )
      set = set[ which(is.finite(set$lon + set$lat + set$yr ) ) , ]  #  fields are required
      oo =  which( !duplicated(set$id) )
      if (length(oo) > 0 ) set = set[ oo, ]
      set = lonlat2planar( set, proj.type=p$internal.crs )  # plon+plat required for lookups

      set$dyear = lubridate::decimal_date( set$timestamp ) - set$yr

      # merge depth
      iz = which( !is.finite(set$z) )
      if (length(iz) > 0) {
        set$z[iz] = bathymetry.lookup( p=p, locs=set[iz, c("plon","plat")], vnames="z" )
      }

      # merge temperature
      it = which( !is.finite(set$t) )
      if (length(it) > 0) {
        set$t[it] = temperature.lookup( p=p, locs=set[it, c("plon","plat")], timestamp=set$timestamp[it] )
      }

      set$oxysat = compute.oxygen.saturation( t.C=set$t, sal.ppt=set$sal, oxy.ml.l=set$oxyml)

      save( set, file=fn, compress=T )
      return (fn)
    }


    # ---------------------

    if (DS %in% c("lengthweight.redo", "lengthweight.parameters", "lengthweight.residuals") ) {

      ## TODO -- make parallel require(multicore)

      ddir = file.path( project.datadirectory("aegis"), "data" )
      dir.create( ddir, showWarnings=FALSE, recursive=TRUE )

      fn = file.path( ddir, "bio.length.weight.parameters.rdata" )
      fn2 = file.path( ddir, "bio.length.weight.residuals.rdata" )

      if (DS=="lengthweight.parameters") {
        res = NULL
        if (file.exists( fn ) ) load( fn )
        return( res )
      }

      if (DS=="lengthweight.residuals") {
        lwr = NULL
        if (file.exists( fn2 ) ) load( fn2 )
        return( lwr )
      }

      # this mirrors the relevent changes/recoding in aegis_db("det")
      x = survey.db( DS="det.init", p=p )

      x = x[ which( is.finite( x$spec_bio)), ]
      x$sex[ which( !is.finite(x$sex)) ] = 2 # set all uncertain sexes to one code sex code
      x$mat[ which( !is.finite(x$mat)) ] = 2 # set all uncertain sexes to one code sex code

      res = expand.grid(
        spec_bio = sort( unique( x$spec_bio )),
        sex = sort( unique( x$sex )),
        mat = sort( unique( x$mat ))
      )
      # sex codes (snowcrab standard)
      #  male = 0
      #  female = 1
      #  sex.unknown = 2

      # mat codes (snowcrab standard)
      #  imm = 0
      #  mat = 1
      #  mat.unknown = 2

      unknown = 2
      x$residual = NA

      # initialise new variables
      res$rsq = NA
      res$sigma = NA
      res$df = NA
      res$b0 = NA
      res$b1 = NA
      res$b0.se = NA
      res$b1.se = NA
      res$pvalue = NA
      for (i in 1:nrow(res)) {
        wsp = which( x$spec_bio == res$spec_bio[i] )
        if (length( wsp) < 3 ) next()
        # remove extremes for each species from the data to generate regressions
        ql = quantile( x$len[wsp], probs=c(0.005, 0.995), na.rm=T )
        qm = quantile( x$mass[wsp], probs=c(0.005, 0.995), na.rm=T )
        wqn =  which( x$len> ql[1] & x$len< ql[2] & x$mass> qm[1] & x$mass< qm[2]  )
        wsx = which( x$sex==res$sex[i] )
        if (res$sex[i]==unknown) wsx = wsp  # use all possible data (not just unknowns) for this class (hermaphrodite/unsexed/unknown)
        wmt = which( x$mat==res$mat[i] )
        if (res$sex[i]==unknown) wsx = wsp  # use all possible data (not just unknowns) for this class (mat unkown)
        w = intersect( intersect( intersect( wsp, wqn ), wsx ), wmt )
        nw = length(w)
        if ( nw > 5 ) {
          q = x[w ,]
          q.lm = try( lm( log10(mass) ~ log10(len), data=q ) )
          if (class( q.lm) %in% "error" ) next()
          s = summary( q.lm )
          res$rsq[i] = s$r.squared
          res$sigma[i] = s$sigma
          res$df[i] = s$df[2]
          res$b0[i] = s$coefficients[1]
          res$b1[i] = s$coefficients[2]
          res$b0.se[i] = s$coefficients[3]
          res$b1.se[i] = s$coefficients[4]
          res$pvalue[i] = pf(s$fstatistic[1],s$fstatistic[2],s$fstatistic[3],lower.tail=FALSE)
          x$residual[w] = rstandard(q.lm)
          print( res[i,] )
        }
      }
      ooo = which( abs( x$residual ) > 4 )
      if (length(ooo) > 0 ) x$residual [ooo] = NA
      lwr = x
      save( lwr, file=fn2, compress=TRUE )
      save( res, file=fn, compress=TRUE )
      return( fn )

    }


    # --------------------


    if (DS %in% c("det","det.redo") ) {

      # error checking, imputation, etc
      det = NULL
      fn = file.path( surveydir, "det.rdata"  )
      if (DS=="det") {
        if (file.exists( fn) ) load( fn)
        return ( det )
      }

        # sex codes
        #  male = 0
        #  female = 1
        #  sex.unknown = 2

        # maturity codes
        #  immature = 0
        #  mature = 1
        #  mat.unknown = 2

      det = survey.db( DS="lengthweight.residuals", p=p )

      # fix mass, length estimates where possible using model parameters
      # try finest match first: by spec_bio:mat, spec_bio:sex, spec_bio

      lwp = survey.db( DS="lengthweight.parameters", p=p )
      # note: lwp$spec is derived from spec_bio, as above

      ims = which( !is.finite( det$mass) )
      sps = sort( unique( det$spec_bio[ ims ] ) )
      mats = sort( unique( det$mat))
      sexes = sort( unique( det$sex))

      for (sp in sps) {
        isp = which( det$spec_bio == sp )

        # first try exact matches based upon {spec_bio, mat, sex}
        for ( mat in mats ) {
        for ( sex in sexes ) {
          u = which( det$mat==mat & det$sex==sex & !is.finite(det$mass ) )
          w = intersect( isp, u )
          if (length(w) > 0) {
            v = which( lwp$spec_bio==sp & lwp$mat==mat & lwp$sex==sex )
            if (length(v)==1) det$mass[w] = 10^( lwp$b0[v] + lwp$b1[v] * log10(det$len[w]) )
          }
        }}

        # next try exact matches based upon {spec_bio, mat}
        for ( mat in mats ) {
          u = which( det$mat==mat & !is.finite(det$mass )  )
          w = intersect( isp, u )
          if (length(w) > 0) {
            v = which( lwp$spec_bio==sp & lwp$mat==mat & is.na(lwp$sex   ) )
            if (length(v)==1) det$mass[w] = 10^( lwp$b0[v] + lwp$b1[v] * log10(det$len[w]) )
          }
        }

       # next try exact matches based upon {spec_bio, sex}
        for ( sex in sexes ) {
          u = which( det$sex==sex & !is.finite(det$mass )  )
          w = intersect( isp, u )
          if (length(w) > 0) {
            v = which( lwp$spec_bio==sp & lwp$sex==sex & is.na(lwp$mat  ) )
            if (length(v)==1) det$mass[w] = 10^( lwp$b0[v] + lwp$b1[v] * log10(det$len[w]) )
          }
        }

       # next try exact matches based upon {spec_bio} only
          u = which( is.na(det$mass ))
          w = intersect( isp , u )
          if (length(w) > 0) {
            v = which( lwp$spec_bio==sp & is.na(lwp$sex) & is.na(lwp$mat  ) )
            if (length(v)==1) det$mass[w] = 10^( lwp$b0[v] + lwp$b1[v] * log10(det$len[w]) )
          }
      }

      # estimate metabolic rates estimates (requires temperature estimate )
      set = survey.db( DS="set.base", p=p  ) # kg, no
      set = set[ , c("id", "t")]  # temperature is required to estimate MR ..

      det = merge( det, set, by="id", all.x=T, all.y=F, sort=F )
      detmr = metabolic.rates ( mass.g=det$mass * 1000, temperature.C=det$t )
      det = cbind( det, detmr )


      # determine weighting factor for individual-level measurements (len, weight, condition, etc)

      # at the set level, some species are not sampled even though sampwgt's are recorded
      # this makes the total biomass > than that estimated from "DET"
      # an additional correction factor is required to bring it back to the total biomass caught,
      # this must be aggregated across all species within each set :

      # correction factors for sampling etc after determination of mass and len
      # for missing data due to subsampling methodology
      # totals in the subsample that was taken should == sampwgt (in theory) but do not
      # ... this is a rescaling of the sum to make it a 'proper' subsample

      cat = survey.db( DS="cat.init", p=p )

      massTotCat = applySum( det[ ,c("id2", "mass")], newnames=c("id2","massTotdet" ) )
      cat = merge( cat, massTotCat, by="id2", all.x=T, all.y=F, sort=F )  # set-->kg/km^2, det-->km
      cat$massTotdet[ which( !is.finite (cat$massTotdet ))] = 0  ### when missing it means no determinations were made
      cat$cf_det_wgt =  cat$massTotdet * cat$cf_cat / cat$totwgt   # cf_det is the multiplier required to make each det measurement scale properly to totwgt in units of Alfred Needler  .. subsample

      det$no = 1
      noTotCat = applySum( det[ ,c("id2", "no")], newnames=c("id2","noTotdet" ) )
      cat = merge( cat, noTotCat, by="id2", all.x=T, all.y=F, sort=F )    # set-->no/km^2, det-->no
      cat$noTotdet[ which( !is.finite (cat$noTotdet ))] = 0  ### when missing it means no determinations were made
      cat$cf_det_no = cat$noTotdet * cat$cf_cat / cat$totno   # cf_det is the multiplier required to make each det measurement scale properly to totno in units of Alfred Needler  .. subsample
      det$no = NULL


    # assume no subsampling -- all weights determined from the subsample
    oo = which ( !is.finite( cat$cf_det_wgt ) |  cat$cf_det_wgt==0 )
    if (length(oo)>0) cat$cf_det_wgt[oo] = 1

    # assume no subsampling -- all weights determined from the subsample
    oo = which ( !is.finite( cat$cf_det_no ) |  cat$cf_det_no==0 )
    if (length(oo)>0) cat$cf_det_no[oo] = 1

    # oo = which ( cat$cf_det_wgt < 0.001 )
    # if (length(oo)>0) cat$cf_det_wgt[oo] = NA
    #
    # oo = which ( cat$cf_det_wgt > 500 )
    # if (length(oo)>0) cat$cf_det_wgt[oo] = NA

      cat = cat[, c("id2", "cf_det_wgt", "cf_det_no")]
      det = merge( det, cat, by="id2", all.x=T, all.y=F, sort=F)

      det$cf_det_wgt [!is.finite(det$cf_det_wgt)] = 1
      det$cf_det_no [!is.finite(det$cf_det_no)] = 1

      ## remaining NA's with cf_det are mostly due to bad hauls, broken nets etc.

      save (det, file=fn, compress=TRUE )
      return (fn)
    }


    # --------------------


    if (DS %in% c("cat", "cat.redo") ) {
      # all species caught
      cat = NULL # biologicals
      fn = file.path( surveydir, "cat.rdata"  )
      if (DS=="cat") {
        if (file.exists( fn) ) load( fn)
        return ( cat )
      }

      set = survey.db( DS="set.init", p=p  ) # kg/km^2, no/km^2

      det = survey.db( DS="det", p=p  ) # size information, no, cm, kg
      det = det[ which( det$id %in% unique( set$id) ), ]

      cat = survey.db( DS="cat.init", p=p )
      cat = cat[ which( cat$id %in% unique( set$id) ), ]
      cat = merge(cat, set[, c("id", "gear")])

      oo = which( duplicated( cat$id2) )
      if (length( oo) > 0 ) cat = cat[ -oo, ]

      cm = data.frame( id2=as.character( sort( unique( cat$id2 ) )), stringsAsFactors=FALSE )
      cm = merge( cm, applySum( det[ , c("id2", "mr", "cf_det_wgt")] ), by="id2", all.x=TRUE, all.y=FALSE, sort=FALSE )

      # averages of these variables
      newvars = c( "residual", "mass", "len", "Ea", "A", "Pr.Reaction", "smr"  )
      for ( nv in newvars ) {
      #browser()
        cm = merge( cm,
          applyMean( det[ , c("id2", nv, "cf_det_wgt")] ), by="id2", all.x=TRUE, all.y=FALSE, sort=FALSE )
      }

      cat = merge( cat, cm, by="id2", all.x=TRUE, all.y=FALSE, sort=FALSE )

      # where det measurements not available, estimate mean mass from total weights and numbers
      oo = which( !is.finite( cat$mass ))
      if (length(oo) > 0 ) {
        cat$mass[oo] = cat$totwgt[oo] / cat$totno[oo]
      }

			# in the following:	quantiles are computed,
      cat$uid = paste( cat$data.source, cat$gear, cat$spec_bio )

      cat$qn = NA  # default when no data
      oo = which( cat$totno == 0 )  # retain as zero values
      if (length(oo)>0 ) cat$qn[oo] = 0
      for ( iii in unique(cat$uid) ) {
        jjj = which( cat$uid==iii & cat$totno > 0 )
        if (length( jjj ) > 3 ) {
          cat$qn[jjj] = quantile_estimate( cat$totno[jjj] * cat$cf_cat[jjj]  )  # convert to quantiles, by species, geartype and survey
        }
      }
     
			cat$qm = NA   # default when no data
      oo = which( cat$totwgt == 0 )  # retain as zero values
      if (length(oo)>0 ) cat$qm[oo] = 0
      for ( iii in unique(cat$uid) ) {
        jjj = which( cat$uid==iii & cat$totwgt > 0 )
        if (length( jjj ) > 3 ) {
          cat$qm[jjj] = quantile_estimate( cat$totwgt[jjj] * cat$cf_cat[jjj]  )  # convert to quantiles, by species, geartype and survey
        }
      }

      cat$uid = NULL

     # convert from quantile to z-score

      cat$zm = quantile_to_normal( cat$qm )
      cat$zn = quantile_to_normal( cat$qn )


			over.write.missing.data = TRUE
			if (over.write.missing.data) {

				# over-write na's for n or mass from each other, where possible:
				kxm = which( !is.finite( cat$qm) )
				kxn = which( !is.finite( cat$qn) )

				kmn = setdiff( kxm, kxn )
				knm = setdiff( kxn, kxm )

				if ( length( knm) > 0 ) cat$qn[knm] =  cat$qm[knm]
				if ( length( kmn) > 0 ) cat$qm[kmn] =  cat$qn[kmn]

				# remaining missing values take the median value for each species == 0.5
				kxm = which( !is.finite( cat$qm ) )
				if ( length( kxm) > 0 ) cat$qm[kxm] = 0.5

				kxn = which( !is.finite( cat$qn ) )
				if ( length( kxn) > 0 ) cat$qn[kxn] = 0.5

			}

      save (cat, file=fn, compress=TRUE )
      return (fn)

    }



    # -------------



    if (DS %in% c("set","set.redo") ) {

      # survet sets
      set = NULL # trip/set loc information
      fn = file.path( surveydir, "set.rdata"  )
      if (DS=="set") {
        if (file.exists( fn) ) load( fn)
        return ( set )
      }

      set = survey.db( DS="set.base", p=p )
      if (exists("yrs", p)) set = set[ set$yr %in% p$yrs,  ]  # select to correct years

      det = survey.db( DS="det", p=p  ) # size information, no, cm, kg
      det = det[ which( det$id %in% unique( set$id) ), ]

      cat = survey.db( DS="cat", p=p )
      cat = cat[ which( cat$id %in% unique( set$id) ), ]

      # NOTE: cat$totno and cat$totwgt are not cf corrected

      sm = data.frame( id=as.character( sort( unique( set$id ) )), stringsAsFactors=FALSE )

      # summaries from cat .. weighted by cf to make per standard unit
      sm = merge( sm, applySum( cat[ , c("id", "totno") ], newnames=c("id", "totno") ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
      sm = merge( sm, applySum( cat[ , c("id", "totwgt") ], newnames=c("id", "totwgt")  ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

      sm = merge( sm, applySum( cat[ , c("id", "totno", "cf_cat") ], newnames=c("id", "totno_adjusted") ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
      sm = merge( sm, applySum( cat[ , c("id", "totwgt", "cf_cat") ], newnames=c("id", "totwgt_adjusted")  ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

      sm$cf_set_mass = sm$totwgt_adjusted / sm$totwgt
      sm$cf_set_no = sm$totno_adjusted / sm$totno
      # NOTE:: these should be == or ~= 1/set$sa ( done this way in case there has been other adjustmensts such as subampling, etc ..) .. these become offets required to express totwgt or totno as areal density per unit km^2 in Poisson models

      # summaries from det
      # --- NOTE det was not always determined and so totals from det mass != totals from cat nor set for all years
      sm = merge( sm, applySum( det[ , c("id", "mr", "cf_det_wgt")] ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

      # averages of these variables from det
      newvars = c( "residual", "mass", "len", "Ea", "A", "Pr.Reaction", "smr"  )
      for ( nv in newvars ) {
        sm = merge( sm,
          applyMean( det[ , c("id", nv, "cf_det_wgt")] ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
      }

      set = merge( set, sm, by ="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

    	surveys = sort( unique( set$data.source ) )

			# in the following:	quantiles are computed,
      set$qn = NA  # default when no data
      oo = which( set$totno_adjusted == 0 )  # retain as zero values
      if (length(oo)>0 ) set$qn[oo] = 0

      for ( s in surveys ) {
        ii = which( set$data.source==s & set$totno_adjusted > 0  )
        if (length( ii) > 0 ) {
  				set$qn[ii] = quantile_estimate( set$totno_adjusted[ii] )  # convert to quantiles, by survey
				}
      }

			set$qm = NA   # default when no data
      oo = which( set$totwgt_adjusted == 0 )  # retain as zero values
      if (length(oo)>0 ) set$qm[oo] = 0

      for ( s in surveys ) {
        ii = which( set$data.source==s & set$totwgt_adjusted > 0 )
          if (length( ii) > 0 ) {
						set$qm[ii] = quantile_estimate( set$totwgt_adjusted[ii] )  # convert to quantiles, by survey
					}
      }

     # convert from quantile to z-score

      set$zm = quantile_to_normal( set$qm )
      set$zn = quantile_to_normal( set$qn )


			over.write.missing.data = TRUE
			if (over.write.missing.data) {

				# over-write na's for n or mass from each other, where possible:
				kxm = which( !is.finite( set$qm) )
				kxn = which( !is.finite( set$qn) )

				kmn = setdiff( kxm, kxn )
				knm = setdiff( kxn, kxm )

				if ( length( knm) > 0 ) set$qn[knm] =  set$qm[knm]
				if ( length( kmn) > 0 ) set$qm[kmn] =  set$qn[kmn]

				# remaining missing values take the median value == 0.5
				kxm = which( !is.finite( set$qm ) )
				if ( length( kxm) > 0 ) set$qm[kxm] = 0.5

				kxn = which( !is.finite( set$qn ) )
				if ( length( kxn) > 0 ) set$qn[kxn] = 0.5

			}

      save( set, file=fn, compress=T )
      return (fn)
    }


    # ---------------------

    if (DS == "det.filter" ) {
      # selected for a given set of species  and size and sex and maturity
      # wrapper around survey.db and groundfish.db to permit abundance data to be passed, including zero valued locations
      # selected for a given set of species  and size and sex and maturity
      # add zero valued locations too..

      # ft2m = 0.3048
      # m2km = 1/1000
      # nmi2mi = 1.1507794
      # mi2ft = 5280
      # k$sakm2 = (41 * ft2m * m2km ) * ( k$dist * nmi2mi * mi2ft * ft2m * m2km )  # surface area sampled in km^2
      # standardtow_sakm2 = (41 * ft2m * m2km ) * ( 1.75 * nmi2mi * mi2ft * ft2m * m2km )  # surface area sampled by a standard tow in km^2  1.75 nm

      # must not subset "set" until after all det/cat info has been resolved in order to reatin zerovalued sets
      set = det = NULL # trip/set loc information
      set = survey.db( DS="set.base", p=p )
      u = data.frame( matrix( unlist(strsplit( set$id, ".", fixed=TRUE)), ncol=2, byrow=TRUE), stringsAsFactors=FALSE )
      set$mission = as.character( u[,1] )
      set$setno = as.numeric( u[,2] )

      if (add_groundfish_strata) {
        polygon_source = "pre2014"  # "pre2014" for older
        sppoly = maritimes_groundfish_strata( timeperiod=polygon_source, returntype="polygons" )
        # sppoly = spTransform(sppoly, sp::CRS(p$internal.crs_planar) )
        # sppoly$sa_strata_km2 = gArea(sppoly, byid=TRUE) / 1000/1000 # (m^2 -> km^2)
        # sppoly = spTransform(sppoly, sp::CRS(p$internal.crs) )
        set = maritimes_groundfish_strata_identify( Y=set, sppoly=sppoly, xyvars=c("lon", "lat"), planar_crs_km=p$internal.crs, plotdata=TRUE )
      }

      # filter non-biologicals ... i.e, set characteristics
      if (exists("selection", p)) {
        if (exists("survey", p$selection)) {  # filter survey information
          if (exists("StrataID", set) ) {
            if (exists("polygon_enforce", p$selection$survey) ) {
              set = set[ which(!is.na(set$StrataID)), ] # remove unsetegorized sets
            }
            if (exists("strata_toremove", p$selection$survey) ) {
              todrop = which( set$StrataID %in% strata_definitions( p$selection$survey[["strata_toremove"]] ) )
              if (length(todrop) > 0) set = set[- todrop, ]
            }
            if (exists("strata_tokeep", p$selection$survey) ){
              set = set[which( set$StrataID %in% p$selection$survey[["strata_to_keep"]] ) , ]
            }
          }
          if (exists("months", p$selection$survey) ) set = set[ which(month(set$timestamp) %in% p$selection$survey[["months"]] ), ]
          isc = filter_data( set, p$selection$survey )
          if (length(isc) > 0) set = set[isc,]
          isc = NULL
        }
      }

      # indiviudal measurements filter
      det = survey.db( DS="det", p=p  ) # size information, no, cm, kg
      det = det[ which( det$id %in% unique( set$id) ), ]
        if (exists("selection", p)) {
          if (exists("biologicals", p$selection)) {  # filter biologicals
            isc = filter_data( det, p$selection$biologicals )
            if (length(isc) > 0) det = det[isc,]
            isc = NULL
          }
        }

      # summaries from det
      # --- NOTE det was not always determined and so totals from det mass != totals from cat nor set for all years
      # cf_det is the weight to make it sum up to the correct total catch (vs any subsamples) and tow length, etc
      det$no = 1

      set = merge( set, applySum( det[ , c("id", "no")],   newnames=c("id", "totno") ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
      set = merge( set, applySum( det[ , c("id", "mass")], newnames=c("id", "totwgt") ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

      cat = survey.db( DS="cat", p=p  ) # size information, no, cm, kg
      cat = cat[ which( cat$id %in% unique( set$id) ), ]
        if (exists("selection", p)) {
          if (exists("biologicals", p$selection)) {  # filter biologicals
            isc = filter_data( cat, p$selection$biologicals )
            if (length(isc) > 0) cat = cat[isc,]
            isc = NULL
          }
        }

      set = merge( set, applySum( cat[ , c("id", "totno", "cf_cat")], newnames=c("id", "totno_adjusted") ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
      set = merge( set, applySum( cat[ , c("id", "totwgt", "cf_cat")], newnames=c("id", "totwgt_adjusted") ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

      set$totno_adjusted[ which(!is.finite(set$totno_adjusted))] = 0
      set$totwgt_adjusted[ which(!is.finite(set$totwgt_adjusted))] = 0
      set$totno[ which(!is.finite(set$totno))] = 0
      set$totwgt[ which(!is.finite(set$totwgt))] = 0

      set$cf_set_mass = set$totwgt_adjusted / set$totwgt
      set$cf_set_no = set$totno_adjusted / set$totno

      ii = which(!is.finite(set$cf_set_mass ))
      if (length(ii) > 0) set$cf_set_mass[ii] = set$cf_tow[ii]

      ii = which(!is.finite(set$cf_set_no ))
      if (length(ii) > 0) set$cf_set_no[ii] = set$cf_tow[ii]


      # NOTE:: these should be == or ~= 1/set$sa ( done this way in case there has been other adjustmensts such as subampling, etc ..) .. these become offets required to express totwgt or totno at a common areal density per unit km^2 in Poisson models

      # --- NOTE det was not always determined and so totals from det mass != totals from cat nor set for all years
      set = merge( set, applySum( det[ , c("id", "mr", "cf_det_wgt")] ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
      set$mr[ which(!is.finite(set$mr))] = 0

      # averages of these variables from det
      newvars = c( "residual", "mass", "len", "Ea", "A", "Pr.Reaction", "smr"  )
      for ( nv in newvars ) {
        set = merge( set,
          applyMean( det[ , c("id", nv, "cf_det_wgt")] ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
      }
      set$Ea[ which(!is.finite(set$Ea))] = 0
      set$A[ which(!is.finite(set$A))] = 0
      set$Pr.Reaction[ which(!is.finite(set$Pr.Reaction))] = 0
      set$smr[ which(!is.finite(set$smr))] = 0


			# in the following:	quantiles are computed,
      set$uid = paste( set$data.source, set$gear )

      set$qn = NA  # default when no data
      oo = which( set$totno_adjusted == 0 )  # retain as zero values
      if (length(oo)>0 ) set$qn[oo] = 0
      for ( iii in unique(set$uid) ) {
        jjj = which( set$uid==iii & set$totno_adjusted > 0 )
        if (length( jjj ) > 3 ) {
          set$qn[jjj] = quantile_estimate( set$totno_adjusted[jjj] )  # convert to quantiles, by species, geartype and survey
        }
      }
     
			set$qm = NA   # default when no data
      oo = which( set$totwgt_adjusted == 0 )  # retain as zero values
      if (length(oo)>0 ) set$qm[oo] = 0
      for ( iii in unique(set$uid) ) {
        jjj = which( set$uid==iii & set$totwgt_adjusted > 0 )
        if (length( jjj ) > 3 ) {
          set$qm[jjj] = quantile_estimate( set$totwgt_adjusted[jjj] )  # convert to quantiles, by species, geartype and survey
        }
      }

      set$uid = NULL
      


     # convert from quantile to z-score
      set$zm = quantile_to_normal( set$qm )
      set$zn = quantile_to_normal( set$qn )

      # aggeragate measurement filter
      if (exists("selection", p)) {
        # last pass .. filter any set-level traits (mostly aggregate biological characgeristics, if any)
        if (exists("aggregate", p$selection)) {  # filter on survey characteristics
          if (exists("drop.unreliable.zeros.groundfish.data", p$selection$aggregate )) {
            if (p$selection$aggregate$drop.unreliable.zeros.groundfish.data) {
              # special flag .. unreliable zero's for snowcrab in the groundfish data
              todrop = which( set$data.source=="groundfish" & set$yr < 1999 & (set$totwgt ==0 | set$totno==0) )
              if (length(todrop)>0) set = set[ -todrop, ]
            }
          }
          isc = filter_data( set, p$selection$aggregate )
          if (length(isc) > 0) set = set[isc,]
          isc = NULL
        }
      }

      return (set)
    }


    # ---------------------

    if (DS == "cat.filter" ) {
      # selected for a given set of species  and size
      # wrapper around survey.db and groundfish.db to permit abundance data to be passed, including zero valued locations
      # selected for a given set of species  and size and sex and maturity
      # add zero valued locations too..

      # ft2m = 0.3048
      # m2km = 1/1000
      # nmi2mi = 1.1507794
      # mi2ft = 5280
      # k$sakm2 = (41 * ft2m * m2km ) * ( k$dist * nmi2mi * mi2ft * ft2m * m2km )  # surface area sampled in km^2
      # standardtow_sakm2 = (41 * ft2m * m2km ) * ( 1.75 * nmi2mi * mi2ft * ft2m * m2km )  # surface area sampled by a standard tow in km^2  1.75 nm

      # must not subset "set" until after all det/cat info has been resolved in order to reatin zerovalued sets
      set = cat = NULL # trip/set loc information

      set = survey.db( DS="set.base", p=p )
      u = data.frame( matrix( unlist(strsplit( set$id, ".", fixed=TRUE)), ncol=2, byrow=TRUE), stringsAsFactors=FALSE )
      set$mission = as.character( u[,1] )
      set$setno = as.numeric( u[,2] )


      if (add_groundfish_strata) {
        polygon_source = "pre2014"  # "pre2014" for older
        sppoly = maritimes_groundfish_strata( timeperiod=polygon_source, returntype="polygons" )
        set = maritimes_groundfish_strata_identify( Y=set, sppoly=sppoly, xyvars=c("lon", "lat"), planar_crs_km=p$internal.crs, plotdata=TRUE )
        # sppoly = spTransform(sppoly, sp::CRS(p$internal.crs_planar) )
        # sppoly$sa_strata_km2 = gArea(sppoly, byid=TRUE) / 1000/1000 # (m^2 -> km^2)
        # sppoly = spTransform(sppoly, sp::CRS(p$internal.crs) )
      }

      # filter non-biologicals ... i.e, set characteristics
      if (exists("selection", p)) {
        if (exists("survey", p$selection)) {  # filter survey information

          if (exists("StrataID", set) ) {
            if (exists("polygon_enforce", p$selection$survey) ) {
              set = set[ which(!is.na(set$StrataID)), ] # remove unsetegorized sets
            }
            if (exists("strata_toremove", p$selection$survey) ) {
              todrop = which( set$StrataID %in% strata_definitions( p$selection$survey[["strata_toremove"]] ) )
              if (length(todrop) > 0) set = set[- todrop, ]
            }
            if (exists("strata_tokeep", p$selection$survey) ){
              set = set[which( set$StrataID %in% p$selection$survey[["strata_to_keep"]] ) , ]
            }
          }

          if (exists("months", p$selection$survey) ) set = set[ which(month(set$timestamp) %in% p$selection$survey[["months"]] ), ]
          isc = filter_data( set, p$selection$survey )
          if (length(isc) > 0) set = set[isc,]
          isc = NULL
        }
      }

      # indiviudal measurements filter
      # catvars= c("id", "totno", "totwgt", "cf_cat")
      cat = aegis::survey.db(DS="cat", p=p) #export from grounfish survey database .. weight (kg) and num per unit area (km^2)
      cat = cat[ which( cat$id %in% unique( set$id) ), ]
        if (exists("selection", p)) {
          if (exists("biologicals", p$selection)) {  # filter biologicals
            isc = filter_data( cat, p$selection$biologicals )
            if (length(isc) > 0) cat = cat[isc, ]
            isc = NULL
          }
        }

      cat$data.source = NULL
      set = merge( set, cat, by="id", all.x=TRUE, all.y=FALSE)
      set$totno_adjusted = set$totno * set$cf_cat
      set$totwgt_adjusted = set$totwgt * set$cf_cat

      set$totno_adjusted[ which(!is.finite(set$totno_adjusted))] = 0
      set$totwgt_adjusted[ which(!is.finite(set$totwgt_adjusted))] = 0
      set$totno[ which(!is.finite(set$totno))] = 0
      set$totwgt[ which(!is.finite(set$totwgt))] = 0

      # NOTE:: these should be == or ~= 1/set$sa ( done this way in case there has been other adjustmensts such as subampling, etc ..) .. these become offets required to express totwgt_adjusted ot totno_adjusted per unit km^2 in Poisson models

			# in the following:	quantiles are computed,
      set$uid = paste( set$data.source, set$gear )

      set$qn = NA  # default when no data
      oo = which( set$totno_adjusted == 0 )  # retain as zero values
      if (length(oo)>0 ) set$qn[oo] = 0
      for ( iii in unique(set$uid) ) {
        jjj = which( set$uid==iii & set$totno_adjusted > 0 )
        if (length( jjj ) > 3 ) {
          set$qn[jjj] = quantile_estimate( set$totno_adjusted[jjj] )  # convert to quantiles, by species, geartype and survey
        }
      }
     
			set$qm = NA   # default when no data
      oo = which( set$totwgt_adjusted == 0 )  # retain as zero values
      if (length(oo)>0 ) set$qm[oo] = 0
      for ( iii in unique(set$uid) ) {
        jjj = which( set$uid==iii & set$totwgt_adjusted > 0 )
        if (length( jjj ) > 3 ) {
          set$qm[jjj] = quantile_estimate( set$totwgt_adjusted[jjj] )  # convert to quantiles, by species, geartype and survey
        }
      }

      set$uid = NULL
      

     # convert from quantile to z-score
      set$zm = quantile_to_normal( set$qm )
      set$zn = quantile_to_normal( set$qn )


      # aggeragate measurement filter
      if (exists("selection", p)) {
        # last pass .. filter any set-level traits (mostly aggregate biological characgeristics, if any)
        if (exists("aggregate", p$selection)) {  # filter on survey characteristics
          if (exists("drop.unreliable.zeros.groundfish.data", p$selection$aggregate )) {
            if (p$selection$aggregate$drop.unreliable.zeros.groundfish.data) {
              # special flag .. unreliable zero's for snowcrab in the groundfish data
              todrop = which( set$data.source=="groundfish" & set$yr < 1999 & (set$totwgt ==0 | set$totno==0) )
              if (length(todrop)>0) set = set[ -todrop, ]
            }
          }
          isc = filter_data( set, p$selection$aggregate )
          if (length(isc) > 0) set = set[isc,]
          isc = NULL
        }
      }

      return (set)
    }

# ----------------------------------

    if (DS == "filter" ) {
      # selected for a given set of species  and size and sex and maturity
      # wrapper around survey.db and groundfish.db to permit abundance data to be passed, including zero valued locations
      # selected for a given set of species  and size and sex and maturity
      # add zero valued locations too..

      # ft2m = 0.3048
      # m2km = 1/1000
      # nmi2mi = 1.1507794
      # mi2ft = 5280
      # k$sakm2 = (41 * ft2m * m2km ) * ( k$dist * nmi2mi * mi2ft * ft2m * m2km )  # surface area sampled in km^2
      # standardtow_sakm2 = (41 * ft2m * m2km ) * ( 1.75 * nmi2mi * mi2ft * ft2m * m2km )  # surface area sampled by a standard tow in km^2  1.75 nm

      # must not subset "set" until after all det/cat info has been resolved in order to reatin zerovalued sets

      set = cat = det = NULL # trip/set loc information

      set = survey.db( DS="set.base", p=p )
      u = data.frame( matrix( unlist(strsplit( set$id, ".", fixed=TRUE)), ncol=2, byrow=TRUE), stringsAsFactors=FALSE )
      set$mission = as.character( u[,1] )
      set$setno = as.numeric( u[,2] )

      if (add_groundfish_strata) {
        polygon_source = "pre2014"  # "pre2014" for older
        sppoly = maritimes_groundfish_strata( timeperiod=polygon_source, returntype="polygons" )
        # sppoly = spTransform(sppoly, sp::CRS(p$internal.crs_planar) )
        # sppoly$sa_strata_km2 = gArea(sppoly, byid=TRUE) / 1000/1000 # (m^2 -> km^2)
        # sppoly = spTransform(sppoly, sp::CRS(p$internal.crs) )
        set = maritimes_groundfish_strata_identify( Y=set, sppoly=sppoly, xyvars=c("lon", "lat"), planar_crs_km=p$internal.crs, plotdata=TRUE )
      }

      # filter non-biologicals ... i.e, set characteristics
      if (exists("selection", p)) {
        if (exists("survey", p$selection)) {  # filter survey information
          if (exists("StrataID", set) ) {
            if (exists("polygon_enforce", p$selection$survey) ) {
              set = set[ which(!is.na(set$StrataID)), ] # remove unsetegorized sets
            }
            if (exists("strata_toremove", p$selection$survey) ) {
              todrop = which( set$StrataID %in% strata_definitions( p$selection$survey[["strata_toremove"]] ) )
              if (length(todrop) > 0) set = set[- todrop, ]
            }
            if (exists("strata_tokeep", p$selection$survey) ){
              set = set[which( set$StrataID %in% p$selection$survey[["strata_to_keep"]] ) , ]
            }
          }
          if (exists("months", p$selection$survey) ) set = set[ which(month(set$timestamp) %in% p$selection$survey[["months"]] ), ]
          isc = filter_data( set, p$selection$survey )
          if (length(isc) > 0) set = set[isc,]
          isc = NULL
        }
      }


      data_source_base = "det"

      # indiviudal measurements filter
      if (exists("selection", p)) {
        if (exists("biologicals", p$selection)) {  # filter biologicals
          if ( length( p$selection$biologicals) == 1 ) {
            if ( exists( "spec_bio", p$selection$biologicals ) | exists( "spec", p$selection$biologicals ) ) {
              data_source_base = "cat"  #only if a single filter based upon spec or spec_bio
            }
          }
        }
      }

      if (data_source_base=="cat") {
        # merge from cat tables as there is no subselection by biologicals
        cat = survey.db( DS="cat", p=p  ) # size information, no, cm, weight (kg) and num per unit area (km^2)
        cat = cat[ which( cat$id %in% unique( set$id) ), ]
        isc = filter_data( cat, p$selection$biologicals )
        if (length(isc) > 0) cat = cat[isc,]
        isc = NULL
        # catvars= c("id", "totno", "totwgt", "cf_cat")
        cat$data.source = NULL
        set = merge( set, cat, by="id", all.x=TRUE, all.y=FALSE)
        set$totno_adjusted = set$totno * set$cf_cat
        set$totwgt_adjusted = set$totwgt * set$cf_cat
      }

      if (data_source_base=="det") {
        # weight and number summaries from det (overwrite those from cat as there is a subsampling)
        # --- NOTE det was not always determined and so totals from det mass != totals from cat nor set for all years
        # cf_det is the weight to make it sum up to the correct total catch (vs any subsamples) and tow length, etc
        det = survey.db( DS="det", p=p  ) # size information, no, cm, kg
        det = det[ which( det$id %in% unique( set$id) ), ]
        if (exists("selection", p)) {
          if (exists("biologicals", p$selection)) {  # filter biologicals
            isc = filter_data( det, p$selection$biologicals )
            if (length(isc) > 0) det = det[isc,]
            isc = NULL
          }
        }
        det$no = 1

        oo = applySum( det[ , c("id", "no")],   newnames=c("id", "totno") )
        set = merge( set, oo, by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

        oo = applySum( det[ , c("id", "mass")], newnames=c("id", "totwgt") )
        set = merge( set, oo, by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

        oo = applySum( det[ , c("id", "no", "cf_det_no")], newnames=c("id", "totno_adjusted") )
        set = merge( set, oo, by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

        oo = applySum( det[ , c("id", "mass", "cf_det_wgt")], newnames=c("id", "totwgt_adjusted") )
        set = merge( set, oo, by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )

        # --- NOTE det was not always determined and so totals from det mass != totals from cat nor set for all years
        set = merge( set, applySum( det[ , c("id", "mr", "cf_det_wgt")] ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
        set$mr[ which(!is.finite(set$mr))] = 0
        # averages of these variables from det
        newvars = c( "residual", "mass", "len", "Ea", "A", "Pr.Reaction", "smr"  )
        for ( nv in newvars ) {
          set = merge( set,
            applyMean( det[ , c("id", nv, "cf_det_wgt")] ), by="id", all.x=TRUE, all.y=FALSE, sort=FALSE )
        }
        set$Ea[ which(!is.finite(set$Ea))] = 0
        set$A[ which(!is.finite(set$A))] = 0
        set$Pr.Reaction[ which(!is.finite(set$Pr.Reaction))] = 0
        set$smr[ which(!is.finite(set$smr))] = 0
      }

      set$totno_adjusted[ which(!is.finite(set$totno_adjusted))] = 0
      set$totwgt_adjusted[ which(!is.finite(set$totwgt_adjusted))] = 0
      set$totno[ which(!is.finite(set$totno))] = 0
      set$totwgt[ which(!is.finite(set$totwgt))] = 0

      # NOTE:: these should be == or ~= 1/set$sa ( done this way in case there has been other adjustmensts such as subampling, etc ..) .. these become offets required to express totwgt or totno at a common areal density per unit km^2 in Poisson models
      set$cf_set_mass = set$totwgt_adjusted / set$totwgt
      set$cf_set_no = set$totno_adjusted / set$totno

      ii = which(!is.finite(set$cf_set_mass ))
      if (length(ii) > 0) set$cf_set_mass[ii] = set$cf_tow[ii]

      ii = which(!is.finite(set$cf_set_no ))
      if (length(ii) > 0) set$cf_set_no[ii] = set$cf_tow[ii]

    
			# in the following:	quantiles are computed,
      set$uid = paste( set$data.source, set$gear )

      set$qn = NA  # default when no data
      oo = which( set$totno_adjusted == 0 )  # retain as zero values
      if (length(oo)>0 ) set$qn[oo] = 0
      for ( iii in unique(set$uid) ) {
        jjj = which( set$uid==iii & set$totno_adjusted > 0 )
        if (length( jjj ) > 3 ) {
          set$qn[jjj] = quantile_estimate( set$totno_adjusted[jjj] )  # convert to quantiles, by species, geartype and survey
        }
      }
     
			set$qm = NA   # default when no data
      oo = which( set$totwgt_adjusted == 0 )  # retain as zero values
      if (length(oo)>0 ) set$qm[oo] = 0
      for ( iii in unique(set$uid) ) {
        jjj = which( set$uid==iii & set$totwgt_adjusted > 0 )
        if (length( jjj ) > 3 ) {
          set$qm[jjj] = quantile_estimate( set$totwgt_adjusted[jjj] )  # convert to quantiles, by species, geartype and survey
        }
      }

      set$uid = NULL
      

     # convert from quantile to z-score
      set$zm = quantile_to_normal( set$qm )
      set$zn = quantile_to_normal( set$qn )

      # aggeragate measurement filter
      if (exists("selection", p)) {
        # last pass .. filter any set-level traits (mostly aggregate biological characgeristics, if any)
        if (exists("aggregate", p$selection)) {  # filter on survey characteristics
          if (exists("drop.unreliable.zeros.groundfish.data", p$selection$aggregate )) {
            if (p$selection$aggregate$drop.unreliable.zeros.groundfish.data) {
              # special flag .. unreliable zero's for snowcrab in the groundfish data
              todrop = which( set$data.source=="groundfish" & set$yr < 1999 & (set$totwgt ==0 | set$totno==0) )
              if (length(todrop)>0) set = set[ -todrop, ]
            }
          }
          isc = filter_data( set, p$selection$aggregate )
          if (length(isc) > 0) set = set[isc,]
          isc = NULL
        }
      }

      return (set)
    }

  }