baytrends033: Long Term Water Quality Trend Analysis

Documented in gamDiff

# ####
#' Compute an estimate of difference based on GAM results
#'
#' @param gamRslt output from gam model
#' @param iSpec data set specifications (see details for required content)
#' @param analySpec analytical specifications
#' @param base.yr.set vector of years used for baseline period
#' @param test.yr.set vector of years used for test period
#' @param doy.set vector of days used to establish sub-annual analyses 
#' (see details)
#' @param alpha alpha level for computing confidence intervals
#' @param flow.detrended data generated by detrended.flow.  
#'Default = flow.detrended.
#' @param salinity.detrended data generated by detrended.salinity.  
#' Default = detrended.salinity.
#'
#' @details
#'   iSpec is a list containing information about the date range and 
#'   transformations. Specifically, iSpec must include iSpec$yearBegin, 
#'   iSpec$yearEnd, iSpec$centerYear corresponding to beginning year of record, 
#'   ending year of record and centering year. Also, iSpec must include 
#'   iSpec$transform and iSpec$logConst. (See online help for selectData for 
#'   more information on these values.)
#'
#'   base.yr.set and test.yr.set represent two time periods used to compare 
#'   differences. For example, base.yr.set=c(1999,2000) and test.yr.set=c(2013,
#'   2014) would compare GAM predictions from 1999-2000 versus 2013-2014. There 
#'   is no particular limit to the number of years included in the specification
#'   for base.yr.set and test.yr.set. For example, a user could specify 
#'   c(2001:2002,2004) to use the years 2001, 2002, and 2004, skipping 2003 
#'   because 2003 was an abnormal year (particularly wet, particularly dry,
#'   hurricanes, etc.).
#'
#'   base.yr.set and test.yr.set must be within the years specified by
#'   the range from iSpec$yearBegin to iSpec$yearEnd (inclusive). If not, this 
#'   function defaults to using the first two years (or last two years) of 
#'   record. If base.yr.set and test.yr.set are left to their default values of 
#'   NA, then the first two and last two years will be used
#'
#'   doy.set represents the days of year for which GAM predictions are made and 
#'   used to compute base.yr and test.yr means. For example doy.set= c(15, 46, 
#'   75, 106, 136, 167, 197, 228, 259, 289, 320, 350) would result in the 15th 
#'   of each month being used in the analysis; whereas doy.set= c(15, 46, 75) 
#'   would just use Jan-15, Feb-15, and Mar-15. (Keep in mind that this package 
#'   uses a 366 day calendar every year such that Mar-1 is always day 61, 
#'   regardless of leap year.) If doy.set is left to the default value of NA, 
#'   then c(15, 46, 75, 106, 136, 167, 197, 228, 259, 289, 320, 350) is used.
#'   
#'   The baseDay function has been added to this package from the smwrBase 
#'   package.
#'
#' @examples
#' # run analysisOrganizeData function to create the list analySpec
#' dfr <- analysisOrganizeData (dataCensored, report=NA)
#' df        <- dfr[["df"]]
#' analySpec <- dfr[["analySpec"]]
#'
#' # set GAM models to just one model
#' analySpec$gamModels <- list(
#'   list(option=2
#'        , name= "Non-linear trend with Seasonality (plus Interactions)"
#'        , model= "~ cyear + s(cyear) + s(doy,bs='cc') + 
#'        ti(cyear,doy,bs=c('tp','cc'))"
#'        , deriv=FALSE))
#'
#' # run GAM for a single water quality variable, station and layer
#' gamResult <- gamTest(df, 'tn', 'CB5.4', 'S', analySpec=analySpec)
#'
#' # use gamDiff to replicate estimates of change calculated in the above
#' gamDiff(gamRslt=gamResult[["gamOutput2"]]$gamRslt,
#'         iSpec=gamResult$iSpec, analySpec=analySpec,
#'         base.yr.set = NA, test.yr.set = NA,
#'         doy.set = NA, alpha = 0.05)
#'
#' # use gamDiff to calculate changes from 2005/06 to 2013/14
#' gamDiff(gamRslt=gamResult[["gamOutput2"]]$gamRslt,
#'         iSpec=gamResult$iSpec, analySpec=analySpec,
#'         base.yr.set = c(2004:2005), test.yr.set = c(2013:2014),
#'         doy.set = NA, alpha = 0.05)
#'
#' @return Returns a nest list that includes the base and test years, doys,
#'   period means in analyzed units, period means in observed units, percent
#'   change, difference estimate, difference estimate in observed units,
#'   standard error, confidence intervals, t statistic, p value, and alpha
#'   level. The alpha level corresponds to the confidence intervals. The first
#'   list (gamDiff.regular) uses the computed model to estimate differences and
#'   is applicable for GAM formulas that do not involve an intervention term.
#'   The second list (gamDiff.adjusted) performs computations by projecting the
#'   most recent intervention (e.g., the current lab method) to all time
#'   periods.
#' @export
#'
# ####
gamDiff <- function(gamRslt
                    , iSpec
                    , analySpec
                    , base.yr.set=NA
                    , test.yr.set=NA
                    , doy.set=NA
                    , alpha=0.05
                    , flow.detrended=NA
                    , salinity.detrended=NA) {
# -----< Change history >--------------------------------------------
# 24Mar2021: JBH: add parameter to of gamDiffNumChgYrs as 3 years
# 11Jul2018: JBH: add additional independent variables to pdat excluding dependent
#                 variable and "gamK*"
# 30Sep2017: JBH: added salinity interpolation to merge up with pdat
# 11Aug2017: JBH: added analySpec to function input parameters; renamed 
                   #pct.chg.dta to pdat
#                 updated setting for flw_sal to be observed flow
# 29Jul2017: JBH: enhance for flw_sal
# 09NOv2016: JBH: update documentation
# 04Nov2016: JBH: update to accomodate models with intervention terms. 
  #          specifically
#            update returned list to have gamDiff.regular (baseline mean 
  # computed as
#            per the prediction model) and gamDiff.adjusted (baseline meam 
  # computed
#            assuming last intervention method used throughout the record)
# 20Oct2016: JBH: add intervention term to prediction data set (pdat)
# 09Jun2016: JBH: migrated from .gamDiff to gamDiff; revised help file
# 04Jun2016: JBH: migrated name convention from gam1 to gamRslt; migrated 
  # percent change calculation
#            from function gamDiffPORtbl to this function; other misc. code 
  # formatting
# 24May2016: ESP: modified gamDiffPOR to create gamDiffYrs that allows user to 
  # specify
#            a base year period and a test year period for the difference 
  # computation
#            the user may also specify as set of days of year (doy.set) for 
  # computing this difference.
#            If base and test years are not specified, default if first 2 and 
  # last 2 years of POR
#            If yr.set not specified, default is once a month on the 15th.
# 27Apr2016: JBH: depricated porBegin, porEnd, porLength; switched to
#            dyear*, see por.rng assignment

  
# unpack ####
  
  # base.yr.set=NA; test.yr.set=NA; doy.set=NA; alpha=alpha
  # extract selected values from iSpec
  por.rng    <- c(iSpec$yearBegin, iSpec$yearEnd)
  centerYear <- iSpec$centerYear
  transform  <- iSpec$transform
  logConst   <- iSpec$logConst
  gamDiffNumChgYrs <- analySpec$gamDiffNumChgYrs

  # does model include intervention term 11Aug2017 [added from gamPlotCalc]
  intervention <- ifelse (length(grep('intervention',gamRslt$formula  )) == 0
                          , FALSE, TRUE)

  # does model include flw_sal term 11Aug2017 [added from gamPlotCalc]
  has.flw_sal <- ifelse (length(grep('flw_sal',gamRslt$formula  )) == 0
                         , FALSE, TRUE)
  if(has.flw_sal) {
    pdatWgt <- data.frame(normPct = analySpec$gamFlw_Sal.Wgt.Perc,
                          Z       = stats::qnorm(analySpec$gamFlw_Sal.Wgt.Perc),
            normD   = stats::dnorm(stats::qnorm(analySpec$gamFlw_Sal.Wgt.Perc)))
  } else {
    pdatWgt <- data.frame(normPct = 0.5,
                          Z       = stats::qnorm(0.5),
                          normD   = stats::dnorm(stats::qnorm(0.5)))
  }
  pdatWgt$flw.wgt <- pdatWgt$normD/sum(pdatWgt$normD)

# matrix set up ####
  # diffType="regular"; diffType="adjusted"  #(04Nov2016)
  for(diffType in c("regular","adjusted")) {

# matrix: set up prediction table pdat for regular calculation ####
    if(diffType=="regular") {                           #(04Nov2016)

      # if doy.set is NA, then assume doy.set = 15th of each month
      if(is.na(doy.set[1])){
        doy.set <-  baseDay(as.POSIXct(paste(2000,1:12,15,sep='-')))
      }

      # set up base and test years to first "gamDiffNumChgYrs" and last 
      # "gamDiffNumChgYrs" years if base.yr.set
      # and/or test.yr.set not specified; otherwise concatenate the values
      if(is.na(base.yr.set[1])) {base.yr.set <-  c(por.rng[1]
                                                ,por.rng[1]+gamDiffNumChgYrs-1)}
      if(is.na(test.yr.set[1])) {test.yr.set <-  c(por.rng[2]-
                                                gamDiffNumChgYrs+1,por.rng[2])}
      Nbase.yr <- length(base.yr.set)       # count years in each period
      Ntest.yr <- length(test.yr.set)
      yr.set <- c(base.yr.set,test.yr.set)  # combine base years and test years

      Ndoy  <- length(doy.set) # count predictions per year
      Nbase <- Ndoy*Nbase.yr   # calculate total predictions in base period
      Ntest <- Ndoy*Ntest.yr   # calculate total predictions in test period

      # create a data frame with Nrow rows where Nrow= (Nbase*Ndoy) + 
      # (Ntest*Ndoy)...
      # Nbase yrs of Ndoy baseline dates and Ntest-yrs
      # of Ndoy current dates. Include: doy, year, logical field (bl) indicating
      # baseline
      # and current, and centered decimal year (cyear) using same centering 
      # value
      # computed from data set (centerYear)
      pdat        <- expand.grid(doy.set,yr.set)       # make df with all comb. 
      # of doy.set and yr.set
      names(pdat) <- c('doy','year')                   # rename variables
      pdat$bl     <- pdat$year <= base.yr.set[Nbase.yr] # create logical field 
      # indicating baseline
      pdat$cyear  <- (pdat$year + (pdat$doy-1)/366) - centerYear # compute cyear

      # calculate date based on doy to set up adding intervention term 
      # (added 20Oct2016)
      pdat$month <- lubridate::month(as.Date(pdat$doy, origin = "1999-12-31"))
      pdat$day   <- lubridate::day  (as.Date(pdat$doy, origin = "1999-12-31"))
      pdat$date  <- as.POSIXct(paste(pdat$year,pdat$month,pdat$day,sep='-'))

      # add intervention term to prediction data set (added 20Oct2016)
      intervenList <- iSpec$intervenList
      intervenList$intervention <- as.character(intervenList$intervention)

      tmp <- lapply(1:nrow(pdat), function(x)
        list(intervenList[ intervenList$beginDate <= pdat$date[x] &
                             intervenList$endDate +(24*3600)-1 >= pdat$date[x]
                                                          , c("intervention")]))
      tmp[sapply(tmp, is.null)] <- NA
      pdat$intervention <- sapply(1:nrow(pdat)
                                  , function(x) unname(unlist(tmp[x])[1]))
      pdat$intervention <- factor(pdat$intervention
                                  , levels = intervenList$intervention)

      # add flw_sal and flw_sal.sd term for models with flw_sal  # 11Aug2017
      if(has.flw_sal) {
        # populate flw_sal
        pdat$flw_sal <- 0
        # populate flw_sal.sd
        if(iSpec$hydroTermSel=="flow") {
          # for "flow", iSpec$hydroTermSel.var will be d* indicating the 
          # averaging period
          # associated with the best correlation (i.e., max(abs(cor))) between 
          # the dependent variable
          # and the detrended log flow, e.g., d120 refers to a 120-day averaging
          # window
          tmp <- flow.detrended[[paste0("q"
                                        ,iSpec$usgsGageID
                                        ,".sum")]][["lowess.sd"]]
          tmp <- tmp[,c("doy",iSpec$hydroTermSel.var )]
          names(tmp)[names(tmp) == iSpec$hydroTermSel.var] <- 'flw_sal.sd'
        } else if (iSpec$hydroTermSel=="salinity") {
          # for "salinity", iSpec$hydroTermSel.var will be SAP or BBP indicating
          # the
          # average detrended salinity for "surface and above pycnocline" [SAP] 
          # or "bottom
          # and below pycnocline [BBP]. Selection is BBP for dependent variable
          # from B, BP,
          # or BBP; and SAP otherwise
          tmp <- salinity.detrended[[paste0(iSpec$stat,".sum")]][["lowess.sd"]]
          tmp <- tmp[,c("doy",iSpec$hydroTermSel.var )]
          names(tmp)[names(tmp) == iSpec$hydroTermSel.var] <- 'flw_sal.sd'
        } else {
          return('ERROR in lowess.sd pickup')
        }
        pdat <- merge( pdat,tmp, by="doy", all.x=TRUE)
        pdat <- pdat[with( pdat, order(date)), ]
      } else {
        pdat$flw_sal    <- NA_real_
        pdat$flw_sal.sd <- NA_real_
      }

      # add a row id number and sort columns # 11Aug2017
      pdat$rowID <- seq.int(nrow( pdat))
      tmp <- c("rowID","date","year","cyear","doy")
      pdat<-pdat[c(tmp, setdiff(names(pdat), tmp))]
      
      # identify additional independent variables #11Jul2018
      # excluding dependent variable and "gamK*" 
      indVar <- setdiff (all.vars(gamRslt$formula), c(iSpec$dep, names(pdat)))
      indVar <- indVar[-grep("^gamK", indVar)]
      pdat[,indVar] <- 0

# matrix: "adjust" prediction table pdat to use last intervention value for all 
      # calculations ####
      #(04Nov2016)
    } else if (diffType=="adjusted") {
      pdat$intervention <- pdat$intervention[length(pdat$intervention)]
    }      
    
      # create pdatLong from pdat with a downselected number of columns for 
    # merging
      # with pdatWgt; in the merge of pdatWgt and pdatLong (each row of pdat is 
    # repeated
      # for each record in pdatWgt, then goes to next row of pdat); flw_sal 
    # based on
      # z stat and sd
      pdatLong <- pdat[,c("rowID", "cyear", "doy", "bl", "intervention"
                          , "flw_sal", "flw_sal.sd", indVar)]
      pdatLong <- merge( pdatWgt[,c("Z","flw.wgt")], pdat[,c("rowID", "cyear"
                                                  , "doy", "bl", "intervention"
                                          , "flw_sal", "flw_sal.sd", indVar  )])
      pdatLong$flw_sal <- pdatLong$Z * pdatLong$flw_sal.sd

      # JBH(24Nov2017): original construction of xa and avg.per.mat
      # construct 2xNrow matrix (for averaging baseline and current periods). 
      # Since pdat is constructed
      # with Nrow rows (see above)
      # the weighting here is set to 1/Nbase for base period and 1/Ntest for 
      # test period.
      # xa <- c(rep(1/Nbase,Nbase),rep(0,Ntest),rep(0,Nbase),rep(1/Ntest,Ntest)) 
      # avg.per.mat <- matrix(xa,nrow=2,ncol=Nbase+Ntest, byrow=TRUE)
      
      # JBH(24Nov2017): extension of above xa and avg.per.mat
      #   keeping weight the same--just extending number of values by 
      # "*nrow(pdatWgt)"
      xa <- c(rep(1/Nbase,Nbase*nrow(pdatWgt)),
              rep(0,Ntest*nrow(pdatWgt)),
              rep(0,Nbase*nrow(pdatWgt)),
              rep(1/Ntest,Ntest*nrow(pdatWgt))) # construct a matrix to average
      # baseline and current periods
      avg.per.mat <- matrix(xa,nrow=2
                            ,ncol=(Nbase+Ntest)*nrow(pdatWgt), byrow=TRUE)
      
      # now apply flow weighting to the averaging matrix (probably could have 
      # combined into above
      # but this is ok such that each row of avg.per.mat is == 1) 
      avg.per.mat[1,] <- avg.per.mat[1,] * t(pdatLong$flw.wgt)
      avg.per.mat[2,] <- avg.per.mat[2,] * t(pdatLong$flw.wgt)

      # construct matrix to get difference of current minus baseline
      diff.mat <- c(-1,1)

      # Extract coefficients (number of terms depends on complexity of GAM 
      # formula)
      beta    <- gamRslt$coefficients        # coefficients vector
      VCmat   <- gamRslt$Vp                  # variance-covariance matrix of 
      # coefficents

# Begin calculations to compute differences ####
    # extract matrix of linear predicters
    Xpc     <- predict(gamRslt,newdata=pdatLong,type="lpmatrix")

    # Compute predictions based on linear predictors (Nrow x 1 matrix)
    pdep    <- Xpc%*%beta               # equivalent to 
    #                                       "predict(gamRslt,newdata=pdatLong)"

    # Calc. average baseline and average current; stores as a 2x1 matrix
    period.avg <- avg.per.mat %*% pdep

    # Calc. average current - average baseline; stores as a 1x1 matrix
    diff.avg  <- diff.mat %*% period.avg # pre-multiply by differencing matrix 
    # to check results

    # Calc standard errors and confidence intervals on difference predictions
    xpd       <- diff.mat%*%avg.per.mat%*%Xpc # premultiply linear predictors by
    # averaging and differencing matrices.
    diff.est  <- xpd%*%beta                   # compute estimate of difference
    diff.se   <- sqrt(xpd%*%VCmat%*%t(xpd))   # compute Std. Err. by usual rules
    diff.t    <- diff.est / diff.se
    diff.pval <- 2*pt(abs(diff.t), gamRslt$df.null, 0, lower.tail = FALSE)

    #compute CI for differnce
    halpha    <- alpha/2
    diff.ci   <- c(diff.est - qnorm(1-halpha) * diff.se,diff.est + 
                     qnorm(1-halpha) *diff.se)

    # back transform mean (03Nov)
    per.mn.obs <- if(transform) exp(period.avg) - logConst else period.avg

    # calculate percent change (03Nov)
    pct.chg <- 100*((per.mn.obs[2] - per.mn.obs[1])/per.mn.obs[1])

    # difference in obs units (03Nov)
    diff.est.obs <- per.mn.obs[2] - per.mn.obs[1]

# pack up and return results ####

    gamDiff.tmp <- list(base.yr    = base.yr.set,
                        test.yr    = test.yr.set,
                        doys       = doy.set,
                        per.mn     = as.vector(period.avg),
                        per.mn.obs = as.vector(per.mn.obs),
                        pct.chg    = pct.chg,
                        diff.est   = diff.est,
                        diff.est.obs = diff.est.obs,
                        diff.se    = diff.se,
                        diff.ci    = diff.ci,
                        diff.t     = diff.t,
                        diff.pval  = diff.pval,
                        alpha      = alpha)

    #(04Nov2016)
    if(diffType=="regular") {
      gamDiff.regular <- gamDiff.tmp
    } else if(diffType=="adjusted") {
      gamDiff.adjusted <- gamDiff.tmp
    }

  }

  gamDiff.return <- list(gamDiff.regular=gamDiff.regular
                         , gamDiff.adjusted=gamDiff.adjusted)
  return(gamDiff.return)

}