R/runSurvReg.R
In EGRET: Exploration and Graphics for RivEr Trends

Documented in jitterSam runSurvReg run_WRTDS

#' Run the weighted survival regression for a set of estimation points (defined by DecYear and Log(Q))
#'
#' This function runs the survival regression which is the concentration estimation method of WRTDS. 
#' It uses sample data from the data frame Sample. 
#' It does the estimation for a set of data points defined by two vectors: estPtYear and estPtLQ. 
#' It returns an array of results for the estimation points.  
#' The array returned contains yHat, SE and ConcHat (in that order). yHat is the expected value of log(concentration), SE is 
#' the standard error of log(concentration) and ConcHat is the expected value of concentration.
#'
#' @param Sample dataframe created for EGRET analysis
#' @param estPtYear numeric vector of Decimal Year values at the estimation points
#' @param estPtLQ numeric vector of ln(Q) values at the estimation points, must be the same length as estPtYear 
#' @param windowY numeric specifying the half-window width in the time dimension, in units of years, default is 7
#' @param windowQ numeric specifying the half-window width in the discharge dimension, units are natural log units, default is 2
#' @param windowS numeric specifying the half-window with in the seasonal dimension, in units of years, default is 0.5
#' @param minNumObs numeric specifying the miniumum number of observations required to run the weighted regression, default is 100
#' @param minNumUncen numeric specifying the minimum number of uncensored observations to run the weighted regression, default is 50
#' @param verbose logical specifying whether or not to display progress message
#' @param interactive logical deprecated. Use 'verbose' instead
#' @param edgeAdjust logical specifying whether to use the modified method for calculating the windows at the edge of the record.  The modified method tends to reduce curvature near the start and end of record.  Default is TRUE.
#' @param DecLow number specifying minimum decimal year (left edge of the estimated surfaces).
#' @param DecHigh number specifying maximum decimal year (right edge of the estimated surfaces).
#' @param run.parallel logical to run bootstrapping in parallel or not
#' @keywords water-quality statistics
#' @return resultSurvReg numeric array containing the yHat, SE, and ConcHat values array dimensions are (numEstPts,3)
#' @export
#' @rdname runSurvReg
#' @examples
#' eList <- Choptank_eList
#' estPtYear<-c(2001.0,2005.0,2009.0)
#' estPtLQ<-c(1,1,1)
#' Sample <- getSample(eList)
#' DecLow <- Sample$DecYear[1]
#' DecHigh <- Sample$DecYear[nrow(Sample)]
#' resultSurvReg <- runSurvReg(estPtYear,estPtLQ,
#'                             DecLow,DecHigh,Sample,
#'                             run.parallel = FALSE)
runSurvReg <- function(estPtYear, estPtLQ, DecLow, DecHigh, Sample, 
                     windowY=7, windowQ=2, windowS=0.5,
                     minNumObs=100, minNumUncen=50, verbose = TRUE,interactive=NULL,
                     edgeAdjust=TRUE, run.parallel = FALSE) {
  
  if(!is.null(interactive)) {
    warning("The argument 'interactive' is deprecated. Please use 'verbose' instead")
    verbose <- interactive
  }
  
  localSample <- Sample
  if(any(is.na(localSample$LogQ))){
    if (verbose) message("Removing Sample data that does not have corresponding flow data")
    localSample <- localSample[!is.na(localSample$LogQ),]
  }
  numSamples <- length(localSample$DecYear)
  
  numEstPt<-length(estPtYear)
  
  printUpdate <- floor(seq(1,numEstPt,numEstPt/100))
  endOfLine <- seq(10,100,10)
  
  if (minNumUncen >= sum(localSample$Uncen)) stop('minNumUncen is greater than total number of samples')
  if (minNumObs >= nrow(localSample)) stop('minNumObs is greater than total number of samples')
  
  warningFlag <- 0
  n <- NULL
  if(run.parallel){
    `%dopar%` <- foreach::`%dopar%`
    wrtds_return_list <- foreach::foreach(n = 1:numEstPt, .packages=c('EGRET')) %dopar% {
                      wrtds_returns <- run_WRTDS(estY = estPtYear[n], estLQ = estPtLQ[n],
                                                 localSample =localSample,DecLow = DecLow,DecHigh = DecHigh,
                                                 minNumObs = minNumObs,minNumUncen = minNumUncen,
                                                 windowY = windowY, windowQ = windowQ, windowS = windowS, 
                                                 edgeAdjust = edgeAdjust)
                    }
    
    warningFlag <- sum(sapply(wrtds_return_list, function(x) x[["warningFlag"]]))
    resultSurvReg <- t(sapply(wrtds_return_list, function(x) x[["survReg"]]))
    
  } else {
    resultSurvReg<-array(0,c(numEstPt,3))
    if (verbose) cat("Survival regression (% complete):\n")
    
    for (i in 1:numEstPt) {
      wrtds_return <- run_WRTDS(estY = estPtYear[i], estLQ = estPtLQ[i],
                                localSample =localSample,DecLow = DecLow,DecHigh = DecHigh,
                                minNumObs = minNumObs,minNumUncen = minNumUncen,
                                windowY = windowY, windowQ = windowQ, windowS = windowS, 
                                edgeAdjust = edgeAdjust)
  
      if (i %in% printUpdate & verbose) {
        cat(floor(i*100/numEstPt),"\t")
        if (floor(i*100/numEstPt) %in% endOfLine) cat("\n")
      }
      warningFlag <- warningFlag + wrtds_return$warningFlag
      resultSurvReg[i,] <- wrtds_return$survReg
    }
  }
  
  if (warningFlag > 0){
    message("\nIn model estimation, the survival regression function was run ", numEstPt, " times (for different combinations of discharge and time).  In ", warningFlag, " of these runs it did not properly converge. This does not mean that the model is unacceptable, but it is a suggestion that there may be something odd about the data set. You may want to check for outliers, repeated values on a single date, or something else unusual about the data.")
  }

  if (verbose) cat("\nSurvival regression: Done")

  return(resultSurvReg)
}

#' @export
#' @rdname runSurvReg
#' @param localSample "Sample" data frame from the eList. 
#' @param estY numeric decimal year values at the estimation point
#' @param estLQ numeric ln(Q) values at the estimation point 
run_WRTDS <- function(estY, estLQ,
                      localSample,DecLow,DecHigh,
                      minNumObs,minNumUncen,
                      windowY, windowQ, windowS,
                      edgeAdjust){
  # This loop takes us through all the estimation points
  # We always reset the window widths to their starting values, because they may
  #   have been widened in the process
  tempWindowY<-windowY
  tempWindowQ<-windowQ
  tempWindowS<-windowS
  
  distLow <- estY-DecLow
  distHigh <- DecHigh-estY
  
  survReg <- c(NA, NA, NA)
  warningFlag <- 0
  
  if(all(is.na(c(distLow,distHigh)))){
    return(list(survReg=survReg, warningFlag=warningFlag))
  }
  
  distTime <- min(distLow,distHigh)
  
  if (edgeAdjust & !is.na(distTime)) {
    tempWindowY <- if(distTime>tempWindowY) tempWindowY else ((2 * tempWindowY) - distTime)
  } 

  k <- 1
  
  repeat{
    #  We subset the sample frame by time, to narrow the set of data to run through in the following steps
    
    Sam <- localSample[abs(localSample$DecYear-estY) <= tempWindowY,]
    diffY<-abs(Sam$DecYear-estY)
    weightY<-triCube(diffY,tempWindowY)
    weightQ<-triCube(Sam$LogQ-estLQ,tempWindowQ)
    diffUpper<-ceiling(diffY)
    diffLower<-floor(diffY)
    diffSeason<-pmin(abs(diffUpper-diffY),abs(diffY-diffLower))
    weightS<-triCube(diffSeason,tempWindowS)
    Sam$weight<-weightY*weightQ*weightS
    Sam<-subset(Sam,weight>0)
    numPosWt<-length(Sam$weight)
    numUncen<-sum(Sam$Uncen)
    tempWindowY<-tempWindowY*1.1
    tempWindowQ<-tempWindowQ*1.1
    k <- k + 1
    if(k > 10000) message("Problems converging")
    # the next line is designed so that if the user sets windowS so it includes
    # data from all seasons, the widening process leaves it alone    	
    tempWindowS<-if(windowS<=0.5) min(tempWindowS*1.1,0.5) else windowS
    if(numPosWt>=minNumObs&numUncen>=minNumUncen | k > 10000) break
  }
  
  # now we are ready to run Survival Regression
  weight<-Sam$weight
  aveWeight<-sum(weight)/numPosWt
  weight<-weight/aveWeight
  Sam <- data.frame(Sam)
  
  x <- tryCatch({
    survModel <- survival::survreg(survival::Surv(log(ConcLow),log(ConcHigh),type="interval2") ~ 
                                     DecYear+LogQ+SinDY+CosDY,data=Sam,weights=weight,dist="gaus")
  }, warning=function(w) {
    
    if(w$message == "Ran out of iterations and did not converge"){
      Sam2 <- jitterSam(Sam)
    } else {
      Sam2 <- Sam
    }
    
    survModel <- survival::survreg(survival::Surv(log(ConcLow),log(ConcHigh),type="interval2") ~ 
                                     DecYear+LogQ+SinDY+CosDY,data=Sam2,weights=weight,dist="gaus")
    
    return(survModel)
  }, error=function(e) {
    message(e, "Error")
    return(NULL)
  })
  
  if(inherits(x, "survreg")) {
    newdf<-data.frame(DecYear=estY,LogQ=estLQ,SinDY=sin(2*pi*estY),CosDY=cos(2*pi*estY))
    #   extract results at estimation point
    yHat<- stats::predict(x,newdf)
    SE<-x$scale
    bias<-exp((SE^2)/2)
    survReg[1]<-yHat
    survReg[2]<-SE
    survReg[3]<-bias*exp(yHat)
  } 
  
  if(all(is.na(x))){
    warningFlag <- 1
  }
  return(list(survReg=survReg, warningFlag=warningFlag))
}

#' jitter Sample
#' 
#' This function is used in cases where there are numerical problems with 
#' the estimation of the WRTDS model.  This mostly happens during bootstrap 
#' estimation or when the data sets are very large.  In order to reduce the 
#' collinearity in the explanatory variables, some random noise is added to 
#' the time and log discharge variables in the Sample data frame. 
#' 
#' @export
#' @return SamR a data frame structured like the Sam data frame but with
#'  the time and discharge variables modified by adding random jitter
#' @param Sam data frame with at least columns DecYear and LogQ
#' @param V a multiplier for the sd of the LogQ jitter. for example V = 0.02,
#'  means that the sd of the LnQ jitter is 0.02*sdLQ
#' @examples 
#' eList <- Choptank_eList
#' Sample_jitter <- jitterSam(eList$Sample)
jitterSam <- function(Sam, V = 0.2) {
  SamR <- Sam
  n <- length(Sam$DecYear)
  SamR$DecYear <- Sam$DecYear + stats::rnorm(n,0,0.05)
  SamR$SinDY <- sin(SamR$DecYear * 2 * pi)
  SamR$CosDY <- cos(SamR$DecYear * 2 * pi)
  sdLQ <- stats::sd(Sam$LogQ)
  s <- sdLQ * V
  SamR$LogQ <- Sam$LogQ + stats::rnorm(n,0,s)
  return(SamR)
}