MESgenCov: Generates input for the MES problem

Documented in getCov

#'Creates covariance matrix from normalized NADP monitor data
#'gamma version with optional inputs
#'@import   MVN
#'@importFrom rmatio write.mat
#'@importFrom EnvStats rosnerTest
#'@importFrom utils data
#'@param df data frame of input arguments specified in vignette and default set of inputs are in data("defaultInput")

#'@return   List of model summaries at each site, covariance matrix, sites analyzed, predicted values by univariate models, data frame of residuals and more (see vignette for full list)
#'@export
#'@examples getCov(structure(list(startdateStr = "01/01/83 00:00", enddateStr   = "12/31/86 00:00", 
#'comp = "SO4", use36 = TRUE, siteAdd = NULL, outlierDatesbySite = NULL,
#'siteOutliers = NULL,  removeOutliers = NULL, plotMulti = FALSE,  sitePlot = NULL,
#'plotAll = FALSE, writeMat = FALSE, seas = 12, r = 1, k = 1),
#'.Names = c("startdateStr","enddateStr","comp","use36","siteAdd",
#'"outlierDatesbySite","siteOutliers","removeOutliers","plotMulti","sitePlot",
#'"plotAll","writeMat","seas","r","k")  ,row.names = c(NA, -1L) 
#',class = "data.frame"))


getCov <- function(df){
  start_time <- Sys.time()
  #get data if it's not in the working directory
  if(!exists("weeklyConc") || !exists("preDaily")){
    try({utils::data("weeklyConc",envir = environment()); utils::data("preDaily",envir = environment())})
  }
  #check, still doesn't exist?
  if(!exists("weeklyConc") || !exists("preDaily")){
    stop("Missing files, running code that downloads necessary files from the NADP site")
  }
  #check for multiple pollutants
  if(length(df$comp) > 1){
    stop("Package can only model data from one pollutant/observed variable")
    #later versions of package will be able to handle this
  }
  
  ##code for optional input functionality
  dfInp <- df
  weeklyB      <- FALSE#df$weeklyB
  startdateStr <- df$startdateStr
  enddateStr   <- df$enddateStr
  comp         <- df$comp
  use36        <- df$use36
  siteAdd      <- df$siteAdd
  outlierDatesbySite <- df$outlierDatesbySite[[1]]
  siteOutliers <- df$siteOutliers
  removeOutlier <-df$removeOutliers 
  plotMulti    <- df$plotMulti
  sitePlot     <- df$sitePlot
  plotAll      <- df$plotAll
  writeMat     <- df$writeMat
  seas         <- df$seas
  r            <- df$r
  k            <- df$k
  
  if (is.null(siteOutliers)){
    showOutliers = FALSE
  }else{showOutliers = TRUE}
  if (is.null(sitePlot)){
    plotB = FALSE
  }else{plotB = TRUE}
  
  ##error handling
  if (is.null(startdateStr) | is.null(enddateStr) ){
    stop("Required input is missing check start and end date")
  }
  
  if ((is.null(siteAdd) & !use36) ){
    stop("Required input is missing check added sites")
  }
  
  
  ##### store data
  
  conCSV <- MESgenCov::weeklyConc
  preCSVf <- stats::na.omit(MESgenCov::preDaily)
  conCSV$siteID  <- toupper(conCSV$siteID) #combine data
  preCSVf$siteID <- toupper(preCSVf$siteID)
  
  #filter out unnecessary columns
  conCSVf  <- conCSV[,-5:-14]
  conCSVf  <- stats::na.omit(conCSVf)
  
  startdate  <- as.POSIXct(startdateStr, format = "%m/%d/%y %H:%M")
  enddate    <- as.POSIXct(enddateStr  , format = "%m/%d/%y %H:%M")
  strtYrMo   <- format(startdate,"%Y%m")
  endYrMo    <- format(enddate,  "%Y%m")
  diffYrm    <- mondf(startdate,enddate)+1
  nonneg <- 0
  if(weeklyB){
    totT <- as.integer(difftime(enddate,startdate,units="weeks"))-1
    nonneg <- 1
    if(is.null(seas)){seas = 52}
  }else if(!(floor(mondf(startdate,enddate))==(mondf(startdate,enddate)))){
    stop("Number of months is not integer")
  }else if(diffYrm <= 1){
    stop("Number of months is less than or equal to one")
  }else{
    totT <- (mondf(startdate,enddate))+1
    if(is.null(seas)){seas = 12}
  }
  
  #add default sites
  if(use36){
    cat36 <- c("AL10","IL11","IL18","IL19","IL35","IL47","IL63","IN34",
               "IN41","MA01","MA13","MD13","MI09","MI26","MI53",
               "NC03","NC34","NC41","NJ99","NY08","NY10","NY20",
               "NY52","NY65","OH17","OH49","OH71","PA15","PA29",
               "PA42","TN00","TN11","VA13","VT01","WI28","WV18")#from Guttorp, Le 1992
  }else{cat36 <- NULL}
  cati   <- union(cat36,siteAdd[[1]])
  obs    <- comp
  
  #add back desired columns based on input in obs e.g add back SO4, pH, NO3 etc.
  for (i in 1:length(obs)){
    obsiCSV           <- match(obs[i],colnames(conCSV))
    cn                <- colnames(conCSVf)
    conCSVf[,4+i]     <- conCSV[,obsiCSV] #
    colnames(conCSVf) <- c(cn, obs[i])
  }
  rm(conCSV)
  
  #filter by date
  conCSVf   <- conCSVf[conCSVf$yrmonth>=strtYrMo,]
  conCSVf   <- conCSVf[conCSVf$yrmonth<=endYrMo,]
  
  #filter dates for precipitation data
  d1 <- startdate
  d2 <- enddate
  preCSVf <- preCSVf[preCSVf$starttime >=d1,]
  preCSVf <- preCSVf[preCSVf$endtime   <=d2,]
  
  preCSVf$amount    <- as.numeric(preCSVf$amount) #change data type of column
  preCSVf <- preCSVf[preCSVf$amount>-0.0001,]     #filter out -/ive values

  
  #initialize
  
  rv    <- integer(5); if(r != 0){rv[1:r] <- rep(1L,r)}
  kv    <- integer(5); if(k != 0){kv[1:k] <- rep(1L,k)}
  kk    <- k
  
  tNA     <- vector(mode="list", length=(length(cati)*length(obs))) #time stamp before na.omit
  tafNA   <- vector(mode="list", length=(length(cati)*length(obs))) #time stamp
  y0      <- vector(mode="list", length=(length(cati)*length(obs))) #monthly/weekly concentraition values
  ylogNA  <- vector(mode="list", length=(length(cati)*length(obs))) #log data w/ NA values
  ylog    <- vector(mode="list", length=(length(cati)*length(obs))) #log data
  e    <- vector(mode="list", length=(length(cati)*length(obs))) #error
  e2   <- vector(mode="list", length=(length(cati)*length(obs))) #error with NA
  mods <- vector(mode="list", length=(length(cati)*length(obs))) #model summaries
  vpredl <- vector(mode="list", length=(length(cati)*length(obs))) #list of vector of predictions
  
  si   <- 1
  obsi <- 1
  siObs<- length(obs)
  oc   <- 1

  if(plotAll){
    graphics::par(mfrow = c(2,2),
        oma = c(0,0,0,0) + 0.1,
        mar = c(0,0,0,0) + 0.1)
  }

  for(s in 1:(length(cati)*length(obs))){
    #change obs
    if (s%%(length(cati)) == 1 && s != 1){obsi = obsi + 1; si = 1}
    
    #filter site (weekly concentration data)
    conCSVk <- conCSVf[conCSVf$siteID == toString(cati[si]),]
    preCSVk <- preCSVf[preCSVf$siteID == toString(cati[si]),]

    if(nrow(conCSVk)==0 || nrow(preCSVk)==0){
      #store parameters
      tNA[[s]]     <- NA
      to[[s]]     <- NA
      e[[s]]      <- NA
      e2[[s]]     <- NA
      ylog[[s]]   <- NA
      y0[[s]]     <- NA
      ylogNA[[s]] <- NA
      mods[[s]]   <- NA
      vpredl[[s]] <- NA
      message(paste0("Missing data for ", cati[si]," ",obs[obsi],
                     " check if site has data for inputted dates in data file weeklyConc, preDaily"))
    }else{
      #filter out missing data, -/ive values, and order dataframes by date
      site <- conCSVk
      site <- site[site[,4 + obsi]>-0.0001,]
      site <- site[order(site$dateon),]
      preCSVk <- preCSVk[order(preCSVk$starttime,decreasing=F),]

      #aaggregates weekly precipitation
      if (obs[obsi] == "ph"){bi = 1}else{bi = 0}
      site <- appendPre(site,preCSVk,obs,obsi,siObs,bi)
      site <- stats::na.omit(site)
      
      #aggregate precipitation data monthly and get concentration values
      if(!weeklyB){sitem <- aggregateMonthly(bi,site,siObs,obs,obsi,totT,strtYrMo)
      }#else{       sitem <- weeklyConcT(bi,site,siObs,obs,obsi,startdate)}

      
      if (length(outlierDatesbySite) != 0){
        if(outlierDatesbySite[oc] == cati[si]){
          for(i in (oc+1):length(outlierDatesbySite)){
            ifInt  <- suppressWarnings(as.integer(outlierDatesbySite[i]))
            if (!is.na(ifInt)){
              j     <- as.integer(outlierDatesbySite[i])
              index <- match(j, sitem$t)
              if (!is.na(index)){
                sitem[index,] <- NA
              }
            }else{break}
          }
          oc = i #outlier site counter #skips t
        }
      }
      start_time <- Sys.time()
      p <- 1
      #deterministic univariate model for one site
      cn <- colnames(sitem)
      minsite3 <- min(stats::na.omit(sitem[,3]))
      if(!weeklyB || minsite3 >= 0.0001){
        minsite3 <- 0
        nonneg   <- 0
      }else if(minsite3 == 0){
        nonneg = 0.00001
      }
      sitem[,4] <- log(sitem[,3] + abs(minsite3) + nonneg)
      ylogNA <- sitem[,4]
      #moved y0, ylogNA from here to avoid NAs in plot call
      sitem     <- stats::na.omit(sitem)
      colnames(sitem) <- c(cn[1:2],"Conc" ,"log")
      y0[[s]]     <- sitem$Conc
      ylog[[s]]   <- sitem$log
      tafNA[[s]]  <- sitem$t #same as t for monthly different for weekly
      t <- sitem$t
      cyclicTrend <- (I(cos(t*(2*pi/seas))^p)   + I(sin(t*(2*pi/seas))^p))*kv[1]   +
                     (I(cos(t*(2*pi/seas)*2)^p) + I(sin(t*(2*pi/seas)*2)^p))*kv[2] +
                     (I(cos(t*(2*pi/seas)*3)^p) + I(sin(t*(2*pi/seas)*3)^p))*kv[3] +
                     (I(cos(t*(2*pi/seas)*4)^p) + I(sin(t*(2*pi/seas)*4)^p))*kv[4] +
                     (I(cos(t*(2*pi/seas)*5)^p) + I(sin(t*(2*pi/seas)*5)^p))*kv[5] +
                   t*rv[1] + (t^2)*rv[2] + (t^3)*rv[3] + (t^4)*rv[4] + (t^5)*rv[5]
      
      if(df$comp != "ph"){
      dfc  <- data.frame(cbind(sitem$log,cyclicTrend))
      dfc  <- data.frame(cbind(dfc,t))
      mod <- definelm(sitem$log,t,dfc,r,kk,seas)
      } else {
        dfc  <- data.frame(cbind(sitem$Conc,cyclicTrend))
        dfc  <- data.frame(cbind(dfc,t))
        mod <- definelm(sitem$Conc,t,dfc,r,kk,seas)
      }
      er  <- stats::residuals(mod)
      summary(mod)
      mods[[s]] <- summary(mod)
      tc       <- data.frame(1:totT)
      currResi <- data.frame(cbind(t,er))
      colnames(currResi) <- c("t","error")
      colnames(tc)      <- c("t")
      cR       <- merge(tc,currResi,by = "t", all = TRUE) #merge(dfRes,currRes, by = "t", all = F)
      e2[[s]]  <- cR[,2]
      end_time <- Sys.time()
      #print(paste0("model time :",end_time - start_time))
      #store predicted value vector
      options(warn = -1)
      if(length(tafNA[[s]]) < totT){
        vpred <- predict(mod,newdata = tc)
      }else{vpred <- predict(mod)}
      options(warn = 0)
      
      #fill in missing t ##there's a better way of doing this
      se1  <- sqrt(deviance(mod)/df.residual(mod))
      fi   <- 0 #to loop back
      maxfi<- totT - length(t)
      fe   <- 1:(maxfi) #fake end to keep loop going #ignore if t=48
      t    <- c(t,fe)
      y1   <- c(sitem$log,fe)
      er   <- c(er,fe)
      kl    <- 1

      for(i in 1:(totT+maxfi-1)){
        if(t[kl] != i-fi){ #if t is skipped
          cy1 <- vpred[i-fi]
          ry1 <- y1[kl:length(y1)]
          set.seed(i+s)
          e0 <- rnorm(1,0,se1)
          y1 <- c(y1[1:kl-1],cy1+e0,ry1)
          er <- c(er[1:kl-1],e0,er[kl:length(er)]) # change to stochastic #change to 0
          t  <- c(t[1:kl-1],i-fi,t[kl:length(t)])
          fi <- fi + 1
        }else{kl = kl + 1}
      }
      if(length(t)>totT){t<-t[1:totT]; y1 <- y1[1:totT];er<-er[1:totT]}
      
      #store parameters
      e[[s]]     <- unname(er)
      vpredl[[s]] <- vpred
      
      #plot all sites if indicated by user
      if(plotAll){
        if(s%%4 == 1 && s!=1){
          grDevices::dev.new(width = 6, height = 5.5, noRStudioGD = T, unit = "in")
          graphics::par(mfrow = c(2,2))}
        graphics::par(mar=c(4,4,2,2))
        tc <- 1:length(vpred)
        graphics::plot(sitem$t,sitem$log, ylab="Log concentration",main = paste(cati[si],obs[obsi]), xlab = "t (months)")
        graphics::par(new=TRUE)
        graphics::lines(x = tc, y = vpred, col ="blue")
      }
    }
    si <- si + 1
  }
  ############################END BIG LOOP##############################
  
  options(warn=-1)
  dfRes  <- NULL
  dfRes2 <- NULL
  si    <- 2
  obsi  <- 1
  tc <- 1:totT
  #loop to create dataframe of all residuals
  dfRes <- cbind(tc,e[[1]])
  dfRes2 <- cbind(tc,e2[[1]]) #with NA values
  colnames(dfRes) <- c("t",paste(cati[1],obs[1], sep=""))
  colnames(dfRes2) <- c("t",paste(cati[1],obs[1], sep=""))
  for( i in 2:(length(cati)*length(obs))){
    if (i%%(length(cati)) == 1){obsi = obsi + 1; si = 1}
      currRes <- cbind(tc,e[[i]])
      colnames(currRes) <- c("t",paste0(cati[si],obs[obsi]))
      dfRes   <- merge(dfRes,currRes, by = "t", all = F)
    currRes2 <- cbind(tc,e2[[i]])
    colnames(currRes2) <- c("t",paste0(cati[si],obs[obsi]))
    dfRes2   <- merge(dfRes2,currRes2, by = "t", all.x = TRUE,all.y = TRUE)
    si <- si+1
  }
  #create covariance matrix
  covxx <- data.frame(cov(dfRes[,-1]))
  options(warn=0)
  
  #produce multivariate analysis
  graphics::par(mar=c(1,1,1,1))
  MVDw <- mvn(dfRes[,-1], subset = NULL, mvnTest = "mardia", covariance = TRUE, tol = 1e-25, alpha = 0.5, scale = FALSE, desc = TRUE, transform = "none", univariateTest = "SW",  univariatePlot = "none", multivariatePlot = "none", multivariateOutlierMethod = "none", bc = FALSE, bcType = "rounded", showOutliers = FALSE,showNewData = FALSE)
  univariateTest <- MVDw$univariateNormality
  MVDw
  if(plotMulti){
    grDevices::dev.new(width = 4, height = 5, noRStudioGD = TRUE)
    #graphics::par(mfrow=c(1,2))
    MVDw <- mvn(dfRes[,-1], subset = NULL, mvnTest = "mardia", covariance = TRUE, tol = 1e-25,
                alpha = 0.5, scale = FALSE, desc = TRUE, transform = "none", univariateTest = "SW", 
                univariatePlot = "none", multivariatePlot = "qq", multivariateOutlierMethod = "none", 
                bc = FALSE, bcType = "rounded", showOutliers = FALSE, showNewData = FALSE)
    MVDw
  }
  #sitePlot
  if(plotB){
    tc <- 1:totT
    for (g in 1:length(sitePlot[[1]])){
      grDevices::dev.new(width = 6, height = 5.5, noRStudioGD = T, unit = "in")
      graphics::par(mar=c(4,4,2,2))
      i = match(sitePlot[[1]][g], cati)
      if(!is.na(i)){
        graphics::plot(tafNA[[i]],ylog[[i]], ylab="Log concentration",main = paste(cati[i],obs[1]),xlab = "t (months)")
        graphics::par(new=TRUE)
        graphics::lines(x = tc, y = vpredl[[i]], col ="blue")
      }else{warning("Site in sitePlot was not in the vector of sites that were analyzed. Make sure the site ID in sitePlot is in the column siteAdd of the input data frame.")}
    }
  }
  #outlier analysis
  rosnerT     <- vector(mode="list", length=(length(cati)*length(obs))) #time stamp
  siteRosner  <- NULL
  noutliers   <- "Outlier test not enabled, add sites to siteOutliers in the following format list(c(\"IN41\",\"OH71\"))"
  if(showOutliers == TRUE){
    noutliers <- 0
    if (length(siteOutliers[[1]])==1){
      #find outliers in each site
      #Store Site outliers
      i <- match(siteOutliers[[1]][1],cati)
      if(is.na(i)){
        message("siteOutliers string doesn't match any colnames, these are the options:")
        print(colnames(dfRes))
        siteRosner = NULL}
      else{siteRosner = rosnerTest(dfRes[,i+1])
          rosnerT[[i]] <- rosnerTest(dfRes[,i+1])}
    }else if (length(siteOutliers[[1]]) > 1){
      for (z in 1:length(siteOutliers[[1]])){
        i <- match(siteOutliers[[1]][z],cati)
        if(is.na(i)){
          message("siteOutliers string doesn't match any colnames, these are the options:")
          print(cati)
          siteRosner = NULL
        }else{siteRosner = c(siteRosner, rosnerTest(dfRes[,i+1]))
              rosnerT[[i]] <- rosnerTest(dfRes[,i+1])}
              noutliers    <- noutliers + rosnerTest(dfRes[,i+1])$n.outliers
      }
    }
  }
  if(!is.null(removeOutlier)){
    if(siteOutliers[[1]] %contain% removeOutlier[[1]]){
      if(length(removeOutlier[[1]])>=2){
        sitesOut <- removeOutlier[[1]]
        outlierDatesbySite <- takeOutAllOutliers(sitesOut,rosnerResult = rosnerT,cati)$outBySite
        outSites           <- takeOutAllOutliers(sitesOut,rosnerResult = rosnerT,cati)$sites
        if(length(outlierDatesbySite) >= 2){
          updatedPars        <- reEvaluateSites(dfInp, tafNA,startdate,enddate,totT,
                                                y0,ylog,e,e2,mods,vpredl, outlierDatesbySite,outSites,cati,strtYrMo,endYrMo)
          #update all outputs here
          dfRes   <- updatedPars$residualData
          dfRes2  <- updatedPars$residualDataNA
          covxx   <- updatedPars$cov
          MVDw    <- updatedPars$mvn
          vpredl  <- updatedPars$pred
          mods    <- updatedPars$newMods 
        }else{message("No univariate outliers")}
      }
    }else{
      missingSites <- setdiff(removeOutlier[[1]], siteOutliers[[1]])
      str2 <- paste(missingSites, collapse= ", ")
      warning(paste0("Outliers were not removed because there exists site(s) ",
                     str2," in removeOutliers that are not in siteOutliers."))}
  }
  if(writeMat){
    write.mat(covxx,filename = "covSites.mat")
  }
  my_list <- list("listMod" = mods, "cov" = covxx, "sites" = cati,
                  "mvn" = MVDw, "univariateTest" = univariateTest, "residualData" = dfRes[,-1],"residualDataNA" = dfRes2[,-1],
                  "rosnerTest" = rosnerT, "pred" = vpredl, "nOutliers"= noutliers)
  return(my_list)
}