R/noaa.r
In VulnToolkit: Analysis of Tidal Datasets

Documented in noaa

#' @title Downloads NOAA CO-OPS tide data
#'
#' @description
#' Scrapes water level data (and other measurements) from NOAA CO-OPS website. NOAA's site limits the time period for data downloads, but these constraints are avoided by `noaa()`. Requires 
#' internet connection and curl (check availablility using `Sys.which("curl")`). 
#'
#' @details 
#' Download water level and other data from NOAA CO-OPS website.
#' 
#' @usage noaa(begindate = "begindate", enddate = "enddate", station = "8467150",
#' met = FALSE, units = "meters", datum = "MHW", interval = "HL", time = "GMT", 
#' continuous = TRUE)
#' 
#' @param begindate first day of data to download. Format must be YYYYMMDD. If 
#' left unspecified, the first complete day of data will be used.
#' 
#' @param enddate final day of data to download. Format must be YYYYMMDD. If left 
#' unspecified, the last complete day of data will be used.
#' 
#' @param station station number, found on the NOAA Tides and Currents website 
#' (https://www.tidesandcurrents.noaa.gov/stations.html?type=Water+Levels).
#' Station numbers can be numeric or a character string (necessary if first 
#' character is a zero). Default station is Bridgeport, CT.
#' 
#' @param met whether meteorological data should be returned. This value can be 'TRUE' or 
#' 'FALSE'; if 'TRUE', all ancillary parameters are returned. At present, this only 
#' works with 6-minute and hourly data
#' 
#' @param units can be 'feet' or 'meters'. Default is 'meters'
#' 
#' @param datum vertical reference datum, set to 'MHW' by default. Can be 'station', 'NAVD', 
#' 'MLLW', 'MLW', 'MSL', 'MTL', 'MHW', 'MHHW', or 'IGLD' (some datums are not available 
#' at some sites)
#' 
#' @param interval sets measurement interval; can be 'HL' (default), '6 minute', 'hourly', or 'monthly'.
#' 
#' @param time can be 'LST', 'GMT', or 'LST/LDT'. Not all time zones are available for all data. 
#' GMT appears to have wider availability than LST, so it is the default.
#' 
#' @param continuous determines whether a continuous time series is produced, with lengthy gaps 
#' in data filled in with NAs. By default, this is \code{FALSE}. This option only applies 
#' to data at evenly spaced intervals (i.e., \code{6 minute} or \code{hourly})
#' 
#' @return dataset a dataframe with water levels, associated time stamps, a station ID column, 
#' and tide type (if interval is set to \code{HL}). The NOAA CO-OPS website has many odd data 
#' availabilty problems. Some data are not available in all time intervals or time zones.

#' 
#' @importFrom plyr join_all
#' @importFrom stats complete.cases
#' @importFrom utils read.csv
#' @importFrom utils download.file
#' 
#' 
#' @examples \dontrun{
#' # Example requires an internet connection
#'bport2013 <- noaa(begindate = 20130101, enddate = 20131231, 
#'   station = "8467150", interval = "6 minute")
#'
#'test2.1 <- noaa(begindate = "20100101", enddate = "20120101", interval = "hourly") 
#'test2.2 <- noaa(begindate = "20100101", enddate = "20120101", interval = "hourly", 
#'                continuous = TRUE) 
#'nrow(test2.1) # includes data on NOAA site (incomplete record)
#'nrow(test2.2) # fills gaps with NAs 
#'test2.3 <- noaa(begindate = "20100101", enddate = "20120101", interval = "hourly", 
#'                met = TRUE) 
#' }
#' @export



noaa <- function(begindate = "begindate", enddate = "enddate", station = "8467150",
                 met = FALSE, units = "meters", datum = "MHW", interval = "HL", 
                 time = "GMT", continuous = TRUE) {
  getDates <- function(startDate, endDate, dataType, 
                       first.record = startDate, last.record = endDate) {
    # function produces a vector of dates used to download data
    startDate <- as.Date(gsub("-", "", startDate), tz = posix.tz, format = "%Y%m%d")
    endDate   <- as.Date(gsub("-", "", endDate), tz = posix.tz, format = "%Y%m%d")
    
    dates <- startDate
    if (dataType == "HL") { 
      if (startDate < first.record | endDate > last.record)   {
        stop ("invalid time interval")
      } else if (as.numeric(endDate - startDate) > 364) {
        dates <- seq(startDate, endDate, 365)
      } else (dates <- c(startDate, endDate))
    }
    if (dataType == "6 minute") {
      if (startDate < first.record | endDate > last.record)   {
        stop ("invalid time interval")
      } else if (as.numeric(endDate - startDate) > 30)  {
        dates <- seq(startDate, endDate, 31)
      } else (dates <- c(startDate, endDate))
    }
    if (dataType == "hourly") {
      if (startDate < first.record | endDate > last.record)   {
        stop ("invalid time interval")
      } else if ( as.numeric(endDate - startDate) > 364) {
        dates <- seq(startDate, endDate, 365)
      } else (dates <- c(startDate, endDate))
    }
    
    if (!endDate %in% dates[length(dates)]) {
      dates <- c(dates, endDate)
    }
    
    returnedDates <- gsub("-", "", as.character(dates))     # re-format dates for the url
    returnedDates <- returnedDates[!is.na(returnedDates)]
  }
  
  
  # set acceptable true/flase values
  T.vals  <- c("TRUE", "T", "True", TRUE, T)
  F.vals  <- c("FALSE", "F", "False", FALSE, F)
  TF.vals <- c(T.vals, F.vals)
  
  if (!continuous %in% TF.vals) {
    stop ("'continuous' must be set to 'TRUE' or 'FALSE'")
  }
  
  if ((interval %in% c("HL", "monthly")) & (!met %in% F.vals)) {
    met <- "FALSE"
    print("`met = TRUE` is not consistent with monthly or HL water levels. If meteorological data are desired, request 6 minute or hourly data.")
  }
  
  # set units                                                       
  if (units ==  "meters")       {
    u.csv <- u <- "metric"
  } else if (units ==  "feet")  {
    u     <- "standard"
    u.csv <- "english"
  } else stop ("invalid units: must be 'feet' or 'meters' ")
  
  # set datum
  if (datum %in% c("STND", "MHHW", "MHW", "MTL", "MSL", "MLW", "MLLW", "NAVD", "IGLD")) {
    datum <- datum  
  } else stop ("invalid datum: must be 'STND', 'MHHW', 'MHW', 'MTL',
    'MSL', 'MLW', 'MLLW', 'IGLD', or 'NAVD'")
  
  # set measurement time interval        
  if (interval ==  "6 minute")                  {
    ti.csv  <- "water_level"
    ti.name <- "Verified 6-Minute Water Level"
    met.csv <- "6" # this variable is for calling meteorological data csv files
  } else if (interval ==  "hourly")             {
    ti.csv  <- "hourly_height"
    ti.name <- "Verified Hourly Height Water Level"
    met.csv <- "h"
  } else if (interval ==  "HL")                 {
    ti.csv  <- "high_low"
    ti.name <- "Verified High/Low Water Level"
  } else if (interval ==  "monthly")            {
    ti.csv  <- "monthly_mean"
    ti.name <- "Verified Monthly Mean Water Level"
  }  else stop ("invalid time interval: must be '6 minute', 'hourly', or 'HL'")
  
  # set time zone
  if (time %in%  c("LST/LDT", "GMT", "LST"))    {
    tz <- time
  } else stop ("invalid time zone: must be 'LST/LDT', 'GMT', or 'LST' ")
  
  # set time zone in more detail (for labeling data in conversion to POSIX)
  # LST/LDT is not ideal and, at present, probably produces mislabelled data
  if (time %in%  c("GMT"))    {
    posix.tz <- time
  } else if (time %in%  c("LST/LDT", "LST"))    {
    posix.tz <- ""  # if not specified, the local time zone used (relative to computer, 
    # not the data!). I don't think this will alternate between LST and LDT.
  } 
  
  # set site name/number indicator
  if (regexpr("[0-9]{7}", station)[1] == 1)         {
    site.ind  <- c(1)
  } else if (regexpr("[a-zA-Z]+", station)[1] == 1) {
    stop("Invalid station entry: must use station number. Check active stations 
   at: https://www.tidesandcurrents.noaa.gov/stations.html?type=Water+Levels")
  } else stop("Invalid station entry: must use station number. Check active stations 
   at: https://www.tidesandcurrents.noaa.gov/stations.html?type=Water+Levels")
  
  # nocov start
  
  
  # stns <- readLines("https://www.tidesandcurrents.noaa.gov/stations.html", warn = FALSE) # list of active stations
  
  ### commented out 20210305: NOAA web page now dynamically produced; can't scrape station names :(
  # ### get station name from station home page
  # url <- paste0("https://tidesandcurrents.noaa.gov/stationhome.html?id=", station)
  # stns <- readLines(url, warn = FALSE)
  # stn.query <- paste0(" - Station ID: ", station)
  # nameSection <- grep(stn.query, stns, value = TRUE)
  # if (length(nameSection) == 0) { # 
  #   stop ("Station number appears to be invalid. No match found at
  #           https://www.tidesandcurrents.noaa.gov/stations.html?type=Water+Levels. Debug note: retrieving station name")
  # }
  # site.name <- strsplit(x = nameSection, split = stn.query)[[1]]
  # site.name <- gsub(x = site.name, pattern = "\\t", replacement = "")
  site.name <- station
  
  siteParameters <- noaa.parameters(stn = station)
  
  ### find line with data product
  # if there are multiple rows per product, use start date from first line 
  data.line    <- grep(ti.name, siteParameters$params)[1] 
  first.record <- siteParameters$startDate[data.line]
  first.rec    <- as.Date(first.record, format = "%Y%m%d")
  
  data.line.end    <- grep(ti.name, siteParameters$params)[length(grep(ti.name, siteParameters$params))] 
  last.record <- siteParameters$endDate[data.line.end]
  last.rec    <- as.Date(last.record, format = "%Y%m%d")
  ### This should be more robust to data gaps
  
  # set start/end dates to full period of record, if left as default
  if (begindate ==  "begindate")        {
    sdate <- first.rec + 1
  } else if (begindate !=  "begindate") {
    sdate <- as.Date(as.character(begindate), "%Y%m%d")
  }
  
  if (enddate ==  "enddate")            {
    edate <- last.rec - 1
  } else if (enddate !=  "enddate")     {
    edate <- as.Date(as.character(enddate), "%Y%m%d")
  }
  
  
  
  # check if date range is within period of record, and check if time period
  # requires splitting into smaller units. Interval limit is 1 year for hourly
  # and HL data,  31 days for 6-min data, 10 years for monthly data.
  
  
  dates2 <- getDates(startDate = sdate, endDate = edate, dataType = interval, 
                     first.record = first.rec, last.record = last.rec)
  
  #####
  ##### Get water level data
  #####
  # create list of csv files for water level data
  for(i in 1:(length(dates2) - 1)) {
    url.temp <- c(paste0("https://tidesandcurrents.noaa.gov/api/datagetter?", 
                         "product=", ti.csv, 
                         "&application=NOS.COOPS.TAC.WL",
                         "&begin_date=", dates2[i], 
                         "&end_date=", dates2[i+1], 
                         "&datum=", datum, 
                         "&station=", station,
                         "&time_zone=", tz, 
                         "&units=", u.csv, 
                         "&format=csv"))
    if (!exists("url.list")){
      url.list <- url.temp
    }
    # if the dataset exists, add to it
    if (exists("url.list")){
      url.list[i] <- url.temp
      rm(url.temp)
    }
  }
  
  tempFileNames <- tempfile(pattern = as.character(c(1:length(url.list))), fileext = ".csv")
  
  # download.file(url.list[i], destfile = "test.csv")
  
  # RCurl dependency eliminated 20200903
  ### check for curl: 
  curl.msg <- Sys.which("curl")
  if (!grepl(x = curl.msg, pattern = "curl")) {
    stop("curl (https://curl.haxx.se/) is needed for downloading data. please install curl and ensure it is visible via the R console command: Sys.which('curl'). This may help: https://stackoverflow.com/a/16216825 ")
  }
  
  for (i in 1:length(url.list)) {
    download.file(url.list[i], destfile = tempFileNames[i], 
                  quiet = TRUE,
                  method="curl",
                  extra='-L') # requires curl. might be an issue for some users. also, unlink to delete files
  }

  
  
  for (i in 1:length(tempFileNames)) {
    if (i == 1){
      data.csv <- utils::read.csv(tempFileNames[[i]])
    } else {
      data.csv <- rbind(data.csv, utils::read.csv(tempFileNames[[i]]))
      unlink(tempFileNames[[i]])
    }
  }
  
  data.csv$station <- rep(site.name, times = nrow(data.csv))
  
  label   <- paste("verified water level at ", site.name, " (", units, " rel. to ", datum, ")", sep="")
  t.label <- paste("time_", time, sep = "")
  
  
  # clean up the data
  if(interval == "HL" ) {
    data.csv$datetime <- as.POSIXct(data.csv[, 1], format = "%Y-%m-%d %H:%M", tz = posix.tz)
    data.csv <- data.csv[, c(7, 2, 3, 6)]
    names(data.csv) <- c(t.label, label, "tide", "station")
    levels(data.csv$tide) <- c("H", "HH", "L", "LL")
  }
  
  if(interval == "6 minute" ) {
    data.csv$datetime <- as.POSIXct(data.csv[, 1], format = "%Y-%m-%d %H:%M", tz = posix.tz)
    data.csv <- data.csv[, c(10, 2, 9)]
    names(data.csv) <- c(t.label, label, "station")
  }
  
  if(interval == "hourly" ) {
    data.csv$datetime <- as.POSIXct(data.csv[, 1], format = "%Y-%m-%d %H:%M", tz = posix.tz)
    data.csv <- data.csv[, c(7, 2, 6)]
    names(data.csv) <- c(t.label, label, "station") 
  }
  
  if(interval == "monthly" ) {
    data.csv$datetime <- data.csv$Year + data.csv$Month / 12
  }
  
  
  if(interval == "hourly" & (continuous == "TRUE" | continuous == "T")) {
    data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
    time.df <- data.frame(seq(from = data.csv[1, 1], to = data.csv[nrow(data.csv), 1], by = 60*60))
    names(time.df)[1] <- t.label
    data.csv <- plyr::join_all(list(time.df, data.csv[!duplicated(data.csv[, 1]), ]))
    
  } else if(interval == "6 minute" & (continuous == "TRUE" | continuous == "T")) {
    data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
    time.df <- data.frame(seq(from = data.csv[1, 1], to = data.csv[nrow(data.csv), 1], by = 60*6))
    names(time.df)[1] <- t.label
    data.csv <- plyr::join_all(list(time.df, data.csv[!duplicated(data.csv[, 1]), ]))
    
  } else if(interval == "monthly" & (continuous == "TRUE" | continuous == "T" )) {
    time.df <- data.frame(seq(from = data.csv$datetime[1], to = data.csv$datetime[nrow(data.csv)], by = 1 / 12))
    names(time.df) <- "datetime"
    time.df$datetime <- round(time.df$datetime, 2)
    data.csv$datetime <- round(data.csv$datetime, 2)
    data.csv <- plyr::join_all(list(time.df, data.csv))
    data.csv$Year <- as.numeric(data.csv$Year)
    data.csv$Year[is.na(data.csv$station)] <- as.numeric(substr(data.csv$datetime[is.na(data.csv$station)], 1, 4))
    data.csv$Month[is.na(data.csv$station)] <- round((data.csv$datetime[is.na(data.csv$station)] - data.csv$Year[is.na(data.csv$station)]) * 12)
    data.csv$station[is.na(data.csv$station)] <- site.name
  } else data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
  
  
  #####
  ##### Get meteorological data (if desired)
  #####
  # create list of csv files for met data
  # I may need to verify that data is available for requested range, 
  # or ignore csv files that return blank data
  
  # 150809: If no met data are available during the time period specified, an error is thrown. 
  # This shouldn't happen.
  
  if (!met %in% F.vals) {
    
    # get available products for the station, and corresponding dates
    param_list   <- data.frame(names = c("Conductivity", "Wind", "Barometric Pressure", "Air Temperature",
                                         "Water Temperature", "Relative Humidity", "Salinity"), 
                               codes = c("conductivity", "wind", "air_pressure", "air_temperature", "water_temperature",
                                         "humidity", "salinity")
    )
    
    #   siteParameters
    #   date.list    <- lapply(TempNodes, function(x)  xmlValue(getSibling(x))) # first entry is consistently skipped
    #   availableParams    <- data.frame(param = NA, start = NA, end = NA)
    availableParams    <- siteParameters[siteParameters$params %in% param_list$names, ]
    
    # check that 'met' is valid.
    # "TRUE" causes met to download all available data
    if (sum(met %in% availableParams$params) > 0) {
      availableParams <- availableParams[availableParams$params %in% met, ] # select only those sought params that are also available
    } else if (sum(met %in% availableParams$params) == 0) {
      message("`met` parameters were not recognized. All meteorological parameters will be downloaded.")
    }
    
    # add if clause: if availableParams has more than zero rows, run this
    if (!nrow(availableParams) == 0) {
      availableParams$actualEnd <- availableParams$actualStart <- as.numeric(NA) # start and end dates to use when calling data
      
      for (i in 1:nrow(availableParams)) {
        #       # if a param is found, check if date range is relevant
        if (availableParams$startDate[i] < enddate) { # if data starts before request ends, fill in data
          # if the date range is relevant, record the details
          # first check whether the date range is relevant, then if it is include the parameter 
          # old version converted to times before comparing: if (as.integer(as.POSIXct(as.character(availableParams$startDate[i]), format = "%Y%m%d", origin = posix.tz)) <= as.integer(as.POSIXct(as.character(enddate), format = "%Y%m%d", origin = posix.tz))) {
          if (availableParams$startDate[i] <= enddate) {
            temp.Params <- data.frame(param = as.character(param_list$codes[i]), start = NA, end = NA)
            
            if ((availableParams$startDate[i] <= begindate) & !(availableParams$endDate[i] < begindate)) {
              availableParams$actualStart[i]    <- begindate
            } else if ((availableParams$startDate[i] <= begindate) & (availableParams$endDate[i] < begindate) ) {
              availableParams$actualStart[i]    <- as.numeric(NA)
            } else if (availableParams$startDate[i] > begindate) {
              availableParams$actualStart[i]    <- as.numeric(availableParams$startDate[i])
            }
            
            if (availableParams$endDate[i] >= enddate) {
              availableParams$actualEnd[i]      <- enddate
            } else if (availableParams$endDate[i] < enddate) {
              availableParams$actualEnd[i]      <- as.numeric(availableParams$endDate[i])
            }
            if (is.na(availableParams$actualStart[i])) {
              availableParams$actualEnd[i]    <- as.numeric(NA)
            }
          }
        }
      }
      
      ### parameters relevant to our time period of interest
      availableParams <- availableParams[stats::complete.cases(availableParams), ]
      # availableParams has each available ancillary parameter, and their associated start and end dates
      
      
      for (i in 1:nrow(availableParams)) {
        dateRange <- getDates(startDate = availableParams$actualStart[i], endDate = availableParams$actualEnd[i], dataType = interval)
        
        for(j in 1:(length(dateRange) - 1)) {
          met.url.temp <- c(paste0("https://tidesandcurrents.noaa.gov/api/datagetter?", 
                                   "product=", param_list$codes[match(availableParams$param[i], param_list$names)],
                                   "&application=NOS.COOPS.TAC.PHYSOCEAN",
                                   "&begin_date=", dateRange[j], 
                                   "&end_date=", dateRange[j+1], 
                                   "&station=", station,
                                   "&time_zone=", tz, 
                                   "&units=", u.csv,
                                   "&interval=", met.csv,
                                   "&format=csv"))
          
          
          if (j == 1) {
            met.url.list    <- met.url.temp
          } else  {
            met.url.list[j] <- met.url.temp
          }
          rm(met.url.temp)
        }
        
        tempFileNames <- tempfile(pattern = as.character(c(1:length(met.url.list))), fileext = "_met.csv")
        # RCurl dependency eliminated 20200903
        ### check for curl: 
        for (h in 1:length(met.url.list)) {
          download.file(met.url.list[h], destfile = tempFileNames[h], 
                        quiet = TRUE,
                        method="curl",
                        extra='-L') # requires curl. might be an issue for some users. also, unlink to delete files
        }
        
        
        
        for (h in 1:length(tempFileNames)) {
          if (h == 1){
            met.data.csv <- utils::read.csv(tempFileNames[[h]])
          } else {
            met.data.csv <- rbind(met.data.csv, utils::read.csv(tempFileNames[[h]]))
            unlink(tempFileNames[[h]])
          }
        }
        #######
        
        # now, all data is compiled for availableParams$param[i] 
        # so, merge it in.
        
        # but first, remove bloated columns
        rem           <- match(c("X", "N", "R"), names(met.data.csv))
        met.data.csv  <- met.data.csv[, -c(rem[!is.na(rem)])]
        
        # get datetime variable
        # 1. convert to character
        met.data.csv[, 1]    <- as.character(met.data.csv[, 1])
        # 2. re-structure to remove backslashes
        t.temp <- gsub("-", "", met.data.csv[, 1])
        
        # join using seconds since 1970
        met.data.csv$datetime <- as.integer(as.POSIXct(t.temp, format = "%Y%m%d %H:%M", tz = posix.tz))
        data.csv$datetime     <- as.integer(data.csv[, 1])
        
        data.csv              <- plyr::join_all(list(data.csv, met.data.csv[, -1]), by = "datetime")
        
        # now, remove datetime integer column
        datetime.col          <- grep("datetime", names(data.csv))
        data.csv              <- data.csv[, -c(datetime.col)] 
        invisible(data.csv)
        
        rm(met.url.list)
        rm(met.data.csv)
      }
    } # closes section contingent on availableParams haveing >0 rows
    invisible(data.csv)
  }

  invisible(data.csv)
  } # nocov end