#' @title Downloads NOAA CO-OPS tide data
#'
#' @description
#' Scrapes water level data (and other measurements) from NOAA CO-OPS website. NOAA's site limits the time period for data downloads, but these constraints are avoided by `noaa()`. Requires
#' internet connection and curl (check availablility using `Sys.which("curl")`).
#'
#' @details
#' Download water level and other data from NOAA CO-OPS website.
#'
#' @usage noaa(begindate = "begindate", enddate = "enddate", station = "8467150",
#' met = FALSE, units = "meters", datum = "MHW", interval = "HL", time = "GMT",
#' continuous = TRUE)
#'
#' @param begindate first day of data to download. Format must be YYYYMMDD. If
#' left unspecified, the first complete day of data will be used.
#'
#' @param enddate final day of data to download. Format must be YYYYMMDD. If left
#' unspecified, the last complete day of data will be used.
#'
#' @param station station number, found on the NOAA Tides and Currents website
#' (https://www.tidesandcurrents.noaa.gov/stations.html?type=Water+Levels).
#' Station numbers can be numeric or a character string (necessary if first
#' character is a zero). Default station is Bridgeport, CT.
#'
#' @param met whether meteorological data should be returned. This value can be 'TRUE' or
#' 'FALSE'; if 'TRUE', all ancillary parameters are returned. At present, this only
#' works with 6-minute and hourly data
#'
#' @param units can be 'feet' or 'meters'. Default is 'meters'
#'
#' @param datum vertical reference datum, set to 'MHW' by default. Can be 'station', 'NAVD',
#' 'MLLW', 'MLW', 'MSL', 'MTL', 'MHW', 'MHHW', or 'IGLD' (some datums are not available
#' at some sites)
#'
#' @param interval sets measurement interval; can be 'HL' (default), '6 minute', 'hourly', or 'monthly'.
#'
#' @param time can be 'LST', 'GMT', or 'LST/LDT'. Not all time zones are available for all data.
#' GMT appears to have wider availability than LST, so it is the default.
#'
#' @param continuous determines whether a continuous time series is produced, with lengthy gaps
#' in data filled in with NAs. By default, this is \code{FALSE}. This option only applies
#' to data at evenly spaced intervals (i.e., \code{6 minute} or \code{hourly})
#'
#' @return dataset a dataframe with water levels, associated time stamps, a station ID column,
#' and tide type (if interval is set to \code{HL}). The NOAA CO-OPS website has many odd data
#' availabilty problems. Some data are not available in all time intervals or time zones.
#'
#' @importFrom plyr join_all
#' @importFrom stats complete.cases
#' @importFrom utils read.csv
#' @importFrom utils download.file
#'
#'
#' @examples \dontrun{
#' # Example requires an internet connection
#'bport2013 <- noaa(begindate = 20130101, enddate = 20131231,
#' station = "8467150", interval = "6 minute")
#'
#'test2.1 <- noaa(begindate = "20100101", enddate = "20120101", interval = "hourly")
#'test2.2 <- noaa(begindate = "20100101", enddate = "20120101", interval = "hourly",
#' continuous = TRUE)
#'nrow(test2.1) # includes data on NOAA site (incomplete record)
#'nrow(test2.2) # fills gaps with NAs
#'test2.3 <- noaa(begindate = "20100101", enddate = "20120101", interval = "hourly",
#' met = TRUE)
#' }
#' @export
noaa <- function(begindate = "begindate", enddate = "enddate", station = "8467150",
met = FALSE, units = "meters", datum = "MHW", interval = "HL",
time = "GMT", continuous = TRUE) {
getDates <- function(startDate, endDate, dataType,
first.record = startDate, last.record = endDate) {
# function produces a vector of dates used to download data
startDate <- as.Date(gsub("-", "", startDate), tz = posix.tz, format = "%Y%m%d")
endDate <- as.Date(gsub("-", "", endDate), tz = posix.tz, format = "%Y%m%d")
dates <- startDate
if (dataType == "HL") {
if (startDate < first.record | endDate > last.record) {
stop ("invalid time interval")
} else if (as.numeric(endDate - startDate) > 364) {
dates <- seq(startDate, endDate, 365)
} else (dates <- c(startDate, endDate))
}
if (dataType == "6 minute") {
if (startDate < first.record | endDate > last.record) {
stop ("invalid time interval")
} else if (as.numeric(endDate - startDate) > 30) {
dates <- seq(startDate, endDate, 31)
} else (dates <- c(startDate, endDate))
}
if (dataType == "hourly") {
if (startDate < first.record | endDate > last.record) {
stop ("invalid time interval")
} else if ( as.numeric(endDate - startDate) > 364) {
dates <- seq(startDate, endDate, 365)
} else (dates <- c(startDate, endDate))
}
if (!endDate %in% dates[length(dates)]) {
dates <- c(dates, endDate)
}
returnedDates <- gsub("-", "", as.character(dates)) # re-format dates for the url
returnedDates <- returnedDates[!is.na(returnedDates)]
}
# set acceptable true/flase values
T.vals <- c("TRUE", "T", "True", TRUE, T)
F.vals <- c("FALSE", "F", "False", FALSE, F)
TF.vals <- c(T.vals, F.vals)
if (!continuous %in% TF.vals) {
stop ("'continuous' must be set to 'TRUE' or 'FALSE'")
}
if ((interval %in% c("HL", "monthly")) & (!met %in% F.vals)) {
met <- "FALSE"
print("`met = TRUE` is not consistent with monthly or HL water levels. If meteorological data are desired, request 6 minute or hourly data.")
}
# set units
if (units == "meters") {
u.csv <- u <- "metric"
} else if (units == "feet") {
u <- "standard"
u.csv <- "english"
} else stop ("invalid units: must be 'feet' or 'meters' ")
# set datum
if (datum %in% c("STND", "MHHW", "MHW", "MTL", "MSL", "MLW", "MLLW", "NAVD", "IGLD")) {
datum <- datum
} else stop ("invalid datum: must be 'STND', 'MHHW', 'MHW', 'MTL',
'MSL', 'MLW', 'MLLW', 'IGLD', or 'NAVD'")
# set measurement time interval
if (interval == "6 minute") {
ti.csv <- "water_level"
ti.name <- "Verified 6-Minute Water Level"
met.csv <- "6" # this variable is for calling meteorological data csv files
} else if (interval == "hourly") {
ti.csv <- "hourly_height"
ti.name <- "Verified Hourly Height Water Level"
met.csv <- "h"
} else if (interval == "HL") {
ti.csv <- "high_low"
ti.name <- "Verified High/Low Water Level"
} else if (interval == "monthly") {
ti.csv <- "monthly_mean"
ti.name <- "Verified Monthly Mean Water Level"
} else stop ("invalid time interval: must be '6 minute', 'hourly', or 'HL'")
# set time zone
if (time %in% c("LST/LDT", "GMT", "LST")) {
tz <- time
} else stop ("invalid time zone: must be 'LST/LDT', 'GMT', or 'LST' ")
# set time zone in more detail (for labeling data in conversion to POSIX)
# LST/LDT is not ideal and, at present, probably produces mislabelled data
if (time %in% c("GMT")) {
posix.tz <- time
} else if (time %in% c("LST/LDT", "LST")) {
posix.tz <- "" # if not specified, the local time zone used (relative to computer,
# not the data!). I don't think this will alternate between LST and LDT.
}
# set site name/number indicator
if (regexpr("[0-9]{7}", station)[1] == 1) {
site.ind <- c(1)
} else if (regexpr("[a-zA-Z]+", station)[1] == 1) {
stop("Invalid station entry: must use station number. Check active stations
at: https://www.tidesandcurrents.noaa.gov/stations.html?type=Water+Levels")
} else stop("Invalid station entry: must use station number. Check active stations
at: https://www.tidesandcurrents.noaa.gov/stations.html?type=Water+Levels")
# nocov start
# stns <- readLines("https://www.tidesandcurrents.noaa.gov/stations.html", warn = FALSE) # list of active stations
### commented out 20210305: NOAA web page now dynamically produced; can't scrape station names :(
# ### get station name from station home page
# url <- paste0("https://tidesandcurrents.noaa.gov/stationhome.html?id=", station)
# stns <- readLines(url, warn = FALSE)
# stn.query <- paste0(" - Station ID: ", station)
# nameSection <- grep(stn.query, stns, value = TRUE)
# if (length(nameSection) == 0) { #
# stop ("Station number appears to be invalid. No match found at
# https://www.tidesandcurrents.noaa.gov/stations.html?type=Water+Levels. Debug note: retrieving station name")
# }
# site.name <- strsplit(x = nameSection, split = stn.query)[[1]]
# site.name <- gsub(x = site.name, pattern = "\\t", replacement = "")
site.name <- station
siteParameters <- noaa.parameters(stn = station)
### find line with data product
# if there are multiple rows per product, use start date from first line
data.line <- grep(ti.name, siteParameters$params)[1]
first.record <- siteParameters$startDate[data.line]
first.rec <- as.Date(first.record, format = "%Y%m%d")
data.line.end <- grep(ti.name, siteParameters$params)[length(grep(ti.name, siteParameters$params))]
last.record <- siteParameters$endDate[data.line.end]
last.rec <- as.Date(last.record, format = "%Y%m%d")
### This should be more robust to data gaps
# set start/end dates to full period of record, if left as default
if (begindate == "begindate") {
sdate <- first.rec + 1
} else if (begindate != "begindate") {
sdate <- as.Date(as.character(begindate), "%Y%m%d")
}
if (enddate == "enddate") {
edate <- last.rec - 1
} else if (enddate != "enddate") {
edate <- as.Date(as.character(enddate), "%Y%m%d")
}
# check if date range is within period of record, and check if time period
# requires splitting into smaller units. Interval limit is 1 year for hourly
# and HL data, 31 days for 6-min data, 10 years for monthly data.
dates2 <- getDates(startDate = sdate, endDate = edate, dataType = interval,
first.record = first.rec, last.record = last.rec)
#####
##### Get water level data
#####
# create list of csv files for water level data
for(i in 1:(length(dates2) - 1)) {
url.temp <- c(paste0("https://tidesandcurrents.noaa.gov/api/datagetter?",
"product=", ti.csv,
"&application=NOS.COOPS.TAC.WL",
"&begin_date=", dates2[i],
"&end_date=", dates2[i+1],
"&datum=", datum,
"&station=", station,
"&time_zone=", tz,
"&units=", u.csv,
"&format=csv"))
if (!exists("url.list")){
url.list <- url.temp
}
# if the dataset exists, add to it
if (exists("url.list")){
url.list[i] <- url.temp
rm(url.temp)
}
}
tempFileNames <- tempfile(pattern = as.character(c(1:length(url.list))), fileext = ".csv")
# download.file(url.list[i], destfile = "test.csv")
# RCurl dependency eliminated 20200903
### check for curl:
curl.msg <- Sys.which("curl")
if (!grepl(x = curl.msg, pattern = "curl")) {
stop("curl (https://curl.haxx.se/) is needed for downloading data. please install curl and ensure it is visible via the R console command: Sys.which('curl'). This may help: https://stackoverflow.com/a/16216825 ")
}
for (i in 1:length(url.list)) {
download.file(url.list[i], destfile = tempFileNames[i],
quiet = TRUE,
method="curl",
extra='-L') # requires curl. might be an issue for some users. also, unlink to delete files
}
for (i in 1:length(tempFileNames)) {
if (i == 1){
data.csv <- utils::read.csv(tempFileNames[[i]])
} else {
data.csv <- rbind(data.csv, utils::read.csv(tempFileNames[[i]]))
unlink(tempFileNames[[i]])
}
}
data.csv$station <- rep(site.name, times = nrow(data.csv))
label <- paste("verified water level at ", site.name, " (", units, " rel. to ", datum, ")", sep="")
t.label <- paste("time_", time, sep = "")
# clean up the data
if(interval == "HL" ) {
data.csv$datetime <- as.POSIXct(data.csv[, 1], format = "%Y-%m-%d %H:%M", tz = posix.tz)
data.csv <- data.csv[, c(7, 2, 3, 6)]
names(data.csv) <- c(t.label, label, "tide", "station")
levels(data.csv$tide) <- c("H", "HH", "L", "LL")
}
if(interval == "6 minute" ) {
data.csv$datetime <- as.POSIXct(data.csv[, 1], format = "%Y-%m-%d %H:%M", tz = posix.tz)
data.csv <- data.csv[, c(10, 2, 9)]
names(data.csv) <- c(t.label, label, "station")
}
if(interval == "hourly" ) {
data.csv$datetime <- as.POSIXct(data.csv[, 1], format = "%Y-%m-%d %H:%M", tz = posix.tz)
data.csv <- data.csv[, c(7, 2, 6)]
names(data.csv) <- c(t.label, label, "station")
}
if(interval == "monthly" ) {
data.csv$datetime <- data.csv$Year + data.csv$Month / 12
}
if(interval == "hourly" & (continuous == "TRUE" | continuous == "T")) {
data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
time.df <- data.frame(seq(from = data.csv[1, 1], to = data.csv[nrow(data.csv), 1], by = 60*60))
names(time.df)[1] <- t.label
data.csv <- plyr::join_all(list(time.df, data.csv[!duplicated(data.csv[, 1]), ]))
} else if(interval == "6 minute" & (continuous == "TRUE" | continuous == "T")) {
data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
time.df <- data.frame(seq(from = data.csv[1, 1], to = data.csv[nrow(data.csv), 1], by = 60*6))
names(time.df)[1] <- t.label
data.csv <- plyr::join_all(list(time.df, data.csv[!duplicated(data.csv[, 1]), ]))
} else if(interval == "monthly" & (continuous == "TRUE" | continuous == "T" )) {
time.df <- data.frame(seq(from = data.csv$datetime[1], to = data.csv$datetime[nrow(data.csv)], by = 1 / 12))
names(time.df) <- "datetime"
time.df$datetime <- round(time.df$datetime, 2)
data.csv$datetime <- round(data.csv$datetime, 2)
data.csv <- plyr::join_all(list(time.df, data.csv))
data.csv$Year <- as.numeric(data.csv$Year)
data.csv$Year[is.na(data.csv$station)] <- as.numeric(substr(data.csv$datetime[is.na(data.csv$station)], 1, 4))
data.csv$Month[is.na(data.csv$station)] <- round((data.csv$datetime[is.na(data.csv$station)] - data.csv$Year[is.na(data.csv$station)]) * 12)
data.csv$station[is.na(data.csv$station)] <- site.name
} else data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
#####
##### Get meteorological data (if desired)
#####
# create list of csv files for met data
# I may need to verify that data is available for requested range,
# or ignore csv files that return blank data
# 150809: If no met data are available during the time period specified, an error is thrown.
# This shouldn't happen.
if (!met %in% F.vals) {
# get available products for the station, and corresponding dates
param_list <- data.frame(names = c("Conductivity", "Wind", "Barometric Pressure", "Air Temperature",
"Water Temperature", "Relative Humidity", "Salinity"),
codes = c("conductivity", "wind", "air_pressure", "air_temperature", "water_temperature",
"humidity", "salinity")
)
# siteParameters
# date.list <- lapply(TempNodes, function(x) xmlValue(getSibling(x))) # first entry is consistently skipped
# availableParams <- data.frame(param = NA, start = NA, end = NA)
availableParams <- siteParameters[siteParameters$params %in% param_list$names, ]
# check that 'met' is valid.
# "TRUE" causes met to download all available data
if (sum(met %in% availableParams$params) > 0) {
availableParams <- availableParams[availableParams$params %in% met, ] # select only those sought params that are also available
} else if (sum(met %in% availableParams$params) == 0) {
message("`met` parameters were not recognized. All meteorological parameters will be downloaded.")
}
# add if clause: if availableParams has more than zero rows, run this
if (!nrow(availableParams) == 0) {
availableParams$actualEnd <- availableParams$actualStart <- as.numeric(NA) # start and end dates to use when calling data
for (i in 1:nrow(availableParams)) {
# # if a param is found, check if date range is relevant
if (availableParams$startDate[i] < enddate) { # if data starts before request ends, fill in data
# if the date range is relevant, record the details
# first check whether the date range is relevant, then if it is include the parameter
# old version converted to times before comparing: if (as.integer(as.POSIXct(as.character(availableParams$startDate[i]), format = "%Y%m%d", origin = posix.tz)) <= as.integer(as.POSIXct(as.character(enddate), format = "%Y%m%d", origin = posix.tz))) {
if (availableParams$startDate[i] <= enddate) {
temp.Params <- data.frame(param = as.character(param_list$codes[i]), start = NA, end = NA)
if ((availableParams$startDate[i] <= begindate) & !(availableParams$endDate[i] < begindate)) {
availableParams$actualStart[i] <- begindate
} else if ((availableParams$startDate[i] <= begindate) & (availableParams$endDate[i] < begindate) ) {
availableParams$actualStart[i] <- as.numeric(NA)
} else if (availableParams$startDate[i] > begindate) {
availableParams$actualStart[i] <- as.numeric(availableParams$startDate[i])
}
if (availableParams$endDate[i] >= enddate) {
availableParams$actualEnd[i] <- enddate
} else if (availableParams$endDate[i] < enddate) {
availableParams$actualEnd[i] <- as.numeric(availableParams$endDate[i])
}
if (is.na(availableParams$actualStart[i])) {
availableParams$actualEnd[i] <- as.numeric(NA)
}
}
}
}
### parameters relevant to our time period of interest
availableParams <- availableParams[stats::complete.cases(availableParams), ]
# availableParams has each available ancillary parameter, and their associated start and end dates
for (i in 1:nrow(availableParams)) {
dateRange <- getDates(startDate = availableParams$actualStart[i], endDate = availableParams$actualEnd[i], dataType = interval)
for(j in 1:(length(dateRange) - 1)) {
met.url.temp <- c(paste0("https://tidesandcurrents.noaa.gov/api/datagetter?",
"product=", param_list$codes[match(availableParams$param[i], param_list$names)],
"&application=NOS.COOPS.TAC.PHYSOCEAN",
"&begin_date=", dateRange[j],
"&end_date=", dateRange[j+1],
"&station=", station,
"&time_zone=", tz,
"&units=", u.csv,
"&interval=", met.csv,
"&format=csv"))
if (j == 1) {
met.url.list <- met.url.temp
} else {
met.url.list[j] <- met.url.temp
}
rm(met.url.temp)
}
tempFileNames <- tempfile(pattern = as.character(c(1:length(met.url.list))), fileext = "_met.csv")
# RCurl dependency eliminated 20200903
### check for curl:
for (h in 1:length(met.url.list)) {
download.file(met.url.list[h], destfile = tempFileNames[h],
quiet = TRUE,
method="curl",
extra='-L') # requires curl. might be an issue for some users. also, unlink to delete files
}
for (h in 1:length(tempFileNames)) {
if (h == 1){
met.data.csv <- utils::read.csv(tempFileNames[[h]])
} else {
met.data.csv <- rbind(met.data.csv, utils::read.csv(tempFileNames[[h]]))
unlink(tempFileNames[[h]])
}
}
#######
# now, all data is compiled for availableParams$param[i]
# so, merge it in.
# but first, remove bloated columns
rem <- match(c("X", "N", "R"), names(met.data.csv))
met.data.csv <- met.data.csv[, -c(rem[!is.na(rem)])]
# get datetime variable
# 1. convert to character
met.data.csv[, 1] <- as.character(met.data.csv[, 1])
# 2. re-structure to remove backslashes
t.temp <- gsub("-", "", met.data.csv[, 1])
# join using seconds since 1970
met.data.csv$datetime <- as.integer(as.POSIXct(t.temp, format = "%Y%m%d %H:%M", tz = posix.tz))
data.csv$datetime <- as.integer(data.csv[, 1])
data.csv <- plyr::join_all(list(data.csv, met.data.csv[, -1]), by = "datetime")
# now, remove datetime integer column
datetime.col <- grep("datetime", names(data.csv))
data.csv <- data.csv[, -c(datetime.col)]
invisible(data.csv)
rm(met.url.list)
rm(met.data.csv)
}
} # closes section contingent on availableParams haveing >0 rows
invisible(data.csv)
}
invisible(data.csv)
} # nocov end
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.