#' Get a cleaned version of GHCND data from a single weather site
#'
#' This function uses ftp to access the Global Historical Climatology Network
#' daily weather data from NOAA's FTP server for a single weather monitor site.
#' It requires the site identification number for that site and will pull the
#' entire weather dataset for the site. It will then clean this data to convert
#' it to a tidier format and will also, if requested, filter it to a certain
#' date range and to certain weather variables.
#'
#' @export
#' @inheritParams ghcnd
#' @param date_min A character string giving the earliest
#' date of the daily weather time series that the user would
#' like in the final output. This character string should be formatted as
#' "yyyy-mm-dd". If not specified, the default is to keep all daily data for
#' the queried weather site from the earliest available date.
#' @param date_max A character string giving the latest
#' date of the daily weather time series that the user would
#' like in the final output. This character string should be formatted as
#' "yyyy-mm-dd". If not specified, the default is to keep all daily data for
#' the queried weather site through the most current available date.
#' @param var A character vector specifying either \code{"all"} (pull all
#' available weather parameters for the site) or the weather parameters to
#' keep in the final data (e.g., \code{c("TMAX", "TMIN")} to only keep
#' maximum and minimum temperature). Example choices for this argument
#' include:
#' \itemize{
#' \item \code{PRCP}: Precipitation, in tenths of millimeters
#' \item \code{TAVG}: Average temperature, in tenths of degrees Celsius
#' \item \code{TMAX}: Maximum temperature, in tenths of degrees Celsius
#' \item \code{TMIN}: Minimum temperature, in tenths of degrees Celsius
#' }
#' A full list of possible weather variables is available in NOAA's README
#' file for the GHCND data
#' (\url{https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt}).
#' Most weather stations will only have a small subset of all the possible
#' weather variables, so the data generated by this function may not include
#' all of the variables the user specifies through this argument.
#'
#' @return A list object with slots for each of the available specified
#' weather variables. Each element in the list is a separate time series
#' dataframe with daily observations, as well as flag values, for one of
#' the weather variables. The flag values give information on the quality
#' and source of each observation; see the NOAA README file linked above
#' for more information.
#'
#' @author Scott Chamberlain \email{myrmecocystus@@gmail.com},
#' Adam Erickson \email{adam.erickson@@ubc.ca}
#'
#' @note This function calls \code{\link{ghcnd}}, which will download and save
#' data from all available dates and weather variables for the queried
#' weather station. The step of limiting the dataset to only certain dates
#' and / or weather variables, using the \code{date_min}, \code{date_max},
#' and \code{var} arguments, does not occur until after the full data has
#' been pulled.
#'
#' @seealso \code{\link{meteo_pull_monitors}}, \code{\link{meteo_tidy_ghcnd}}
#'
#' @examples \dontrun{
#' # Search based on variable and/or date
#' ghcnd_search("AGE00147704", var = "PRCP")
#' ghcnd_search("AGE00147704", var = "PRCP", date_min = "1920-01-01")
#' ghcnd_search("AGE00147704", var = "PRCP", date_max = "1915-01-01")
#' ghcnd_search("AGE00147704", var = "PRCP", date_min = "1920-01-01",
#' date_max = "1925-01-01")
#' ghcnd_search("AGE00147704", date_min = "1920-01-01", date_max = "1925-01-01")
#' ghcnd_search("AGE00147704", var = c("PRCP","TMIN"))
#' ghcnd_search("AGE00147704", var = c("PRCP","TMIN"), date_min = "1920-01-01")
#' ghcnd_search("AGE00147704", var = "adfdf")
#' }
ghcnd_search <- function(stationid, date_min = NULL, date_max = NULL,
var = "all", ...) {
calls <- names(sapply(match.call(), deparse))[-1]
calls_vec <- "path" %in% calls
if (any(calls_vec)) {
stop("The parameter path has been removed, see docs for ?ghcnd_search",
call. = FALSE)
}
dat <- ghcnd_splitvars(ghcnd(stationid))
possvars <- paste0(names(dat), collapse = ", ")
if (any(var != "all")) {
vars_null <- sort(tolower(var))[!sort(tolower(var)) %in% sort(names(dat))]
dat <- dat[tolower(var)]
}
if (any(sapply(dat, is.null))) {
dat <- noaa_compact(dat)
warning(
sprintf("%s not in the dataset\nAvailable variables: %s",
paste0(vars_null, collapse = ", "), possvars), call. = FALSE)
}
if (!is.null(date_min)) {
dat <- lapply(dat, function(z) z %>% dplyr::filter(date >= date_min))
}
if (!is.null(date_max)) {
dat <- lapply(dat, function(z) z %>% dplyr::filter(date <= date_max))
}
dat
}
#' Get all GHCND data from a single weather site
#'
#' This function uses ftp to access the Global Historical Climatology Network
#' daily weather data from NOAA's FTP server for a single weather site. It
#' requires the site identification number for that site and will pull the
#' entire weather dataset for the site.
#'
#' @export
#' @param stationid A character string giving the identification of the weather
#' station for which the user would like to pull data. To get a full and
#' current list of stations, the user can use the \code{\link{ghcnd_stations}}
#' function. To identify stations within a certain radius of a location, the
#' user can use the \code{\link{meteo_nearby_stations}} function.
#' @param ... Additional curl options to pass through to \code{\link[httr]{GET}}.
#'
#' @return A tibble (data.frame) which contains data pulled from NOAA's FTP
#' server for the queried weather site. A README file with more information
#' about the format of this file is available from NOAA
#' (\url{http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt}).
#' This file is formatted so each line of the file gives the daily weather
#' observations for a single weather variable for all days of one month of
#' one year. In addition to measurements, columns are included for certain
#' flags, which add information on observation sources and quality and are
#' further explained in NOAA's README file for the data.
#'
#' @details This function saves the full set of weather data for the queried
#' site locally in the directory specified by the \code{path} argument.
#'
#' You can access the path for the cached file via \code{attr(x, "source")}
#'
#' @author Scott Chamberlain \email{myrmecocystus@@gmail.com},
#' Adam Erickson \email{adam.erickson@@ubc.ca}
#'
#' @seealso To generate a weather dataset for a single weather site that has
#' been cleaned to a tidier weather format, the user should use the
#' \code{\link{ghcnd_search}} function, which calls \code{\link{ghcnd}} and then
#' processes the output, or \code{\link{meteo_tidy_ghcnd}}, which wraps the
#' \code{\link{ghcnd_search}} function to output a tidy dataframe. To pull
#' GHCND data from multiple monitors, see \code{\link{meteo_pull_monitors}}.
#'
#' @section File storage:
#' We use \pkg{rappdirs} to store files, see
#' \code{\link[rappdirs]{user_cache_dir}} for how we determine the directory on
#' your machine to save files to, and run
#' \code{rappdirs::user_cache_dir("rnoaa/ghcnd")} to get that directory.
#'
#' Note that between versions of \pkg{rnoaa} you may want to clear your
#' cache of ghcnd files IF there are changes in ghcnd functions. See
#' \code{\link{ghcnd_clear_cache}} or you can do so manually.
#'
#' @examples \dontrun{
#' # Get data
#' ghcnd(stationid = "AGE00147704")
#'
#' stations <- ghcnd_stations()
#' ghcnd(stations$id[40])
#' ghcnd(stations$id[4000])
#' ghcnd(stations$id[10000])
#' ghcnd(stations$id[80000])
#' ghcnd(stations$id[80300])
#'
#' library("dplyr")
#' ghcnd(stations$id[80300]) %>% select(id, element) %>% slice(1:3)
#'
#' # manipulate data
#' ## using built in fxns
#' dat <- ghcnd(stationid = "AGE00147704")
#' (alldat <- ghcnd_splitvars(dat))
#' library("ggplot2")
#' ggplot(subset(alldat$tmax, tmax >= 0), aes(date, tmax)) + geom_point()
#'
#' ## using dplyr
#' library("dplyr")
#' dat <- ghcnd(stationid = "AGE00147704")
#' dat %>%
#' filter(element == "PRCP", year == 1909)
#' }
ghcnd <- function(stationid, ...){
calls <- names(sapply(match.call(), deparse))[-1]
calls_vec <- "path" %in% calls
if (any(calls_vec)) {
stop("The parameter path has been removed, see docs for ?ghcnd",
call. = FALSE)
}
path <- file.path(rnoaa_cache_dir(), "ghcnd")
csvpath <- ghcnd_local(stationid, path)
if (!is_ghcnd(x = csvpath)) {
res <- ghcnd_GET(path, stationid, ...)
} else {
res <- read.csv(csvpath, stringsAsFactors = FALSE)
colClasses = ghcnd_col_classes)
}
res <- tibble::as_data_frame(res)
attr(res, 'source') <- csvpath
res
}
ghcnd_col_classes <- c(
"character", "integer", "integer", "character",
rep(c("integer", "character", "character", "character"), times = 31)
)
fm <- function(n) {
gsub("\\s", "0", n)
}
#' Get information on the GHCND weather stations
#'
#' This function returns an object with a dataframe with meta-information about
#' all available GHCND weather stations.
#'
#' @export
#' @inheritParams ghcnd
#'
#' @return This function returns a tibble (dataframe) with a weather
#' station on each row with the following columns:
#' \itemize{
#' \item \code{id}: The weather station's ID number. The first two letters
#' denote the country (using FIPS country codes).
#' \item \code{latitude}: The station's latitude, in decimal degrees.
#' Southern latitudes will be negative.
#' \item \code{longitude}: The station's longitude, in decimal degrees.
#' Western longitudes will be negative.
#' \item \code{elevation}: The station's elevation, in meters.
#' \item \code{name}: The station's name.
#' \item \code{gsn_flag}: "GSN" if the monitor belongs to the GCOS Surface
#' Network (GSN). Otherwise either blank or missing.
#' \item \code{wmo_id}: If the station has a WMO number, this column gives
#' that number. Otherwise either blank or missing.
#' \item \code{element}: A weather variable recorded at some point during
#' that station's history. See the link below in "References" for definitions
#' of the abbreviations used for this variable.
#' \item \code{first_year}: The first year of data available at that station
#' for that weather element.
#' \item \code{last_year}: The last year of data available at that station
#' for that weather element.
#' }
#'
#' If a weather station has data on more than one weather variable, it will
#' be represented in multiple rows of this output dataframe.
#'
#' @note Since this function is pulling a large dataset by ftp, it may take
#' a while to run.
#'
#' @references
#'
#' For more documentation on the returned dataset, see
#' \url{http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt}.
#'
#' @examples \dontrun{
#' # Get stations, ghcnd-stations and ghcnd-inventory merged
#' (stations <- ghcnd_stations())
#'
#' library(dplyr)
#' # filter by state
#' stations %>% filter(state == "IL")
#' stations %>% filter(state == "OR")
#' # those without state values
#' stations %>% filter(state == "")
#' # filter by element
#' stations %>% filter(element == "PRCP")
#' # filter by id prefix
#' stations %>% filter(grepl("^AF", id))
#' stations %>% filter(grepl("^AFM", id))
#' # filter by station long name
#' stations %>% filter(name == "CALLATHARRA")
#' }
ghcnd_stations <- function(...){
sta <- get_stations(...)
inv <- get_inventory(...)
tibble::as_data_frame(merge(sta, inv[, -c(2, 3)], by = "id"))
}
get_stations <- function(...){
res <- GET_retry(
"ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt",
...)
df <- read.fwf(
textConnection(utcf8(res)),
widths = c(11, 9, 11, 7, 2, 31, 5, 10),
header = FALSE, strip.white = TRUE, comment.char = "",
stringsAsFactors = FALSE)
nms <- c("id","latitude", "longitude", "elevation",
"state", "name", "gsn_flag", "wmo_id")
stats::setNames(df, nms)
}
get_inventory <- function(...){
res <- GET_retry(
"ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt", ...)
df <- read.fwf(
textConnection(utcf8(res)),
widths = c(11, 9, 10, 5, 5, 5),
header = FALSE, strip.white = TRUE, comment.char = "",
stringsAsFactors = FALSE)
nms <- c("id","latitude", "longitude",
"element", "first_year", "last_year")
stats::setNames(df, nms)
}
strex <- function(x) str_extract_(x, "[0-9]+")
#' Split variables in data returned from \code{ghcnd}
#'
#' This function is a helper function for \code{\link{ghcnd_search}}. It helps
#' with cleaning up the data returned from \code{\link{ghcnd}}, to get it in a
#' format that is easier to work with.
#'
#' @param x An object returned from \code{\link{ghcnd}}.
#'
#' @author Scott Chamberlain \email{myrmecocystus@@gmail.com},
#' Adam Erickson \email{adam.erickson@@ubc.ca}
#'
#' @export
ghcnd_splitvars <- function(x){
if (!inherits(x, "data.frame")) stop("input must be a data.frame", call. = FALSE)
if (!"id" %in% names(x)) stop("input not of correct format", call. = FALSE)
x <- x[!is.na(x$id), ]
out <- lapply(as.character(unique(x$element)), function(y){
ydat <- x[ x$element == y, ]
dd <- ydat %>%
dplyr::select(-dplyr::contains("FLAG")) %>%
tidyr::gather(var, value, -id, -year, -month, -element) %>%
dplyr::mutate(
day = strex(var),
date = as.Date(sprintf("%s-%s-%s", year, month, day), "%Y-%m-%d")) %>%
dplyr::filter(!is.na(date)) %>%
dplyr::select_(
quote(-element),
quote(-var),
quote(-year),
quote(-month),
quote(-day))
dd <- stats::setNames(dd, c("id", tolower(y), "date"))
mflag <- ydat %>%
dplyr::select(-dplyr::contains("VALUE"), -dplyr::contains("QFLAG"),
-dplyr::contains("SFLAG")) %>%
tidyr::gather(var, value, -id, -year, -month, -element) %>%
dplyr::mutate(
day = strex(var),
date = as.Date(sprintf("%s-%s-%s", year, month, day), "%Y-%m-%d")) %>%
dplyr::filter(!is.na(date)) %>%
dplyr::select(value) %>%
dplyr::rename(mflag = value)
qflag <- ydat %>%
dplyr::select(-dplyr::contains("VALUE"), -dplyr::contains("MFLAG"),
-dplyr::contains("SFLAG")) %>%
tidyr::gather(var, value, -id, -year, -month, -element) %>%
dplyr::mutate(
day = strex(var),
date = as.Date(sprintf("%s-%s-%s", year, month, day), "%Y-%m-%d")) %>%
dplyr::filter(!is.na(date)) %>%
dplyr::select(value) %>%
dplyr::rename(qflag = value)
sflag <- ydat %>%
dplyr::select(-dplyr::contains("VALUE"), -dplyr::contains("QFLAG"),
-dplyr::contains("MFLAG")) %>%
tidyr::gather(var, value, -id, -year, -month, -element) %>%
dplyr::mutate(
day = strex(var),
date = as.Date(sprintf("%s-%s-%s", year, month, day), "%Y-%m-%d")) %>%
dplyr::filter(!is.na(date)) %>%
dplyr::select(value) %>%
dplyr::rename(sflag = value)
dplyr::tbl_df(cbind(dd, mflag, qflag, sflag))
})
stats::setNames(out, tolower(unique(x$element)))
}
# ghcnd_mergevars <- function(x){
# merge(x[[2]], x[[3]] %>% select(-id), by='date')
# }
#' Get meta-data on the GHCND daily data
#'
#' These function allow you to pull the current versions of certain meta-datasets
#' for the GHCND, including lists of country and state abbreviations used in some
#' of the weather station IDs and information about the current version of the
#' data.
#'
#' @inheritParams ghcnd
#'
#' @author Scott Chamberlain \email{myrmecocystus@@gmail.com},
#' Adam Erickson \email{adam.erickson@@ubc.ca}
#'
#' @details Functions:
#' \itemize{
#' \item \code{ghcnd_version}: Get current version of GHCND data
#' \item \code{ghcnd_states}: Get US/Canada state names and 2-letter codes
#' \item \code{ghcnd_countries}: Get country names and 2-letter codes
#' }
#'
#' @examples \dontrun{
#' # Get metadata
#' ghcnd_states()
#' ghcnd_countries()
#' ghcnd_version()
#' }
#'
#' @export
ghcnd_states <- function(...){
res <- GET_retry(
"ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-states.txt", ...)
df <- read.fwf(textConnection(utcf8(res)), widths = c(2, 27),
header = FALSE, strip.white = TRUE, comment.char = "",
stringsAsFactors = FALSE, col.names = c("code","name"))
df[ -NROW(df) ,]
}
#' @export
#' @rdname ghcnd_states
ghcnd_countries <- function(...){
res <- GET_retry(
"ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-countries.txt", ...)
df <- read.fwf(textConnection(utcf8(res)), widths = c(2, 47),
header = FALSE, strip.white = TRUE, comment.char = "",
stringsAsFactors = FALSE, col.names = c("code","name"))
df[ -NROW(df) ,]
}
#' @export
#' @rdname ghcnd_states
ghcnd_version <- function(...){
res <- GET_retry(
"ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-version.txt", ...)
utcf8(res)
}
GET_retry <- function(url, ..., times = 3) {
res <- suppressWarnings(GET(url, ...))
if (res$status_code > 226) {
message("Request failed - Retrying")
stat <- 500
i <- 0
while (stat > 226 && i <= times) {
i <- i + 1
res <- suppressWarnings(GET(url, ...))
stat <- res$status_code
}
if (res$status_code > 226) stop("Request failed, try again", call. = FALSE)
}
return(res)
}
ghcnd_zip <- function(x){
"adf"
}
ghcnd_GET <- function(bp, stationid, ...){
dir.create(bp, showWarnings = FALSE, recursive = TRUE)
fp <- ghcnd_local(stationid, bp)
res <- suppressWarnings(httr::GET(ghcnd_remote(stationid), ...))
tt <- utcf8(res)
vars <- c("id","year","month","element",
do.call("c",
lapply(1:31, function(x) paste0(c("VALUE","MFLAG","QFLAG","SFLAG"), x))))
df <- read.fwf(textConnection(tt), c(11,4,2,4,rep(c(5,1,1,1), 31)),
na.strings = "-9999")
df[] <- Map(function(a, b) {
if (b == "integer") {
a <- as.character(a)
}
suppressWarnings(eval(parse(text = paste0("as.", b)))(a))
}, df, ghcnd_col_classes)
dat <- stats::setNames(df, vars)
write.csv(dat, fp, row.names = FALSE)
return(dat)
}
ghcndbase <- function() "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/all"
ghcnd_remote <- function(stationid) {
file.path(ghcndbase(), paste0(stationid, ".dly"))
}
ghcnd_local <- function(stationid, path) {
file.path(path, paste0(stationid, ".dly"))
}
is_ghcnd <- function(x) if (file.exists(x)) TRUE else FALSE
str_extract_ <- function(string, pattern) {
regmatches(string, regexpr(pattern, string))
}
str_extract_all_ <- function(string, pattern) {
regmatches(string, gregexpr(pattern, string))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.