R/ghcnd.R

Defines functions ghcnd_search ghcnd

Documented in ghcnd ghcnd_search

#' Get a cleaned version of GHCND data from a single weather site
#'
#' This function uses ftp to access the Global Historical Climatology Network
#' daily weather data from NOAA's FTP server for a single weather monitor site.
#' It requires the site identification number for that site and will pull the
#' entire weather dataset for the site. It will then clean this data to convert
#' it to a tidier format and will also, if requested, filter it to a certain
#' date range and to certain weather variables.
#'
#' @export
#' @inheritParams ghcnd
#' @param date_min A character string giving the earliest
#'    date of the daily weather time series that the user would
#'    like in the final output. This character string should be formatted as
#'    "yyyy-mm-dd". If not specified, the default is to keep all daily data for
#'    the queried weather site from the earliest available date.
#' @param date_max A character string giving the latest
#'    date of the daily weather time series that the user would
#'    like in the final output. This character string should be formatted as
#'    "yyyy-mm-dd". If not specified, the default is to keep all daily data for
#'    the queried weather site through the most current available date.
#' @param var A character vector specifying either \code{"all"} (pull all
#'    available weather parameters for the site) or the weather parameters to
#'    keep in the final data (e.g., \code{c("TMAX", "TMIN")} to only keep
#'    maximum and minimum temperature). Example choices for this argument
#'    include:
#'    \itemize{
#'    \item \code{PRCP}: Precipitation, in tenths of millimeters
#'    \item \code{TAVG}: Average temperature, in tenths of degrees Celsius
#'    \item \code{TMAX}: Maximum temperature, in tenths of degrees Celsius
#'    \item \code{TMIN}: Minimum temperature, in tenths of degrees Celsius
#'    }
#'    A full list of possible weather variables is available in NOAA's README
#'    file for the GHCND data
#'    (\url{https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt}).
#'    Most weather stations will only have a small subset of all the possible
#'    weather variables, so the data generated by this function may not include
#'    all of the variables the user specifies through this argument.
#'
#' @return A list object with slots for each of the available specified
#'    weather variables. Each element in the list is a separate time series
#'    dataframe with daily observations, as well as flag values, for one of
#'    the weather variables. The flag values give information on the quality
#'    and source of each observation; see the NOAA README file linked above
#'    for more information.
#'
#' @author Scott Chamberlain \email{myrmecocystus@@gmail.com},
#' Adam Erickson \email{adam.erickson@@ubc.ca}
#'
#' @note This function calls \code{\link{ghcnd}}, which will download and save
#'    data from all available dates and weather variables for the queried
#'    weather station. The step of limiting the dataset to only certain dates
#'    and / or weather variables, using the \code{date_min}, \code{date_max},
#'    and \code{var} arguments, does not occur until after the full data has
#'    been pulled.
#'
#' @seealso \code{\link{meteo_pull_monitors}}, \code{\link{meteo_tidy_ghcnd}}
#'
#' @examples \dontrun{
#' # Search based on variable and/or date
#' ghcnd_search("AGE00147704", var = "PRCP")
#' ghcnd_search("AGE00147704", var = "PRCP", date_min = "1920-01-01")
#' ghcnd_search("AGE00147704", var = "PRCP", date_max = "1915-01-01")
#' ghcnd_search("AGE00147704", var = "PRCP", date_min = "1920-01-01",
#'              date_max = "1925-01-01")
#' ghcnd_search("AGE00147704", date_min = "1920-01-01", date_max = "1925-01-01")
#' ghcnd_search("AGE00147704", var = c("PRCP","TMIN"))
#' ghcnd_search("AGE00147704", var = c("PRCP","TMIN"), date_min = "1920-01-01")
#' ghcnd_search("AGE00147704", var = "adfdf")
#' }
ghcnd_search <- function(stationid, date_min = NULL, date_max = NULL,
                         var = "all", ...) {
  calls <- names(sapply(match.call(), deparse))[-1]
  calls_vec <- "path" %in% calls
  if (any(calls_vec)) {
    stop("The parameter path has been removed, see docs for ?ghcnd_search",
         call. = FALSE)
  }

  dat <- ghcnd_splitvars(ghcnd(stationid))
  possvars <- paste0(names(dat), collapse = ", ")

  if (any(var != "all")) {
    vars_null <- sort(tolower(var))[!sort(tolower(var)) %in% sort(names(dat))]
    dat <- dat[tolower(var)]
  }
  if (any(sapply(dat, is.null))) {
    dat <- noaa_compact(dat)
    warning(
      sprintf("%s not in the dataset\nAvailable variables: %s",
              paste0(vars_null, collapse = ", "), possvars), call. = FALSE)
  }
  if (!is.null(date_min)) {
    dat <- lapply(dat, function(z) z %>% dplyr::filter(date >= date_min))
  }
  if (!is.null(date_max)) {
    dat <- lapply(dat, function(z) z %>% dplyr::filter(date <= date_max))
  }
  dat
}

#' Get all GHCND data from a single weather site
#'
#' This function uses ftp to access the Global Historical Climatology Network
#' daily weather data from NOAA's FTP server for a single weather site. It
#' requires the site identification number for that site and will pull the
#' entire weather dataset for the site.
#'
#' @export
#' @param stationid A character string giving the identification of the weather
#'    station for which the user would like to pull data. To get a full and
#'    current list of stations, the user can use the \code{\link{ghcnd_stations}}
#'    function. To identify stations within a certain radius of a location, the
#'    user can use the \code{\link{meteo_nearby_stations}} function.
#' @param ... Additional curl options to pass through to \code{\link[httr]{GET}}.
#'
#' @return A tibble (data.frame) which contains data pulled from NOAA's FTP
#' server for the queried weather site. A README file with more information
#' about the format of this file is available from NOAA
#' (\url{http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt}).
#' This file is formatted so each line of the file gives the daily weather
#' observations for a single weather variable for all days of one month of
#' one year. In addition to measurements, columns are included for certain
#' flags, which add information on observation sources and quality and are
#' further explained in NOAA's README file for the data.
#'
#' @details  This function saves the full set of weather data for the queried
#' site locally in the directory specified by the \code{path} argument.
#'
#' You can access the path for the cached file via \code{attr(x, "source")}
#'
#' @author Scott Chamberlain \email{myrmecocystus@@gmail.com},
#' Adam Erickson \email{adam.erickson@@ubc.ca}
#'
#' @seealso To generate a weather dataset for a single weather site that has
#' been cleaned to a tidier weather format, the user should use the
#' \code{\link{ghcnd_search}} function, which calls \code{\link{ghcnd}} and then
#' processes the output, or \code{\link{meteo_tidy_ghcnd}}, which wraps the
#' \code{\link{ghcnd_search}} function to output a tidy dataframe. To pull
#' GHCND data from multiple monitors, see \code{\link{meteo_pull_monitors}}.
#'
#' @section File storage:
#' We use \pkg{rappdirs} to store files, see
#' \code{\link[rappdirs]{user_cache_dir}} for how we determine the directory on
#' your machine to save files to, and run
#' \code{rappdirs::user_cache_dir("rnoaa/ghcnd")} to get that directory.
#'
#' Note that between versions of \pkg{rnoaa} you may want to clear your
#' cache of ghcnd files IF there are changes in ghcnd functions. See
#' \code{\link{ghcnd_clear_cache}} or you can do so manually.
#'
#' @examples \dontrun{
#' # Get data
#' ghcnd(stationid = "AGE00147704")
#'
#' stations <- ghcnd_stations()
#' ghcnd(stations$id[40])
#' ghcnd(stations$id[4000])
#' ghcnd(stations$id[10000])
#' ghcnd(stations$id[80000])
#' ghcnd(stations$id[80300])
#'
#' library("dplyr")
#' ghcnd(stations$id[80300]) %>% select(id, element) %>% slice(1:3)
#'
#' # manipulate data
#' ## using built in fxns
#' dat <- ghcnd(stationid = "AGE00147704")
#' (alldat <- ghcnd_splitvars(dat))
#' library("ggplot2")
#' ggplot(subset(alldat$tmax, tmax >= 0), aes(date, tmax)) + geom_point()
#'
#' ## using dplyr
#' library("dplyr")
#' dat <- ghcnd(stationid = "AGE00147704")
#' dat %>%
#'  filter(element == "PRCP", year == 1909)
#' }
ghcnd <- function(stationid, ...){
  calls <- names(sapply(match.call(), deparse))[-1]
  calls_vec <- "path" %in% calls
  if (any(calls_vec)) {
    stop("The parameter path has been removed, see docs for ?ghcnd",
         call. = FALSE)
  }

  path <- file.path(rnoaa_cache_dir(), "ghcnd")
  csvpath <- ghcnd_local(stationid, path)
  if (!is_ghcnd(x = csvpath)) {
    res <- ghcnd_GET(path, stationid, ...)
  } else {
    res <- read.csv(csvpath, stringsAsFactors = FALSE)
                    colClasses = ghcnd_col_classes)
  }
  res <- tibble::as_data_frame(res)
  attr(res, 'source') <- csvpath
  res
}

ghcnd_col_classes <- c(
  "character", "integer", "integer", "character",
  rep(c("integer", "character", "character", "character"), times = 31)
)

fm <- function(n) {
  gsub("\\s", "0", n)
}

#' Get information on the GHCND weather stations
#'
#' This function returns an object with a dataframe with meta-information about
#' all available GHCND weather stations.
#'
#' @export
#' @inheritParams ghcnd
#'
#' @return This function returns a tibble (dataframe) with a weather
#' station on each row with the following columns:
#' \itemize{
#'    \item \code{id}: The weather station's ID number. The first two letters
#'    denote the country (using FIPS country codes).
#'    \item \code{latitude}: The station's latitude, in decimal degrees.
#'    Southern latitudes will be negative.
#'    \item \code{longitude}: The station's longitude, in decimal degrees.
#'    Western longitudes will be negative.
#'    \item \code{elevation}: The station's elevation, in meters.
#'    \item \code{name}: The station's name.
#'    \item \code{gsn_flag}: "GSN" if the monitor belongs to the GCOS Surface
#'    Network (GSN). Otherwise either blank or missing.
#'    \item \code{wmo_id}: If the station has a WMO number, this column gives
#'    that number. Otherwise either blank or missing.
#'    \item \code{element}: A weather variable recorded at some point during
#'    that station's history. See the link below in "References" for definitions
#'    of the abbreviations used for this variable.
#'    \item \code{first_year}: The first year of data available at that station
#'    for that weather element.
#'    \item \code{last_year}: The last year of data available at that station
#'    for that weather element.
#' }
#'
#' If a weather station has data on more than one weather variable, it will
#' be represented in multiple rows of this output dataframe.
#'
#' @note Since this function is pulling a large dataset by ftp, it may take
#' a while to run.
#'
#' @references
#'
#' For more documentation on the returned dataset, see
#' \url{http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt}.
#'
#' @examples \dontrun{
#' # Get stations, ghcnd-stations and ghcnd-inventory merged
#' (stations <- ghcnd_stations())
#'
#' library(dplyr)
#' # filter by state
#' stations %>% filter(state == "IL")
#' stations %>% filter(state == "OR")
#' # those without state values
#' stations %>% filter(state == "")
#' # filter by element
#' stations %>% filter(element == "PRCP")
#' # filter by id prefix
#' stations %>% filter(grepl("^AF", id))
#' stations %>% filter(grepl("^AFM", id))
#' # filter by station long name
#' stations %>% filter(name == "CALLATHARRA")
#' }
ghcnd_stations <- function(...){
  sta <- get_stations(...)
  inv <- get_inventory(...)
  tibble::as_data_frame(merge(sta, inv[, -c(2, 3)], by = "id"))
}

get_stations <- function(...){
  res <- GET_retry(
    "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt",
    ...)
  df <- read.fwf(
    textConnection(utcf8(res)),
    widths = c(11, 9, 11, 7, 2, 31, 5, 10),
    header = FALSE, strip.white = TRUE, comment.char = "",
    stringsAsFactors = FALSE)
  nms <- c("id","latitude", "longitude", "elevation",
           "state", "name", "gsn_flag", "wmo_id")
  stats::setNames(df, nms)
}

get_inventory <- function(...){
  res <- GET_retry(
    "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt", ...)
  df <- read.fwf(
    textConnection(utcf8(res)),
    widths = c(11, 9, 10, 5, 5, 5),
    header = FALSE, strip.white = TRUE, comment.char = "",
    stringsAsFactors = FALSE)
  nms <- c("id","latitude", "longitude",
           "element", "first_year", "last_year")
  stats::setNames(df, nms)
}

strex <- function(x) str_extract_(x, "[0-9]+")

#' Split variables in data returned from \code{ghcnd}
#'
#' This function is a helper function for \code{\link{ghcnd_search}}. It helps
#' with cleaning up the data returned from \code{\link{ghcnd}}, to get it in a
#' format that is easier to work with.
#'
#' @param x An object returned from \code{\link{ghcnd}}.
#'
#' @author Scott Chamberlain \email{myrmecocystus@@gmail.com},
#' Adam Erickson \email{adam.erickson@@ubc.ca}
#'
#' @export
ghcnd_splitvars <- function(x){
  if (!inherits(x, "data.frame")) stop("input must be a data.frame", call. = FALSE)
  if (!"id" %in% names(x)) stop("input not of correct format", call. = FALSE)
  x <- x[!is.na(x$id), ]
  out <- lapply(as.character(unique(x$element)), function(y){
    ydat <- x[ x$element == y, ]

    dd <- ydat %>%
      dplyr::select(-dplyr::contains("FLAG")) %>%
      tidyr::gather(var, value, -id, -year, -month, -element) %>%
      dplyr::mutate(
        day = strex(var),
        date = as.Date(sprintf("%s-%s-%s", year, month, day), "%Y-%m-%d")) %>%
      dplyr::filter(!is.na(date)) %>%
      dplyr::select_(
        quote(-element),
        quote(-var),
        quote(-year),
        quote(-month),
        quote(-day))
    dd <- stats::setNames(dd, c("id", tolower(y), "date"))

    mflag <- ydat %>%
      dplyr::select(-dplyr::contains("VALUE"), -dplyr::contains("QFLAG"),
                    -dplyr::contains("SFLAG")) %>%
      tidyr::gather(var, value, -id, -year, -month, -element) %>%
      dplyr::mutate(
        day = strex(var),
        date = as.Date(sprintf("%s-%s-%s", year, month, day), "%Y-%m-%d")) %>%
      dplyr::filter(!is.na(date)) %>%
      dplyr::select(value) %>%
      dplyr::rename(mflag = value)

    qflag <- ydat %>%
      dplyr::select(-dplyr::contains("VALUE"), -dplyr::contains("MFLAG"),
                    -dplyr::contains("SFLAG")) %>%
      tidyr::gather(var, value, -id, -year, -month, -element) %>%
      dplyr::mutate(
        day = strex(var),
        date = as.Date(sprintf("%s-%s-%s", year, month, day), "%Y-%m-%d")) %>%
      dplyr::filter(!is.na(date)) %>%
      dplyr::select(value) %>%
      dplyr::rename(qflag = value)

    sflag <- ydat %>%
      dplyr::select(-dplyr::contains("VALUE"), -dplyr::contains("QFLAG"),
                    -dplyr::contains("MFLAG")) %>%
      tidyr::gather(var, value, -id, -year, -month, -element) %>%
      dplyr::mutate(
        day = strex(var),
        date = as.Date(sprintf("%s-%s-%s", year, month, day), "%Y-%m-%d")) %>%
      dplyr::filter(!is.na(date)) %>%
      dplyr::select(value) %>%
      dplyr::rename(sflag = value)

    dplyr::tbl_df(cbind(dd, mflag, qflag, sflag))
  })
  stats::setNames(out, tolower(unique(x$element)))
}

# ghcnd_mergevars <- function(x){
#   merge(x[[2]], x[[3]] %>% select(-id), by='date')
# }

#' Get meta-data on the GHCND daily data
#'
#' These function allow you to pull the current versions of certain meta-datasets
#' for the GHCND, including lists of country and state abbreviations used in some
#' of the weather station IDs and information about the current version of the
#' data.
#'
#' @inheritParams ghcnd
#'
#' @author Scott Chamberlain \email{myrmecocystus@@gmail.com},
#' Adam Erickson \email{adam.erickson@@ubc.ca}
#'
#' @details Functions:
#' \itemize{
#'  \item \code{ghcnd_version}: Get current version of GHCND data
#'  \item \code{ghcnd_states}: Get US/Canada state names and 2-letter codes
#'  \item \code{ghcnd_countries}: Get country names and 2-letter codes
#' }
#'
#' @examples \dontrun{
#' # Get metadata
#' ghcnd_states()
#' ghcnd_countries()
#' ghcnd_version()
#' }
#'
#' @export
ghcnd_states <- function(...){
  res <- GET_retry(
    "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-states.txt", ...)
  df <- read.fwf(textConnection(utcf8(res)), widths = c(2, 27),
                 header = FALSE, strip.white = TRUE, comment.char = "",
                 stringsAsFactors = FALSE, col.names = c("code","name"))
  df[ -NROW(df) ,]
}

#' @export
#' @rdname ghcnd_states
ghcnd_countries <- function(...){
  res <- GET_retry(
    "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-countries.txt", ...)
  df <- read.fwf(textConnection(utcf8(res)), widths = c(2, 47),
                 header = FALSE, strip.white = TRUE, comment.char = "",
                 stringsAsFactors = FALSE, col.names = c("code","name"))
  df[ -NROW(df) ,]
}

#' @export
#' @rdname ghcnd_states
ghcnd_version <- function(...){
  res <- GET_retry(
    "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-version.txt", ...)
  utcf8(res)
}

GET_retry <- function(url, ..., times = 3) {
  res <- suppressWarnings(GET(url, ...))
  if (res$status_code > 226) {
    message("Request failed - Retrying")
    stat <- 500
    i <- 0
    while (stat > 226 && i <= times) {
      i <- i + 1
      res <- suppressWarnings(GET(url, ...))
      stat <- res$status_code
    }
    if (res$status_code > 226) stop("Request failed, try again", call. = FALSE)
  }
  return(res)
}

ghcnd_zip <- function(x){
  "adf"
}

ghcnd_GET <- function(bp, stationid, ...){
  dir.create(bp, showWarnings = FALSE, recursive = TRUE)
  fp <- ghcnd_local(stationid, bp)
  res <- suppressWarnings(httr::GET(ghcnd_remote(stationid), ...))
  tt <- utcf8(res)
  vars <- c("id","year","month","element",
            do.call("c",
      lapply(1:31, function(x) paste0(c("VALUE","MFLAG","QFLAG","SFLAG"), x))))
  df <- read.fwf(textConnection(tt), c(11,4,2,4,rep(c(5,1,1,1), 31)),
                 na.strings = "-9999")
  df[] <- Map(function(a, b) {
    if (b == "integer") {
      a <- as.character(a)
    }
    suppressWarnings(eval(parse(text = paste0("as.", b)))(a))
  }, df, ghcnd_col_classes)
  dat <- stats::setNames(df, vars)
  write.csv(dat, fp, row.names = FALSE)
  return(dat)
}

ghcndbase <- function() "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/all"
ghcnd_remote <- function(stationid) {
  file.path(ghcndbase(), paste0(stationid, ".dly"))
}
ghcnd_local <- function(stationid, path) {
  file.path(path, paste0(stationid, ".dly"))
}
is_ghcnd <- function(x) if (file.exists(x)) TRUE else FALSE
str_extract_ <- function(string, pattern) {
  regmatches(string, regexpr(pattern, string))
}
str_extract_all_ <- function(string, pattern) {
  regmatches(string, gregexpr(pattern, string))
}
leighseverson/rnoaa documentation built on May 21, 2019, 3:06 a.m.