R/get_station_data.R

Defines functions get_region_data get_state_data get_station_data

Documented in get_region_data get_state_data get_station_data

#' Organize .dly weather data downloaded from the web.
#'
#' These functions are designed to take NOAA weather data organized in .dly
#' format, extract snow depth, WESD, or other climate information, and return
#' a neatly organized data frame.
#'
#' @describeIn get_station_data Uses station identification codes to gather
#'   data from the proper .dly files.
#'
#' @param stations A vector of station identification codes. See the internal
#'   data set ghcnd_stations for station id's and descriptions.
#' @param source If downloading from the web, source should be
#'   "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/all/".
#' @param elem The climate variables to extract. Default is snow depth in mm
#'   ("SNWD") and water depth equivalent of snowfall in mm ("WESD"). For other
#'   climate variables see section three of
#'   ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt.
#' @param progress If true, a text progress bar is printed to the console.
#'   Only relevant when more than one station is requested.
#'
#' @return A data.frame where each row is a measurement of a climate variable
#'   in elem for a single day and station. Columns are:
#'   \describe{
#'     \item{}{\emph{ID} - The station identification code. See the internal
#'     data set ghcnd_stations for station id's and descriptions.}
#'     \item{}{\emph{DATE} - The date of the measurement.}
#'     \item{}{\emph{ELEMENT} - The climate variable measured. See
#'     ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt, section 3  for
#'     a description of climate variables.}
#'     \item{}{\emph{VALUE} - The measurement of the ELEMENT on the given
#'     DATE.}
#'     \item{}{\emph{MFLAG} - A measurement flag. See
#'     ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt, section 3  for
#'     a description of the flag.}
#'     \item{}{\emph{QFLAG} - A quality flag. See
#'     ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt, section 3  for
#'     a description of the flag.}
#'     \item{}{\emph{SFLAG} - A source flag. See
#'     ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt, section 3  for
#'     a description of the flag.}
#'     \item{}{\emph{ECO3} - Identifier for the EPA level 3 ecoregion where the station
#'     is located.}
#'   }
#'
#'
#' @examples
#' get_station_data(ghcnd_stations$ID[10000],
#'                  "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/all/")
#'
#'
#' @export
get_station_data <- function(stations, source, elem = c("SNWD", "WESD"), progress = TRUE){
  # Set constants
  #=============================================================================
  files <- paste0(source, stations, ".dly")
  month_days <- 1:31
  max_days <- max(month_days)

  # Indices for fixed width file item locations identified from:
  # ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
  # Section 3
  id_start <- 1
  id_end <- 11
  year_start <- 12
  year_end <- 15
  month_start <- 16
  month_end <- 17
  elem_start <- 18
  elem_end <- 21
  value_start <- month_days * 8 + 14
  value_end <- value_start + 4
  mflag_start <- value_end + 1
  mflag_end <- mflag_start
  qflag_start <- mflag_start + 1
  qflag_end <- qflag_start
  sflag_start <- qflag_start + 1
  sflag_end <- sflag_start


  # Define helper functions
  #=============================================================================
  # extract()
  #
  # A wrapper for vapply with substring that extracts a vector of characters
  # from each string
  extract <- function(strings, start, end) {
    vapply(strings,
           substring,
           character(length(start)),
           USE.NAMES = FALSE,
           start,
           end)
  }

  # extract_dates()
  #
  # A wrapper for extract that extracts a vector of dates for a given month in
  # each string
  extract_dates <- function(strings) {
    paste(rep(extract(strings, year_start, year_end), each = max_days),
          rep(extract(strings, month_start, month_end), each = max_days),
          rep(month_days, times = length(strings)),
          sep = "-")
  }


  # Extract data for each station in station vector
  #=============================================================================
  data_list <- vector("list", length(files))
  # Create a progress bar for the data download if requested. (utils package)
  if(progress){
    pb <- utils::txtProgressBar(min = 0, max = length(files), style = 3)
  }
  for (i in 1:length(files)) {
    # Read the lines from the FTP file
    dly <- try(readLines(files[i]), silent = TRUE)
    if(inherits(dly, "try-error")){
      print(paste("No viable file found at ", files[i], sep = ""))
      data_list[[i]] <- NULL
      next
    }

    # Remove lines that do not represent requried elements
    dly <- dly[substring(dly, elem_start, elem_end) %in% elem]


    # Manipulate lines into data.frame
    data_list[[i]] <- data.frame(
      "ID" = rep(substring(dly, id_start, id_end),
                 each = max_days),
      "DATE" = as.Date(extract_dates(dly)),
      "ELEMENT" = rep(substring(dly, elem_start, elem_end),
                      each = max_days),
      "VALUE" = as.numeric(extract(dly, value_start, value_end)),
      "MFLAG" = as.character(extract(dly, mflag_start, mflag_end)),
      "QFLAG" = as.character(extract(dly, qflag_start, qflag_end)),
      "SFLAG" = as.character(extract(dly, sflag_start, sflag_end)),
      stringsAsFactors = FALSE
    )

    if (nrow(data_list[[i]]) == 0) {
      data_list[[i]] <- NULL
    }

    if(progress){
      utils::setTxtProgressBar(pb, i)
    }
  }
  if(progress){
    message("") # hack to start next messages on new line
  }

  # Combine into single data.frame and remove missing values
  #=============================================================================
  tdata <- dplyr::bind_rows(data_list)

  # Return data.frame
  if (is.null(tdata) || nrow(tdata) == 0) {
    return(NULL)
  } else {
    tdata <- tdata[tdata$VALUE != -9999,] # -9999 is defined as missing
    return(tdata)
  }

}



#' @describeIn get_station_data Wrapper for get_station_data that extracts all
#'   station data for a given state(s) by passing all stations from internal
#'   data source ghcnd_stations with corresponding state(s).
#' @param states A vector of states, see the internal data set ghcnd_stations
#'   for valid states and associated stations.
#' @export
get_state_data <- function(states, source, elem = c("SNWD", "WESD"), progress = TRUE) {
  ghcnd_stations <- ghcnd_stations
  stations <- ghcnd_stations$ID[ghcnd_stations$STATE %in% states]
  get_station_data(stations, source, elem, progress)
}



#' @describeIn get_station_data Wrapper for get_station_data that extracts all
#'   station data for a given eco region(s) by passing all stations from internal
#'   data source ghcnd_stations with corresponding eco region(s).
#' @param eco_regions A vector of eco_regions, see the internal data set
#'   ghcnd_stations for valid eco_regions and associated stations. eco_regions
#'   can be either level 3 eco regions (e.g. "8.2.4"), level 2 regions
#'   (e.g. "8.2"), or level 1 regions (e.g. "8").
#' @export
get_region_data <- function(eco_regions, source, elem = c("SNWD", "WESD"), progress = TRUE) {
  # Change eco_regions to search for any given level 1, 2, or 3 vaules
  level1or2 <- suppressWarnings(!is.na(as.numeric(eco_regions)))
  eco_regions[level1or2] <- paste0(eco_regions[level1or2], "\\.")
  eco_regions <- paste0("^", eco_regions, collapse = "|")

  # Get stations
  ghcnd_stations <- ghcnd_stations
  stations <- ghcnd_stations$ID[grepl(eco_regions, ghcnd_stations$ECO3)]
  get_station_data(stations, source, elem, progress)
}
scoutiii/HTSoutliers documentation built on April 4, 2021, 4:47 p.m.