R/download-data.R

Defines functions qlt_download_data

Documented in qlt_download_data

#' Download QLT data
#'
#' Downloads QLT data from SQLite database.
#'
#' @param station A character vector of the station codes.
#' @param frequency A string of the period to average by (by default 'hourly' but can also be 'raw' or 'daily').
#' @param start_date A Date element of the beginning of the time series.
#' @param end_date A Date element of the end of the time series.
#' @param status A string indicating which type of values should be included possible
#' values are 'reasonable', 'questionable' and 'erroneous'. The status of the values
#' is indicated in the returned data frame.
#' @param surrogate A string indicating whether to exclude surrogates ('none'),
#' replace missing values with surrogates ('missing')
#' or include all surrogates ('duplicate')
#' @param add_missing A flag indicating whether to add missing values.
#' @param file A string of the name of the SQLite database to create (without the extension .sqlite).
#' @param dir A string of the directory.
#' @return A tbl data frame.
#' @export
qlt_download_data <- function(
  station = "DDM",
  frequency = "hourly",
  start_date = as.Date("2016-01-01"),
  end_date = Sys.Date(),
  status = "questionable",
  surrogate = "missing",
  add_missing = TRUE,
  file = getOption("kootqlt.file", "kootqlt"),
  dir = getOption("kootqlt.dir", ".")) {

  check_vector(station, kootqlt::station$Station, length = c(1, Inf))
  check_vector(frequency, c("raw", "hourly", "daily"), length = 1)
  check_date(start_date)
  check_vector(end_date)
  check_vector(status, c("reasonable", "questionable", "erroneous"), length = 1)
  check_vector(surrogate, c("none", "missing", "duplicate"), length = 1)
  check_flag(add_missing)
  check_string(dir)
  check_string(file)

  if (end_date > Sys.Date()) ps_error("end_date cannot be in the future")
  if (end_date < start_date) ps_error("end_date must be after start_date")

  if(dir != ".") file %<>% file.path(dir, .)
  file %<>% paste0(".sqlite")

  if (!file.exists(file)) ps_error("file '", file, "' does not exist")

  conn <- DBI::dbConnect(RSQLite::SQLite(), file)
  DBI::dbGetQuery(conn, "PRAGMA foreign_keys = ON;")
  on.exit(DBI::dbDisconnect(conn))

  data <- dplyr::tbl(conn, "Download") %>%
    dplyr::filter_(~Station %in% station) %>%
    dplyr::collect(n = Inf)

  data$DateTimeReading %<>% as.POSIXct(tz = "Etc/GMT+8")

  data$Date <- lubridate::date(data$DateTimeReading)
  data %<>% dplyr::filter_(~Date >= start_date, ~Date <= end_date)

  if (frequency != "raw") {
    if (frequency == "daily") lubridate::hour(data$DateTimeReading) <- 0L
    lubridate::minute(data$DateTimeReading) <- 0L
    lubridate::second(data$DateTimeReading) <- 0L
    data %<>%
      dplyr::group_by(.data$Station, .data$DateTimeReading, .data$Date, .data$Surrogate) %>%
      dplyr::summarise_("Value" = "mean(Value)", "Status" = "max(Status)") %>%
      dplyr::ungroup()
  }

  status <- switch(status, "reasonable" = 1, "questionable" = 2, "erroneous" = 3)

  data %<>% dplyr::filter_(~Status <= status)

  data$Status %<>% ordered(levels = 1:3)
  levels(data$Status) <- list(Reasonable = "1",
                              Questionable = "2",
                              Erroneous = "3",
                              Missing = "4")

  data %<>%
    dplyr::inner_join(kootqlt::station, by = "Station") %>%
    dplyr::select_(~DateTimeReading, ~Station,
                   ~Value, ~Variable, ~Units,
                   ~Surrogate, ~Status, ~Location)

  is.na(data$Surrogate[data$Surrogate == "no"]) <- TRUE

  if (surrogate == "none") {
    data %<>% dplyr::filter_(~is.na(Surrogate))
  } else if(surrogate == "missing") {
    surrogate_data <- dplyr::filter_(data, ~!is.na(Surrogate)) %>%
      dplyr::filter_(~!is.na(Value))

    data %<>% dplyr::filter_(~is.na(Surrogate))

    if(nrow(surrogate_data)) {
      surrogate_data %<>% dplyr::anti_join(data, by = c("Station", "DateTimeReading"))
      dplyr::bind_rows(data, surrogate_data)
    }
  }
  if(add_missing) {
    data %<>% poisdata::ps_add_missing_sequence(
      "DateTimeReading", by = c("Station", "Variable", "Units", "Location"))
    data$Status[is.na(data$Status)] <- "Missing"
  }
  data %<>% dplyr::arrange(.data$Station, .data$DateTimeReading) %>%
    dplyr::as.tbl()
  data
}
poissonconsulting/kootqlt documentation built on May 25, 2019, 10:25 a.m.