R/import.R

Defines functions importGiosFromCSV importOneCSV importGiosFromXLSX importOneXLSX isAvailable

Documented in importGiosFromCSV importGiosFromXLSX importOneCSV importOneXLSX isAvailable

#' Check if there are any measurement on a given stations for chosen year and polutant.
#'
#' @param chosenYear year of measurements
#' @param chosenPolutant measured polutant
#' @param station station of interest. Please provide current names of stations (values, not names of stationCodes vector)
#'
#' @return logical value: TRUE if the data is available, FALSE if it's not
#'
#' @export
#'

isAvailable <- function(chosenYear, chosenPolutant, station) {
  availability %>%
    dplyr::filter(year == chosenYear,
           polutant == chosenPolutant) %>%
    dplyr::select_(.dots = station) %>%
    unlist(use.names = FALSE)
}


#' Import from one GIOŚ .xlsx file.
#'
#' @param station chr, name of the chosen station.
#' @param polutant chr, name of chosen polutant.
#' @param year chr, chosen year (yyyy).
#' @param path chr, path to folder containing .xlsx file or path to .xlsx file.
#' @param noHours chr, 1 or 24, defaults to 1 (hourly meas.)
#' @param skip int, number of rows to skip, defaults to 3.
#'        Do not change if unchanged GIOŚ files are used.
#' @param exact lgl, TRUE if path is a path to .xlsx file.
#'
#' @return tibble
#'

importOneXLSX <- function(station, polutant, year, path = getwd(),
                          noHours = "1", skip = 3, exact = FALSE) {

  fileList <- character(0)
  if(exact) {
    fileList <- path
  } else {
    fileList <- list.files(path, pattern = "*.xlsx") %>%
      grep(pattern = year, value = TRUE, fixed = TRUE) %>%
      grep(pattern = polutant, value = TRUE, fixed = TRUE) %>%
      grep(pattern = paste0(noHours, "g"), value = TRUE, fixed = TRUE)
  }

  emptyFrame <- tibble::tibble(measDate = character(0),
		       station = character(0),
		       polutant = character(0),
		       measurement = character(0))

  if(length(fileList) == 0) return(emptyFrame)

  srcFile <- paste(path, fileList, sep = "/")
  colNames <- colnames(readxl::read_excel(srcFile))

  if(!sum(grepl(colNames, pattern = station))) {
    isOld  <- sum(grepl(names(stationCodes), pattern = station))
    isNew <- sum(grepl(stationCodes, pattern = station))
    if(isOld) {
      station <- stationCodes[station]
    } else if(isNew) {
      station <- names(stationCodes)[grep(stationCodes, pattern = station)]
    }
    if(!sum(grepl(colNames, pattern = station))) {
      return(emptyFrame)
    }
  }

  tmpFrame <- tibble::as_tibble(readxl::read_excel(srcFile, skip = skip,
                                                   col_names = FALSE))
  colnames(tmpFrame) <- colNames
  colnames(tmpFrame)[1] <- "measDate"
  tmpFrame <- tmpFrame[, c("measDate", station)]
  colnames(tmpFrame)[2] <- "measurement"

  tmpFrame %>%
    dplyr::mutate(measurement = stringr::str_replace_all(measurement, ",", ".")) %>%
    dplyr::mutate(measurement = as.numeric(measurement),
	                station = station,
	                polutant = polutant)

}


#' Import data for one station, multiple years and polutants.
#'
#' @param station chr, name of the chosen station.
#' @param polutants chr, names of chosen polutants.
#' @param years chr, chosen years (yyyy).
#' @param path chr, path to folder containing .xlsx file or path to .xlsx file.
#' @param noHours chr, 1 or 24, defaults to 1 (hourly meas.)
#' @param skip int, number of rows to skip, defaults to 3.
#'        Do not change if unchanged GIOŚ files are used.
#' @param exact lgl, TRUE if path is a path to .xlsx file.
#'
#' @return tibble
#'
#' @export
#'
#' @examples
#' \dontrun{
#' importGiosFromXLSX("DsWrocKorzA", c("NOx", "SO2"), c("2015", "2014"))
#' # Default settings, .xlsx files are in the working directory.
#' importGiosFromXLSX("DsWrocKorzA", c("NOx", "SO2"), c("2015", "2014"), "path-to-the-folder")
#' # Importing from a different (not working) directory.
#' }
#'

importGiosFromXLSX <- function(station, polutants = NULL, years = NULL, path = getwd(),
                               noHours = "1", skip = 3, exact = FALSE) {
  if(!exact & (is.null(polutants) | is.null(years))) stop("Years and polutants must be given if exact = FALSE")
  if(exact & path == getwd()) stop("Paths to files must be given if exact = TRUE")

  tmpResult <- vector("list", length(polutants)*length(years))
  for(i in polutants) {
    for(j in years) {
      tmpResult[[paste0(i, j)]] <- importOneXLSX(station, i, j, path, noHours, skip, exact)
    }
  }
  tmpResult %>%
    dplyr::bind_rows() %>%
    dplyr::select(station, polutant, measDate, measurement) %>%
    dplyr::mutate(measDate = lubridate::ymd_hms(measDate)) %>%
    dplyr::mutate(measDate = lubridate::round_date(measDate, unit = "hour")) %>%
    dplyr::filter(lubridate::year(measDate) %in% years)

}


#' Import from one .csv file.
#'
#' @inheritParams importOneXLSX
#'
#' @return tibble
#'

importOneCSV <- function(station, polutant, year, path = getwd(),
                          noHours = "1", skip = 3, exact = FALSE) {

  fileList <- character(0)
  if(exact) {
    fileList <- path
  }  else {
    fileList <- list.files(path, pattern = "*.csv") %>%
      grep(pattern = year, value = TRUE, fixed = TRUE) %>%
      grep(pattern = polutant, value = TRUE, fixed = TRUE) %>%
      grep(pattern = paste0(noHours, "g"), value = TRUE, fixed = TRUE)
  }

  emptyFrame <- tibble::tibble(measDate = character(0),
                               station = character(0),
                               polutant = character(0),
                               measurement = character(0))

  if(length(fileList) == 0) return(emptyFrame)

  srcFile <- paste(path, fileList, sep = "/")
  tmpFrame <- readr::read_csv(srcFile, col_names = TRUE)
  colnames(tmpFrame)[1] <- "measDate"
  tmpFrame <- tmpFrame[-(1:2), ]
  colNames <- colnames(tmpFrame)

  if(!sum(grepl(colNames, pattern = station))) {
    isOld  <- sum(grepl(names(stationCodes), pattern = station))
    isNew <- sum(grepl(stationCodes, pattern = station))
    if(isOld) {
      station <- stationCodes[station]
    } else if(isNew) {
      station <- names(stationCodes)[grepl(x = stationCodes, pattern = station)]
    }
    if(!sum(grepl(colNames, pattern = station))) {
      return(emptyFrame)
    }
  }

  tmpFrame <- tmpFrame[, c("measDate", station)]
  colnames(tmpFrame)[2] <- "measurement"

  tmpFrame %>%
    dplyr::mutate(measurement = stringr::str_replace_all(measurement,
                                                         ",", ".")) %>%
    dplyr::mutate(measurement = as.numeric(measurement),
                  station = station,
                  polutant = polutant)
}


#' Import data for one station, multiple years and polutants from .csv files.
#'
#' @inheritParams importGiosFromXLSX
#'
#' @return tibble
#'
#' @export
#'
#' @examples
#' \dontrun{
#' importGiosFromXLSX("DsWrocKorzA", c("NOx", "SO2"), c("2015", "2014"))
#' # Default settings, .xlsx files are in the working directory.
#' importGiosFromXLSX("DsWrocKorzA", c("NOx", "SO2"), c("2015", "2014"), "path-to-the-folder")
#' # Importing from a different (not working) directory.
#' }
#'

importGiosFromCSV <- function(station, polutants = NULL, years = NULL, path = getwd(),
                               noHours = "1", skip = 3, exact = FALSE) {
  if(!exact & (is.null(polutants) | is.null(years))) stop("Years and polutants must be given if exact = FALSE")
  if(exact & path == getwd()) stop("Paths to files must be given if exact = TRUE")

  tmpResult <- vector("list", length(polutants)*length(years))
  for(i in polutants) {
    for(j in years) {
      tmpResult[[paste0(i, j)]] <- importOneCSV(station, i, j, path, noHours, skip, exact)
    }
  }
  tmpResult %>%
    dplyr::bind_rows() %>%
    dplyr::select(station, polutant, measDate, measurement) %>%
    dplyr::mutate(measDate = lubridate::ymd_hms(measDate)) %>%
    dplyr::mutate(measDate = lubridate::round_date(measDate, unit = "hour")) %>%
    dplyr::filter(lubridate::year(measDate) %in% years)
}
mstaniak/giosDownloader documentation built on May 17, 2018, 12:42 a.m.