# Copyright (C) 2016 Ramon Novoa <ramonnovoa AT gmail DOT com>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# We will need the magnitude and station datasets.
data(madrid.air.magnitudes, envir = environment())
data(madrid.air.stations, envir = environment())
# Supress warnings.
Month <- NULL
Day <- NULL
Hour <- NULL
Value <- NULL
ValidationCode <- NULL
################################################################################
#' Air quality data parser.
#'
#' Parses raw air quality data published by \url{http://datos.madrid.es/} and returns
#' a clean data frame.
#'
#' @param input Input file containing the raw air quality data.
#' @param output Optional CSV file where the air quality data data frame will
#' be saved.
#' @param station Filter data by station code (see
#' \code{\link{madrid.air.stations}}).
#' @param magnitude Filter data by magnitude name (see
#' \code{\link{madrid.air.magnitudes}}).
#' @return A data frame containing air quality data.
#' @examples
#' # Download and parse daily data.
#' download.file('http://datos.madrid.es/egob/catalogo/201410-14-calidad-aire-diario.txt',
#' '201410-14-calidad-aire-diario.txt')
#' madrid.air.parse('201410-14-calidad-aire-diario.txt')
#'
#' # Save air quality data to disk.
#' madrid.air.parse('201410-14-calidad-aire-diario.txt',
#' output='201410-14-calidad-aire-diario.csv')
#'
#' # Download and parse hourly data.
#' download.file('http://datos.madrid.es/egob/catalogo/201200-29-calidad-aire-horario.zip',
#' '201200-29-calidad-aire-horario.zip')
#' unzip('201200-29-calidad-aire-horario.zip')
#' madrid.air.parse('Ene_mo01.txt')
#'
#' # Check station information.
#' data(madrid.air.stations)
#' madrid.air.stations
#'
#' # Filter by station.
#' madrid.air.parse('Ene_mo01.txt', station=28079001)
#'
#' # Check magnitude information.
#' data(madrid.air.magnitudes)
#' madrid.air.magnitudes
#'
#' # Filter by magnitude.
#' madrid.air.parse('Ene_mo01.txt', magnitude='CO')
#' @export
################################################################################
madrid.air.parse <- function (input,
output = NA,
station = NA,
magnitude = NA) {
# Read the first record to parse the year and period.
air_data <- utils::read.fwf(input, widths = c(8, 2, 2, 2, 2, 2, 2),
stringsAsFactors = FALSE, n = 1)
period <- air_data[1, 4]
year <- air_data[1, 5] + 2000
# Parse air quality data.
if (period == 2) {
air_data <- madrid.air.parser.hourly(input)
} else if (period == 4) {
air_data <- madrid.air.parser.daily(input, year)
} else {
stop("Invalid file format.")
}
# Filter by station.
if (!is.na(station)) {
air_data <- air_data[air_data$Station == station, ]
}
# Filter by magnitude
if (!is.na(magnitude)) {
air_data <- air_data[air_data$Magnitude == magnitude, ]
}
# Save the dataset to disk.
if (!is.na(output)) {
utils::write.csv(air_data, file = output, row.names = F, quote = F)
}
air_data
}
################################################################################
################################################################################
## Helper functions.
################################################################################
################################################################################
################################################################################
# Parses daily air quality data and returns a data frame.
#
# Args:
# input: Source file containing the raw air quality data.
# year: The year the data was collected.
#
# Returns:
# A data frame.
################################################################################
madrid.air.parser.daily <- function (input, year, output = NA) {
# There was an extra column before 2011 with the value 00 in daily
# data.
if (year < 2011) {
air_data <- utils::read.fwf(input, widths = c(8, 2, 2, 2, 2, 2, 2,
rep(6, 31)), stringsAsFactors = FALSE)
# Remove the extra column.
air_data <- air_data[, -7]
} else {
air_data <- utils::read.fwf(input, widths = c(8, 2, 2, 2, 2, 2,
rep(6, 31)), stringsAsFactors = FALSE)
}
# Remove the measurement technique and period columns.
air_data <- air_data[, -c(3, 4)]
# Set appropriate column names.
names(air_data) <- c("Station", "Magnitude", "Year", "Month", c(1:31))
# Leave one daily observation per row.
air_data <- tidyr::gather(air_data, Day, Value, 5:35)
# Perform common parsing operations.
air_data <- madrid.air.parser.common(air_data)
# Rearrange the columns.
air_data <- air_data[, c("Station", "Year", "Month", "Day", "Magnitude",
"Value")]
# Sort by date.
air_data <- dplyr::arrange(air_data, Month, Day)
air_data
}
################################################################################
# Parses hourly air quality data and returns a data frame.
#
# Args:
# input: Source file containing the raw air quality data.
#
# Returns:
# A data frame.
################################################################################
madrid.air.parser.hourly <- function (input) {
air_data <- utils::read.fwf(input,
widths = c(8, 2, 2, 2, 2, 2, 2, rep(6, 24)),
stringsAsFactors = FALSE)
# Remove the measurement technique and period columns.
air_data <- air_data[, -c(3, 4)]
# Set appropriate column names.
names(air_data) <- c("Station", "Magnitude", "Year", "Month", "Day",
c(1:24))
# Leave one hourly observation per row.
air_data <- tidyr::gather(air_data, Hour, Value, 6:29)
# Perform common parsing operations.
air_data <- madrid.air.parser.common(air_data)
# Rearrange the columns.
air_data <- air_data[, c("Station", "Year", "Month", "Day", "Hour",
"Magnitude", "Value")]
# Fix some column types.
air_data$Hour <- as.integer(air_data$Hour)
# Sort by date.
air_data <- dplyr::arrange(air_data, Month, Day, Hour)
air_data
}
################################################################################
# Performs parsing operation common to daily and hourly data.
#
# Args:
# input: A data frame containing daily or hourly air quality data.
#
# Returns:
# A data frame.
################################################################################
madrid.air.parser.common <- function (air_data) {
# Separate the validation code from the actual value.
air_data <- tidyr::separate(air_data, Value, c("Value", "ValidationCode"),
sep = 5)
# We will only keep valid data (validation code "V").
air_data <- dplyr::filter(air_data, ValidationCode == "V")
# Sanity check.
if (nrow(air_data) == 0) {
stop("Invalid file format.")
}
# Remove the validation code from the dataset.
air_data <- air_data[, -which(names(air_data) %in% c("ValidationCode"))]
# Convert values to double precision numbers.
air_data$Value <- as.double(air_data$Value)
# Save magnitudes as a factor.
air_data$Magnitude <- factor(air_data$Magnitude,
levels = madrid.air.magnitudes[, "Magnitude"],
labels = madrid.air.magnitudes[, "Abbreviation"])
# Remove unknown magnitudes.
# Note: There is no description for magnitude 85 in the docs!!!
air_data <- air_data[!is.na(air_data$Magnitude), ]
# Some station codes changed after 2011. Rename old codes to new codes.
# Pza. del Carmen.
air_data[air_data$Station == 28079003, "Station"] <- 28079035
# Barrio del Pilar.
air_data[air_data$Station == 28079005, "Station"] <- 28079039
# Cuatro Caminos.
air_data[air_data$Station == 28079010, "Station"] <- 28079038
# Vallecas.
air_data[air_data$Station == 28079013, "Station"] <- 28079040
# Moratalaz.
air_data[air_data$Station == 28079020, "Station"] <- 28079036
# Tres Olivos.
air_data[air_data$Station == 28079086, "Station"] <- 28079060
# Fix the Year column.
air_data$Year <- air_data$Year + 2000
# Fix some column types.
air_data$Year <- as.integer(air_data$Year)
air_data$Month <- as.integer(air_data$Month)
air_data$Day <- as.integer(air_data$Day)
air_data$Station <- as.factor(air_data$Station)
air_data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.