hkdata:

Documented in aqhi_24hr aqhi_24hr_retrieve aqhi_24hr_url aqhi_current aqhi_current_retrieve aqhi_current_url aqhi_range_forecast aqhi_range_forecast_retrieve aqhi_range_forecast_url lamp_posts_data lamp_posts_data_url past_aqhi past_aqhi_url past_pollution_index past_pollution_index_url pollutant_24hr pollutant_24hr_retrieve pollutant_24hr_url towngas_performance_data towngas_performance_data_url

#-----------------------------------------------------------------------------#
# HONG KONG POLLUTION DATA
#-----------------------------------------------------------------------------#

# Air Quality Health Index ------------------------------------------------

#' URL for Air Quality Health Index (AQHI) at individual general and roadside Air Quality Monitoring stations for the past 24 hours
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past24hr-aqhi-of-individual-air-quality-monitoring-stations}
#'
#' @param lang select the language of the available data, available choices are
#'   "en" for English, "tc" for traditional Chinese, "sc" for simplified Chinese
#'
#' @export
#'
aqhi_24hr_url <- function(lang = c("en", "tc", "sc")) {
  lang = match.arg(lang)
  switch(lang,
         en = "http://www.aqhi.gov.hk/epd/ddata/html/out/24aqhi_Eng.xml",
         tc = "http://www.aqhi.gov.hk/epd/ddata/html/out/24aqhi_ChT.xml",
         sc = "http://www.aqhi.gov.hk/epd/ddata/html/out/24aqhi_ChS.xml"
         )
}

#' Retrieve Air Quality Health Index (AQHI) at individual general and roadside
#' Air Quality Monitoring stations for the past 24 hours
#' 
#' @references
#' \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past24hr-aqhi-of-individual-air-quality-monitoring-stations}
#'
#' @param data_url the url to the specific file, e.g. as returned by
#'   hist_file_url() for a given timestamp
#'
#' @export
#' 
aqhi_24hr_retrieve <- function(data_url) {
  require(rvest)
  require(httr)
  require(dplyr)
  
  res <- content(GET(data_url), encoding = 'UTF-8')
  if (identical("NOT FOUND", res$message)) {
    stop("Unable to retrieve information, input timestamp may not be available
         - try to find the available timestamps.")
  }
  # pick out the components
  res_headers <- sapply(c("title", "link", "description", "language", "copyright", "webMaster", "lastBuildDate"),
                        FUN = function(n) {res %>% html_node(xpath = n) %>% html_text()},
                        simplify = FALSE, USE.NAMES = TRUE)
  res_items <- lapply(res %>% html_nodes(xpath = "item"),
                      function(n) {
                        sapply(c("type", "StationName", "DateTime", "aqhi"),
                               function(x) {n %>% html_node(xpath = x) %>% html_text()},
                               simplify = FALSE, USE.NAMES = TRUE) %>%
                          data.frame(stringsAsFactors = FALSE)
                      })
  res_data <- do.call("rbind", res_items)
  
  list(header = res_headers, data = res_data)
}

#' Retrieve Air Quality Health Index (AQHI) at individual general and roadside Air Quality Monitoring stations for the past 24 hours
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past24hr-aqhi-of-individual-air-quality-monitoring-stations}
#'
#' @param timestamp if null then current, otherwise historical weather it should
#'   be in format of \%Y\%m\%d-\%H\%M, e.g. 20180905-1306
#' @param lang select the language of the available data, available choices are
#'   "en" for English, "tc" for traditional Chinese, "sc" for simplified Chinese
#'
#' @export
#'
aqhi_24hr <- function(timestamp = NULL, lang = c("en", "tc", "sc")) {
  aqhi_24hr_retrieve(data_file_url(aqhi_24hr_url(lang), timestamp))
}


# Past 24-hour Pollutant Concentration ------------------------------------

#' URL for Past 24-hour Pollutant Concentration of individual Air Quality Monitoring stations
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past24hr-pc-of-individual-air-quality-monitoring-stations}
#'
#' @param lang select the language of the available data, available choices are
#'   "en" for English
#'   
#' @export
#'
pollutant_24hr_url <- function(lang = "en") {
  # TODO: lang
  "http://www.aqhi.gov.hk/epd/ddata/html/out/24pc_Eng.xml"
}

#' Retrieve Past 24-hour Pollutant Concentration of individual Air Quality Monitoring stations
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past24hr-pc-of-individual-air-quality-monitoring-stations}
#'
#' @param data_url the url to the specific file, e.g. as returned by
#'   hist_file_url() for a given timestamp
#'
#' @export
#'
pollutant_24hr_retrieve <- function(data_url) {
  require(rvest)
  require(httr)
  require(dplyr)

  res <- content(GET(data_url), encoding = 'UTF-8')
  if (identical("NOT FOUND", res$message)) {
    stop("Unable to retrieve information, input timestamp may not be available
         - try to find the available timestamps.")
  }
  # pick out the components
  res_headers <- sapply(c("title", "link", "description", "language", "copyright", "webMaster", "lastBuildDate"),
                        FUN = function(n) {res %>% html_node(xpath = n) %>% html_text()},
                        simplify = FALSE, USE.NAMES = TRUE)
  res_items <- lapply(res %>% html_nodes(xpath = "PollutantConcentration"),
                      function(n) {
                        sapply(c("StationName", "DateTime", "NO2", "O3", "SO2", "CO", "PM10", "PM2.5"),
                               function(x) {n %>% html_node(xpath = x) %>% html_text()},
                               simplify = FALSE, USE.NAMES = TRUE) %>%
                          data.frame(stringsAsFactors = FALSE)
                      })
  res_data <- do.call("rbind", res_items)
  
  list(header = res_headers, data = res_data)
}

#' Retrieve Past 24-hour Pollutant Concentration of individual Air Quality Monitoring stations
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past24hr-pc-of-individual-air-quality-monitoring-stations}
#'
#' @param timestamp if null then current, otherwise historical weather
#' it should be in format of \%Y\%m\%d-\%H\%M, e.g. 20180905-0100
#'
#' @export
#'
pollutant_24hr <- function(timestamp = NULL) {
  pollutant_24hr_retrieve(data_file_url(pollutant_24hr_url(), timestamp))
}

# Current Air Quality Health Index ----------------------------------------

#' URL for Current Air Quality Health Index of individual Air Quality Monitoring stations
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-current-aqhi-of-individual-air-quality-monitoring-stations}
#'
#' @param lang select the language of the available data, available choices are
#'   "en" for English, "tc" for traditional Chinese, "sc" for simplified Chinese
#'   
#' @export
#'
aqhi_current_url <- function(lang = c("en", "tc", "sc"))  {
  lang = match.arg(lang)
  switch(lang,
         en = "http://www.aqhi.gov.hk/epd/ddata/html/out/aqhi_ind_rss_Eng.xml",
         tc = "http://www.aqhi.gov.hk/epd/ddata/html/out/aqhi_ind_rss_ChT.xml",
         sc = "http://www.aqhi.gov.hk/epd/ddata/html/out/aqhi_ind_rss_ChS.xml"
  )
}

#' Retrieve Current Air Quality Health Index of individual Air Quality Monitoring stations
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-current-aqhi-of-individual-air-quality-monitoring-stations}
#'
#' @param data_url the url to the specific file, e.g. as returned by
#'   hist_file_url() for a given timestamp
#'
#' @export
#'
aqhi_current_retrieve <- function(data_url) {
  require(rvest)
  require(httr)
  require(dplyr)

  res <- content(GET(data_url), encoding = 'UTF-8')
  if (identical("NOT FOUND", res$message)) {
    stop("Unable to retrieve information, input timestamp may not be available
         - try to find the available timestamps.")
  }
  # pick out the components
  res_headers <- sapply(c(title = "//channel/title",
                          link = "//channel/link",
                          image_title = "//channel/image/title",
                          image_link = "//channel/image/link",
                          image_url = "//channel/image/url",
                          description = "//channel/description",
                          language = "//channel/language",
                          copyright = "//channel/copyright",
                          webMaster = "//channel/webMaster",
                          pubDate = "//channel/pubDate",
                          lastBuildDate = "//channel/lastBuildDate"),
                        FUN = function(n) {res %>% html_node(xpath = n) %>% html_text()},
                        simplify = FALSE, USE.NAMES = TRUE)

  res_items <- lapply(res %>% html_nodes("item"),
                      function(n) {
                        dat <- sapply(c("title", "guid", "link", "pubDate", "description"),
                                      function(x) {n %>% html_node(xpath = x) %>% html_text()},
                                      simplify = FALSE, USE.NAMES = TRUE)
                        loc_qahi <- trimws(strsplit(dat$title, split = ":")[[1]], "both")
                        data.frame(station = loc_qahi[1],
                                   aqhi = loc_qahi[2],
                                   risk = loc_qahi[3],
                                   guid = dat$guid,
                                   link = dat$link,
                                   pubDate = dat$pubDate,
                                   description = dat$description,
                                   stringsAsFactors = FALSE)
                      })
  res_data <- do.call("rbind", res_items)

  list(header = res_headers, data = res_data)
}

#' Retrieve Current Air Quality Health Index of individual Air Quality Monitoring stations
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-current-aqhi-of-individual-air-quality-monitoring-stations}
#'
#' @param timestamp if null then current, otherwise historical weather
#' it should be in format of \%Y\%m\%d-\%H\%M, e.g. 20180905-0100
#' @param lang select the language of the available data, available choices are
#'   "en" for English, "tc" for traditional Chinese, "sc" for simplified Chinese
#'
#' @export
#'
aqhi_current <- function(timestamp = NULL, lang = c("en", "tc", "sc")) {
  aqhi_current_retrieve(data_file_url(aqhi_current_url(lang), timestamp))
}

# Current Air Quality Health Index Range and Forecast ---------------------

#' URL for Current Air Quality Health Index Range and Forecast
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-current-air-quality-health-index-range-and-forecast}
#'
#' @param lang select the language of the available data, available choices are
#'   "en" for English, "tc" for traditional Chinese, "sc" for simplified Chinese
#'
#' @export
#'
aqhi_range_forecast_url <- function(lang = c("en", "tc", "sc")) {
  lang = match.arg(lang)
  switch(lang,
         en = "http://www.aqhi.gov.hk/epd/ddata/html/out/aqhirss_Eng.xml",
         tc = "http://www.aqhi.gov.hk/epd/ddata/html/out/aqhirss_ChT.xml",
         sc = "http://www.aqhi.gov.hk/epd/ddata/html/out/aqhirss_ChS.xml"
  )
}

#' Retrieve Current Air Quality Health Index Range and Forecast
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-current-air-quality-health-index-range-and-forecast}
#'
#' @param data_url the url to the specific file, e.g. as returned by
#'   hist_file_url() for a given timestamp
#'
#' @export
#'
aqhi_range_forecast_retrieve <- function(data_url) {
  require(rvest)
  require(httr)
  require(dplyr)

  res <- content(GET(data_url), encoding = 'UTF-8')
  if (identical("NOT FOUND", res$message)) {
    stop("Unable to retrieve information, input timestamp may not be available
         - try to find the available timestamps.")
  }
  # pick out the components
  res_headers <- sapply(c(title = "//channel/title",
                          link = "//channel/link",
                          description = "//channel/description",
                          language = "//channel/language",
                          copyright = "//channel/copyright",
                          webMaster = "//channel/webMaster",
                          lastBuildDate = "//channel/lastBuildDate"),
                        FUN = function(n) {res %>% html_node(xpath = n) %>% html_text()},
                        simplify = FALSE, USE.NAMES = TRUE)

  res_items <- lapply(res %>% html_nodes("item"),
                      function(n) {
                        sapply(c("title", "guid", "link", "description"),
                               function(x) {n %>% html_node(xpath = x) %>% html_text()},
                               simplify = FALSE, USE.NAMES = TRUE)
                      })

  # the first is for now
  res_current <- res_items[[1]]
  type_date <- trimws(strsplit(res_current$title, split = ": |：")[[1]], "both")
  res_current$data_desc <- type_date[1]
  res_current$dateTime <- strsplit(type_date[2], split = "[\t\n]")[[1]][1]
  aqhi_dat <- res_current$description %>% read_html() %>% html_nodes("p") %>% html_text() %>%
    strsplit(split = "[:()]|：") %>%
    lapply(FUN=function(x) {
      dat <- trimws(x, which = "both")
      data.frame(
        station_type = dat[1],
        aqhi = dat[2],
        risk = dat[4],
        stringsAsFactors = FALSE)
      })
  res_current_data <- do.call("rbind", aqhi_dat) %>%
    mutate(
      data_type = "current",
      data_desc = res_current$data_desc,
      dateTime = res_current$dateTime,
      guid = res_current$guid,
      link = res_current$link) %>%
    select(data_type, data_desc, dateTime, station_type, aqhi, risk, guid, link, everything())

  # the second is the forecast
  res_forecast <- res_items[[2]]
  type_date <- trimws(strsplit(res_forecast$title, split = ": |：")[[1]], "both")
  res_forecast$data_desc <- type_date[1]
  aqhi_dat <- res_forecast$description %>% read_html() %>% html_nodes("p") %>% html_text()
  res_forecast_data <- do.call(
    "rbind",
    lapply(list(list(data_type = aqhi_dat[1], type_risk = aqhi_dat[2]),
                list(data_type = aqhi_dat[1], type_risk = aqhi_dat[3]),
                list(data_type = aqhi_dat[4], type_risk = aqhi_dat[5]),
                list(data_type = aqhi_dat[4], type_risk = aqhi_dat[6])),
           function(x) {
             tmp <- strsplit(x$type_risk, split = ":|：")[[1]] %>% trimws(which = "both")
             data.frame(data_type = x$data_type,
                        station_type = tmp[1],
                        risk = tmp[2])
           })
  ) %>%
    mutate(
      data_desc = res_forecast$data_desc,
      dateTime = NA_character_,
      aqhi = NA_character_,
      guid = res_forecast$guid,
      link = res_forecast$link
    ) %>%
    select(data_type, data_desc, dateTime, station_type, aqhi, risk, guid, link, everything())

  #
  list(header = res_headers, data = rbind(res_current_data, res_forecast_data))
}

#' Retrieve Current Air Quality Health Index Range and Forecast
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-current-air-quality-health-index-range-and-forecast}
#'
#' @param timestamp if null then current, otherwise historical weather
#' it should be in format of \%Y\%m\%d-\%H\%M, e.g. 20180905-0100
#' @param lang select the language of the available data, available choices are
#'   "en" for English, "tc" for traditional Chinese, "sc" for simplified Chinese
#'
#' @export
#'
aqhi_range_forecast <- function(timestamp = NULL, lang = c("en", "tc", "sc")) {
  aqhi_range_forecast_retrieve(data_file_url(aqhi_range_forecast_url(lang), timestamp))
}

# Past hourly record of Air Pollution Index -------------------------------

#' URL for Past hourly record of Air Pollution Index at a given year and month
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past-record-of-air-pollution-index-en}
#' Only available from July 1999 to Decemenber 2013.
#'
#' @param year year of the past record
#' @param month month of the past record
#'
#' @export
#'
past_pollution_index_url <- function(year, month) {
  if(floor(month)!=month || month < 1 || month > 12) stop("Month should be integer from 1 to 12. Got ", month)
  sprintf(
    "http://www.aqhi.gov.hk/api_history/download/hourly/eng/hr%02d%04d.csv",
    month,
    year
  )
}

#' Retrieve Past hourly record of Air Pollution Index at a given year and month
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past-record-of-air-pollution-index-en}
#' Only available from July 1999 to Decemenber 2013.
#'
#' @param year year of the past record
#' @param month month of the past record
#' @param path the directory where the raw file should be save, if NULL it
#' will not be saved
#'
#' @export
#'
past_pollution_index <- function(year, month, path = NULL) {
  require(readr)
  require(tidyr)
  
  url <- past_pollution_index_url(year, month)
  if (is.null(path)) {
    path <- file.path(tempdir(), basename(url))
    on.exit(unlink(path))
  } else {
    path <- file.path(path, basename(url))
  }
  download.file(url, path, quiet = TRUE)
  # since the file contains variable number of rows (ranging from 8 to 10) before the header row,
  # so use grep to find the first row with header, containing "Date"
  r <- read_lines(path, n_max = 15)
  dt <- min(grep("^Date", r))
  
  # not sure if the fields may have something like "4*", so read as characters to be safe
  data <- read_csv(path, skip = dt - 1, col_types = cols(.default = "c"))
  # still some clean up needed, only hour 0 has Date, the Date for other hours are omitted
  data %>% fill(Date)
}


# Past record of Air Quality Health Index ---------------------------------

#' URL for Past record of Air Quality Health Index
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past-record-of-air-quality-health-index-en}
#' Only available from Decemenber 2013 onward.
#'
#' @param year year of the past record
#' @param month month of the past record
#'
#' @export
#'
past_aqhi_url <- function(year, month) {
  if(floor(month)!=month || month < 1 || month > 12) stop("Month should be integer from 1 to 12. Got ", month)
  sprintf(
    "http://www.aqhi.gov.hk/epd/ddata/html/history/%04d/%04d%02d_Eng.csv",
    year,
    year,
    month
  )
}

#' Retrieve Past record of Air Quality Health Index
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-epd-airteam-past-record-of-air-quality-health-index-en}
#' Only available from Decemenber 2013 onward.
#'
#' @param year year of the past record
#' @param month month of the past record
#' @param path the directory where the raw file should be save, if NULL it
#' will not be saved
#'
#' @export
#'
past_aqhi <- function(year, month, path = NULL) {
  require(readr)
  require(tidyr)
  
  url <- past_aqhi_url(year, month)
  if (is.null(path)) {
    path <- file.path(tempdir(), basename(url))
    on.exit(unlink(path))
  } else {
    path <- file.path(path, basename(url))
  }
  download.file(url, path, quiet = TRUE)
  # the files all have the header at line 8, but would still use the dynamic approach to be sure.
  # to use grep to find the first row with header, containing "Date"
  r <- read_lines(path, n_max = 15)
  dt <- min(grep("^Date", r))
  # the fields for aqhi are not all numeric, some are "6*", so keep them as character
  # even "Hour" has "Daily Max", so not numeric
  data <- read_csv(path, skip = dt - 1, col_types = cols(.default = "c"))
  # still some clean up needed, only hour 1 has Date, the Date for other hours are omitted
  data %>% fill(Date)
}


# Towngas Environmental Performance Data ----------------------------------

#' URL for Towngas Environmental Performance Data
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/towngas-towngas-environment}
#'
#' @param fromYear integer, the starting year of the performance data.
#' @param toYear integer, the ending (inclusive) year of the performance data.
#'
#' @export
#'
towngas_performance_data_url <- function(fromYear, toYear = fromYear) {
  if(floor(fromYear) != fromYear) stop("fromYear should be integer. Got ", fromYear)
  if(floor(toYear) != toYear) stop("toYear should be integer. Got ", toYear)
  sprintf(
    "https://appapi.towngas.com/opendata/v1/environment/filter/%04d/%04d",
    fromYear, toYear
  )
}

#' Retrieve Towngas Environmental Performance Data
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/towngas-towngas-environment}
#'
#' @param fromYear integer, the starting year of the performance data.
#' @param toYear integer, the ending (inclusive) year of the performance data.
#' @param path the directory where the raw file should be save, if NULL it
#' will not be saved
#'
#' @export
#'
towngas_performance_data <- function(fromYear, toYear = fromYear, path = NULL) {
  require(tidyr)
  url <- towngas_performance_data_url(fromYear, toYear)
  data <- get_file_json(url, path)
  # the returned data has two levels of nesting after the default simplification
  # of arrays of dataframe
  unnest(unnest(data))
}


# Real-time city data collected by multi-purpose lamp posts ---------------

#' URL for Real-time city data collected by multi-purpose lamp posts in Kowloon East
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-devb-mplp-mplp-sensor-data}
#' Note that only the most update value is available.
#' @export
#'
lamp_posts_data_url <- function() {
  "https://mplpssl.wisx.io/nodered/getlampposts/"
}

#' Retrieve Real-time city data collected by multi-purpose lamp posts in Kowloon East
#' 
#' @references \url{https://data.gov.hk/en-data/dataset/hk-devb-mplp-mplp-sensor-data}
#' Note that only the most update value is available.
#'
#'#' @param path the directory where the raw file should be save, if NULL it
#' will not be saved
#' 
#' @export
#'
lamp_posts_data <- function(path = NULL) {
  require(tidyr)
  url <- "https://mplpssl.wisx.io/nodered/getlampposts/"
  data <- get_file_json(url, path)
  # the returned data has one level of nesting after the default simplification
  # of arrays of dataframe
  unnest(data)
}