R/cso_get_data.R

Defines functions cso_download_tbl cso_get_data

Documented in cso_get_data

#' Return a CSO table as a data frame
#'
#' Returns a CSO table from the CSO PxStat Application Programming Interface
#' (API) as a data frame, with the option to give it in wide format (default)
#' very wide or long format.
#'
#' The data is pulled from the ResponseInstance service on the CSO API in
#' JSON-Stat fromat, using the GET method from the httr package.
#'
#' @param table_code string. If the table_code is a filename or a path to a
#' file, e.g. "QNQ22.json", it is imported from that file. Otherwise if it is
#' only a table code e.g. "QNQ22", the file is downloaded from data.cso and
#' checked to see if it is a valid table.
#' @param wide_format string, one of "wide", "very_wide" or "tall". If "wide"
#' (default) the table is returned in wide (human readable) format, with
#' statistic as a column (if it exists). If "very_wide" the table is returned
#' wide format and spreads the statistic column to rows. If "tall" the table is
#' returned in tall (statistic and value) format.
#' @param include_ids logical. The JSON-stat format stores variables as ids
#' i.e. IE11 and labels i.e. Border. While the label is generally preferred,
#' sometimes it is useful to have the ids to match on. If \code{include_ids}
#' is TRUE (default) then ids are retrieved and appended to the table to the
#' right of the original column with the name <columnName>.id.
#' @param id_list either NULL (default) or a character vector of columns that
#' should have ids appended if include_ids is TRUE.
#' if NULL then every column that is not included in the vector
#' \code{remove_id} will be used.
#' @param use_factors logical. If TRUE (default) factors will be used in
#' strings.
#' @param cache logical. if TRUE (default) csodata will cache the result using
#' R.cache. The raw data downloaded from the data.csi.ie is cached, which means
#' that calling \code{cso_get_data} with the same table_code but different
#' parameterswill result in cached data being used.
#' @param flush_cache logical. If TRUE (default) the cache will be checked for 
#' old, unused files. Any files which have not been accessed in the last month
#'  will be deleted
#' @return data frame of the requested CSO table.
#' @export
#' @examples
#' \dontrun{
#' tbl1 <- cso_get_data("QNQ22")
#' tbl2 <- cso_get_data("QLF07.json")
#' }
cso_get_data <- function(table_code, wide_format = "wide", include_ids = FALSE,
                         id_list = NULL, use_factors = TRUE, cache = TRUE, flush_cache = TRUE) {
  # Set path to or download data --------
  if (substr(
    table_code, nchar(table_code) - 4, nchar(table_code)
  ) == ".json") {
    if (file.exists(table_code)) {
      json_data <- table_code
    } else {
      stop("Not a valid path to a .json file")
    }
  } else {
    json_data <- cso_download_tbl(table_code, cache = cache, flush_cache = flush_cache)
    # Error Checking ----------------------
    if (is.null(json_data)) {
      return(NULL)
    }
  }
  
  # Load data ---------------------------
  data <- rjstat::fromJSONstat(json_data,
                               naming = "label", use_factors = use_factors
  )
  names(data) <- make.names(names(data), unique = TRUE)
  
  # Append ids as new column ------------
  if (include_ids) {
    data_id <- rjstat::fromJSONstat(json_data,
                                    naming = "id", use_factors = use_factors
    )
    names(data_id)[substr(names(data_id),0,1) == "C"] <-paste0(names(data)[substr(names(data_id),0,1) == "C"],".id")
    
    #Composing the list of id columns to be appended
    if (is.null(id_list)) {
      
      concat_id <- names(data_id)[grepl(".*id$",names(data_id))]
    } else if (all(id_list %in% names(data))){
      concat_id <- paste0(id_list,".id")
    } else {
      #There was an issue with the id_list
      stop("One or more columns supplied to id_list not found")
    }
    
    for (ID in concat_id) {
      id_list <- list(as.vector(as.matrix(data_id[ID])))
      names(id_list) <- ID
      data <- data.frame(append(data, id_list, after = match(substr(ID,0,nchar(ID)-3), names(data)) ),
                         stringsAsFactors = TRUE
      )
    }
  }
  
  # Pivot to wide table -----------------
  if (wide_format == "wide" || wide_format == "very_wide") {
    string <- names(data)
    remove <- c(
      "Year", "Quarter", "Month", "value", "CensusYear",
      "Census.Year", "HalfYear", "Intercensal.Period"
    )
    
    if (wide_format == "very_wide") {
      remove <- append(remove, "Statistic")
    }
    
    row_vars <- string [!string %in% remove]
    
    wide_data <- reshape2::dcast(data, formula = paste(
      paste(row_vars, collapse = " + "), " ~ ... "
    ))
    
    if (!use_factors) {
      wide_data <- dplyr::mutate_if(wide_data, is.factor, as.character)
    }
    
    return(wide_data)
  } else {
    return(data)
  }
}

#' Download a CSO table as a data frame
#'
#' Internal function to return a CSO table from the CSO PxStat Application
#' Programming Interface (API) as a JSON-stat dataset.
#'
#' The data is pulled from the ReadDataset service on the CSO API in
#' JSON-Stat format, using the GET method from the httr package.
#'
#' To improve performance, the result is cached by default.
#'
#' @param table_code string. The code uniquely identifying one table.
#' @param cache logical. Indicates whether to cache the result using R.cache.
#' @param suppress_messages logical. If FALSE (default) a message is printed
#' when loading a previously cached data table.
#' @param flush_cache logical. If TRUE (default) the cache will be checked for 
#' old, unused files. Any files which have not been accessed in the last month 
#' will be deleted
#' @return a character object, containing the data in JSON-stat format.
#' @noRd
#' @importFrom utils fileSnapshot


cso_download_tbl <- function(table_code, cache = TRUE,
                             suppress_messages = FALSE, flush_cache = TRUE) {
  url <- paste0(
    "https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/",
    table_code, "/JSON-stat/2.0/en"
  )
  
  # Attempt to retrieve cached data -----
  if (cache) {
    toc <- cso_get_toc(suppress_messages = TRUE)
    last_update <- toc[toc$id == table_code, 1]
    data <- R.cache::loadCache(list(table_code, last_update), dirs = "csodata")
    if (!is.null(data)) {
      if (!suppress_messages) {
        message("Loaded cached data\n")
      }
      return(data)
    }
  }
  
  #Empty out the cache of unused files if a new file is being downloaded
  if(flush_cache){
    file.remove(
      rownames(
        fileSnapshot(paste0(R.cache::getCacheRootPath(),"/csodata"), full.names = T, recursive = T)$info[!lubridate::`%within%`(
          fileSnapshot(paste0(R.cache::getCacheRootPath(),"/csodata"), full.names = T, recursive = T)$info[,"mtime"],
          lubridate::interval(start = Sys.Date() - lubridate::days(2) , end = Sys.Date() + lubridate::days(1))) , ]
      )
    )
  } 
  
  
  # No caching, or cache empty ----------
  
  # Check for errors using trycatch since PxStat API does not support
  # html head requests.
  error_message =  paste0("Failed retrieving table. Please check internet ",
                          "connection and that data .cso.ie is online")
  
  response <- tryCatch({
    httr::GET(url)
  }, warning = function(w) {
    print(paste0("Warning: ", error_message))
    return(NULL)
  }, error = function(e) {
    message(paste0("Connection Error: ", error_message))
    return(NULL)
  })
  
  #Cut off if theres issues
  if(is.null(response)){return(NULL)}
  
  # Check if data valid -------------
  if (httr::status_code(response) == 200 &&
      !all(response[["content"]][1:32] ==
           charToRaw("invalid Maintable format entered"))) {
    json_data <- rawToChar(response[["content"]])
    if (cache) {
      toc <- cso_get_toc(suppress_messages = TRUE)
      last_update <- toc[toc$id == table_code, 1]
      R.cache::saveCache(json_data,
                         key = list(table_code, last_update), dirs = "csodata"
      )
    }
    return(json_data)
  } else {
    stop("Not a valid table code. See cso_get_toc() for all valid tables.")
  }
}

Try the csodata package in your browser

Any scripts or data that you put into this service are public.

csodata documentation built on Dec. 11, 2021, 9:13 a.m.