R/abs-cat-functions.R

Defines functions abs_urls abs_filetypes abs_cat_stats abs_cat_tables abs_cat_releases abs_cat_download abs_local_filename abs_cat_unzip abs_read_tss abs_read_tss_

Documented in abs_cat_download abs_cat_releases abs_cat_stats abs_cat_tables abs_cat_unzip abs_filetypes abs_local_filename abs_read_tss abs_read_tss_ abs_urls

### ABS Catalogue functions

#' @name abs_urls
#' @title ABS URL addresses and paths used in accessing ABS Catalogue data calls
#' @description This function returns a list of URLs and data paths used to construct ABS Catalogue
#'   data access calls. It is used in other functions in this package and need not be called
#'   directly.
#' @return a list with a base url and a url section for formatting ABS Catalogue statistics calls
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
#' @keywords internal
abs_urls <- function()
{
  list(base_url = "https://www.abs.gov.au",
       ausstats_path = "ausstats/abs@.nsf",
       mf_path = "mf",
       downloads_regex = "Downloads",
       releases_regex = "Past.*Future.*Releases");
}


#' @name abs_filetypes
#' @title Valid ABS file types
#' @description This function returns a vector of valid ABS file types for using list of URLs and data paths used to construct ABS Catalogue
#'   data access calls. It is used in other functions in this package and need not be called
#'   directly.
#' @return a vector containing a list of valid ABS file types.
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
#' @keywords internal
abs_filetypes <- function()
{
  c(zip_files = "application/x-zip",
    excel_files = "application/vnd.ms-excel",
    openxml_files = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    pdf_files = "application/pdf");
}


#' @name abs_cat_stats
#' @title Get ABS catalogue series data
#' @description This function downloads ABS catalogue series statistics, by ABS catalogue number.
#' @importFrom rvest html_session follow_link html_attr jump_to
#' @importFrom xml2 read_xml read_html
#' @param cat_no Character vector specifying one or more ABS collections or catalogue numbers to
#'   download.
#' @param tables A character vector of regular expressions denoting tables to download. The default
#'   ('All') downloads all time series spreadsheet tables for each specified catalogue. Use a list
#'   to specify different table sets for each specified ABS catalogue number.
#' @param releases Date or character string object specifying the month and year denoting which
#'   release to download. Default is "Latest", which downloads the latest available data. See
#'   examples for further details.
#' @param types One of either 'tss' -- ABS time series spreadsheet (the default) or 'css' -- ABS
#'   data cube (cross-section spreadsheet).
#' @param na.rm logical (default: \code{TRUE}) - remove observations containing missing values.
#' @return data frame in long format
#' @export
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
#' @examples
#'   \donttest{
#'     ## Download quarterly Australian National Accounts, Tables 1 & 2 
#'     ana_q <- abs_cat_stats("5206.0", tables=c("Table 1\\W+", "Table 2\\W+"));
#'
#'     ## Download December 2017 Australian National Accounts, Table 1
#'     ana_q_2017q4 <- abs_cat_stats("5206.0", tables="Table 1\\W+", release="Dec 2017");
#'   }
abs_cat_stats <- function(cat_no, tables="All", releases="Latest", types="tss", na.rm=TRUE)
{
  if (missing(cat_no))
    stop("No cat_no supplied.");
  ## if (tolower(releases) != "latest" ||
  ##     releases IS NOT A DATE )
  ##   stop("releases arguments ")
  if (any(!types %in% c("tss","css")))
    stop("Allowable type arguments limited to one or both: 'tss' and 'css'.");
  ## Get available catalogue tables
  if (FALSE) {
    cat_no <- "5206.0"; tables <- c("Table 1\\W+", "Table 2\\W+");
    releases <- "Latest"; types <- "tss"; include_urls <- FALSE;
  }
  cat_tables <- abs_cat_tables(cat_no=cat_no, releases=releases, types=types, include_urls=TRUE)
  ## Select only the user specified tables ('sel_tables')
  if (length(tables) == 1 && tolower(tables) == "all") {
    ## If 'all' tables, download all
    sel_tables <- if (any(grepl("^all time series.*", cat_tables$item_name, ignore.case=TRUE))) {
                    ## If all tables provided as single compressed archive, select that
                    cat_tables[grepl("^all time series.*", cat_tables$item_name, ignore.case=TRUE),]
                  } else {
                    ## Else, select all tables
                    cat_tables
                  };
  } else {
    ## Else, return only selected tables
    sel_tables <- cat_tables[grepl(sprintf("(%s)", paste(tables, collapse="|")),
                                   cat_tables$item_name, ignore.case=TRUE),]
    ## Stop if regular expression does not return any tables
    if (nrow(sel_tables) == 0)
      stop(paste("Specified table regular expressions do not match any table names, re-specify."))
  }
  ## Select only the user specified tables ('sel_tables')
  sel_urls <- apply(sel_tables, 1,
                    function(y) {
                      ## If zip in path_zip, select zip file, else select xls(x) file
                      if (any(grepl("\\.zip", y, ignore.case=TRUE))) {
                        unique(grep("\\.zip", unlist(y), ignore.case=TRUE, value=TRUE))
                      } else {
                        unique(grep("\\.xlsx*", unlist(y), ignore.case=TRUE, value=TRUE))
                      }
                    });
  ## Download ABS TSS/Data Cubes ..
  z <- lapply(sel_urls, abs_cat_download);
  z <- lapply(z,
              function(x) 
                if (!grepl("\\.zip", x, ignore.case=TRUE)) {
                  x
                } else {
                  abs_cat_unzip(files=x)
                });
  ## .. and combine into single data frame
  data <- lapply(z, function(x) abs_read_tss(x, na.rm=na.rm));
  data <- do.call(rbind, data);
  rownames(data) <- 1:nrow(data);
  return(data);
}


#' @name abs_cat_tables
#' @title Return ABS catalogue tables
#' @description Return list of data tables available from specified ABS catalogue number.
#' @importFrom rvest html_session html_text html_nodes html_attr follow_link
#' @importFrom httr http_error
#' @importFrom dplyr case_when bind_rows
#' @param cat_no ABS catalogue numbers.
#' @param releases Date or character string object specifying the month and year denoting which
#'   release to download. Default is "Latest", which downloads the latest available data. See
#'   examples for further details.
#' @param types ABS publication types to return. Permissable options include one or more of: 'tss'
#'   -- ABS Time Series Spreadsheets, 'css' - ABS Data Cubes and 'pub' -- ABS Publications. The
#'   default returns all Time Series Spreadsheets and Data Cubes.
#' @param include_urls Include full URLs to returned ABS data files. Default (FALSE) does not
#'   include data file URLs.
#' @return Returns a data frame listing the data collection tables and URLs for Excel (column:
#'   \code{path_xls}) and, if available, Zip (column: \code{path_zip}) files.
#' @export
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
#' @examples
#'   \donttest{
#'     ## List latest available quarterly National Accounts tables
#'     ana_tables <- abs_cat_tables("5206.0", releases="Latest");
#'     ana_tables_url <- abs_cat_tables("5206.0", releases="Latest", include_urls=TRUE);
#'
#'     ## List latest available CPI Time Series Spreadsheet tables only
#'     cpi_tables <- abs_cat_tables("6401.0", releases="Latest", types="tss");
#'     cpi_tables_url <- abs_cat_tables("5206.0", releases="Latest", types="tss", include_urls=TRUE);
#'   
#'     ## List latest available ASGS Volume 3 Data Cubes
#'     asgs_vol3_tables <- abs_cat_tables("1270.0.55.003", releases="Latest", types="css");
#'     asgs_vol3_tables_url <- abs_cat_tables("1270.0.55.003", releases="Latest",
#'                                            types="css", include_urls=TRUE);
#'   
#'     ## List latest available ASGS ANZSIC publications (PDF) files
#'     anzsic_2006 <- abs_cat_tables("1292.0", releases="Latest", types="pub", include_urls=TRUE);
#'   }
abs_cat_tables <- function(cat_no, releases="Latest", types=c("tss", "css"), include_urls=FALSE)
{
  ## if (FALSE) {
  ##   -- DEBUGGING CODE --
  ## cat_no <- "6401.0"; types <- "tss"; releases <- "Latest"; include_urls <- TRUE;
  ## cat_no <- "5209.0.55.001"; types <- "css"; releases <- "Latest"; include_urls <- TRUE;
  ## cat_no <- "1270.0.55.001"; releases <- "Latest"; types <- "css"; include_urls <- TRUE;
  ## cat_no <- "6202.0"; releases <- "Latest"; types <- "css"; include_urls <- TRUE;
  ## cat_no <- "3105.0.65.001"; releases <- "Latest"; types <- "css"; include_urls <- TRUE;
  ## }
  if (missing(cat_no))
    stop("No cat_no supplied.");
  if (any(!types %in% c("tss", "css", "pub")))
    stop("Allowable type arguments limited to one or more of: 'tss', 'css' or 'pub'.");
  if (!is.logical(include_urls))
    stop("include_urls must be either TRUE or FALSE");
  ## Spell out type -- for ABS website scraping
  types <- sapply(types,
                     function(x) switch(x,
                                        "tss" = "Time Series Spreadsheet",
                                        "css" = "Data Cubes",
                                        "pub" = "Publication"));
  ## Create ABS URL and open session 
  url <- file.path(abs_urls()$base_url, abs_urls()$ausstats_path, abs_urls()$mf_path, cat_no);
  ## Check for HTTP errors
  raustats_check_url_available(url);
  ## -- OLD CODE --
  ## if (http_error(url))
  ##   stop(sprintf("File cannot be downloaded. Check URL: %s", url))
  ## Open html session
  suppressWarnings(s <- html_session(url));
  releases <- unique(releases);
  if (length(releases) == 1 && tolower(releases) == "latest") {
    .paths <- "";
  } else {
    ## Get path to 'Past & Future Releases' page
    .paths <- html_nodes(s, "a");
    .paths <- .paths[grepl(abs_urls()$releases_regex, .paths)];
    .paths <- html_attr(.paths, "href");
    s <- jump_to(s, .paths)
    .paths <- html_nodes(s, "a");
    .paths <- .paths[grepl(paste(releases, collapse="|"), .paths, ignore.case=TRUE)];
    .paths <- html_attr(.paths, "href");
  }
  ## Return list of all downloadable files, for specified catalogue tables ('cat_tables')
  v <- lapply(.paths,
              function(x) {
                ## Check for HTTP errors
                ## raustats_check_url_available(file.path(s, x));
                y <- jump_to(s, x)
                l <- follow_link(y, abs_urls()$downloads_regex)
                ht <- html_nodes(html_nodes(l, "table"), "table")
                ## Return data table
                ## The ABS data catalogue lists the data inside a HTML table within a table, i.e.
                ##  <table>
                ##    <table> </table>
                ##  </table>
                ## The following nested apply functions, exploits this structure to extract the
                ## list of available publication types and associated links.
                all_nodes <- lapply(sapply(ht, function(x) html_nodes(x, "tr")),
                                    function(x)
                                      c(html_text(html_nodes(x, "td")),
                                        ## html_attr(html_nodes(html_nodes(x, "td"), "a"), "href")));
                                        paste0(abs_urls()$base_url,
                                               html_attr(html_nodes(html_nodes(x, "td"), "a"), "href")))
                                    );
                ## Remove ABS data download section heading from all_nodes
                ##   Where ABS data download section titles that include links, are included
                ##   in the node set, but are not conformant with publication information.
                ##   The following code block, removes these entries.
                data_nodes <- lapply(all_nodes,
                                     function(x) {
                                       if (grepl(paste(c("(^\\W{0,1}$)",
                                                         "(^data\\s*cubes\\W*$)",
                                                         "(^time series spreadsheet\\W*$)"),
                                                       collapse="|"),
                                                 x[1], ignore.case=TRUE)) {
                                         NULL
                                       } else {
                                         x
                                       }
                                     })
                data_nodes <- data_nodes[-which(sapply(data_nodes, is.null))];
                ## Tidy and return data set names and urls
                nodes <- data_nodes[unlist(lapply(data_nodes,
                                                  function(x)
                                                    any(grepl(sprintf("(%s)",
                                                                      paste(types, collapse="|")),
                                                              x, ignore.case=TRUE)) &
                                                    any(grepl("ausstats", x, ignore.case=TRUE))
                                                  ))];
                ## Remove non-breaking spaces (&nbsp;), and blank entries
                nodes <- lapply(nodes,
                                function(x) {
                                  z <- trimws(gsub("\u00a0", "", x));      ## Remove non-breaking spaces
                                  z <- replace(z, z == "", NA_character_); ## Replace blank objects with NA
                                  ## Set entries not starting with 'https*' with 'NA_character_'
                                  z[-1] <- replace(z[-1],                          
                                                   !grepl("^https*.+", z[-1], ignore.case=TRUE),
                                                   NA_character_);
                                  ## Set entries containing 'INotes' with 'NA_character_'
                                  z <- replace(z,                          
                                               grepl("INotes", z, ignore.case=TRUE),
                                               NA_character_);
                                  z <- z[!is.na(z)];                       ## Remove NA objects
                                  ## Set object names: First element = 'item_name'
                                  names(z)[1] <- "item_name";
                                  names(z)[-1] <- case_when(
                                    ## !grepl("(^https*|^Releases|INotes)", z, ignore.case=TRUE) ~ "item_name",
                                    grepl("\\.xlsx*", z[-1], ignore.case=TRUE) ~ "path_xls",
                                    grepl("\\.zip", z[-1], ignore.case=TRUE) ~ "path_zip",
                                    grepl("\\.pdf", z[-1], ignore.case=TRUE) ~ "path_pdf",
                                    TRUE ~ NA_character_)
                                  z <- as.data.frame(t(cbind.data.frame(z, deparse.level=1)),
                                                     stringsAsFactors=FALSE);
                                  return(z);
                                });
                ## Tidy nodes into data.frame (using dplyr::bind_rows)
                dt <- suppressWarnings(bind_rows(nodes))
                ## Lastly replace spaces in all URL paths with '%20' string
                for(name in grep("^path_", names(dt), ignore.case=TRUE, value=TRUE)) # names(dt)[-1]
                  dt[,name] <- gsub("\\s+", "%20", dt[,name]);
                return(dt);
              });
  ## Add catalogue number and release information to table
  v <- lapply(seq_along(v),
              function(i) {
                v[[i]]$release <- sub("^$", "Latest", releases[i]);
                v[[i]]$cat_no <- cat_no;
                as.data.frame(v)
              });
  ## Bind all results together
  z <- do.call(rbind, v);
  ## If rbind breaks on different row names try:
  ## z <- do.call(function(...) rbind(..., make.row.names=FALSE), v);
  ## names(z) <- c("item_name", ..., "cat_no", "release");
  z <- if (!include_urls) {
         z[,c("cat_no", "release", "item_name")]
       } else {
         z[,c("cat_no", "release", "item_name",
              names(z)[!names(z) %in% c("cat_no", "release", "item_name")])]
       }
  row.names(z) <- seq_len(nrow(z));
  return(z)
}


#' @name abs_cat_releases
#' @title Return ABS catalogue table releases
#' @description Return list of all releases available for specified ABS catalogue number.
#' @importFrom rvest html_session html_table html_text html_nodes html_attr follow_link
#' @importFrom httr http_error
#' @param cat_no ABS catalogue numbers.
#' @param include_urls Include full path URL to specified ABS catalogue releases. Default (FALSE)
#'   does not include release URLs.
#' @return Returns a data frame listing available ABS catalogue releases.
#' @export
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
#' @examples
#'   \donttest{
#'     ## List all available quarterly National Accounts tables
#'     ana_releases <- abs_cat_releases("5206.0");
#'     ana_release_urls <- abs_cat_releases("5206.0", include_urls=TRUE);
#'   
#'     ## List latest available CPI Time Series Spreadsheet tables only
#'     cpi_releases <- abs_cat_releases("6401.0");
#'     cpi_release_urls <- abs_cat_releases("6401.0", include_urls=TRUE);
#'   }
abs_cat_releases <- function(cat_no, include_urls=FALSE)
{
  ## if (FALSE) {
  ##   ## -- DEBUGGING CODE --
  ##   cat_no <- "5206.0"
  ##   include_urls <- FALSE
  ## }
  if (missing(cat_no))
    stop("No cat_no supplied.");
  if (!is.logical(include_urls))
    stop("include_urls must be either TRUE or FALSE");
  ## Create ABS URL and open session 
  url <- file.path(abs_urls()$base_url, abs_urls()$ausstats_path, abs_urls()$mf_path, cat_no);
  ## Check for HTTP errors
  raustats_check_url_available(url)
  ## if (http_error(url))
  ##   stop(sprintf("File cannot be downloaded. Check URL: %s", url))
  suppressWarnings(s <- html_session(url));
  ## Get path to 'Past & Future Releases' page
  .paths <- html_nodes(s, "a");
  .paths <- .paths[grepl(abs_urls()$releases_regex, .paths)];
  .paths <- html_attr(.paths, "href");
  s <- jump_to(s, .paths)
  ## Get list of available ABS catalogue releases (See: https://devhints.io/xpath for Xpath hints)
  .tables <- html_nodes(s, "table");
  .tables <- .tables[grepl("Past Releases", .tables, ignore.case=TRUE)];
  .paths <- html_nodes(.tables, "a");
  ## Return results 
  if (!include_urls) {
    z <- data.frame(releases = html_text(.paths))
  } else {
    z <- data.frame(releases = html_text(.paths),
                    urls = file.path(abs_urls()$base_url,
                                     abs_urls()$ausstats_path,
                                     html_attr(.paths, "href")))
  }
  row.names(z) <- seq_len(nrow(z));
  return(z)
}


#' @name abs_cat_download
#' @title Function to download files from the ABS website and store locally
#' @description Downloads specified ABS catalogue data files from the ABS website, using a valid ABS
#'   data table URL.
#' @importFrom httr GET http_type http_error progress status_code write_disk
#' @param data_url Character vector specifying an ABS data URLs.
#' @param exdir Target directory for downloaded files (defaults to \code{tempdir()}). Directory is
#'   created if it doesn't exist.
#' @return Downloads data from the ABS website and returns a character vector listing the location
#'   where files are saved.
#' @export
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
abs_cat_download <- function(data_url, exdir=tempdir()) {
  if (!dir.exists(exdir)) dir.create(exdir);
  local_filenames <-
    sapply(data_url[!is.na(data_url)],
           function(url) {
             this_filename <- abs_local_filename(url);
             ## Check if any data_urls are not ABS data URLs
             if (!grepl("^https*:\\/\\/www\\.abs\\.gov\\.au\\/ausstats.+",
                        url, ignore.case=TRUE))	
               stop(sprintf("Invalid ABS url: %s", url));
             ##
             ## -- Download files --
             cat(sprintf("Downloading: %s", this_filename));
             ## Check for errors
             raustats_check_url_available(url)
             resp <- GET(url, write_disk(file.path(exdir, this_filename), overwrite=TRUE),
                         raustats_ua(), progress());
             ## ## File download validation code based on:
             ## ##  https://cran.r-project.org/web/packages/httr/vignettes/api-packages.html
             ## if (http_error(resp)) {
             ##   stop(
             ##     sprintf(
             ##       "ABS catalogue file request failed (Error code: %s)\nInvalid URL: %s", 
             ##       status_code(resp),
             ##       url
             ##     ),
             ##     call. = FALSE
             ##   )
             ## }
             ## Check content-type is compliant
             if (!http_type(resp) %in% abs_filetypes()) {
               stop("ABS file request did not return Excel, Zip or PDF file", call. = FALSE)
             }
             return(file.path(exdir, this_filename));
           })
    ## local_filename <- abs_local_filename(data_url);
  ## ## Check if any data_urls are not ABS data URLs
  ## if (!grepl("^https*:\\/\\/www\\.abs\\.gov\\.au\\/ausstats.+",
  ##            data_url, ignore.case=TRUE))	
  ##   stop(sprintf("Invalid ABS url: %s", data_url));
  ## ##
  ## ## -- Download files --
  ## cat(sprintf("Downloading: %s", local_filename));
  ## resp <- GET(data_url, write_disk(file.path(exdir, local_filename), overwrite=TRUE),
  ##             raustats_ua(), progress());
  ## ## File download validation code based on:
  ## ##  https://cran.r-project.org/web/packages/httr/vignettes/api-packages.html
  ## if (http_error(resp)) {
  ##   stop(
  ##     sprintf(
  ##       "ABS catalogue file request failed (Error code: %s)\nInvalid URL: %s", 
  ##       status_code(resp),
  ##       data_url
  ##     ),
  ##     call. = FALSE
  ##   )
  ## }
  ## ## Check content-type is compliant
  ## if (!http_type(resp) %in% abs_filetypes()) {
  ##   stop("ABS file request did not return Excel, Zip or PDF file", call. = FALSE)
  ## }
  ## Return results
  ## return(file.path(exdir, local_filename));
  return(local_filenames);
}


#' @name abs_local_filename
#' @title Create local file names for storing downloaded ABS data files
#' @description Function to create local filename from web-based file name.
#' @param url Character vector specifying one or more ABS data URLs.
#' @return Returns a local file names (character vector) in which downloaded files will be saved.
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
#' @keywords internal
abs_local_filename <- function(url)
{
  sprintf("%s_%s.%s",
          sub("^.+&(.+)\\.(zip|xlsx*|pdf)&.+$", "\\1", url),
          sub("^.+(\\d{2}).(\\d{2}).(\\d{4}).+$", "\\3\\2\\1", url),
          sub("^.+&(.+)\\.(zip|xlsx*|pdf)&.+$", "\\2", url));
}


#' @name abs_cat_unzip
#' @title Uncompress locally-stored ABS Catalogue data file archives
#' @description Function to uncompress locally-stored ABS Catalogue data file archives.
#' @importFrom utils unzip zip
#' @param files One or more local zip files.
#' @param exdir Target directory for extracted archive files. Directory is created if it doesn't
#'   exist. If missing, creates a new subdirectory in \code{tempdir()} using the respective zip
#'   files (specified in \code{files}.
#' @return Returns a character vector listing the names of all files extracted.
#' @export
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
abs_cat_unzip <- function(files, exdir) {
  if (any(!file.exists(files)))
    stop(sprintf("Files %s do not exist",
                 paste(files[!file.exists(files)], collapse=", ")));
  if (missing(exdir))
    exdir <- tempdir();
  ## Only extract from zip files
  files <- files[grepl("\\.zip$", files, ignore.case=TRUE)];
  xl_files <- sapply(files,
                     function(x)
                       if (grepl("\\.zip$", x, ignore.case=TRUE)) {
                         ## If exdir NOT missing, then use it
                         if (exdir == tempdir()) {
                           exdir <- file.path(exdir, sub("\\.zip", "", basename(x)));
                         } else {
                           ## Else, use tempdir()
                           if (!dir.exists(exdir))
                             dir.create(exdir)
                         }
                         unzip(x, exdir=exdir);
                         file.path(exdir, unzip(x, list=TRUE)$Name);
                       } else {
                         x;
                       });
  return(xl_files);
}


### Function: abs_read_tss
#' @name abs_read_tss
#' @title Extract data from an ABS time series data file
#' @description This function extracts time series data from ABS data files.
#' @param files Names of one or more ABS data files
#' @param type One of either 'tss' -- ABS Time Series Spreadsheet (the Default) or 'css' -- Data
#'   Cube.R
#' @param na.rm logical. If \code{TRUE} (default), remove observations containing missing values.
#' @return data frame in long format
#' @export
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
#' @examples
#'   \donttest{
#'     ## Read specified ABS Excel time series files
#'     tables <- abs_cat_tables("5206.0", releases="Latest", include_urls=TRUE);
#'     downloaded_tables <- abs_cat_download(tables$path_zip, exdir=tempdir())
#'     extracted_files <- abs_cat_unzip(downloaded_tables)
#'     x <- abs_read_tss(extracted_files);
#'   }
abs_read_tss <- function(files, type="tss", na.rm=TRUE) {
  x <- lapply(files,
              function(file)
                abs_read_tss_(file, type=type, na.rm=na.rm));
  z <- do.call(rbind, x);
#  rownames(z) <- seq_len(nrow(z));
  return(z);
}


### Function: abs_read_tss_
#' @name abs_read_tss
#' @title Read ABS time series data file(s)
#' @description This is the internal function that extracts time series data from ABS data files.
#' @importFrom readxl read_excel excel_sheets
#' @importFrom dplyr left_join
#' @importFrom tidyr gather
#' @importFrom stats complete.cases
#' @param files Names of one or more ABS data files
#' @param type One of either 'tss' -- ABS Time Series Spreadsheet (the Default) or 'css' -- Data
#'   Cube.R
#' @param na.rm logical. If \code{TRUE} (default), remove observations containing missing values.
#' @author David Mitchell <david.pk.mitchell@@gmail.com>
#' @keywords internal
abs_read_tss_ <- function(file, type="tss", na.rm=na.rm) {
  ## Avoid 'No visible binding for global variables' note
  { series_start <- series_end <- no_obs <- collection_month <- series_id <- value <- NULL }
  
  sheet_names <- tolower(excel_sheets(file));
  if (!all(c("index", "data1")  %in% sheet_names))
    stop(sprintf("File: %s is not a valid ABS time series file.", basename(file)));
  ## -- Read metadata --
  .meta <- read_excel(file,
                      sheet = grep("index", excel_sheets(file), ignore.case=TRUE, value=TRUE),
                      .name_repair = "minimal");
  ## Return pre-header information from ABS files 
  header_row <- which(sapply(1:nrow(.meta),
                             function(i)
                               grepl("series\\s*id", paste(.meta[i,], collapse=" "),
                                     ignore.case=TRUE)));
  metadata <- .meta;
  names(metadata) <- tolower(gsub("\\s","_",
                                  gsub("\\.", "",
                                       .meta[header_row,])));         ## Rename variables
  metadata <- metadata[-(1:header_row), !is.na(names(metadata))];     ## Drop header rows & empty columns
  metadata <- metadata[complete.cases(metadata),];                    ## Drop NA rows
  metadata <- metadata[grepl("\\w\\d{4,7}\\w", metadata$series_id),]; ## Drop if Series ID invalid 
  metadata <- transform(metadata,
                        series_start     = excel2Date(as.integer(series_start)),
                        series_end       = excel2Date(as.integer(series_end)),
                        no_obs           = as.integer(no_obs),
                        collection_month = as.integer(collection_month));
  ##
  ## Get publication details
  ## -- Catalogue number & name --
  regex_catno_name <- "^.*(\\d{4}\\.\\d+(\\.\\d+)*)\\s+(.+)$";
  catno_name <- sapply(1:header_row,
                       function(i)
                         grep(regex_catno_name, paste(.meta[i,], collapse=" "),
                              ignore.case=TRUE, value=TRUE));
  catno_name <- gsub("(\\s*NA)+", "",
                     sub(regex_catno_name, "\\1|\\3", unlist(catno_name), ignore.case=TRUE));
  catno_name <- trimws(unlist(strsplit(catno_name, split="\\|")));
  ##
  ## -- Table number & name --
  ## Note use of 'word' character    \/here               \/here for 13a, 6b, etc.
  regex_table_name <- "^.*Tables*\\s+(\\w+(\\s+\\w+\\s+\\w+)*)(\\.|:)*\\s+(.+)$";
  ## Note use of alternative separators: .|:                      ^here
  tableno_name <- sapply(1:header_row,
                         function(i)
                           grep(regex_table_name,
                                paste(.meta[i,], collapse=" "),
                                ignore.case=TRUE, value=TRUE));
  tableno_name <- gsub("(\\s*NA)+", "",
                       sub(regex_table_name, "\\1|\\4", unlist(tableno_name), ignore.case=TRUE));
  tableno_name <- trimws(unlist(strsplit(tableno_name, split="\\|")));
  ##
  ## Add publication details to metadata table
  metadata  <- transform(metadata,
                         catalogue_no      = catno_name[1],
                         publication_title = catno_name[2],
                         table_no          = tableno_name[1],
                         table_title       = tableno_name[2]);
  ## Extract data
  data <- lapply(grep("data", excel_sheets(file), ignore.case=TRUE, value=TRUE),
                 function(sheet_name) {
                   z <- read_excel(file, sheet=sheet_name, .name_repair = "minimal");
                   ## Return pre-header information from ABS files 
                   header_row <- which(sapply(1:nrow(z),
                                              function(i)
                                                grepl("series\\s*id", paste(z[i,], collapse=" "), 
                                                      ignore.case=TRUE)));
                   names(z) <- gsub("\\s","_",
                                    gsub("\\.","", z[header_row,]));      ## Rename variables
                   names(z) <- sub("series_id", "date", names(z),         ## Rename Series_ID field
                                   ignore.case=TRUE); 
                   z <- z[-(1:header_row), !is.na(names(z))];             ## Drop empty columns
                   z <- gather(z, series_id, value, -date, convert=TRUE); ## Transform data to key:value pairs
                   z <- transform(z,
                                  date = excel2Date(as.integer(date)),
                                  value = as.numeric(value));
                   names(z) <- tolower(names(z));
                   return(z);
                 });
  data <- do.call(rbind, data);
  data <- left_join(data, metadata, by="series_id");
  if (na.rm)
    data <- data[complete.cases(data),]
  names(data) <- tolower(names(data));
  return(data);
}
mitcda/raustats documentation built on Jan. 18, 2020, 12:36 p.m.