R/ONS.R

#' Read and return name data for England and Wales from the
#'   Office of National Statistics
#'
#' Download data from the ONS website and convert into a single data frame
#' Downloading and converting data will take some time. The resultant dataset is
#' provided as \code{\link{ewnames}}
#'
#' @return Data frame with columns for Name, Year, and counts for 
#'   gender incidence
#' @keywords england
#' @seealso \code{\link{readScotlandNames}}, \code{\link{readNISRANames}}, 
#'   \code{\link{readSSANames}}
#' @export
#' @importFrom RCurl getBinaryURL

readONSNames <- function() {
  # if needed, the path to perl can be set as an argument here
  if (length(xlsFormats()) != 2) {
    installXLSXsupport()
  }

  ## ONS download
  downloadONS <- function() {
    ons.base.url <- "http://www.ons.gov.uk/ons"
    ons.index <- "rel/vsob1/baby-names--england-and-wales/index.html"
    xpath.release <- "//div[@class = 'previous-releases-results']//a"
    xpath.xls <- "//a[contains(@href, 'xls')]"
    table.selector <- "//div[@class = 'srp-pubs-list']//a"
    summary.years <- "1904-1994"
    # somewhat fragile path to individual data pages
    indexGet <- function() {
      # index page for data 
      year.pages <- docsFromIndex(file.path(ons.base.url, ons.index),
                                  xpath.release)
      
      year.pages <- paste0(ons.base.url, year.pages)
      # drop summary page for now
      year.pages <- year.pages[!grepl(summary.years, year.pages)]
      return(year.pages)
    }
    
    tableGet <- function(url, filter = "reference") {
      tables <- docsFromIndex(url, table.selector)

      tables <- tables[grepl(filter, tables)]
      
      excel.out <- docsFromIndex(paste0(ons.base.url, tables), xpath.xls)

      return(excel.out)
    }
    
    assets.path <- file.path(tempdir(), "assets", "ons")
    dir.create(assets.path, recursive = TRUE)
    year.pages <- indexGet()
    # call tableGet for each year linked in the index
    # flatten output further
    year.tables <- paste0(ons.base.url, sapply(year.pages, tableGet))

    # download function passed to lapply because
    # we have multiple excel files from a single index
    dlname <- function(url) {
      fname <- uuid()
      writeBin(getBinaryURL(url), file.path(assets.path, fname))
    }
    lapply(year.tables, dlname)
    return(assets.path)
  }
  # reading excel files and converting into tractable form
  # wrapped into a function because ONS stores many excel 
  # sheets (2 per year)
  wrapXLS <- function(file) {
    year.regex <- "[0-9]{4}"
    # find which sheet we need to look at as well as
    # which gender are we looking at.
    sheet.names <- sheetNames(file)
    sheet.loc <- grepl("(Boy|Girl)([^ ]*)? names", sheet.names)
    # Infer year from table of contents sheet
    contents <- read.xls(file, sheet = 1)
    year.string <- names(contents)[grepl(year.regex, names(contents))]
    year <- regmatches(year.string, regexpr(year.regex, year.string))
    if (any(sheet.loc)) {
      # only will happen if ONS changes structure of xls files
      if (sum(sheet.loc) > 1) {
        stop("too many sheets found")
      }
      sheet.number <- which(sheet.loc)

      xls.df <- read.xls(file, sheet = sheet.number, method = "csv",
                         skip = 2, stringsAsFactors = FALSE)
      # sheets contain a consierable number of empty columns.
      good.cols <- names(xls.df)[!grepl("X(\\.?[0-9]*)?", names(xls.df))]
      xls.df <- xls.df[, good.cols]
      xls.df[, "Sex"] <- ifelse(grepl("Boy", sheet.names[sheet.loc]), 
                                "M", "F")
      xls.df[, "Year"] <- as.numeric(year)
    } else {
      # may happen if ONS changes structure or download is corrupted
      stop("no full sheet found")
    }

    # cleanup df
    xls.df <- xls.df[, c("Name", "Count", "Sex", "Year")]
    xls.df <- cleanupNC(xls.df)

    return(xls.df)
  }

  files <- downloadONS()

  alluk.df <- do.call(rbind, lapply(
    list.files(files, full.names = TRUE),
    wrapXLS
  ))

  alluk.df <- ddply(alluk.df, "Year", function(x) {
                    cbind(matchSexes(x), Year = x[1, "Year"])
                  })
  unlink(files, recursive = TRUE)
  closeAllConnections()
  return(alluk.df)
}
OpenGenderTracking/globalnamedata documentation built on May 7, 2019, 9:03 p.m.