R/utils.r

Defines functions get_remote_file validate_directory get_package_name get_geo_lookup is_urban

Documented in get_geo_lookup get_package_name get_remote_file is_urban validate_directory

##' Gets a file from a remote URL
##' 
##' Since the URLs of UK government data appear to change quite
##' frequently, the method will automatically cache a local copy.
##'
##' @param url the address of the file to download
##' @param dir the directory to store the cached file in.  If
##' undefined, it defaults to $TEMP/decctools.
##' @param update_cache boolean.  Should the cached file be
##' overwritten?  Default = FALSE
##' @return the file name (local path)
##' @import stringr RCurl
get_remote_file <- function(url, dir, update_cache=FALSE) {

  ## Currently only processes one URL
  if (length(url)>1) {
    warning("Only one URL at a time currently supported.  Using first URL only.")
    url <- url[1]
  } else if (is.na(url)) {
    stop("A valid URL must be specified.")
  } 

  dir <- validate_directory(dir)
  
  ## Split the URL to identify the filename
  tmp <- unlist(str_split(url, "/"))
  file_name <- file.path(dir, tmp[length(tmp)])

  ## If the file doesn't exist, then download it
  if (!file.exists(file_name) | update_cache) {

      ## see if the certificates are installed
      certs <- system.file("CurlSSL", "cacert.pem", package="RCurl")
      if (file.exists(certs)) {
          options(RCurlOptions=list(cainfo=certs))
      } else {
          warning(sprintf("Unable to find SSL certificates file.  Running without SSL validation.",
                          certs))
          options(RCurlOptions=list(ssl.verifypeer=FALSE))
      }

      if (!url.exists("http://www.google.com")) {
          stop("You must be connected to the internet to download this file.")
      } else {
          f <- CFILE(file_name, mode="wb")
          curlPerform(url=url, writedata=f@ref)
          close(f)
      }    
  }
  
  return(file_name)
}

##' Validates a user-specified directory
##'
##' Ensures that a user-specified directory exists.  If the argument is
##' missing then a placeholder directory is created in $TEMP.
##' 
##' @param dir the directory name
##' @return the validated directory name
validate_directory <- function(dir) {
    
  ## Define the cache directory
  if (missing(dir)) dir=file.path(tempdir(), get_package_name())
  if (is.na(dir)) dir=file.path(tempdir(), get_package_name())

  ## Make sure it exists before trying to download
  if (!file.exists(dir)) dir.create(dir)

  return(dir)
}

##' Gets the name of this package
##'
##' @return a character string
get_package_name <- function() {
  return ("decctools")
}  

##' Gets a lookup table to match IDs across geographies
##'
##' Provides a lookup table giving the local authority name and
##' corresponding IDs for LAD, MSOA, and LSOA geographies.
##'
##' @note As SOA energy data are not available for Northern Ireland,
##' this method only returns a lookup for England, Wales, and
##' Scotland.  These lookup tables are based on a mix of 2011 and 2012
##' geographies so when fetching data for other years, users may find
##' that codes don't align.  The recommended strategy is to merge on
##' the name where possible.
##' 
##' @return a data frame
##' @export
get_geo_lookup <- function() {

    ## Start with the full list of local authorities in the UK 2012
    ## codes.
    url <- "https://geoportal.statistics.gov.uk/Docs/Names%20and%20Codes/Local_authority_districts_(UK)_2012_Names_and_Codes.zip"
    file_name <- get_remote_file(url, NA)
    unzip(file_name, exdir=tempdir())
    lad <- read.csv(file.path(tempdir(), "LAD_2012_UK_NC.csv"),
                   stringsAsFactors=FALSE)[,-2]
    names(lad) <- c("LAD", "name")
    
    ## For England and Wales, this is quite straight forward as ONS
    ## provides a single lookup table.
    url <- "https://geoportal.statistics.gov.uk/Docs/Lookups/Output_areas_(2011)_to_lower_layer_super_output_areas_(2011)_to_middle_layer_super_output_areas_(2011)_to_local_authority_districts_(2011)_E+W_lookup.zip"
    file_name <- get_remote_file(url, NA)
    unzip(file_name, exdir=tempdir())
    df <- read.csv(file.path(tempdir(), "OA11_LSOA11_MSOA11_LAD11_EW_LUv2.csv"),
                   stringsAsFactors=FALSE)
    df <- df[,c("LSOA11CD", "MSOA11CD", "LAD11CD")]
    names(df) <- c("LSOA", "MSOA", "LAD")
    
    ## Scotland however does things its own way. The first file
    ## provides the lookup between IGZ/MSOA geographies and local
    ## authorities.  But their Scottish Executive Local Authority
    ## codes don't match up to the ONS codes.  I've manually built a
    ## lookup table from
    ## \url{https://www.scotxed.net/Survey%20documentation/List%20of%20local%20authority%20codes.xls}
    url <- "http://www.scotland.gov.uk/Resource/Doc/933/0109431.txt"
    file_name <- get_remote_file(url, NA)
    scot.df <- read.csv(file_name)
    data(scotland_igz, envir=environment())
    scot.df <- merge(scot.df, get("scotland_igz"), by.x="LA_CODE", by.y="scotex")
    scot.df <- scot.df[,c("IZ_CODE", "lad")]
    names(scot.df) <- c("MSOA", "LAD")
    scot.df <- cbind(LSOA=NA, scot.df)

    ## Fortunately Northern Ireland doesn't even provide lower level
    ## energy data so we don't need to worry about this.
    tmp <- rbind(df, scot.df)

    ## Merge the full list together, starting from LADs
    tmp <- merge(lad, tmp, all=TRUE)

    ## Reorder and tidy
    tmp <- tmp[,c("name", "LAD", "MSOA", "LSOA")]
    return(tmp)
}

##' Checks if a specified local authority is urban
##'
##' Checks if a local authority is urban according to a user specified
##' classification.  See the details for more, well, detail.
##'
##' @details The urban classifications are described at
##' \url{http://www.ons.gov.uk/ons/guide-method/geography/products/area-classifications/rural-urban-definition-and-la/rural-urban-local-authority--la--classification--england-/index.html}
##' and are summarized as follows:
##'
##' \itemize{
##' 
##' \item MU = Major Urban: districts with either 100,000 people or 50
##' per cent of their population in urban areas with a population of
##' more than 750,000
##' 
##' \item LU = Large Urban: districts with either 50,000 people or 50
##' per cent of their population in one of 17 urban areas with a
##' population between 250,000 and 750,000
##' 
##' \item OU = Other Urban: districts with fewer than 37,000 people or
##' less than 26 per cent of their population in rural settlements and
##' larger market towns
##' 
##' \item SR = Significant Rural: districts with more than 37,000
##' people or more than 26 per cent of their population in rural
##' settlements and larger market towns
##' 
##' \item R50 = Rural-50: districts with at least 50 per cent but less
##' than 80 per cent of their population in rural settlements and
##' larger market towns
##' 
##' \item Rural-80: districts with at least 80 per cent of their
##' population in rural settlements and larger market towns
##' 
##' }
##'
##'
##' Unfortunately these classifications are only defined for
##' England. For all other countries, assumptions have been made about
##' which LAUs are urban (LU) or rural (SR) using the following data
##' sources:
##' 
##' \itemize{
##' 
##' \item Wales,
##' \url{http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/wales/unitary-authorities/index.html}
##' 
##' \item Scotland,
##' \url{http://www.scotland.gov.uk/Publications/2004/06/19498/38788}
##'
##' \item Northern Ireland,
##' \url{http://www.nisra.gov.uk/geography/default.asp10.htm}
##'
##' }
##'
##' @param lad a character vector a LAD ids (post-2011 format)
##' @param urban a character vector specifying which of the above
##' classifications to consider as a urban
##' @return a boolean vector of \code{length(lad)} indicating whether
##' the corresponding local authority is urban
##' @export
is_urban <- function(lad, urban=c("MU", "LU", "OU")) {
    LAD_metadata <- NULL # R CRAN check hack
    data(LAD_metadata, envir=environment())
    tmp <- LAD_metadata[which(LAD_metadata$new %in% lad),]
    return(tmp$urban_class %in% urban)
}
jkeirstead/decctools documentation built on May 19, 2019, 11:41 a.m.