data-raw/load_lifetable.R

require(XLConnect)

# Load ABS lifetable(s)
#
# Load and ABS lifetable by year and state from Excel spreadsheets
#
# Reads the Excel spreadsheet file for a given year using the \code{XLConnect}
# package. All worksheets are processed by \link{get_lifetable()}. The
# spreadsheet will contain data for all of Australia, and for later datasets,
# it will provide data broken down by state and the two main territories.
#
# @param year integer year specifying the first year of the lifetable, e.g.
#   2002 for the 2002-2004 lifetable.
# @param path character value providing directory to search for Excel
#   spreadsheet files.
# @return data.frame of lifetable in a long format: state, age, sex,
#   measurement type (e.g. lx, px) and the value.
load_abs_lifetable <- function(year,
                               path='./') {

    all_states <- c('NSW', 'VIC', 'QLD',  'SA',
                    'WA', 'TAS',  'NT', 'ACT',
                    'AU')

    filename <- file.path(path, abs_lifetable_filename(year))

    wb <- loadWorkbook(filename)

    raw_list <- lapply(getSheets(wb), function(sheet) get_lifetable(wb, sheet))

    table_ind <- which(sapply(raw_list, function(x) length(dim(x)) > 0))

    if (length(table_ind) == 1) {
        message(paste('load_abs_lifetable: National data (only) found for',
                      'year', paste0(year, '.')))
        names(raw_list)[table_ind] <- 'AU'
    } else if (length(table_ind) == 9) {
        message(paste('load_abs_lifetable: National, state, and territory',
                      'data found for year',
                      paste0(year, '.')))
        names(raw_list)[table_ind] <- all_states
    }

    do.call(rbind,
            lapply(factor(names(raw_list[table_ind]), levels=all_states),
                   function(s) {
                       data.frame(raw_list[[as.character(s)]], loc=s)
                   }))

}

# Get ABS lifetable from worksheet
#
# Creates a long style data.frame from an XLConnect worksheet object.
#
# Reads a XLConnect worksheet object and attempts to locate lifetable data as
# per the layout of the ABS lifetable Excel spreadsheets. Once located, the
# data is read into a 'long' format data.frame with columns age, sex,
# measurement type and value.
#
# @param wb an XLConnect workbook object.
# @param sheet an XLConnect sheet object within the workbook.
# @return data.frame of lifetable in a long format: age, sex, measurement
#   type (e.g. lx, px) and value of measurement.
get_lifetable <- function(wb, sheet) {

    df <- readWorksheet(wb, sheet, header=FALSE, readStrategy='fast')

    row_ind <- find_age_rows(df)
    if (length(row_ind) == 0 ||
        any(sapply(row_ind, is.na))) return(NA)

    col_ind <- find_lifetable_cols(df)
    if (length(col_ind) == 0 ||
        any(sapply(col_ind, is.na))) return(NA)

    # create a long data frame by age, sex and measure

    df <- readWorksheet(wb,
                        sheet,
                        startRow=min(row_ind),
                        endRow=max(row_ind),
                        startCol=1,
                        endCol=max(col_ind$j),
                        colTypes='numeric',
                        header=FALSE)

    do.call(rbind,
            lapply(1:nrow(col_ind),
                   function(i)
                       data.frame(sex=rep(col_ind$sex[i], 101),
                                  measure=rep(col_ind$measure[i], 101),
                                  age=0:100,
                                  value=df[1:101,col_ind$j[i]])))

}

# Determine filename for ABS lifetable spreadsheet for a given year.
abs_lifetable_filename <- function(year) {

    paste0('3302055001do001_',
           as.character(year),
           as.character(year+2),
           '.xls')
}

# Identify the range of rows in a data.frame with lifetable data
find_age_rows <- function(sheet) {

    if (length(sheet) == 0) return(NA)

    ind <- match(0:99, sheet[,1])

    if (!any(sapply(ind, is.na)) &&
        all(0:99 == (ind-ind[1])) &&
        sheet[ind[1]+100,1] %in% c('100', '100 and over')) {
        c(ind, ind[1]+100)
    } else NA

}

# Identify the columns in a data.frame with lifetable data
find_lifetable_cols <- function(df) {

    mf_row <- which(sapply(1:nrow(df),
                           function(i) {
                               any(grepl('^male', tolower(df[i,]))) &&
                               any(grepl('^female', tolower(df[i,])))
                            }))
    lx_row <- which(sapply(1:nrow(df),
                           function(i) {
                               any(grepl('^lx', tolower(df[i,]))) &&
                               any(grepl('^qx', tolower(df[i,])))
                            }))

    if (length(mf_row) == 0 ||
        length(lx_row) == 0) return(NA)

    male_col   <- which(grepl('^male', tolower(df[mf_row,])))
    female_col <- which(grepl('^female', tolower(df[mf_row,])))

    data.frame(sex=factor(c(rep('M',4), rep('F',4))),
               j=c(male_col+(0:3), female_col+(0:3)),
               measure=factor(as.character(df[lx_row,
                                              c(male_col+(0:3),
                                                female_col+(0:3))]))
              )

}
stephematician/lifetable documentation built on May 30, 2019, 3:17 p.m.