This notebook does two things:

Setup

devtools::install_github('ukgovdatascience/eesectors')
library(eesectors)
#devtools::load_all()


knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  error = FALSE
)
# Note that in places I used :: to avoid confusion: there are duplicated
# functions with common sames like select and filter

library(dplyr)
library(tidyr)
library(testthat)
#library(eesectors)

First, load the data extracted from the spreadsheet OFFICIAL_working_file_dcms_V13.xlsm. These are stored as *.Rds files within the ../../OFFICIAL/cleaned/ folder. Note that you must remove the password from this file for this to work.

# These data come from the worksheets `1997 Use`:`2015 Use`.

input <- '~/Documents/DSprojects/eesectors_working/working_file_dcms_V13.xlsm'

ABS <- extract_ABS_data(input)
# DCMS_sectors <- extract_DCMS_sectors(input): for now use data built into package
GVA <- extract_GVA_data(input)
SIC91 <- extract_SIC91_data(input)
tourism <- extract_tourism_data(input)

Recreating the Working File worksheet

To recreate the working file as it appears in the spreadsheet we must:

  1. Deal with the SIC 91 data (i.e. remove SIC 91 data from ABS and swap in values from `SIC91)
  2. Duplicate the 2014 ABS values to use for 2015 (2015 values not being available)
  3. Merge the DCMS sectors to ABS_2015 to get the 2 digit SIC.
  4. Calculate sums across sectors and years.
  5. Add in total UK GVA from gva.
  6. Match in Tourism data.
  7. Add tourism overlap.
  8. Build the GVA_by_sector dataframe expected by the eesectors package.
  9. Test the dataframe against the eesectors::GVA_by_sector_2015.
  10. Print differences between GVA_by_sector and eesectors::GVA_by_sector_2015.

1. Deal with SIC 91

The SIC91 data needs to be merged into ABS as the first operation. This replaces the SIC 91 data from the Annual Business Survey (ABS) with sales data.

# Remove all the rows that correspond to SIC 91 based on the unique SIC codes in
# SIC91.

SIC91_mask <- unique(SIC91$SIC)

SIC91_mask

ABS_no91 <- ABS %>% dplyr::filter(!SIC %in% SIC91_mask)

ABS_no91 

Our expectation is now that there are not any 91 SIC codes in the ABS dataset that are also classed as DCMS sectors.

Test this here:

test_that(
  'sectors with 91 SIC 2 will match properly.',
  {

    # Create objects that contain just the rows which have SIC codes beginning
    # with 91 (there are some remaining even after the above step)

    sectors_91 <- DCMS_sectors$SIC[grepl('^91', DCMS_sectors$SIC)]
    ABS_91 <- ABS_no91$SIC[grepl('^91', ABS_no91$SIC)]

    # Check that none of the remaining 91 values in ABS are going to get matched
    # against DCMS sectors in sectors:

    expect_equal(0, sum(sectors_91 %in% ABS_91))

    # Check that when we merge ABS with SIC91, the new SIC91 entries are going 
    # to match the SIC codes for DMCS sectors that are in sectors. Note that not
    # all of the SIC4 codes in SIC91 are present in DCMS sectors, so check the 
    # other way, i.e. all SIC codes starting 91 in sectors$SIC are present in 
    # SIC91$SIC

    expect_equal(length(sectors_91), sum(sectors_91 %in% SIC91$SIC))

    }
  )

Now merge ABS_no91 with SIC91.

ABS_91 <- dplyr::full_join(
  ABS_no91,
  SIC91
)

ABS_91

Test that this merge worked.

test_that(
  'merge of ABS_91 and SIC91 worked as expected.',
  {

    # Our expectation is that there should now be the same number of rows in
    # ABS91 as ABS, since we have replaced the 91 codes.

    expect_equal(nrow(SIC91), nrow(ABS) - nrow(ABS_no91))
    expect_equal(nrow(ABS), nrow(ABS_91))

    # Also we expect that there are valid abs values for these rows!
    # Extract out those rows:

    merged_rows <- ABS_91$ABS[grepl('^91\\.',ABS_91$SIC)]

    expect_true(all(!is.na(merged_rows)))
    expect_true(all(merged_rows > 0))

    }
  )

2. Duplicate 2014 ABS data for 2015

Since the ABS_91 data only run to 2014, first duplicate the 2014 to be used for 2015:

# duplicate the 2014 data for 2015

ABS_2015 <- ABS_91 %>%
  dplyr::filter(year == 2014) %>%
  dplyr::mutate(year = 2015) %>%
  dplyr::bind_rows(ABS_91)

ABS_2015 
test_that(
  '2014 ABS data was replicated as 2015.',
  {

    # Check that the 2014 and 2015 ABS GVA are identical for 2014 and 2015

    GVA_2014 <- ABS_2015 %>%
      dplyr::filter(year == 2014)

    GVA_2015 <- ABS_2015 %>%
      dplyr::filter(year == 2015)

    expect_equal(GVA_2014$ABS_ind_GVA, GVA_2015$ABS_ind_GVA)

  }
)

3. Merge sectors and ABS datasets

GVA_sectors <- DCMS_sectors %>%
  dplyr::left_join(ABS_2015) %>%
  dplyr::rename(ABS_ind_GVA = ABS)

GVA_sectors
test_that(
  'merge of sectors and ABS_2015 worked as expected',
  {

    # Check that all of the columns from sectors are now in GVA

    expect_true(all(colnames(DCMS_sectors) %in% colnames(GVA_sectors)))

    # Check that all of the columns from ABS_2015 are now in GVA. Note
    # that we renamed abs, so we should check for that too.

    expect_equal(sum(colnames(ABS_2015) %in% colnames(GVA_sectors)), 3)

    # Check that there aren't a crazy amount of NAs in the dataframe

    expect_equal(c('DOMVAL', 'ABS', 'SIC'), na_cols(ABS_2015))

    # Check the proportion of these cols that are NA. NOte that these values are
    # somewhat arbitrary. In future years these will need to be reset.

    expect_lt(sum(is.na(ABS_2015$SIC))/nrow(ABS_2015), 0.01)

    expect_lt(sum(is.na(ABS_2015$DOMVAL))/nrow(ABS_2015), 0.01)

    expect_lt(sum(is.na(ABS_2015$abs))/nrow(ABS_2015), 0.2)

  }
)

Then calculate the 2 digit SIC total GVA (from ABS_91) for each of the DCMS sectors. Extract unique 2 digit SICs:

# Extract all the unique 2 digit SICs

SIC2_unique <- GVA_sectors %>%
  dplyr::filter(!is.na(SIC2)) %$%
  SIC2 %>%
  unique

SIC2_unique

We expect there to be 23 of these, so run a quick check:

test_that(
  'there are the right number of unique SIC2s',
  {
    expect_equal(length(SIC2_unique), 23)
  }
)

Next we use SIC2_unique to extract the 2 digit SIC totals from ABS_91.

# This will form the denominator in our division

denom <- ABS_2015 %>%
  dplyr::filter(SIC %in% SIC2_unique) %>%
  dplyr::mutate(ABS_2digit_GVA = ABS) %>%
  dplyr::select(year, SIC2 = SIC, ABS_2digit_GVA)

denom

Now join this back into GVA.

# Join back into GVA for division

GVA_sectors <- GVA_sectors %>% 
  dplyr::filter(present) %>%
  dplyr::left_join(denom) %>%
  dplyr::mutate(
    perc_split = ABS_ind_GVA / ABS_2digit_GVA
  )

# At this point there will be two rows that contain NAs, both as a result of
# tourism, so check for and drop these.

GVA_sectors <- GVA_sectors %>% 
  dplyr::filter(!(is.na(year) & is.na(ABS_ind_GVA)))

GVA_sectors

Test that this worked correctly.

test_that(
  'there are no unexpectedly missing values in GVA.',
  {

    # See http://stackoverflow.com/questions/10574061/show-columns-with-nas-in-a-data-frame

    na_cols <- function(df) colnames(df)[unlist(lapply(df, function(x) any(is.na(x))))]

    # We only expect NAs in DOMVAL at this point

    expect_equal('DOMVAL', na_cols(GVA_sectors))
  }
)

Now, multiply perc_split by GVA after joining with GVA to get the BB16_GVA (column Q in Working File worksheet).

GVA_sectors <- GVA_sectors %>%
   dplyr::left_join(
     GVA, by = c('SIC2' = 'SIC', 'year')
   ) %>%
  dplyr::mutate(
    BB16_GVA = perc_split * GVA
  )

GVA_sectors

4. Calculate sums across sectors and years

Finally calculate the sums across all sectors and years.

GVA_by_sector <- GVA_sectors %>%
  group_by(year, sector) %>%
  dplyr::summarise(
    GVA = sum(BB16_GVA)
    )

GVA_by_sector 

Total UK GVA is included in the gva dataset, this can be added to the data here:

5. Add in total UK GVA from gva

# Use create a table to merge in with GVA_by_sector

GVA_UK <- GVA %>%
  dplyr::filter(grepl("total.*intermediate.*", SIC)) %>%
  dplyr::transmute(
    year, 
    sector = "UK", 
    GVA
  )

GVA_UK

Merge this into GVA_by_sector:

# Full join it together

GVA_by_sector <- GVA_by_sector %>%
  dplyr::full_join(GVA_UK)

GVA_by_sector

6. Match in Tourism data

Tourism data is provided in a separate spreadsheet and imported in the tourism object

GVA_by_sector <- tourism %>%
  dplyr::transmute(year, sector = 'tourism', GVA) %>%
  dplyr::full_join(GVA_by_sector)

GVA_by_sector 
test_that(
  'GVA_by_sector tourism values match those in tourism',
  {
    # Extract tourism gva

    tourism_gva <- tourism %>% 
      dplyr::arrange(year)

    GVA_by_sector_gva <- GVA_by_sector %>% 
      dplyr::filter(sector == 'tourism') %>% 
      dplyr::arrange(year)

    # Check that this merged without error.

    expect_equal(tourism_gva$GVA, GVA_by_sector_gva$GVA)

    }
  )

7. Add tourism overlap

Also need to add the $overlap from tourism to the all_dcms totals in GVA_by_sector

GVA_by_sector <- tourism %>%
  dplyr::transmute(
    year, sector = 'all_dcms', overlap
  ) %>%
  right_join(GVA_by_sector) %>%
  dplyr::mutate(
    GVA = ifelse(!is.na(overlap), overlap + GVA, GVA)
    ) %>%
  dplyr::select(-overlap)


GVA_by_sector

8. Build the GVA_by_sector dataframe expected by the eesectors package

# Build df to match eesectors::GVA_by_sector_2016

GVA_by_sector <- GVA_by_sector %>%
  # The 2016 SFR only inc
  dplyr::filter(
    year %in% 2010:2015
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    # Round values
    GVA = round(GVA, 2),
    # Convert sector to factor
    sector = factor(sector),
    # Convert year to integer
    year = as.integer(year)
  ) %>%
  dplyr::select(
    sector, year, GVA
  ) %>%
  dplyr::arrange(year, sector)

GVA_by_sector

9. Test the dataframe against the eesectors::GVA_by_sector_2015

# Tourism comes from Tourism tab in excel workbook

# Abs data not sue for some SIC codes (SIC 91)

test_case_2016 <- GVA_by_sector_2016 %>% 
  dplyr::mutate(GVA = round(GVA)) %>%
  dplyr::arrange(sector)
test_that(
  'calculated values match those in the SFR.',
  {
      # Some general checks about the object

    expect_is(GVA_by_sector, c("tbl_df", "tbl", "data.frame"))
    expect_equal(nrow(GVA_by_sector), 54)
    expect_equal(levels(eesectors::GVA_by_sector_2016$sector), levels(GVA_by_sector$sector))

    test_case_new <- GVA_by_sector %>%
      dplyr::arrange(sector, year)

    test_case_2016 <- eesectors::GVA_by_sector_2016 %>%
      dplyr::arrange(sector, year)

    # Check equality of GVA within a given tolerance

    expect_equal(test_case_new$GVA, test_case_2016$GVA, tolerance = 1)

    # Check that years are equal

    expect_equal(test_case_new$year, test_case_2016$year)

    # Check that sectors are equal

    expect_equal(test_case_new$sector, test_case_2016$sector)
    }

  )

# If all good, save the dataframe out

GVA_by_sector %>% saveRDS('GVA_by_sector.Rds')

10. Print differences between GVA_by_sector and eesectors::GVA_by_sector_2015

Create a table of comparisons to see all values which do not match perfectly (due to rounding errors).

test <- dplyr::full_join(GVA_by_sector, (eesectors::GVA_by_sector_2016 %>% dplyr::rename(GVA_2016 = GVA)))

test %>%
  dplyr::mutate(abs_diff = abs(GVA - GVA_2016)) %>%
  dplyr::filter(abs_diff > 0) %>%
  knitr::kable()


ukgovdatascience/eesectors documentation built on Sept. 11, 2020, 12:19 p.m.