This notebook does two things:
*.Rds
files.devtools::install_github('ukgovdatascience/eesectors') library(eesectors) #devtools::load_all() knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE, error = FALSE )
# Note that in places I used :: to avoid confusion: there are duplicated # functions with common sames like select and filter library(dplyr) library(tidyr) library(testthat) #library(eesectors)
First, load the data extracted from the spreadsheet OFFICIAL_working_file_dcms_V13.xlsm
. These are stored as *.Rds
files within the ../../OFFICIAL/cleaned/
folder.
Note that you must remove the password from this file for this to work.
# These data come from the worksheets `1997 Use`:`2015 Use`. input <- '~/Documents/DSprojects/eesectors_working/working_file_dcms_V13.xlsm' ABS <- extract_ABS_data(input) # DCMS_sectors <- extract_DCMS_sectors(input): for now use data built into package GVA <- extract_GVA_data(input) SIC91 <- extract_SIC91_data(input) tourism <- extract_tourism_data(input)
To recreate the working file as it appears in the spreadsheet we must:
ABS
and swap in values from `SIC91)sectors
to ABS_2015
to get the 2 digit SIC.gva
.GVA_by_sector
dataframe expected by the eesectors
package.eesectors::GVA_by_sector_2015
.GVA_by_sector
and eesectors::GVA_by_sector_2015
.The SIC91
data needs to be merged into ABS
as the first operation.
This replaces the SIC 91 data from the Annual Business Survey (ABS) with sales data.
# Remove all the rows that correspond to SIC 91 based on the unique SIC codes in # SIC91. SIC91_mask <- unique(SIC91$SIC) SIC91_mask ABS_no91 <- ABS %>% dplyr::filter(!SIC %in% SIC91_mask) ABS_no91
Our expectation is now that there are not any 91 SIC codes in the ABS dataset that are also classed as DCMS sectors.
Test this here:
test_that( 'sectors with 91 SIC 2 will match properly.', { # Create objects that contain just the rows which have SIC codes beginning # with 91 (there are some remaining even after the above step) sectors_91 <- DCMS_sectors$SIC[grepl('^91', DCMS_sectors$SIC)] ABS_91 <- ABS_no91$SIC[grepl('^91', ABS_no91$SIC)] # Check that none of the remaining 91 values in ABS are going to get matched # against DCMS sectors in sectors: expect_equal(0, sum(sectors_91 %in% ABS_91)) # Check that when we merge ABS with SIC91, the new SIC91 entries are going # to match the SIC codes for DMCS sectors that are in sectors. Note that not # all of the SIC4 codes in SIC91 are present in DCMS sectors, so check the # other way, i.e. all SIC codes starting 91 in sectors$SIC are present in # SIC91$SIC expect_equal(length(sectors_91), sum(sectors_91 %in% SIC91$SIC)) } )
Now merge ABS_no91
with SIC91
.
ABS_91 <- dplyr::full_join( ABS_no91, SIC91 ) ABS_91
Test that this merge worked.
test_that( 'merge of ABS_91 and SIC91 worked as expected.', { # Our expectation is that there should now be the same number of rows in # ABS91 as ABS, since we have replaced the 91 codes. expect_equal(nrow(SIC91), nrow(ABS) - nrow(ABS_no91)) expect_equal(nrow(ABS), nrow(ABS_91)) # Also we expect that there are valid abs values for these rows! # Extract out those rows: merged_rows <- ABS_91$ABS[grepl('^91\\.',ABS_91$SIC)] expect_true(all(!is.na(merged_rows))) expect_true(all(merged_rows > 0)) } )
Since the ABS_91
data only run to 2014, first duplicate the 2014 to be used for 2015:
# duplicate the 2014 data for 2015 ABS_2015 <- ABS_91 %>% dplyr::filter(year == 2014) %>% dplyr::mutate(year = 2015) %>% dplyr::bind_rows(ABS_91) ABS_2015
test_that( '2014 ABS data was replicated as 2015.', { # Check that the 2014 and 2015 ABS GVA are identical for 2014 and 2015 GVA_2014 <- ABS_2015 %>% dplyr::filter(year == 2014) GVA_2015 <- ABS_2015 %>% dplyr::filter(year == 2015) expect_equal(GVA_2014$ABS_ind_GVA, GVA_2015$ABS_ind_GVA) } )
GVA_sectors <- DCMS_sectors %>% dplyr::left_join(ABS_2015) %>% dplyr::rename(ABS_ind_GVA = ABS) GVA_sectors
test_that( 'merge of sectors and ABS_2015 worked as expected', { # Check that all of the columns from sectors are now in GVA expect_true(all(colnames(DCMS_sectors) %in% colnames(GVA_sectors))) # Check that all of the columns from ABS_2015 are now in GVA. Note # that we renamed abs, so we should check for that too. expect_equal(sum(colnames(ABS_2015) %in% colnames(GVA_sectors)), 3) # Check that there aren't a crazy amount of NAs in the dataframe expect_equal(c('DOMVAL', 'ABS', 'SIC'), na_cols(ABS_2015)) # Check the proportion of these cols that are NA. NOte that these values are # somewhat arbitrary. In future years these will need to be reset. expect_lt(sum(is.na(ABS_2015$SIC))/nrow(ABS_2015), 0.01) expect_lt(sum(is.na(ABS_2015$DOMVAL))/nrow(ABS_2015), 0.01) expect_lt(sum(is.na(ABS_2015$abs))/nrow(ABS_2015), 0.2) } )
Then calculate the 2 digit SIC total GVA (from ABS_91
) for each of the DCMS sectors.
Extract unique 2 digit SICs:
# Extract all the unique 2 digit SICs SIC2_unique <- GVA_sectors %>% dplyr::filter(!is.na(SIC2)) %$% SIC2 %>% unique SIC2_unique
We expect there to be 23 of these, so run a quick check:
test_that( 'there are the right number of unique SIC2s', { expect_equal(length(SIC2_unique), 23) } )
Next we use SIC2_unique
to extract the 2 digit SIC totals from ABS_91
.
# This will form the denominator in our division denom <- ABS_2015 %>% dplyr::filter(SIC %in% SIC2_unique) %>% dplyr::mutate(ABS_2digit_GVA = ABS) %>% dplyr::select(year, SIC2 = SIC, ABS_2digit_GVA) denom
Now join this back into GVA
.
# Join back into GVA for division GVA_sectors <- GVA_sectors %>% dplyr::filter(present) %>% dplyr::left_join(denom) %>% dplyr::mutate( perc_split = ABS_ind_GVA / ABS_2digit_GVA ) # At this point there will be two rows that contain NAs, both as a result of # tourism, so check for and drop these. GVA_sectors <- GVA_sectors %>% dplyr::filter(!(is.na(year) & is.na(ABS_ind_GVA))) GVA_sectors
Test that this worked correctly.
test_that( 'there are no unexpectedly missing values in GVA.', { # See http://stackoverflow.com/questions/10574061/show-columns-with-nas-in-a-data-frame na_cols <- function(df) colnames(df)[unlist(lapply(df, function(x) any(is.na(x))))] # We only expect NAs in DOMVAL at this point expect_equal('DOMVAL', na_cols(GVA_sectors)) } )
Now, multiply perc_split
by GVA
after joining with GVA
to get the BB16_GVA
(column Q in Working File
worksheet).
GVA_sectors <- GVA_sectors %>% dplyr::left_join( GVA, by = c('SIC2' = 'SIC', 'year') ) %>% dplyr::mutate( BB16_GVA = perc_split * GVA ) GVA_sectors
Finally calculate the sums across all sectors and years.
GVA_by_sector <- GVA_sectors %>% group_by(year, sector) %>% dplyr::summarise( GVA = sum(BB16_GVA) ) GVA_by_sector
Total UK GVA is included in the gva
dataset, this can be added to the data here:
gva
# Use create a table to merge in with GVA_by_sector GVA_UK <- GVA %>% dplyr::filter(grepl("total.*intermediate.*", SIC)) %>% dplyr::transmute( year, sector = "UK", GVA ) GVA_UK
Merge this into GVA_by_sector
:
# Full join it together GVA_by_sector <- GVA_by_sector %>% dplyr::full_join(GVA_UK) GVA_by_sector
Tourism data is provided in a separate spreadsheet and imported in the tourism
object
GVA_by_sector <- tourism %>% dplyr::transmute(year, sector = 'tourism', GVA) %>% dplyr::full_join(GVA_by_sector) GVA_by_sector
test_that( 'GVA_by_sector tourism values match those in tourism', { # Extract tourism gva tourism_gva <- tourism %>% dplyr::arrange(year) GVA_by_sector_gva <- GVA_by_sector %>% dplyr::filter(sector == 'tourism') %>% dplyr::arrange(year) # Check that this merged without error. expect_equal(tourism_gva$GVA, GVA_by_sector_gva$GVA) } )
Also need to add the $overlap
from tourism to the all_dcms
totals in GVA_by_sector
GVA_by_sector <- tourism %>% dplyr::transmute( year, sector = 'all_dcms', overlap ) %>% right_join(GVA_by_sector) %>% dplyr::mutate( GVA = ifelse(!is.na(overlap), overlap + GVA, GVA) ) %>% dplyr::select(-overlap) GVA_by_sector
GVA_by_sector
dataframe expected by the eesectors package# Build df to match eesectors::GVA_by_sector_2016 GVA_by_sector <- GVA_by_sector %>% # The 2016 SFR only inc dplyr::filter( year %in% 2010:2015 ) %>% dplyr::ungroup() %>% dplyr::mutate( # Round values GVA = round(GVA, 2), # Convert sector to factor sector = factor(sector), # Convert year to integer year = as.integer(year) ) %>% dplyr::select( sector, year, GVA ) %>% dplyr::arrange(year, sector) GVA_by_sector
eesectors::GVA_by_sector_2015
# Tourism comes from Tourism tab in excel workbook # Abs data not sue for some SIC codes (SIC 91) test_case_2016 <- GVA_by_sector_2016 %>% dplyr::mutate(GVA = round(GVA)) %>% dplyr::arrange(sector)
test_that( 'calculated values match those in the SFR.', { # Some general checks about the object expect_is(GVA_by_sector, c("tbl_df", "tbl", "data.frame")) expect_equal(nrow(GVA_by_sector), 54) expect_equal(levels(eesectors::GVA_by_sector_2016$sector), levels(GVA_by_sector$sector)) test_case_new <- GVA_by_sector %>% dplyr::arrange(sector, year) test_case_2016 <- eesectors::GVA_by_sector_2016 %>% dplyr::arrange(sector, year) # Check equality of GVA within a given tolerance expect_equal(test_case_new$GVA, test_case_2016$GVA, tolerance = 1) # Check that years are equal expect_equal(test_case_new$year, test_case_2016$year) # Check that sectors are equal expect_equal(test_case_new$sector, test_case_2016$sector) } ) # If all good, save the dataframe out GVA_by_sector %>% saveRDS('GVA_by_sector.Rds')
GVA_by_sector
and eesectors::GVA_by_sector_2015
Create a table of comparisons to see all values which do not match perfectly (due to rounding errors).
test <- dplyr::full_join(GVA_by_sector, (eesectors::GVA_by_sector_2016 %>% dplyr::rename(GVA_2016 = GVA))) test %>% dplyr::mutate(abs_diff = abs(GVA - GVA_2016)) %>% dplyr::filter(abs_diff > 0) %>% knitr::kable()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.