data-raw/oes_employment_read.R

#' Read in the employment data from OES
#'
#' This function loads in the employment data at occupation level from the Bureau of Labor Statistics (BLS). Variable
#' names are made consistent across the annual datasets, SOC2000 and SOC2018 data is converted to SOC2010 using the BLS crosswalks
#' and all files are appended into a single occupational panel covering 2004 to 2019.
#'
#' @return A clean database of US occupational employment figures
#' @export
oes_employment_read <- function() {


  ####### read in the detailed soc codes to filter the excel sheet, using the SOC definitions available from BLS

  ## SOC2000 - for years 2004-2009
detailed.soc.2000 <- as.data.table(readxl::read_xlsx("data/oes employment/Copy of soc-definitions-2000.xlsx",
                                      range="A7:C828")[,"2000 SOC code"])
detailed.soc.2000 <- detailed.soc.2000 %>% dplyr::rename("occ_code" = "2000 SOC code")
detailed.soc.2000$level <- rep("detailed",dim(detailed.soc.2000)[1])

  ## SOC2010 - for years 2010-2018
detailed.soc.2010 <- as.data.table(readxl::read_xls("data/oes employment/soc_2010_definitions.xls",
                                                     range="A7:C847")[,"SOC Code"])
detailed.soc.2010 <- detailed.soc.2010 %>% dplyr::rename("occ_code" = "SOC Code")
detailed.soc.2010$level <- rep("detailed",dim(detailed.soc.2010)[1])

  ## SOC2018 - for years 2019 onwards
detailed.soc.2018 <- as.data.table(readxl::read_xlsx("data/oes employment/Copy of soc_2018_definitions.xlsx",
                                                    range="A8:C1455")[,c("SOC Group","SOC Code")])
detailed.soc.2018 <- detailed.soc.2018 %>% dplyr::rename("occ_code" = "SOC Code","level"="SOC Group") %>% filter(level=="Detailed")

  ## read in each years data file

data.2004 <- as.data.table(readxl::read_xls("data/oes employment/oesm04nat/national_may2004_dl.xls")[,c("occ_code","tot_emp")])
data.2005 <- as.data.table(readxl::read_xls("data/oes employment/oesm05nat/national_may2005_dl.xls")[,c("occ_code","tot_emp")])
data.2006 <- as.data.table(readxl::read_xls("data/oes employment/oesm06nat/national_may2006_dl.xls")[,c("occ_code","tot_emp")])
data.2007 <- as.data.table(readxl::read_xls("data/oes employment/oesm07nat/national_May2007_dl.xls")[,c("occ_code","tot_emp")])
data.2008 <- as.data.table(readxl::read_xls("data/oes employment/oesm08nat/national__M2008_dl.xls")[,c("occ_code","tot_emp")])
data.2009 <- as.data.table(readxl::read_xls("data/oes employment/oesm09nat/national_dl.xls")[,c("occ_code","tot_emp")])
data.2010 <- as.data.table(readxl::read_xls("data/oes employment/oesm10nat/national_M2010_dl.xls")[,c("OCC_CODE","TOT_EMP")])
data.2011 <- as.data.table(readxl::read_xls("data/oes employment/oesm11nat/national_M2011_dl.xls")[,c("OCC_CODE","TOT_EMP")])
data.2012 <- as.data.table(readxl::read_xls("data/oes employment/oesm12nat/oesm12nat/national_M2012_dl.xls")[,c("OCC_CODE","TOT_EMP")])
data.2013 <- as.data.table(readxl::read_xls("data/oes employment/oesm13nat/oesm13nat/national_M2013_dl.xls")[,c("OCC_CODE","TOT_EMP")])

data.2014 <- as.data.table(readxl::read_xlsx("data/oes employment/oesm14nat/oesm14nat/national_M2014_dl.xlsx")[,c("OCC_CODE","TOT_EMP")])
data.2015 <- as.data.table(readxl::read_xlsx("data/oes employment/oesm15nat/oesm15nat/national_M2015_dl.xlsx")[,c("OCC_CODE","TOT_EMP")])
data.2016 <- as.data.table(readxl::read_xlsx("data/oes employment/oesm16nat/oesm16nat/national_M2016_dl.xlsx")[,c("OCC_CODE","TOT_EMP")])
data.2017 <- as.data.table(readxl::read_xlsx("data/oes employment/oesm17nat/oesm17nat/national_M2017_dl.xlsx")[,c("OCC_CODE","TOT_EMP")])
data.2018 <- as.data.table(readxl::read_xlsx("data/oes employment/oesm18nat/oesm18nat/national_M2018_dl.xlsx")[,c("OCC_CODE","TOT_EMP")])
data.2019 <- as.data.table(readxl::read_xlsx("data/oes employment/oesm19nat/oesm19nat/national_M2019_dl.xlsx")[,c("occ_code","tot_emp")])

  ## for each annual dataset, merge in the list of detailed SOC occupations, keep only those occupations, retain only occupation code and total employment

data.2004 <- merge(data.2004,detailed.soc.2000, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))
data.2005 <- merge(data.2005,detailed.soc.2000, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))
data.2006 <- merge(data.2006,detailed.soc.2000, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))
data.2007 <- merge(data.2007,detailed.soc.2000, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))
data.2008 <- merge(data.2008,detailed.soc.2000, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))
data.2009 <- merge(data.2009,detailed.soc.2000, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2010 <- data.2010 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2010 <- merge(data.2010,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2011 <- data.2011 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2011 <- merge(data.2011,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2012 <- data.2012 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2012 <- merge(data.2012,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2013 <- data.2013 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2013 <- merge(data.2013,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2014 <- data.2014 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2014 <- merge(data.2014,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2015 <- data.2015 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2015 <- merge(data.2015,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2016 <- data.2016 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2016 <- merge(data.2016,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2017 <- data.2017 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2017 <- merge(data.2017,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))

data.2018 <- data.2018 %>% select(c("OCC_CODE","TOT_EMP")) %>% dplyr::rename("occ_code" = "OCC_CODE","tot_emp" = "TOT_EMP")
data.2018 <- merge(data.2018,detailed.soc.2010, by = "occ_code",all.y=TRUE) %>% filter(level == "detailed") %>% select(c("occ_code","tot_emp"))


data.2019 <- merge(data.2019,detailed.soc.2018, by = "occ_code",all.y=TRUE) %>% filter(level == "Detailed") %>% select(c("occ_code","tot_emp"))

### create a year variable for each dataset

data.2004 <- as.data.table(data.2004)
data.2005 <- as.data.table(data.2005)
data.2006 <- as.data.table(data.2006)
data.2007 <- as.data.table(data.2007)
data.2008 <- as.data.table(data.2008)
data.2009 <- as.data.table(data.2009)
data.2010 <- as.data.table(data.2010)
data.2011 <- as.data.table(data.2011)
data.2012 <- as.data.table(data.2012)
data.2013 <- as.data.table(data.2013)
data.2014 <- as.data.table(data.2014)
data.2015 <- as.data.table(data.2015)
data.2016 <- as.data.table(data.2016)
data.2017 <- as.data.table(data.2017)
data.2018 <- as.data.table(data.2018)
data.2019 <- as.data.table(data.2019)

data.2004$year = 2004
data.2005$year = 2005
data.2006$year = 2006
data.2007$year = 2007
data.2008$year = 2008
data.2009$year = 2009
data.2010$year = 2010
data.2011$year = 2011
data.2012$year = 2012
data.2013$year = 2013
data.2014$year = 2014
data.2015$year = 2015
data.2016$year = 2016
data.2017$year = 2017
data.2018$year = 2018
data.2019$year = 2019

#### convert the SOC 2000 years (2004 - 2009) to SOC-2010

## read in the BLS crosswalk table, construct weights to spread SOC2000 employment evenly across recipient SOC2010 occupations
soc00to10 <- as.data.table(readxl::read_xls("data/oes employment/soc_2000_to_2010_crosswalk.xls",
                                                     range="A7:C868")[,c(1,3)])
soc00to10 <- as.data.table(soc00to10[2:861,])
soc00to10$id <- rep(1,dim(soc00to10)[1])
soc00to10 <- soc00to10 %>%
  mutate(id = rep(1,dim(soc00to10)[1])) %>%
  dplyr::rename("occ_code" = "2000 SOC code",
                "SOC2010" = "2010 SOC code",
                "id" = "id") %>%
  group_by(occ_code) %>%
  mutate(dup.SOC2000 = sum(id)) %>%
  ungroup() %>%
  mutate(weight = (100/dup.SOC2000)/100 ) %>%
  select(c("occ_code","SOC2010","weight"))

## merge in the employment figures from each year of data and convert

convert.2004 <- merge(soc00to10,data.2004,by="occ_code") %>%
  mutate(tot_emp = as.numeric(tot_emp)) %>%  # ensure employment is numeric
  mutate(empl = tot_emp*weight) %>%          # multiply employment by the SOC2010 weight to get SOC2010 employment from SOC2000 totals
  select(-c(tot_emp)) %>%                    # remove the tot_emp variable
  group_by(SOC2010) %>%                      # group by SOC2010
  mutate(tot_emp = sum(empl)) %>%            # calculate total employment for each SOC2010 occupation
  select(c("SOC2010","tot_emp")) %>%         # retain only SOC2010 code and total employment
  distinct() %>%                             # remove duplicates
  dplyr::rename("occ_code" = "SOC2010",      # rename for consistency
                "tot_emp" = "tot_emp") %>%   # create a year variable
  mutate(year = 2004)

convert.2005 <- merge(soc00to10,data.2005,by="occ_code") %>%
  mutate(tot_emp = as.numeric(tot_emp)) %>%
  mutate(empl = tot_emp*weight) %>%
  select(-c(tot_emp)) %>%
  group_by(SOC2010) %>%
  mutate(tot_emp = sum(empl)) %>%
  select(c("SOC2010","tot_emp")) %>%
  distinct() %>%
  dplyr::rename("occ_code" = "SOC2010",
                "tot_emp" = "tot_emp") %>%
  mutate(year = 2005)

convert.2006 <- merge(soc00to10,data.2006,by="occ_code") %>%
  mutate(tot_emp = as.numeric(tot_emp)) %>%
  mutate(empl = tot_emp*weight) %>%
  select(-c(tot_emp)) %>%
  group_by(SOC2010) %>%
  mutate(tot_emp = sum(empl)) %>%
  select(c("SOC2010","tot_emp")) %>%
  distinct() %>%
  dplyr::rename("occ_code" = "SOC2010",
                "tot_emp" = "tot_emp") %>%
  mutate(year = 2006)

convert.2007 <- merge(soc00to10,data.2007,by="occ_code") %>%
  mutate(tot_emp = as.numeric(tot_emp)) %>%
  mutate(empl = tot_emp*weight) %>%
  select(-c(tot_emp)) %>%
  group_by(SOC2010) %>%
  mutate(tot_emp = sum(empl)) %>%
  select(c("SOC2010","tot_emp")) %>%
  distinct() %>%
  dplyr::rename("occ_code" = "SOC2010",
                "tot_emp" = "tot_emp") %>%
  mutate(year = 2007)

convert.2008 <- merge(soc00to10,data.2008,by="occ_code") %>%
  mutate(tot_emp = as.numeric(tot_emp)) %>%
  mutate(empl = tot_emp*weight) %>%
  select(-c(tot_emp)) %>%
  group_by(SOC2010) %>%
  mutate(tot_emp = sum(empl)) %>%
  select(c("SOC2010","tot_emp")) %>%
  distinct() %>%
  dplyr::rename("occ_code" = "SOC2010",
                "tot_emp" = "tot_emp") %>%
  mutate(year = 2008)

convert.2009 <- merge(soc00to10,data.2009,by="occ_code") %>%
  mutate(tot_emp = as.numeric(tot_emp)) %>%
  mutate(empl = tot_emp*weight) %>%
  select(-c(tot_emp)) %>%
  group_by(SOC2010) %>%
  mutate(tot_emp = sum(empl)) %>%
  select(c("SOC2010","tot_emp")) %>%
  distinct() %>%
  dplyr::rename("occ_code" = "SOC2010",
                "tot_emp" = "tot_emp") %>%
  mutate(year = 2009)

#### convert the SOC 2018 years (2019+) to SOC-2010

## read in the BLS crosswalk table, construct weights to spread SOC2000 employment evenly across recipient SOC2010 occupations
soc18to10 <- as.data.table(readxl::read_xlsx("data/oes employment/soc_2010_to_2018_crosswalk.xlsx",
                                            range="A9:D909")[,c(1,3)])

soc18to10 <- soc18to10 %>%
  mutate(id = rep(1,dim(soc18to10)[1])) %>%
  dplyr::rename("occ_code" = "2018 SOC Code",
                "SOC2010" = "2010 SOC Code",
                "id" = "id") %>%
  group_by(occ_code) %>%
  mutate(dup.SOC2018 = sum(id)) %>%
  ungroup() %>%
  mutate(weight = (100/dup.SOC2018)/100 ) %>%
  select(c("occ_code","SOC2010","weight"))

## merge in the employment figures from each year of data and convert

convert.2019 <- merge(soc18to10,data.2019,by="occ_code")  %>%
  mutate(tot_emp = as.numeric(tot_emp)) %>%
  mutate(empl = tot_emp*weight) %>%
  select(-c(tot_emp)) %>%
  group_by(SOC2010) %>%
  mutate(tot_emp = sum(empl)) %>%
  select(c("SOC2010","tot_emp")) %>%
  distinct() %>%
  dplyr::rename("occ_code" = "SOC2010",
                "tot_emp" = "tot_emp") %>%
  mutate(year = 2019)


### append years
### regression-based imputation for missing years

oes <- dplyr::bind_rows(convert.2004,convert.2005,convert.2006,convert.2007,convert.2008,convert.2009,data.2010,data.2011,
                         data.2012,data.2013,data.2014,data.2015,data.2016,data.2017,data.2018,convert.2019) %>%
  dplyr::filter(substring(occ_code,1,2)!="55") %>%                # drop military occupations with no employment data
  dplyr::group_by(occ_code) %>%
  dplyr::mutate(tot_emp_imp = na.approx(tot_emp,na.rm=FALSE)) %>%
  dplyr::mutate(tot_emp_imp = nafill(tot_emp_imp,type="nocb")) %>%
  dplyr::mutate(tot_emp_imp = nafill(tot_emp_imp,type="locf"))         # impute missing values (linear interpolation, nocb, and locf)


return(oes)

}
djmorris1989/onetmappinguk documentation built on June 14, 2020, 10:04 a.m.