data-raw/CoAData.R

# Pull the 6 digit NAICS associated with crops/livestock from the loaded 2012 NAICS codes and names. Manually create
# 8 digit NAICS for Census of Agriculture (CoA) Crops and Livestock. The purpose of NAICS8 are *only* to help aggregate
# CoA data to NAICS6. NAICS8 are unofficial and are not used again after initial aggregation to NAICS6. NAICS8 are based
# on NAICS definitions from the Census. 


# Generate Census of Agriculture NAICS8 from 2012 Census NAICS6
getCoANAICS8 <- function(){
  
  ####### Pull NAICS6 from NAICS 2012 names and codes ########

  # NAICS 2012 file used in "NAICS.R"
  FileName <- "inst/extdata/2-digit_2012_Codes.xls"
  NAICS_raw <- as.data.frame(readxl::read_excel(FileName, sheet = 1, col_names = TRUE))[-1,-1]

  # modify column names
  NAICS <- NAICS_raw
  colnames(NAICS) <- c("NAICS_Code", "NAICS_Name")
  
  # trim any possible whitespace
  NAICS$NAICS_Code <- trimws(NAICS$NAICS_Code)
  NAICS$NAICS_Name <- trimws(NAICS$NAICS_Name)
  
  # for some data (like aquaculture), the NAICS activity is duplicated, but the NAICS Codes are different, even when 
  # standardized to 8 digit NAICS. Remove duplicates in activity - first need to sort the dataframe
  NAICS <- NAICS[order(NAICS$NAICS_Code),]
  NAICS <- NAICS[!duplicated(NAICS$NAICS_Name),]

  # extract rows related to agriculture/livestock
  df <- subset(NAICS, substr(NAICS$NAICS_Code, 1, 2) == '11')
  
  # right pad to make all NAICS 8 digits
  df$NAICS_Code <- stringr::str_pad(df$NAICS_Code, 8, "right", pad="0")
  
  # more specifically, don't want rows where first 3 digits are 113, 114, 115
  df <- subset(df, !substr(df$NAICS_Code, 1, 3) =='113')
  df <- subset(df, !substr(df$NAICS_Code, 1, 3) =='114')
  df <- subset(df, !substr(df$NAICS_Code, 1, 3) =='115')
  
  # remove duplicate rows introduced by standardizing length of naics
  df <- df[!duplicated(df), ]
  
  
  ####### Add CoA NAICS8 by hard coding ##############
  
  # CoA NAICS names are formatted to match those generated by NASS Quickstats
  # https://quickstats.nass.usda.gov/
  
  # create new df to incorporate coa naics
  df2 <- data.frame(NAICS8=integer(),
                    Activity=character(),
                    stringsAsFactors=FALSE)
  
  #### CROPS ####################3
  
  # coa equivalent to agriculture, forestry, fishing, and hunting 
  df2[nrow(df2) + 1,] = c('11000000', 'AG LAND')
  
  # coa equivalent to crop production: 111000
  df2[nrow(df2) + 1,] = c('11100000', 'AG LAND, CROPLAND')
  df2[nrow(df2) + 1,] = c('1110000A', 'FIELD CROPS, OTHER') 
  
  # coa equivalent to Animal Production and Aquaculture: 112000
  df2[nrow(df2) + 1,] = c('11200000', 'AG LAND, PASTURELAND')
  
  ## coa equivalent to soybean farming: 111110
  df2[nrow(df2) + 1,] = c('11111000', 'SOYBEANS')
  
  # coa aggregates to oilseed (except soybean) farming: 111120
  df2[nrow(df2) + 1,] = c('1111200A', 'CANOLA')
  df2[nrow(df2) + 1,] = c('1111200B', 'FLAXSEED')
  df2[nrow(df2) + 1,] = c('1111200C', 'MUSTARD')
  df2[nrow(df2) + 1,] = c('1111200D', 'RAPESEED')
  df2[nrow(df2) + 1,] = c('1111200E', 'SAFFLOWER')
  df2[nrow(df2) + 1,] = c('1111200F', 'SESAME')
  df2[nrow(df2) + 1,] = c('1111200G', 'SUNFLOWER')
  
  # coa aggregates to dry pea and bean farming: 111130
  df2[nrow(df2) + 1,] = c('1111300A', 'BEANS, DRY EDIBLE, (EXCL LIMA), INCL CHICKPEAS')
  df2[nrow(df2) + 1,] = c('1111300B', 'BEANS, DRY EDIBLE, (EXCL CHICKPEAS & LIMA)')
  df2[nrow(df2) + 1,] = c('1111300C', 'BEANS, DRY EDIBLE, LIMA')
  df2[nrow(df2) + 1,] = c('1111300D', 'CHICKPEAS')
  df2[nrow(df2) + 1,] = c('1111300E', 'LENTILS')
  df2[nrow(df2) + 1,] = c('1111300F', 'PEAS, AUSTRIAN WINTER')
  df2[nrow(df2) + 1,] = c('1111300G', 'PEAS, DRY EDIBLE')
  df2[nrow(df2) + 1,] = c('1111300H', 'PEAS, DRY, SOUTHERN (COWPEAS)')
  #df2[nrow(df2) + 1,] = c('', 'BEANS, MUNG') # last year published 2002
  
  # coa equivalent to wheat farming: 111140
  df2[nrow(df2) + 1,] = c('11114000', 'WHEAT')
  
  # coa aggregates to corn farming: 111150
  df2[nrow(df2) + 1,] = c('11115000', 'CORN')
  df2[nrow(df2) + 1,] = c('1111500A', 'CORN, GRAIN') 
  df2[nrow(df2) + 1,] = c('1111500B', 'CORN, SILAGE')
  df2[nrow(df2) + 1,] = c('1111500C', 'POPCORN')
  
  # coa equivalent to rice farming: 111160
  df2[nrow(df2) + 1,] = c('11116000', 'RICE')
  
  # coa equates to other grain farming: 111190
  df2[nrow(df2) + 1,] = c('11119000', 'SMALL GRAINS, WHEAT & BARLEY & OATS & RYE')
  
  # coa aggregates to all other grain farming: 111199
  df2[nrow(df2) + 1,] = c('1111990A', 'BARLEY')
  df2[nrow(df2) + 1,] = c('1111990B', 'BUCKWHEAT')
  df2[nrow(df2) + 1,] = c('1111990C', 'MILLET, PROSO')
  df2[nrow(df2) + 1,] = c('1111990D', 'OATS')
  df2[nrow(df2) + 1,] = c('1111990E', 'RYE')
  df2[nrow(df2) + 1,] = c('1111990F', 'SORGHUM, GRAIN') 
  df2[nrow(df2) + 1,] = c('1111990G', 'SORGHUM, SILAGE')
  df2[nrow(df2) + 1,] = c('1111990H', 'SORGHUM, SYRUP') 
  df2[nrow(df2) + 1,] = c('1111990I', 'TRITICALE')
  df2[nrow(df2) + 1,] = c('1111990J', 'WILD RICE')
  # df2[nrow(df2) + 1,] = c('', 'SWEET RICE') # last year published 2002
  
  # coa equivalent to vegetable and melon farming: 111200
  df2[nrow(df2) + 1,] = c('11120000', 'VEGETABLE TOTALS') # this category does include melons
  
  # coa aggregates to fruit and tree nut farming: 111300
  df2[nrow(df2) + 1,] = c('11130000', 'ORCHARDS')
  df2[nrow(df2) + 1,] = c('1113000A', 'BERRY TOTALS')
  
  # coa aggregates to other noncitrus fruit farming: 111339
  # missing pinneapples
  
  # coa aggregates to greenhouse nursery and floriculture production: 111400
  df2[nrow(df2) + 1,] = c('11140000', 'HORTICULTURE TOTALS')
  df2[nrow(df2) + 1,] = c('1114000A', 'CUT CHRISTMAS TREES')
  df2[nrow(df2) + 1,] = c('11140008', 'SHORT TERM WOODY CROPS')
  
  # coa equivalent to other crop farming: 111900
  df2[nrow(df2) + 1,] = c('11190000', 'CROPS, OTHER')
  
  # coa equivalent to tobacco farming: 111910
  df2[nrow(df2) + 1,] = c('11191000', 'TOBACCO')
  
  # coa aggregates to cotton: 111920
  df2[nrow(df2) + 1,] = c('11192000', 'COTTON')
  
  # coa aggregates to sugarcane farming: 111930
  df2[nrow(df2) + 1,] = c('1119300A', 'SUGARCANE, SUGAR') 
  df2[nrow(df2) + 1,] = c('1119300B', 'SUGARCANE, SEED') 
  
  # coa aggregates to hay farming: 111940
  df2[nrow(df2) + 1,] = c('11194000', 'HAY & HAYLAGE')
  df2[nrow(df2) + 1,] = c('1119400A', 'HAY & HAYLAGE (EXCL ALFALFA)')
  df2[nrow(df2) + 1,] = c('1119400B', 'HAY & HAYLAGE, ALFALFA')
  df2[nrow(df2) + 1,] = c('11194010', 'HAY')
  df2[nrow(df2) + 1,] = c('1119401A', 'HAY (EXCL ALFALFA)')
  df2[nrow(df2) + 1,] = c('1119401B', 'HAY, ALFALFA')
  df2[nrow(df2) + 1,] = c('11194020', 'HAYLAGE') 
  df2[nrow(df2) + 1,] = c('1119402A', 'HAYLAGE (EXCL ALFALFA)') 
  df2[nrow(df2) + 1,] = c('1119402B', 'HAYLAGE, ALFALFA') 
  
  # coa aggregates to all other crop farming: 111990
  df2[nrow(df2) + 1,] = c('1119900A', 'DILL')
  df2[nrow(df2) + 1,] = c('1119900C', 'GRASSES & LEGUMES TOTALS')
  df2[nrow(df2) + 1,] = c('1119900D', 'GUAR')
  df2[nrow(df2) + 1,] = c('1119900E', 'HERBS, DRY')
  df2[nrow(df2) + 1,] = c('1119900F', 'HOPS')
  df2[nrow(df2) + 1,] = c('1119900G', 'JOJOBA')
  df2[nrow(df2) + 1,] = c('1119900H', 'MISCANTHUS')
  df2[nrow(df2) + 1,] = c('1119900I', 'PEANUTS')
  df2[nrow(df2) + 1,] = c('1119900J', 'SUGARBEETS')
  
  
  
  
  ###### ANIMALS ###################3
  
  # cattle ranching and farming: 112100
  
  # beef cattle ranching and farming including feedlots: 112110		
  df2[nrow(df2) + 1,] = c('1121100A', 'BISON')
  df2[nrow(df2) + 1,] = c('1121100B', 'CATTLE, (EXCL COWS)')
  df2[nrow(df2) + 1,] = c('1121100C', 'CATTLE, COWS')
  df2[nrow(df2) + 1,] = c('1121100D', 'CATTLE, INCL CALVES')
  
  # beef cattle ranching and farming including feedlots: 112111	
  df2[nrow(df2) + 1,] = c('11211100', 'CATTLE, COWS, BEEF')
  
  # cattle feedlots: 112112	
  df2[nrow(df2) + 1,] = c('112112', 'CATTLE, ON FEED')                                 
  
  # dairy cattle and milk production: 112120	
  df2[nrow(df2) + 1,] = c('1121200A', 'CATTLE, COWS, MILK')
  
  # hog and pig farming: 112200	
  df2[nrow(df2) + 1,] = c('11220000', 'HOGS')
  
  
  # poultry and egg production: 112300	
  
  # chicken egg production: 112310	
  df2[nrow(df2) + 1,] = c('1123100A', 'CHICKENS, LAYERS')
  
  # broilers and other meat-type chicken production: 112320	
  df2[nrow(df2) + 1,] = c('1123200A', 'CHICKENS, BROILERS')
  df2[nrow(df2) + 1,] = c('1123200B', 'CHICKENS, PULLETS, REPLACEMENT')
  df2[nrow(df2) + 1,] = c('1123200C', 'CHICKENS, ROOSTERS')
  
  # turkey production: 112330	
  df2[nrow(df2) + 1,] = c('11233000', 'TURKEYS')
  
  # poultry hatcheries: 112340                               
  
  # other poultry production: 112390
  df2[nrow(df2) + 1,] = c('1123900A',	'CHUKARS')
  df2[nrow(df2) + 1,] = c('1123900B',	'DUCKS')
  df2[nrow(df2) + 1,] = c('1123900C',	'EMUS')
  df2[nrow(df2) + 1,] = c('1123900D',	'GEESE')
  df2[nrow(df2) + 1,] = c('1123900E',	'GUINEAS') 
  df2[nrow(df2) + 1,] = c('1123900F',	'OSTRICHES') 
  df2[nrow(df2) + 1,] = c('1123900G', 'PARTRIDGES, HUNGARIAN')	
  df2[nrow(df2) + 1,] = c('1123900H',	'PEAFOWL, HENS & COCKS') 
  df2[nrow(df2) + 1,] = c('1123900J',	'PHEASANTS') 
  df2[nrow(df2) + 1,] = c('1123900K',	'PIGEONS & SQUAB')  
  df2[nrow(df2) + 1,] = c('1123900L',	'POULTRY, OTHER')  
  df2[nrow(df2) + 1,] = c('1123900M',	'QUAIL')  
  df2[nrow(df2) + 1,] = c('1123900N',	'RHEAS')  
  
  
  # sheep and goat farming: 112400
  df2[nrow(df2) + 1,] = c('11240000','SHEEP & GOATS TOTALS')
  
  # sheep farming: 112410	
  df2[nrow(df2) + 1,] = c('11241000', 'SHEEP, INCL LAMBS')
  
  # goat farming: 112420	
  df2[nrow(df2) + 1,] = c('11242000',	'GOATS')
  
  # animal aquaculture: 112500
  df2[nrow(df2) + 1,] = c('11250000', 'AQUACULTURE TOTALS')                             
  # part of Finfish farming                                 
  df2[nrow(df2) + 1,] = c('1125110A',	'FOOD FISH, CATFISH') 
  df2[nrow(df2) + 1,] = c('1125110B',	'FOOD FISH, TROUT') 
  df2[nrow(df2) + 1,] = c('1125110C', 'FOOD FISH, (EXCL CATFISH & TROUT)')
  df2[nrow(df2) + 1,] = c('1125110D',	'BAITFISH') 
  df2[nrow(df2) + 1,] = c('1125110E',	'ORNAMENTAL FISH') 
  df2[nrow(df2) + 1,] = c('1125110F', 'SPORT FISH')
  # part of Shellfish farming                                 
  df2[nrow(df2) + 1,] = c('1125120A',	'CRUSTACEANS') 
  df2[nrow(df2) + 1,] = c('1125120B',	'MOLLUSKS')
  df2[nrow(df2) + 1,] = c('11251900',	'AQUACULTURE, OTHER')                          
  
  # other animal production: 112900	
  
  
  # apiculture: 112910                                      
  df2[nrow(df2) + 1,] = c('1129100A',	'HONEY') 
  df2[nrow(df2) + 1,] = c('1129100B',	'HONEY, BEE COLONIES')
  
  # horse and other equine production: 112920	
  df2[nrow(df2) + 1,] = c('11292000','EQUINE, (HORSES & PONIES) & (MULES & BURROS & DONKEYS)')
  df2[nrow(df2) + 1,] = c('1129200A','EQUINE, HORSES AND POINIES')
  df2[nrow(df2) + 1,] = c('1129200B','EQUINE, MULES & BURROS & DONKEYS')
  
  # fur-bearing animal and rabbit production: 112930	
  df2[nrow(df2) + 1,] = c('1129300A',	'MINK, LIVE')
  df2[nrow(df2) + 1,] = c('1129300B',	'RABBITS, LIVE')
  
  
  # all other animal production: 112990	
  df2[nrow(df2) + 1,] = c('1129900A',	'ALPACAS') 
  df2[nrow(df2) + 1,] = c('1129900B',	'DEER') 
  df2[nrow(df2) + 1,] = c('1129900C',	'ELK') 
  df2[nrow(df2) + 1,] = c('1129900D',	'LLAMAS') 
  
  
  
  ##### JOIN imported naics and manually created naics dataframes
  
  df3 <- merge(df, df2, by.x = "NAICS_Code", by.y = "NAICS8", all = TRUE)
  
  # add additional column of original NAICS 2-6 digit codes pulled from NAICS csv file
  df4 <- merge(df3, NAICS, by = "NAICS_Name", all.x = TRUE)
  
  # rename columns and reorder 
  names(df4)[names(df4) == 'NAICS_Code.y'] <- 'NAICS_2012_Code'
  names(df4)[names(df4) == 'NAICS_Code.x'] <- 'NAICS8_Code'
  
  df4 <-df4[ , c("NAICS_2012_Code", "NAICS8_Code", "NAICS_Name", "Activity")]
  
  # sort data by NAICS8 code
  df5 <- df4[order(df4$NAICS8_Code, df4$NAICS_2012_Code),]
  
  # remove rows where the NAICS_2012_Code ends in "0". Because in all cases, this is a 6 digit NAICS
  # and the 5 digit NAICS also exists and has the same NAICS code name. Want to avoid possible double counting.
  df6 <- subset(df5, (!substr(df5$NAICS_2012_Code, 6, 6)=='0') | is.na(df5$NAICS_2012_Code)) 
  
  # At times, one NAICS8 corresponds directly to one NAICS 5/6 digit. In other cases, multiple NAICS8 must be summed to
  # calculate the value of one NAICS 5/6 digit. This next section of code determines occurances of the later situation
  # and assigns a "NAICS_2012_Code" alongside a NAICS8_Code. If "NAICS_2012_Code" is left as NA, then the row of data 
  # does not need to be summed.
  
  # add NAICS7 column used to match two dataframes - drop later
  df6$NAICS7 <- substr(df6$NAICS8_Code, 1, 7)
  
  # create subset of df where Activity = NA
  df_s <- subset(df6, is.na(Activity))
  
  # modify naics8 to naics7
  df_s$NAICS7 <- substr(df_s$NAICS8_Code, 1, 7)
  
  # rename column
  names(df_s)[names(df_s) == 'NAICS_2012_Code'] <- 'NAICS_Replacement'
  
  # select columns
  df_s <- df_s[ , c("NAICS_Replacement", "NAICS7")]
  
  # merge master dataframe with subset dataframe by naics7, then drop naics7
  df7 <- merge(df6, df_s, by = "NAICS7", all.x = TRUE)
  df7 <- subset(df7, select = -c(NAICS7))
  
  # sort data by NAICS8 code
  df8 <- df7[order(df7$NAICS8_Code, df7$NAICS_2012_Code),]
  
  # if NAICS_12_Code is NA, replace with NAICS_Replacement column (some of which are also NA)
  df8$NAICS_2012_Code[is.na(df8$NAICS_2012_Code)] <- df8$NAICS_Replacement[is.na(df8$NAICS_2012_Code)]
  
  #drop NAICS replacement column
  df8 <- subset(df8, select = -c(NAICS_Replacement))
  
  ### finished assigning NAICS codes. Now, modify presentation
  
  
  # if the coa activity definition colum is NA, replace with the 2012 NAICS name
  df9 <- df8
  df9$Activity[is.na(df9$Activity)] <- as.character(df9$NAICS_Name[is.na(df9$Activity)])
  
  # select specific columns
  df9 <-df9[ , c("NAICS_2012_Code", "NAICS8_Code", "Activity")]
  
  # sort df
  CoAtoNAICS <- df9[order(df9$NAICS8_Code, df9$NAICS_2012_Code),]
  
  # write as csv
  
  utils::write.csv(CoAtoNAICS, "inst/extdata/Crosswalk_CoAtoNAICS.csv", na = "", row.names = FALSE, fileEncoding="UTF-8") 
  
  
}
USEPA/useeior documentation built on April 12, 2024, 1:36 p.m.