knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
library(survey)
library(tidyr)
library(dplyr)
library(readr)
library(tibble)
devtools::load_all() # load package functions

Description

This notebook cleans the full Ethiopia dataset using included survey weights.

devtools is required for loading package functions.

(Terminology: field is just a sub-parcel)

Custom Functions

# auxiliary function for handling sums of NA vectors
# handles NA by returning NA if all are NA
sumNA = function(x) {
  if (all(is.na(x))) {
    return(NA)
  } else {
    return(sum(x, na.rm=TRUE))
  }
}

# take a vector of inputs: if at least 1 input is 1, then return 1 else 0.
# handles NA by returning NA if all are NA
indicator = function(x) {
  if (all(is.na(x))) {
    return(NA)
  } else if (sum(x, na.rm=TRUE) > 0) {
    return(1)
  } else {
    return(0)
  }
}

# recoding 2s to 0s. there's a dplyr method that does this as well
recode = function(x) ifelse(x == 2 | x == 0, 0, 1)

# used for setting other variables to 0 if x is 0
handleNA = function(x, y) ifelse(x == 0, 0, y)

LSMS Agricultural Data

# filenames are equal to variable names
# sect3 is plot level fertilizer use
sect3_pp_w1 = load_csv_from_googledrive('1YG64BoNKdscBocfA_0I5fl7lnJbz8GkA') 
sect3_pp_w2 = load_dta_from_googledrive('1EhgkkmrMIieR7G2rnbTgeaKmsveGPVU0')
sect3_pp_w3 = load_csv_from_googledrive('1sFeyFvVmOk9hOhGuhkNyVpJzUXVv5jBQ')

#sect4 is plot level herb/pest/fungicide use
sect4_pp_w1 = load_csv_from_googledrive('1-kpraW7alL4MQFYL7HYSEJqcHwt6Vy7O')
sect4_pp_w2 = load_dta_from_googledrive('1jFho2eZarM1Zd7IZaYWsuq6vTskRydZF')
sect4_pp_w3 = load_csv_from_googledrive('11GnBZGI8TANCy50C6ECwTyUPzczkecak')
# first analyze fertilizer data

# selecting relevant columns. we drop any columns that have missing hhid or fert indicator, since if fert = NA then we cannot draw conclusions from their data.
wave1 = sect3_pp_w1 %>% select(hhid=household_id, holder_id, parcel_id, field_id, weights = pw, region=saq01, zone=saq02, fert=pp_s3q14, urea=pp_s3q15, dap=pp_s3q18, manure=pp_s3q21, compost=pp_s3q23, organic_fert=pp_s3q25, quant_urea=pp_s3q16_a, quant_dap=pp_s3q19_a, area=pp_s3q05_a) %>% drop_na(fert, hhid)

wave2 = sect3_pp_w2 %>% select(hhid=household_id2, holder_id, parcel_id, field_id, weights = pw2, region=saq01, zone=saq02, fert=pp_s3q14, urea=pp_s3q15, dap=pp_s3q18, manure=pp_s3q21, compost=pp_s3q23, organic_fert=pp_s3q25, quant_urea=pp_s3q16_a, quant_dap=pp_s3q19_a, area=pp_s3q05_a) %>% drop_na(fert, hhid)

wave3 = sect3_pp_w3 %>% select(hhid=household_id2, holder_id, parcel_id, field_id, weights = pw_w3, region=saq01, zone=saq02, fert=pp_s3q14, urea=pp_s3q15, dap=pp_s3q18, manure=pp_s3q21, compost=pp_s3q23, organic_fert=pp_s3q25, quant_urea=pp_s3q16, quant_dap=pp_s3q19, area=pp_s3q05_a) %>% drop_na(fert, hhid)
# add countryname/year columns, and combine into one frame
w1_fert = data.frame(country="ethiopia", start_yr = 2011, end_yr = 2012, wave1, stringsAsFactors = F) 
w2_fert = data.frame(country="ethiopia", start_yr = 2013, end_yr = 2014, wave2, stringsAsFactors = F)
w3_fert = data.frame(country="ethiopia", start_yr = 2015, end_yr = 2016, wave3, stringsAsFactors = F)
fert = rbind(w1_fert, w2_fert, w3_fert)
fert

Cleaning/Transforming Fertilizer Data

# LSMS data has 1 encoded as YES, 2 encoded as NO - we will change all 2s to 0s

fert_df = fert %>% mutate_at(c("fert", "urea", "dap", "manure", "compost", "organic_fert"), recode)

# handling NAs - assume that if fert is 0 then no fertilizer was used, so set other columns to 0
# remark 1: the . in list is a placeholder for all variables that we want to pass to it
# remark 2: funs is deprecated in favor of list in dplyr 0.8.0

fert_df = fert_df %>% mutate_at(c("urea", "dap", "manure", "compost", "organic_fert", "quant_urea", "quant_dap"), list(~handleNA(fert, .)))
fert_df = fert_df %>% mutate(quant_urea = handleNA(urea, quant_urea)) %>% mutate(quant_dap = handleNA(dap, quant_dap))
# summarize fertilizer use into 3 dummy vars: organic, inorganic, overall
# note that there is a fertilizer column from the LSMS data, but we're creating our own just in case the
# data is unreliable
fert_indic = fert_df %>%
  mutate(organic_fert_dummy = as.numeric(select(., matches('manure|compost|organic_fert')) %>% rowSums(na.rm=TRUE) > 0),
    inorganic_fert_dummy = as.numeric(select(., matches('urea|dap')) %>% rowSums(na.rm=TRUE) > 0),
    fert_dummy = as.numeric(select(., matches('fert|urea|dap|manure|compost')) %>% rowSums(na.rm=TRUE) >0))

# remark: the above regex matches any column that contains any of the above words, which may not be desirable later on (i.e. if we want to add quantity data only). however, it doesn't matter if quantity is counted into the indicator, so we'll leave it as is. 
# obtain weights indexed by start_yr, hhid
weights = fert_indic %>% select('start_yr', 'hhid', 'weights') %>% unique()
# group indicator data only, summarize by another indicator for the household (did the household use x?)
fert_indic_gr = fert_indic %>% select(-c('holder_id', 'parcel_id', 'field_id', 'weights', 'region', 'zone', 'area')) %>% group_by(country, start_yr, end_yr, hhid) %>% summarize_all(indicator)
fert_weights = fert_indic_gr %>% inner_join(weights, by = c('start_yr', 'hhid'))
fw1 = fert_weights %>% filter(start_yr==2011)
fw2 = fert_weights %>% filter(start_yr==2013)
fw3 = fert_weights %>% filter(start_yr==2015)
fw1des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = fw1)
fw2des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = fw2)
fw3des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = fw3)
cols = c('fert', 'urea', 'dap', 'organic_fert_dummy', 'inorganic_fert_dummy', 'fert_dummy')
d = length(cols)

calc_means = function(design) {
  means = c(0, d)
  for (var in 1:d) {
    f = as.formula(paste('~', cols[var]))
    means[var] = svymean(f, design, na.rm=TRUE)
  }
  return(means)
}
res = matrix(0, nrow=3, ncol=d)
res[1,] = calc_means(fw1des)
res[2,] = calc_means(fw2des)
res[3,] = calc_means(fw3des)

colnames(res) = cols
rownames(res) = c(2011, 2013, 2015)
res
eth_res = data.frame(res)
eth_res = eth_res %>% add_column(year=rownames(res), .before='fert') %>% add_column(country=rep('Ethiopia', 3), .before='year')
eth_res 

Pesticide Data

Drop rows that have NA prevention (prevention is binary: any method of prevention used).

w1_phf = sect4_pp_w1 %>% select(hhid=household_id, weights = pw, prevention=pp_s4q04, pest=pp_s4q05, herb=pp_s4q06, fung=pp_s4q07) %>% drop_na(hhid, prevention)

w2_phf = sect4_pp_w2 %>% select(hhid=household_id2, weights = pw2, prevention=pp_s4q04, pest=pp_s4q05, herb=pp_s4q06, fung=pp_s4q07) %>% drop_na(hhid, prevention)

w3_phf = sect4_pp_w3 %>% select(hhid=household_id2, weights = pw_w3, prevention=pp_s4q04, pest=pp_s4q05, herb=pp_s4q06, fung=pp_s4q07) %>% drop_na(hhid, prevention)
w1 = data.frame(start_yr = 2011, end_yr = 2012, w1_phf, stringsAsFactors = F) 
w2 = data.frame(start_yr = 2013, end_yr = 2014, w2_phf, stringsAsFactors = F)
w3 = data.frame(start_yr = 2015, end_yr = 2016, w3_phf, stringsAsFactors = F)
phf = rbind(w1, w2, w3)
phf
weights = phf %>% select(start_yr, hhid, weights) %>% unique() 
# dropping weights before groupby summarize, will join back later
# change 2s to 0s, 
phf1 = phf %>% select(-c(weights)) %>% mutate_at(c("prevention", "pest", "herb", "fung"), recode) 

# handling NAs - assume that if prevention is 0 then set other columns to 0
phf2 = phf1 %>% mutate_at(c("pest", "herb", "fung"), list(~handleNA(prevention, .)) )
# summarize w/ indicators, join with weights
phf_weights = phf2 %>% group_by(start_yr, hhid) %>% summarize_all(indicator) %>% inner_join(weights, by=c('start_yr', 'hhid'))
w1 = phf_weights %>% filter(start_yr==2011)
w2 = phf_weights %>% filter(start_yr==2013)
w3 = phf_weights %>% filter(start_yr==2015)
pw1des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = w1)
pw2des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = w2)
pw3des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = w3)
cols = c('prevention', 'pest', 'herb', 'fung')
d = length(cols)

calc_means = function(design) {
  means = c(0, d)
  for (var in 1:d) {
    f = as.formula(paste('~', cols[var]))
    means[var] = svymean(f, design, na.rm=TRUE)
  }
  return(means)
}
res1 = matrix(0, nrow=3, ncol=d)
res1[1,] = calc_means(pw1des)
res1[2,] = calc_means(pw2des)
res1[3,] = calc_means(pw3des)

colnames(res1) = cols
rownames(res1) = c(2011, 2013, 2015)
res1
eth_res_final = rbind(eth_res, res1)
write_csv(eth_res, 'natl_agg_results/eth_res.csv')


cmhoove14/AgSchistOverlap documentation built on Nov. 18, 2019, 6:56 a.m.