In cmhoove14/AgSchistOverlap: Agrochemical Pollution and Schistosomiasis areas of concern

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

library(survey)
library(tidyr)
library(dplyr)
library(readr)
library(tibble)
devtools::load_all() # load package functions

Description

This notebook cleans the full Ethiopia dataset using included survey weights.

devtools is required for loading package functions.

(Terminology: field is just a sub-parcel)

Custom Functions

# auxiliary function for handling sums of NA vectors
# handles NA by returning NA if all are NA
sumNA = function(x) {
  if (all(is.na(x))) {
    return(NA)
  } else {
    return(sum(x, na.rm=TRUE))
  }
}

# take a vector of inputs: if at least 1 input is 1, then return 1 else 0.
# handles NA by returning NA if all are NA
indicator = function(x) {
  if (all(is.na(x))) {
    return(NA)
  } else if (sum(x, na.rm=TRUE) > 0) {
    return(1)
  } else {
    return(0)
  }
}

# recoding 2s to 0s. there's a dplyr method that does this as well
recode = function(x) ifelse(x == 2 | x == 0, 0, 1)

# used for setting other variables to 0 if x is 0
handleNA = function(x, y) ifelse(x == 0, 0, y)

LSMS Agricultural Data

# filenames are equal to variable names
# sect3 is plot level fertilizer use
sect3_pp_w1 = load_csv_from_googledrive('1YG64BoNKdscBocfA_0I5fl7lnJbz8GkA') 
sect3_pp_w2 = load_dta_from_googledrive('1EhgkkmrMIieR7G2rnbTgeaKmsveGPVU0')
sect3_pp_w3 = load_csv_from_googledrive('1sFeyFvVmOk9hOhGuhkNyVpJzUXVv5jBQ')

#sect4 is plot level herb/pest/fungicide use
sect4_pp_w1 = load_csv_from_googledrive('1-kpraW7alL4MQFYL7HYSEJqcHwt6Vy7O')
sect4_pp_w2 = load_dta_from_googledrive('1jFho2eZarM1Zd7IZaYWsuq6vTskRydZF')
sect4_pp_w3 = load_csv_from_googledrive('11GnBZGI8TANCy50C6ECwTyUPzczkecak')

# first analyze fertilizer data

# selecting relevant columns. we drop any columns that have missing hhid or fert indicator, since if fert = NA then we cannot draw conclusions from their data.
wave1 = sect3_pp_w1 %>% select(hhid=household_id, holder_id, parcel_id, field_id, weights = pw, region=saq01, zone=saq02, fert=pp_s3q14, urea=pp_s3q15, dap=pp_s3q18, manure=pp_s3q21, compost=pp_s3q23, organic_fert=pp_s3q25, quant_urea=pp_s3q16_a, quant_dap=pp_s3q19_a, area=pp_s3q05_a) %>% drop_na(fert, hhid)

wave2 = sect3_pp_w2 %>% select(hhid=household_id2, holder_id, parcel_id, field_id, weights = pw2, region=saq01, zone=saq02, fert=pp_s3q14, urea=pp_s3q15, dap=pp_s3q18, manure=pp_s3q21, compost=pp_s3q23, organic_fert=pp_s3q25, quant_urea=pp_s3q16_a, quant_dap=pp_s3q19_a, area=pp_s3q05_a) %>% drop_na(fert, hhid)

wave3 = sect3_pp_w3 %>% select(hhid=household_id2, holder_id, parcel_id, field_id, weights = pw_w3, region=saq01, zone=saq02, fert=pp_s3q14, urea=pp_s3q15, dap=pp_s3q18, manure=pp_s3q21, compost=pp_s3q23, organic_fert=pp_s3q25, quant_urea=pp_s3q16, quant_dap=pp_s3q19, area=pp_s3q05_a) %>% drop_na(fert, hhid)

# add countryname/year columns, and combine into one frame
w1_fert = data.frame(country="ethiopia", start_yr = 2011, end_yr = 2012, wave1, stringsAsFactors = F) 
w2_fert = data.frame(country="ethiopia", start_yr = 2013, end_yr = 2014, wave2, stringsAsFactors = F)
w3_fert = data.frame(country="ethiopia", start_yr = 2015, end_yr = 2016, wave3, stringsAsFactors = F)
fert = rbind(w1_fert, w2_fert, w3_fert)
fert

Cleaning/Transforming Fertilizer Data

# LSMS data has 1 encoded as YES, 2 encoded as NO - we will change all 2s to 0s

fert_df = fert %>% mutate_at(c("fert", "urea", "dap", "manure", "compost", "organic_fert"), recode)

# handling NAs - assume that if fert is 0 then no fertilizer was used, so set other columns to 0
# remark 1: the . in list is a placeholder for all variables that we want to pass to it
# remark 2: funs is deprecated in favor of list in dplyr 0.8.0

fert_df = fert_df %>% mutate_at(c("urea", "dap", "manure", "compost", "organic_fert", "quant_urea", "quant_dap"), list(~handleNA(fert, .)))
fert_df = fert_df %>% mutate(quant_urea = handleNA(urea, quant_urea)) %>% mutate(quant_dap = handleNA(dap, quant_dap))

# summarize fertilizer use into 3 dummy vars: organic, inorganic, overall
# note that there is a fertilizer column from the LSMS data, but we're creating our own just in case the
# data is unreliable
fert_indic = fert_df %>%
  mutate(organic_fert_dummy = as.numeric(select(., matches('manure|compost|organic_fert')) %>% rowSums(na.rm=TRUE) > 0),
    inorganic_fert_dummy = as.numeric(select(., matches('urea|dap')) %>% rowSums(na.rm=TRUE) > 0),
    fert_dummy = as.numeric(select(., matches('fert|urea|dap|manure|compost')) %>% rowSums(na.rm=TRUE) >0))

# remark: the above regex matches any column that contains any of the above words, which may not be desirable later on (i.e. if we want to add quantity data only). however, it doesn't matter if quantity is counted into the indicator, so we'll leave it as is.

# obtain weights indexed by start_yr, hhid
weights = fert_indic %>% select('start_yr', 'hhid', 'weights') %>% unique()

# group indicator data only, summarize by another indicator for the household (did the household use x?)
fert_indic_gr = fert_indic %>% select(-c('holder_id', 'parcel_id', 'field_id', 'weights', 'region', 'zone', 'area')) %>% group_by(country, start_yr, end_yr, hhid) %>% summarize_all(indicator)

fert_weights = fert_indic_gr %>% inner_join(weights, by = c('start_yr', 'hhid'))

fw1 = fert_weights %>% filter(start_yr==2011)
fw2 = fert_weights %>% filter(start_yr==2013)
fw3 = fert_weights %>% filter(start_yr==2015)
fw1des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = fw1)
fw2des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = fw2)
fw3des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = fw3)

cols = c('fert', 'urea', 'dap', 'organic_fert_dummy', 'inorganic_fert_dummy', 'fert_dummy')
d = length(cols)

calc_means = function(design) {
  means = c(0, d)
  for (var in 1:d) {
    f = as.formula(paste('~', cols[var]))
    means[var] = svymean(f, design, na.rm=TRUE)
  }
  return(means)
}
res = matrix(0, nrow=3, ncol=d)
res[1,] = calc_means(fw1des)
res[2,] = calc_means(fw2des)
res[3,] = calc_means(fw3des)

colnames(res) = cols
rownames(res) = c(2011, 2013, 2015)
res

eth_res = data.frame(res)
eth_res = eth_res %>% add_column(year=rownames(res), .before='fert') %>% add_column(country=rep('Ethiopia', 3), .before='year')
eth_res

Pesticide Data

Drop rows that have NA prevention (prevention is binary: any method of prevention used).

w1_phf = sect4_pp_w1 %>% select(hhid=household_id, weights = pw, prevention=pp_s4q04, pest=pp_s4q05, herb=pp_s4q06, fung=pp_s4q07) %>% drop_na(hhid, prevention)

w2_phf = sect4_pp_w2 %>% select(hhid=household_id2, weights = pw2, prevention=pp_s4q04, pest=pp_s4q05, herb=pp_s4q06, fung=pp_s4q07) %>% drop_na(hhid, prevention)

w3_phf = sect4_pp_w3 %>% select(hhid=household_id2, weights = pw_w3, prevention=pp_s4q04, pest=pp_s4q05, herb=pp_s4q06, fung=pp_s4q07) %>% drop_na(hhid, prevention)

w1 = data.frame(start_yr = 2011, end_yr = 2012, w1_phf, stringsAsFactors = F) 
w2 = data.frame(start_yr = 2013, end_yr = 2014, w2_phf, stringsAsFactors = F)
w3 = data.frame(start_yr = 2015, end_yr = 2016, w3_phf, stringsAsFactors = F)
phf = rbind(w1, w2, w3)
phf

weights = phf %>% select(start_yr, hhid, weights) %>% unique()

# dropping weights before groupby summarize, will join back later
# change 2s to 0s, 
phf1 = phf %>% select(-c(weights)) %>% mutate_at(c("prevention", "pest", "herb", "fung"), recode) 

# handling NAs - assume that if prevention is 0 then set other columns to 0
phf2 = phf1 %>% mutate_at(c("pest", "herb", "fung"), list(~handleNA(prevention, .)) )

# summarize w/ indicators, join with weights
phf_weights = phf2 %>% group_by(start_yr, hhid) %>% summarize_all(indicator) %>% inner_join(weights, by=c('start_yr', 'hhid'))

w1 = phf_weights %>% filter(start_yr==2011)
w2 = phf_weights %>% filter(start_yr==2013)
w3 = phf_weights %>% filter(start_yr==2015)
pw1des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = w1)
pw2des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = w2)
pw3des = svydesign(id = ~hhid,
                        weights = ~weights,
                        nest = TRUE,
                        data = w3)

cols = c('prevention', 'pest', 'herb', 'fung')
d = length(cols)

calc_means = function(design) {
  means = c(0, d)
  for (var in 1:d) {
    f = as.formula(paste('~', cols[var]))
    means[var] = svymean(f, design, na.rm=TRUE)
  }
  return(means)
}
res1 = matrix(0, nrow=3, ncol=d)
res1[1,] = calc_means(pw1des)
res1[2,] = calc_means(pw2des)
res1[3,] = calc_means(pw3des)

colnames(res1) = cols
rownames(res1) = c(2011, 2013, 2015)
res1

eth_res_final = rbind(eth_res, res1)

write_csv(eth_res, 'natl_agg_results/eth_res.csv')

cmhoove14/AgSchistOverlap documentation built on Nov. 18, 2019, 6:56 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cmhoove14/AgSchistOverlap
Agrochemical Pollution and Schistosomiasis areas of concern

In cmhoove14/AgSchistOverlap: Agrochemical Pollution and Schistosomiasis areas of concern

Description

Custom Functions

LSMS Agricultural Data

Cleaning/Transforming Fertilizer Data

Pesticide Data

R Package Documentation

Browse R Packages

We want your feedback!

cmhoove14/AgSchistOverlap Agrochemical Pollution and Schistosomiasis areas of concern

In cmhoove14/AgSchistOverlap: Agrochemical Pollution and Schistosomiasis areas of concern

Description

Custom Functions

LSMS Agricultural Data

Cleaning/Transforming Fertilizer Data

Pesticide Data

R Package Documentation

Browse R Packages

We want your feedback!

cmhoove14/AgSchistOverlap
Agrochemical Pollution and Schistosomiasis areas of concern