knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE)
library(survey) library(sf) library(tidyr) library(dplyr) library(readr) library(stringr) library(dummies) devtools::load_all()
# auxiliary function for handling sums of NA vectors # handles NA by returning NA if all are NA sumNA <- function(x) { if (all(is.na(x))) { return(NA) } else { return(sum(x, na.rm=TRUE)) } } # take a vector of inputs: if at least 1 input is 1, then return 1 else 0. # handles NA by returning NA if all are NA indicator <- function(x) { if (all(is.na(x))) { return(NA) } else if (sum(x, na.rm=TRUE) > 0) { return(1) } else { return(0) } } #' Converts a column of quantity data to kg based on unit coding provided #' by units. 1 = kg, 2 = g, 3 = liter, 4 = centiliter. Does not mutate original df. #' #' @param quant quantity (character) #' @param units specifies units of quant (integer) #' @param data (dataframe) #' #' @return converted quant vector unit_conv <- function(quant, units) { return(case_when( units == 1 ~ quant, units == 2 ~ quant/1000, units == 3 ~ quant, units == 4 ~ quant/100 )) } #' If x is 2 or 0, set to 0. This recodes an x column. Note that NAs are recoded to 0. recode <- function(x) ifelse(is.na(x) | x == 2 | x == 0, 0, 1) #' this is mainly used for taking a y/n column x, and wanting to return 0 if x is 0 else y for each observation. #' for example: if x = (0, 0, 1) and y = (NA, NA, 15), then we want to return (0, 0, 15). handleNA <- function(x, y) ifelse(x == 0, 0, y)
This notebook generates plot level fertilizer information. To keep consistency, we will only analyze B files. A files denote the first visit (Jan-Jun), while B files denote the second visit (Jul-Dec) in the year. This is because only 1/2 of the A files were intact, so we'll just use the B files (since all are intact).
We will also analyze 09 last since its format differs slightly.
Plot Area data is in AGSEC4, so we'll need those too.
#AGSEC3B w1_3B <- load_dta_from_googledrive('19Ah0MdOilhUqd-lrLXIbeklcvC-lTZGU') w2_3B <- load_dta_from_googledrive('1BVQD632HvuyVSUuGMojwWa-49OvZmscx') w3_3B <- load_dta_from_googledrive('1H4xv_8xMMS1E794ILtAFNHCIngbqy89P') w4_3B <- load_csv_from_googledrive('1d0y5MZtQmR6FTiS_nu5ETdVgCSV4rI3l') #AGSEC4B w1_4B <- load_dta_from_googledrive('1A73gVw8ZW2jQoktHowKtpMnOtfm5yYP9') w2_4B <- load_dta_from_googledrive('1J77rX_YqiUrfcZwFka5G9DlwnUd84DXs') w3_4B <- load_dta_from_googledrive('1-PJACIKbzvaMHzsY0Yo8bQ0ackBtNCuH') w4_4B <- load_csv_from_googledrive('1TSF51Buqy6Vtv_zZkzLMzkPz6wdI4RqE') #UNPS_GEOVARS_xxxx.dta w1_geo <- load_dta_from_googledrive('1DfdCow-OWianLigs9tnaVGQY8np73fK_') w2_geo <- load_dta_from_googledrive('1zy5I8KIxrN5v1FM2fwverG4mvjzKy5-F') w3_geo <- load_dta_from_googledrive('1kGQdP0NTGp4I--gjcBZmzDJf4yTvZpOS') # Weight data: GSEC1.dta w1_w = load_dta_from_googledrive('1653pmufjZDtxTZ3uCohb74rkUB_eggMl') w2_w = load_dta_from_googledrive('1-YE9mG85w5-VSgsEIDx9ihwLcSVtS3vm') w3_w = load_dta_from_googledrive('1ikEk0ejSIz-_8YDhdaXhIfPtIELkKQ_3') w4_w = load_csv_from_googledrive('1F7vwOz_OZKzb6IkgLZBrI_cdRKI1ZL1u')
w1_weight = w1_w %>% select(HHID, weight=wgt09) %>% mutate(start_yr=2009)# wgt09 w2_weight = w2_w %>% select(HHID, weight=wgt10) %>% mutate(start_yr = 2010)# wgt10 w3_weight = w3_w %>% select(HHID, weight= mult) %>% mutate(start_yr = 2011) # mult w4_weight = w4_w %>% select(HHID, weight= wgt_X) %>% mutate(start_yr = 2015) # wgt_X w4_weight$HHID = as.numeric(gsub("^0+", x=gsub('\\D+', x=w4_weight$HHID, ""), "")) # convert HHIDs back to numeric form ugaweights = rbind(w1_weight, w2_weight, w3_weight, w4_weight)
This includes reported org/inorg fert, quantities, and types if applicable. Quantities are coded as kg/liters, but I am making the assumption that 1 kg = 1 liter.
w1_fert <- w1_3B %>% select(hhid=HHID, parcelid=a3bq1, plotid=a3bq3, ofert_rep_yn = a3bq4, quant_ofert = a3bq5, iofert_rep_yn = a3bq14, type_iofert = a3bq15, quant_iofert = a3bq16, pest_yn = a3bq26, quant_pest = a3bq28b) w2_fert <- w2_3B %>% select(hhid=HHID, parcelid=prcid, plotid=pltid, ofert_rep_yn = a3bq4, quant_ofert = a3bq5, iofert_rep_yn = a3bq14, type_iofert = a3bq15, quant_iofert = a3bq16, pest_yn = a3bq26, quant_pest = a3bq28b) w3_fert <- w3_3B %>% select(hhid=HHID, parcelid = parcelID, plotid=plotID, ofert_rep_yn = a3bq4, quant_ofert = a3bq5, iofert_rep_yn = a3bq13, type_iofert = a3bq14, quant_iofert = a3bq15, pest_yn = a3bq22, quant_pest = a3bq24b) w4_fert <- w4_3B %>% select(hhid=HHID, parcelid=parcelID, plotid=plotID, ofert_rep_yn = a3bq4, quant_ofert = a3bq5, iofert_rep_yn = a3bq13, type_iofert = a3bq14, quant_iofert = a3bq15, pest_yn = a3bq22, quant_pest = a3bq24b)
Area data is one level deeper than the plot data, so each plot's area will be the sum of the areas of each 'subplot', where a subplot corresponds to a crop. Thus we need to group by plot_id. Also area is in acres.
w1_area <- w1_4B %>% select(hhid=HHID, parcelid = a4bq2, plotid=a4bq4, area=a4bq8) %>% group_by(hhid, parcelid, plotid) %>% summarize(area=sum(area)) w2_area <- w2_4B %>% select(hhid=HHID, parcelid = prcid, plotid=pltid, area=a4bq8) %>% group_by(hhid, parcelid, plotid) %>% summarize(area=sum(area)) w3_area <- w3_4B %>% select(hhid=HHID, parcelid = parcelID, plotid = plotID, area=a4bq7) %>% group_by(hhid, parcelid, plotid) %>% summarize(area=sum(area)) w4_area <- w4_4B %>% select(hhid=HHID, parcelid=parcelID, plotid=plotID, area=a4bq7) %>% group_by(hhid, parcelid, plotid) %>% summarize(area=sum(area))
w1 <- w1_fert %>% left_join(w1_area, by = c('hhid', 'parcelid', 'plotid')) w2 <- w2_fert %>% left_join(w2_area, by = c('hhid', 'parcelid', 'plotid')) w3 <- w3_fert %>% left_join(w3_area, by = c('hhid', 'parcelid', 'plotid')) w4 <- w4_fert %>% left_join(w4_area, by = c('hhid', 'parcelid', 'plotid'))
w1t <- data.frame(country='uganda', start_yr=2009, end_yr=2010, w1, stringsAsFactors = FALSE) w2t <- data.frame(country='uganda', start_yr=2010, end_yr=2011, w2, stringsAsFactors = FALSE) w3t <- data.frame(country='uganda', start_yr=2011, end_yr=2012, w3, stringsAsFactors = FALSE) w4t <- data.frame(country='uganda', start_yr=2015, end_yr=2016, w4, stringsAsFactors = FALSE) uga <- rbind(w1t, w2t, w3t, w4t)
1) need to create dummy vars for each type of inorganic fert 2) need to handle NAs for nonreported data
uga
Since there is no question of 'was fertilizer used' to discern whether NAs in iofert/ofert correspond to 0s, we will make the assumption that NAs mean 0 here. This assumption was NOT made in the other LSMS surveys. This occurs in approximately 1.9k cases.
uga[is.na(uga$ofert_rep_yn) | is.na(uga$iofert_rep_yn | is.na(uga$pest_yn)), c('ofert_rep_yn', 'iofert_rep_yn', 'pest_yn')]
# we will recode all 2s to 1s. also, if no org. fert, inorg fert. or pest is used, then the corresponding quantities/types will be set to 0. uga <- uga %>% mutate_at(c('ofert_rep_yn', 'iofert_rep_yn', 'pest_yn'), recode) uga <- uga %>% mutate_at('quant_ofert', list(~handleNA(ofert_rep_yn, .))) %>% mutate_at(c('quant_iofert', 'type_iofert'), list(~handleNA(iofert_rep_yn, .))) %>% mutate_at('quant_pest', list(~handleNA(pest_yn, .))) # handle more NAs (if fert was reported but the type is unreported, then categorize it as other/mixed (4)) uga[(uga$iofert_rep_yn == 1) & (is.na(uga$type_iofert)), ] <- 4
# this function creates a quantity col for a corresponding fertilizer type. agg_quant <- function(type, df) { return(ifelse(df$type_iofert == type, df$quant_iofert, 0)) } # this function creates a type col for a corresponding fertilizer type agg_yn <- function(type, df) { return(as.numeric((!is.na(df$type_iofert)) & (df$type_iofert == type))) } uga_mod <- uga %>% mutate(nitrate_quant=agg_quant(1, uga), phosphate_quant=agg_quant(2, uga), potash_quant=agg_quant(3, uga), mixed_quant=agg_quant(4, uga)) uga_mod <- uga_mod %>% mutate(nitrate_yn=agg_yn(1, uga), phosphate_yn=agg_yn(2, uga), potash_yn=agg_yn(3, uga), mixed_yn=agg_yn(4, uga))
Scale area (acres) to meters^2. Conv factor is 1 acre = 4046.86 m^2
uga_mod$area <- uga_mod$area*4046.86 uga_final <- uga_mod
uga<- uga_final %>% select(country, start_yr, end_yr, hhid, plotid, parcelid, ofert_rep_yn, quant_ofert, iofert_rep_yn, type_iofert, quant_iofert, pest_yn, quant_pest, nitrate_quant, phosphate_quant, potash_quant, mixed_quant, nitrate_yn, phosphate_yn, potash_yn, mixed_yn, area) %>% rename(quant_nitrate = nitrate_quant, quant_phosphate = phosphate_quant, quant_potash = potash_quant, quant_mixed = mixed_quant) uga = uga %>% mutate(fert_dummy = as.numeric(iofert_rep_yn + ofert_rep_yn > 0)) uga
uga_joined = uga %>% inner_join(ugaweights, by = c('start_yr', 'hhid'='HHID')) uga_joined = uga_joined %>% drop_na(weight) # 15 missing weights - we can try to impute later?
sum(is.na(uga_joined$weight))
w1 = uga_joined %>% filter(start_yr==2009) w2 = uga_joined %>% filter(start_yr==2010) w3 = uga_joined %>% filter(start_yr==2011) w4 = uga_joined %>% filter(start_yr==2015) pw1des = svydesign(id = ~hhid, weights = ~weight, nest = TRUE, data = w1) pw2des = svydesign(id = ~hhid, weights = ~weight, nest = TRUE, data = w2) pw3des = svydesign(id = ~hhid, weights = ~weight, nest = TRUE, data = w3) pw4des = svydesign(id= ~hhid, weights = ~weight, nest = TRUE, data = w4)
uga_joined
cols = c('fert_dummy', 'ofert_rep_yn', 'iofert_rep_yn', 'pest_yn', 'quant_ofert', 'quant_iofert', 'quant_pest') d = length(cols) calc_means = function(design) { means = c(0, d) for (var in 1:d) { f = as.formula(paste('~', cols[var])) means[var] = svymean(f, design, na.rm=TRUE) } return(means) } res1 = matrix(0, nrow=4, ncol=d) res1[1,] = calc_means(pw1des) res1[2,] = calc_means(pw2des) res1[3,] = calc_means(pw3des) res1[4,] = calc_means(pw4des) colnames(res1) = cols rownames(res1) = c(2009, 2010, 2011, 2015) res1 = data.frame(res1)
write_csv(res1, 'natl_agg_results/uga_res.csv')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.