In cmhoove14/AgSchistOverlap: Agrochemical Pollution and Schistosomiasis areas of concern

knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE)

library(survey)
library(sf)
library(tidyr)
library(dplyr)
library(readr)
library(stringr)
library(dummies)
devtools::load_all()

# auxiliary function for handling sums of NA vectors
# handles NA by returning NA if all are NA
sumNA <- function(x) {
  if (all(is.na(x))) {
    return(NA)
  } else {
    return(sum(x, na.rm=TRUE))
  }
}

# take a vector of inputs: if at least 1 input is 1, then return 1 else 0.
# handles NA by returning NA if all are NA
indicator <- function(x) {
  if (all(is.na(x))) {
    return(NA)
  } else if (sum(x, na.rm=TRUE) > 0) {
    return(1)
  } else {
    return(0)
  }
}

#' Converts a column of quantity data to kg based on unit coding provided
#' by units. 1 = kg, 2 = g, 3 = liter, 4 = centiliter. Does not mutate original df.
#'
#' @param quant quantity (character) 
#' @param units specifies units of quant (integer)
#' @param data (dataframe)
#'
#' @return converted quant vector
unit_conv <- function(quant, units) {
  return(case_when(
    units == 1 ~ quant,
    units == 2 ~ quant/1000,
    units == 3 ~ quant,
    units == 4 ~ quant/100
  ))
}

#' If x is 2 or 0, set to 0. This recodes an x column. Note that NAs are recoded to 0.
recode <- function(x) ifelse(is.na(x) | x == 2 | x == 0, 0, 1)

#' this is mainly used for taking a y/n column x, and wanting to return 0 if x is 0 else y for each observation.
#' for example: if x = (0, 0, 1) and y = (NA, NA, 15), then we want to return (0, 0, 15).
handleNA <- function(x, y) ifelse(x == 0, 0, y)

Description

This notebook generates plot level fertilizer information. To keep consistency, we will only analyze B files. A files denote the first visit (Jan-Jun), while B files denote the second visit (Jul-Dec) in the year. This is because only 1/2 of the A files were intact, so we'll just use the B files (since all are intact).

We will also analyze 09 last since its format differs slightly.

Plot Area data is in AGSEC4, so we'll need those too.

#AGSEC3B
w1_3B <- load_dta_from_googledrive('19Ah0MdOilhUqd-lrLXIbeklcvC-lTZGU')
w2_3B <- load_dta_from_googledrive('1BVQD632HvuyVSUuGMojwWa-49OvZmscx')
w3_3B <- load_dta_from_googledrive('1H4xv_8xMMS1E794ILtAFNHCIngbqy89P')
w4_3B <- load_csv_from_googledrive('1d0y5MZtQmR6FTiS_nu5ETdVgCSV4rI3l')

#AGSEC4B
w1_4B <- load_dta_from_googledrive('1A73gVw8ZW2jQoktHowKtpMnOtfm5yYP9')
w2_4B <- load_dta_from_googledrive('1J77rX_YqiUrfcZwFka5G9DlwnUd84DXs')
w3_4B <- load_dta_from_googledrive('1-PJACIKbzvaMHzsY0Yo8bQ0ackBtNCuH')
w4_4B <- load_csv_from_googledrive('1TSF51Buqy6Vtv_zZkzLMzkPz6wdI4RqE')

#UNPS_GEOVARS_xxxx.dta
w1_geo <- load_dta_from_googledrive('1DfdCow-OWianLigs9tnaVGQY8np73fK_')
w2_geo <- load_dta_from_googledrive('1zy5I8KIxrN5v1FM2fwverG4mvjzKy5-F')
w3_geo <- load_dta_from_googledrive('1kGQdP0NTGp4I--gjcBZmzDJf4yTvZpOS')

# Weight data: GSEC1.dta
w1_w = load_dta_from_googledrive('1653pmufjZDtxTZ3uCohb74rkUB_eggMl')
w2_w = load_dta_from_googledrive('1-YE9mG85w5-VSgsEIDx9ihwLcSVtS3vm')
w3_w = load_dta_from_googledrive('1ikEk0ejSIz-_8YDhdaXhIfPtIELkKQ_3')
w4_w = load_csv_from_googledrive('1F7vwOz_OZKzb6IkgLZBrI_cdRKI1ZL1u')

w1_weight = w1_w %>% select(HHID, weight=wgt09) %>% mutate(start_yr=2009)# wgt09
w2_weight = w2_w %>% select(HHID, weight=wgt10) %>% mutate(start_yr = 2010)# wgt10
w3_weight = w3_w %>% select(HHID, weight= mult) %>% mutate(start_yr = 2011) # mult
w4_weight = w4_w %>% select(HHID, weight= wgt_X) %>% mutate(start_yr = 2015) # wgt_X
w4_weight$HHID = as.numeric(gsub("^0+", x=gsub('\\D+', x=w4_weight$HHID, ""), "")) # convert HHIDs back to numeric form

ugaweights = rbind(w1_weight, w2_weight, w3_weight, w4_weight)

Fertilizer/Pesticide data

This includes reported org/inorg fert, quantities, and types if applicable. Quantities are coded as kg/liters, but I am making the assumption that 1 kg = 1 liter.

w1_fert <- w1_3B %>% select(hhid=HHID, parcelid=a3bq1, plotid=a3bq3,  ofert_rep_yn = a3bq4, quant_ofert = a3bq5, iofert_rep_yn = a3bq14, type_iofert = a3bq15, quant_iofert = a3bq16, pest_yn = a3bq26, quant_pest = a3bq28b) 

w2_fert <- w2_3B  %>% select(hhid=HHID, parcelid=prcid, plotid=pltid, ofert_rep_yn = a3bq4, quant_ofert = a3bq5, iofert_rep_yn = a3bq14, type_iofert = a3bq15, quant_iofert = a3bq16, pest_yn = a3bq26, quant_pest = a3bq28b)


w3_fert <- w3_3B %>% select(hhid=HHID, parcelid = parcelID, plotid=plotID, ofert_rep_yn = a3bq4, quant_ofert = a3bq5, iofert_rep_yn = a3bq13, type_iofert = a3bq14, quant_iofert = a3bq15, pest_yn = a3bq22, quant_pest = a3bq24b)

w4_fert <- w4_3B %>% select(hhid=HHID, parcelid=parcelID, plotid=plotID, ofert_rep_yn = a3bq4, quant_ofert = a3bq5, iofert_rep_yn = a3bq13, type_iofert = a3bq14, quant_iofert = a3bq15, pest_yn = a3bq22, quant_pest = a3bq24b)

Area data is one level deeper than the plot data, so each plot's area will be the sum of the areas of each 'subplot', where a subplot corresponds to a crop. Thus we need to group by plot_id. Also area is in acres.

w1_area <- w1_4B %>% select(hhid=HHID, parcelid = a4bq2, plotid=a4bq4, area=a4bq8) %>% group_by(hhid, parcelid, plotid) %>% summarize(area=sum(area))

w2_area <- w2_4B %>% select(hhid=HHID, parcelid = prcid, plotid=pltid, area=a4bq8) %>% group_by(hhid, parcelid, plotid) %>% summarize(area=sum(area))

w3_area <- w3_4B %>% select(hhid=HHID, parcelid = parcelID, plotid = plotID, area=a4bq7) %>% group_by(hhid, parcelid, plotid) %>% summarize(area=sum(area))

w4_area <- w4_4B %>% select(hhid=HHID, parcelid=parcelID, plotid=plotID, area=a4bq7) %>% group_by(hhid, parcelid, plotid) %>% summarize(area=sum(area))

w1 <- w1_fert %>% left_join(w1_area, by = c('hhid', 'parcelid', 'plotid'))
w2 <- w2_fert %>% left_join(w2_area, by = c('hhid', 'parcelid', 'plotid'))
w3 <- w3_fert %>% left_join(w3_area, by = c('hhid', 'parcelid', 'plotid'))
w4 <- w4_fert %>% left_join(w4_area, by = c('hhid', 'parcelid', 'plotid'))

w1t <- data.frame(country='uganda', start_yr=2009, end_yr=2010, w1, stringsAsFactors = FALSE)
w2t <- data.frame(country='uganda', start_yr=2010, end_yr=2011, w2, stringsAsFactors = FALSE)
w3t <- data.frame(country='uganda', start_yr=2011, end_yr=2012, w3, stringsAsFactors = FALSE)
w4t <- data.frame(country='uganda', start_yr=2015, end_yr=2016, w4, stringsAsFactors = FALSE)
uga <- rbind(w1t, w2t, w3t, w4t)

Transforming Fertilizer data

1) need to create dummy vars for each type of inorganic fert 2) need to handle NAs for nonreported data

uga

Handling NAs

Since there is no question of 'was fertilizer used' to discern whether NAs in iofert/ofert correspond to 0s, we will make the assumption that NAs mean 0 here. This assumption was NOT made in the other LSMS surveys. This occurs in approximately 1.9k cases.

uga[is.na(uga$ofert_rep_yn) | is.na(uga$iofert_rep_yn | is.na(uga$pest_yn)), c('ofert_rep_yn', 'iofert_rep_yn', 'pest_yn')]

# we will recode all 2s to 1s. also, if no org. fert, inorg fert. or pest is used, then the corresponding quantities/types will be set to 0.
uga <- uga %>% mutate_at(c('ofert_rep_yn', 'iofert_rep_yn', 'pest_yn'), recode)

uga <- uga %>% mutate_at('quant_ofert', list(~handleNA(ofert_rep_yn, .))) %>% mutate_at(c('quant_iofert', 'type_iofert'), list(~handleNA(iofert_rep_yn, .))) %>% mutate_at('quant_pest', list(~handleNA(pest_yn, .)))

# handle more NAs (if fert was reported but the type is unreported, then categorize it as other/mixed (4))
uga[(uga$iofert_rep_yn == 1) & (is.na(uga$type_iofert)), ] <- 4

# this function creates a quantity col for a corresponding fertilizer type.
agg_quant <- function(type, df) {
  return(ifelse(df$type_iofert == type, df$quant_iofert, 0))
}

# this function creates a type col for a corresponding fertilizer type
agg_yn <- function(type, df) {
  return(as.numeric((!is.na(df$type_iofert)) & (df$type_iofert == type)))
}

uga_mod <- uga %>% mutate(nitrate_quant=agg_quant(1, uga), phosphate_quant=agg_quant(2, uga), potash_quant=agg_quant(3, uga), mixed_quant=agg_quant(4, uga))

uga_mod <- uga_mod %>% mutate(nitrate_yn=agg_yn(1, uga), phosphate_yn=agg_yn(2, uga), potash_yn=agg_yn(3, uga), mixed_yn=agg_yn(4, uga))

Scale area (acres) to meters^2. Conv factor is 1 acre = 4046.86 m^2

uga_mod$area <- uga_mod$area*4046.86
uga_final <- uga_mod

uga<- uga_final %>% select(country, start_yr, end_yr, hhid, plotid, parcelid, ofert_rep_yn, quant_ofert, iofert_rep_yn, type_iofert, quant_iofert, pest_yn, quant_pest, nitrate_quant, phosphate_quant, potash_quant, mixed_quant, nitrate_yn, phosphate_yn, potash_yn, mixed_yn, area) %>% rename(quant_nitrate = nitrate_quant, quant_phosphate = phosphate_quant, quant_potash = potash_quant, quant_mixed = mixed_quant)
uga = uga %>% mutate(fert_dummy = as.numeric(iofert_rep_yn + ofert_rep_yn > 0))
uga

Survey Weighting

uga_joined = uga %>% inner_join(ugaweights, by = c('start_yr', 'hhid'='HHID'))
uga_joined = uga_joined %>% drop_na(weight) # 15 missing weights - we can try to impute later?

sum(is.na(uga_joined$weight))

w1 = uga_joined %>% filter(start_yr==2009)
w2 = uga_joined %>% filter(start_yr==2010)
w3 = uga_joined %>% filter(start_yr==2011)
w4 = uga_joined %>% filter(start_yr==2015)

pw1des = svydesign(id = ~hhid,
                        weights = ~weight,
                        nest = TRUE,
                        data = w1)
pw2des = svydesign(id = ~hhid,
                        weights = ~weight,
                        nest = TRUE,
                        data = w2)
pw3des = svydesign(id = ~hhid,
                        weights = ~weight,
                        nest = TRUE,
                        data = w3)
pw4des = svydesign(id= ~hhid,
                   weights = ~weight,
                   nest = TRUE,
                   data = w4)

uga_joined

cols = c('fert_dummy', 'ofert_rep_yn', 'iofert_rep_yn', 'pest_yn', 'quant_ofert', 'quant_iofert', 'quant_pest')  
d = length(cols)

calc_means = function(design) {
  means = c(0, d)
  for (var in 1:d) {
    f = as.formula(paste('~', cols[var]))
    means[var] = svymean(f, design, na.rm=TRUE)
  }
  return(means)
}
res1 = matrix(0, nrow=4, ncol=d)
res1[1,] = calc_means(pw1des)
res1[2,] = calc_means(pw2des)
res1[3,] = calc_means(pw3des)
res1[4,] = calc_means(pw4des)

colnames(res1) = cols
rownames(res1) = c(2009, 2010, 2011, 2015)
res1 = data.frame(res1)

write_csv(res1, 'natl_agg_results/uga_res.csv')

cmhoove14/AgSchistOverlap documentation built on Nov. 18, 2019, 6:56 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cmhoove14/AgSchistOverlap
Agrochemical Pollution and Schistosomiasis areas of concern

In cmhoove14/AgSchistOverlap: Agrochemical Pollution and Schistosomiasis areas of concern

Description

Fertilizer/Pesticide data

Transforming Fertilizer data

Handling NAs

Survey Weighting

R Package Documentation

Browse R Packages

We want your feedback!

cmhoove14/AgSchistOverlap Agrochemical Pollution and Schistosomiasis areas of concern

In cmhoove14/AgSchistOverlap: Agrochemical Pollution and Schistosomiasis areas of concern

Description

Fertilizer/Pesticide data

Transforming Fertilizer data

Handling NAs

Survey Weighting

R Package Documentation

Browse R Packages

We want your feedback!

cmhoove14/AgSchistOverlap
Agrochemical Pollution and Schistosomiasis areas of concern