R/clean_functions.R

#' parity_adj
#'
#' \code{parity_adj} calculates adjusted parity ratios.
#'
#' @param df a data frame with a key / value columns.
#' @param col indicator key column.
#' @param a indicator for 'disadvantaged' group (numerator).
#' @param b indicator for 'advantaged' group  (denominator).
#' @param varname name for calculated indice (character).
#' @param val_status For use with data with flags for estimated observations. If
#'   `TRUE` will calculate flag for indice (requires flag column to be named
#'   `val_status` and estimates lablled as `E`.
#' @return A data frame.
#' @export
#' @family summarise
#' @examples
#'
#' parity_adj(df, indicator, ger.female, ger.male, "ger.gpia", val_status = FALSE)
#'
#' parity_indices <- list(df = list("df","df"),
#' col = list("ind",  "ind"),
#' a = list("adult.profiliteracy.f","adult.profinumeracy.f"),
#' b = list("adult.profiliteracy.m", "adult.profinumeracy.m"),
#' varname = list("adult.profiliteracy.gpia", "adult.profinumeracy.gpia")) %>%
#' pmap(parity_adj) %>%


parity_adj <- function(df, col, a, b, varname, val_status = FALSE) {

  df <- dynGet(df)
  col <- as.name(col)

  indice <- df %>%
    dplyr::filter(!!col %in% c(a, b))

  indice <- dplyr::group_by(indice, iso2c, year) %>%
    dplyr::filter(n()==2 ) %>%
    {if(val_status == FALSE) dplyr::summarise(., value = ifelse(value[!!col == !!a] > value[!!col == !!b], 2 - (1/(value[!!col == !!a]/value[!!col == !!b])), value[!!col == !!a]/value[!!col == !!b]))
      else
        dplyr::summarise(., value = ifelse(value[!!col == !!a] > value[!!col == !!b], 2 - (1/(value[!!col == !!a]/value[!!col == !!b])), value[!!col == !!a]/value[!!col == !!b]),
                  val_status = ifelse(val_status[!!col == !!a] == "E" | val_status[!!col == !!b] == "E", "E", "A")) } %>%
    dplyr::mutate(ind = varname) %>%
    dplyr::filter(!is.na(value))
}


#' uis_clean
#'
#' \code{uis_clean} is a function to clean UIS data.
#'
#' Cleans data from UIS api queries and computes several variables
#' (admi.grade2or3prim, admi.endofprim, admi.endoflowersec, Comp.02, Free.02,
#' Comp.2t3, Free.2t3, Read.Primary.GPIA, Math.Primary.GPIA, Read.LowerSec.GPIA,
#' Math.LowerSec.GPIA, LR.Ag15t24.GPIA, LR.Ag15t99.GPIA, Read.Primary.WPIA,
#' Math.Primary.WPIA, Read.LowerSec.WPIA, Math.LowerSec.WPIA)
#'@family clean

uis_clean <- function(df) {

  clean1 <- df %>%
    tidyr::unite(col = var_concat, STAT_UNIT, UNIT_MEASURE, EDU_LEVEL, EDU_CAT, SEX, AGE, GRADE, SECTOR_EDU, EDU_ATTAIN, WEALTH_QUINTILE, LOCATION,
          EDU_TYPE, EDU_FIELD, SUBJECT, INFRASTR, SE_BKGRD, TEACH_EXPERIENCE, CONTRACT_TYPE, COUNTRY_ORIGIN, REGION_DEST,
          EXPENDITURE_TYPE, SOURCE_FUND, FUND_FLOW) %>%
    dplyr::select(iso2c = REF_AREA, var_concat, year = TIME_PERIOD, value = OBS_VALUE, val_status = OBS_STATUS) %>%
    dplyr::mutate(value = as.numeric(value),
           value = ifelse(val_status == "Z", NA, value),
           year = as.numeric(year)) %>%
    dplyr::filter(!is.na(value)) %>%
    unique()

  clean1 %>%
    dplyr::group_by(iso2c, var_concat) %>%
    dplyr::filter(year == max(year)) %>%
    dplyr::ungroup() %>%
    R.cache::saveCache(key=list("uis_comp"), comment="uis_comp")


  clean2 <- clean1 %>%
    dplyr::inner_join(.gemrtables.pkg.env$indicators[, 1:2], by = "var_concat")

  admin_assessment <- clean1 %>%
    dplyr::filter(stringr::str_detect(var_concat, "ADMIN_NB")) %>%
    dplyr::group_by(iso2c, year) %>%
    tidyr::spread(key = var_concat, value = value) %>%
    dplyr::mutate(admi.grade2or3prim = ifelse(ADMIN_NB_L1__T__T__T_G2_3_INST_T__Z__Z__T__T__T_MATH__Z__T__Z__Z_W00_W00_NA_NA_NA == 1 | ADMIN_NB_L1__T__T__T_G2_3_INST_T__Z__Z__T__T__T_READING__Z__T__Z__Z_W00_W00_NA_NA_NA == 1, 1, 0),
           admi.endofprim = ifelse(ADMIN_NB_L1__T__T__T_GLAST_INST_T__Z__Z__T__T__T_MATH__Z__T__Z__Z_W00_W00_NA_NA_NA == 1 | ADMIN_NB_L1__T__T__T_GLAST_INST_T__Z__Z__T__T__T_READING__Z__T__Z__Z_W00_W00_NA_NA_NA == 1, 1, 0),
           admi.endoflowersec = ifelse(ADMIN_NB_L2__T__T__T_GLAST_INST_T__Z__Z__T__T__T_READING__Z__T__Z__Z_W00_W00_NA_NA_NA == 1 | ADMIN_NB_L2__T__T__T_GLAST_INST_T__Z__Z__T__T__T_MATH__Z__T__Z__Z_W00_W00_NA_NA_NA == 1, 1, 0),
           val_status = "A") %>%
    dplyr::select(iso2c, year, contains("admi."), val_status) %>%
    tidyr::gather(key = "ind", value = "value", -iso2c, -year, -val_status) %>%
    dplyr::mutate(value = ifelse(is.na(value), 0, value))

  # More accurate calculation based on published net flow of int'l mobile students,
  # but currently inbound rate is more available.
  # inbound_stu_add <- clean1 %>%
  #   dplyr::filter(var_concat %in% c(
  #     "MENF_PER_L5T8__T__T__T__T_INST_T__Z__Z__T__T__T__T__Z__Z__Z__Z_W00_W00_NA_NA_NA",
  #     "OE_PER_L5T8__T__T__T__T_INST_T__Z__Z__T__T__T__T__Z__Z__Z__Z_W00_W00_NA_NA_NA"
  #   )) %>%
  #   dplyr::group_by(iso2c, year) %>%
  #   dplyr::filter(n()==2) %>%
  #   dplyr::summarise(
  #     value = sum(value),
  #     val_status = ifelse(any(val_status == "E"), "E", "A")) %>%
  #   dplyr::mutate(ind = "IE.5t8.40510") %>%
  #   dplyr::select(iso2c, year, ind, value, val_status) %>%
  #   group_by(iso2c, ind) %>%
  #   filter(year == max(year)) %>%
  #   ungroup()

  inbound_stu <- clean1 %>%
    dplyr::filter(var_concat %in% c(
        "STU_PER_L5T8__T__T__T__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
        "MSEP_PT_L5T8__T__T__T__T_INST_T__Z__Z__T__T__T__T__Z__Z__Z__Z_W00_W00_NA_NA_NA"
                             )) %>%
    dplyr::group_by(iso2c, year) %>%
    dplyr::filter(n()==2) %>%
    dplyr::summarise(
      value = value[1] * (value[2]/100),
      val_status = ifelse(any(val_status == "E"), "E", "A")) %>%
    dplyr::mutate(ind = "IE.5t8.40510") %>%
    dplyr::select(iso2c, year, ind, value, val_status) %>%
    group_by(iso2c, ind) %>%
    filter(year == max(year)) %>%
    ungroup()

  parity_indices <- list(df = list("clean1", "clean1", "clean1", "clean1", "clean1", "clean1", "clean1", "clean1", "clean1", "clean1",
                                   "clean1", "clean1", "clean1", "clean1"),
                         col = list("var_concat",  "var_concat", "var_concat", "var_concat", "var_concat",  "var_concat", "var_concat",
                                    "var_concat", "var_concat", "var_concat", "var_concat", "var_concat",  "var_concat", "var_concat"),
                         a = list("STU_PT_L1__T_F__T_GLAST_INST_T__Z__T__T__T_ISC_F00_READING__Z__T__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L1__T_F__T_GLAST_INST_T__Z__T__T__T_ISC_F00_MATH__Z__T__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L2__T_F__T_GLAST_INST_T__Z__T__T__T_ISC_F00_READING__Z__T__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L2__T_F__T_GLAST_INST_T__Z__T__T__T_ISC_F00_MATH__Z__T__Z__Z_W00_W00_NA_NA_NA",
                                  "LR_PT__Z__Z_F_Y15T24__Z__Z__Z__Z__T__Z__Z__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "LR_PT__Z__Z_F_Y_GE15__Z__Z__Z__Z__T__Z__Z__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L1__T__T__T_GLAST_INST_T__Z_Q1__T__T_ISC_F00_READING__Z_LOW__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L1__T__T__T_GLAST_INST_T__Z_Q1__T__T_ISC_F00_MATH__Z_LOW__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L2__T__T__T_GLAST_INST_T__Z_Q1__T__T_ISC_F00_READING__Z_LOW__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L2__T__T__T_GLAST_INST_T__Z_Q1__T__T_ISC_F00_MATH__Z_LOW__Z__Z_W00_W00_NA_NA_NA",
                                  "GER_PT_L02__T_F_SCH_AGE_GROUP__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "GER_PT_L1__T_F_SCH_AGE_GROUP__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "GER_PT_L2_3__T_F_SCH_AGE_GROUP__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "GER_PT_L5T8__T_F_SCH_AGE_GROUP__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA"),
                         b = list("STU_PT_L1__T_M__T_GLAST_INST_T__Z__T__T__T_ISC_F00_READING__Z__T__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L1__T_M__T_GLAST_INST_T__Z__T__T__T_ISC_F00_MATH__Z__T__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L2__T_M__T_GLAST_INST_T__Z__T__T__T_ISC_F00_READING__Z__T__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L2__T_M__T_GLAST_INST_T__Z__T__T__T_ISC_F00_MATH__Z__T__Z__Z_W00_W00_NA_NA_NA",
                                  "LR_PT__Z__Z_M_Y15T24__Z__Z__Z__Z__T__Z__Z__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "LR_PT__Z__Z_M_Y_GE15__Z__Z__Z__Z__T__Z__Z__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L1__T__T__T_GLAST_INST_T__Z_Q5__T__T_ISC_F00_READING__Z_HIGH__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L1__T__T__T_GLAST_INST_T__Z_Q5__T__T_ISC_F00_MATH__Z_HIGH__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L2__T__T__T_GLAST_INST_T__Z_Q5__T__T_ISC_F00_READING__Z_HIGH__Z__Z_W00_W00_NA_NA_NA",
                                  "STU_PT_L2__T__T__T_GLAST_INST_T__Z_Q5__T__T_ISC_F00_MATH__Z_HIGH__Z__Z_W00_W00_NA_NA_NA",
                                  "GER_PT_L02__T_M_SCH_AGE_GROUP__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "GER_PT_L1__T_M_SCH_AGE_GROUP__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "GER_PT_L2_3__T_M_SCH_AGE_GROUP__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA",
                                  "GER_PT_L5T8__T_M_SCH_AGE_GROUP__T_INST_T__Z__Z__T__T__T__Z__Z__Z__Z__Z_W00_W00_NA_NA_NA"),
                         varname = list("Read.Primary.GPIA", "Math.Primary.GPIA", "Read.LowerSec.GPIA", "Math.LowerSec.GPIA",
                                        "LR.Ag15t24.GPIA", "LR.Ag15t99.GPIA", "Read.Primary.WPIA", "Math.Primary.WPIA",
                                        "Read.LowerSec.WPIA", "Math.LowerSec.WPIA", "GER.02.GPIA", "GER.1.GPIA", "GER.2t3.GPIA",
                                        "GER.5t8.GPIA"),
                         val_status = list(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE)) %>%
    purrr::pmap(parity_adj) %>%
    purrr::reduce(dplyr::bind_rows)

  cleaned <- dplyr::bind_rows(clean2, inbound_stu, parity_indices) %>%
    dplyr::mutate(source = "UIS", year = as.numeric(year)) %>%
    dplyr::select(iso2c, year, ind, value, val_status, source) %>%
    dplyr::filter(!is.na(value))

}

#' cedar_clean
#'
#' \code{cedar_clean} is a function to clean data imported by \code{read_cedar}.
#'
#' Cleans data imported from cedar database and computes several variables
#' (CR.1.GPIA, CR.1.LPIA, CR.1.WPIA, CR.2.GPIA, CR.2.LPIA, CR.2.WPIA, CR.3.GPIA,
#' CR.3.LPIA, CR.3.WPIA, chores.28plus.12t14.GPIA)
#'@family clean
#'@seealso \code{\link{read_cedar}}

cedar_clean <- function(df) {

  clean1 <- df %>%
    dplyr::mutate(ind = dplyr::case_when(indicator == "trans_prim_m" & is.na(sex_id) ~ "TranRA.2.GPV.cp",
                           indicator == "comp_prim_v2_m" & is.na(sex_id)  & is.na(location_id) & is.na(wealth_id) ~ "CR.1",
                           indicator == "comp_prim_v2_m" & sex_id == 1 & is.na(wealth_id) ~ "CR.1.f",
                           indicator == "comp_prim_v2_m" & sex_id == 2 & is.na(wealth_id) ~ "CR.1.m",
                           indicator == "comp_prim_v2_m" & location_id == 5 ~ "CR.1.rural",
                           indicator == "comp_prim_v2_m" & location_id == 6 ~ "CR.1.urban",
                           indicator == "comp_prim_v2_m" & wealth_id == 6 & is.na(sex_id) ~ "CR.1.q1",
                           indicator == "comp_prim_v2_m" & wealth_id == 10 & is.na(sex_id) ~ "CR.1.q5",
                           indicator == "comp_prim_v2_m" & wealth_id == 6 & sex_id == 1 ~ "CR.1.q1.f",
                           indicator == "comp_prim_v2_m" & wealth_id == 6 & sex_id == 2 ~ "CR.1.q1.m",
                           indicator == "comp_lowsec_v2_m"  & is.na(sex_id)  & is.na(location_id) & is.na(wealth_id) ~ "CR.2",
                           indicator == "comp_lowsec_v2_m" & sex_id == 1 & is.na(location_id) & is.na(wealth_id) ~ "CR.2.f",
                           indicator == "comp_lowsec_v2_m" & sex_id == 2 & is.na(location_id) & is.na(wealth_id) ~ "CR.2.m",
                           indicator == "comp_lowsec_v2_m" & location_id == 5 ~ "CR.2.rural",
                           indicator == "comp_lowsec_v2_m" & location_id == 6 ~ "CR.2.urban",
                           indicator == "comp_lowsec_v2_m" & wealth_id == 6 & is.na(sex_id) ~ "CR.2.q1",
                           indicator == "comp_lowsec_v2_m" & wealth_id == 10 & is.na(sex_id) ~ "CR.2.q5",
                           indicator == "comp_lowsec_v2_m" & wealth_id == 6 & sex_id == 1 ~ "CR.2.q1.f",
                           indicator == "comp_lowsec_v2_m" & wealth_id == 6 & sex_id == 2 ~ "CR.2.q1.m",
                           indicator == "comp_upsec_v2_m" & is.na(sex_id)  & is.na(location_id) & is.na(wealth_id) ~ "CR.3",
                           indicator == "comp_upsec_v2_m" & sex_id == 1 & is.na(wealth_id) ~ "CR.3.f",
                           indicator == "comp_upsec_v2_m" & sex_id == 2 & is.na(wealth_id) ~ "CR.3.m",
                           indicator == "comp_upsec_v2_m" & location_id == 5 ~ "CR.3.rural",
                           indicator == "comp_upsec_v2_m" & location_id == 6 ~ "CR.3.urban",
                           indicator == "comp_upsec_v2_m" & wealth_id == 6 & is.na(sex_id) ~ "CR.3.q1",
                           indicator == "comp_upsec_v2_m" & wealth_id == 10 & is.na(sex_id) ~ "CR.3.q5",
                           indicator == "comp_upsec_v2_m" & wealth_id == 6 & sex_id == 1 ~ "CR.3.q1.f",
                           indicator == "comp_upsec_v2_m" & wealth_id == 6 & sex_id == 2 ~ "CR.3.q1.m",
                           indicator == "u5_posit_home_learn"  ~ "home.lrn.env.3t7",
                           indicator == "u5_child_book"  ~ "home.book.u5",
                           indicator == "school_child_track"  ~ "OnTrack.three.domains",
                           indicator == "stu_exper_violence_13_17" & sex_id ==4  ~ "stu.viol.13t17",
                           indicator == "stu_exper_bully_13_17"  & sex_id == 4 ~ "stu.bully.13t17",
                           indicator == "esd_gced_curr_ge"  ~ "esd.curr.ge",
                           indicator == "esd_gced_curr_hr"  ~ "esd.curr.hr",
                           indicator == "esd_gced_glo_cit"  ~ "esd.curr.cit",
                           indicator == "esd_gced_sus_dev"  ~ "esd.sus.dev",
                           indicator == "child_chores_more_28_12_14" & sex_id == 4 ~ "chores.28plus.12t14",
                           indicator == "child_chores_more_28_12_14" & sex_id == 1 ~ "chores.28plus.12t14.f",
                           indicator == "child_chores_more_28_12_14" & sex_id == 2 ~ "chores.28plus.12t14.m"),
                  value = ifelse(stringr::str_detect(ind, stringr::regex("CR\\.|TranRA")), value*100, value),
                  val_status = ifelse(stringr::str_detect(ind, stringr::regex("CR\\.|TranRA")), "A", NA)) %>%

    dplyr::inner_join(.gemrtables.pkg.env$regions, by = c("country_code" = "iso3c")) %>%
    dplyr::filter(!is.na(ind))

  parity_indices <- list(df = list("clean1", "clean1", "clean1", "clean1", "clean1", "clean1", "clean1", "clean1", "clean1", "clean1"),
                         col = list("ind",  "ind", "ind", "ind", "ind",  "ind", "ind", "ind", "ind", "ind"),
                         a = list("CR.1.f", "CR.1.rural", "CR.1.q1", "CR.2.f", "CR.2.rural", "CR.2.q1", "CR.3.f", "CR.3.rural", "CR.3.q1", "chores.28plus.12t14.f"),
                         b = list("CR.1.m", "CR.1.urban", "CR.1.q5","CR.2.m", "CR.2.urban", "CR.2.q5", "CR.3.m", "CR.3.urban", "CR.3.q5", "chores.28plus.12t14.m"),
                         varname = list("CR.1.GPIA", "CR.1.LPIA", "CR.1.WPIA", "CR.2.GPIA", "CR.2.LPIA", "CR.2.WPIA",
                                        "CR.3.GPIA", "CR.3.LPIA", "CR.3.WPIA", "chores.28plus.12t14.GPIA")) %>%
    purrr::pmap(parity_adj) %>%
    purrr::reduce(dplyr::bind_rows)

  cleaned <- dplyr::bind_rows(clean1, parity_indices) %>%
    dplyr::ungroup() %>%
    dplyr::select(iso2c, year, ind, value, val_status) %>%
    dplyr::mutate(source = "cedar") %>%
    dplyr::filter(!is.na(value)) %>%
    dplyr::group_by(iso2c, ind) %>%
    dplyr::filter(year == max(year)) %>%
    dplyr::ungroup() %>%
    unique()

}

#' wb_clean
#'
#' \code{wb_clean} is a function to clean data imported by \code{[wbstats]{wb}} within \code{\link{other}} .
#'
#' Cleans data imported by \code{[wbstats]{wb}} and computes two variables
#' (adult.profiliteracy.gpia, adult.profinumeracy.gpia)
#'@family clean
#'@seealso \code{\link[wbstats]{wb}}, \code{\link{other}}

wb_clean <- function(df) {

  clean1 <- df %>%
    dplyr::mutate(ind = dplyr::case_when(indicatorID == "SH.STA.STNT.ZS" ~ "stunt.u5",
                                         indicatorID == "LO.TIMSS.SCI8.LOW" | stringr::str_detect(indicatorID, "PISA")  ~ "sci.lowerSec",
                                         stringr::str_detect(indicatorID, "PIAAC.LIT.YOU")  ~ "youth.profiliteracy",
                                         stringr::str_detect(indicatorID, "PIAAC.NUM.YOU")  ~ "youth.profinumeracy",
                                         stringr::str_detect(indicatorID, "LIT.BE|LIT.1")  ~ "adult.profiliteracy",
                                         stringr::str_detect(indicatorID, "NUM.BE|NUM.1")  ~ "adult.profinumeracy",
                                         stringr::str_detect(indicatorID, "LIT.MA.BE|LIT.MA.1")  ~ "adult.profiliteracy.m",
                                         stringr::str_detect(indicatorID, "LIT.FE.BE|LIT.FE.1")  ~ "adult.profiliteracy.f",
                                         stringr::str_detect(indicatorID, "NUM.MA.BE|NUM.MA.1")  ~ "adult.profinumeracy.m",
                                         stringr::str_detect(indicatorID, "NUM.FE.BE|NUM.FE.1")  ~ "adult.profinumeracy.f"),
                  source = dplyr::case_when(indicatorID == "SH.STA.STNT.ZS" ~ "World Bank",
                                            indicatorID == "LO.TIMSS.SCI8.LOW" ~ "TIMSS",
                                            stringr::str_detect(indicatorID, "PISA")  ~ "PISA",
                                            stringr::str_detect(indicatorID, "PIAAC")  ~ "PIAAC"))

  clean2 <- clean1 %>%
    dplyr::filter(stringr::str_detect(indicatorID, "PISA|PIAAC")) %>%
    dplyr::group_by(source, iso2c, ind, date) %>%
    dplyr::summarise(value = 100 - (sum(value)))  %>%
    dplyr::ungroup() %>%
    dplyr::bind_rows(clean1) %>%
    dplyr::filter(indicatorID %in% c("SH.STA.STNT.ZS", "LO.TIMSS.SCI8.LOW", NA)) %>%
    dplyr::select(iso2c, ind, year = date, value, source)

  parity_indices <- list(df = list("clean2","clean2"),
                         col = list("ind",  "ind"),
                         a = list("adult.profiliteracy.f","adult.profinumeracy.f"),
                         b = list("adult.profiliteracy.m", "adult.profinumeracy.m"),
                         varname = list("adult.profiliteracy.gpia", "adult.profinumeracy.gpia")) %>%
    purrr::pmap(parity_adj) %>%
    purrr::reduce(dplyr::bind_rows) %>%
    dplyr::mutate(source = "PIAAC")

  cleaned <- dplyr::bind_rows(clean2, parity_indices) %>%
    dplyr::group_by(iso2c, ind) %>%
    dplyr::filter(year == max(year), !is.na(value)) %>%
    dplyr::mutate(val_status = "A",
           year = as.numeric(year)) %>%
    dplyr::select(iso2c, year, ind, value, val_status, source)

}

#' eurostat_clean
#'
#' \code{eurostat_clean} is a function to clean data from eurostat API queries
#' defined in \code{other}.
#'
#'@family clean function
#'@seealso \code{\link{other}}

eurostat_clean <- function(df) {

  cleaned <- df %>%
    dplyr::mutate(ind = dplyr::case_when(TRAINING == "FE_NFE" ~ "prya.25t64",
                                         INDIC_IS == "I_CCPY"  ~ "yadult.porcentICTskill.copi"),
                  source = "eurostat",
                  val_status = "A",
                  obsTime = as.numeric(obsTime)) %>%
    dplyr::select(iso2c = GEO, year = obsTime, ind, source, value = obsValue, val_status ) %>%
    dplyr::group_by(iso2c, ind) %>%
    dplyr::filter(year == max(year))

}

#' oecd_clean
#'
#' \code{oecd_clean} is a function to clean data from OECD API queries defined
#' in \code{other}.
#'
#' Cleans data from OECD api queries and computes one variable.
#' (sal.rel.2t3,)
#'
#'@family clean
#'@seealso \code{\link{other}}

oecd_clean <- function(df) {

  schol_unspec <- df[[1]] %>%
    dplyr::mutate(ind = dplyr::case_when(AIDTYPE == "E02" ~ "odaflow.imputecost",
                                         AIDTYPE == "E01"  ~ "odaflow.volumescholarship"),
                  RECIPIENT = as.numeric(RECIPIENT)) %>%
    dplyr::filter(RECIPIENT %in% c(89, 57, 189, 289, 298, 380, 389, 489, 498, 589, 619, 679, 689, 789, 798, 889, 9998)) %>%
    dplyr::group_by(ind) %>%
    dplyr::summarise(value = (sum(obsValue))*1000000)

  R.cache::saveCache(schol_unspec, key=list("schol_unspec"), comment="schol_unspec")

  schol <- df[[1]] %>%
    dplyr::mutate(ind = dplyr::case_when(AIDTYPE == "E02" ~ "odaflow.imputecost",
                                         AIDTYPE == "E01"  ~ "odaflow.volumescholarship"),
                  RECIPIENT = as.numeric(RECIPIENT)) %>%
    dplyr::left_join(.gemrtables.pkg.env$regions, by = c("RECIPIENT" = "oecd.crs.recipientcode")) %>%
    dplyr::select(iso2c, year = REFERENCEPERIOD, ind, value = obsValue) %>%
    dplyr::mutate(value = value*1000000)

  sal1 <- df[[2]] %>%
    dplyr::mutate(ind = dplyr::case_when(ISC11 == "L0" ~ "sal.rel.02",
                                         ISC11 == "L1" ~ "sal.rel.1",
                                         ISC11 == "L2_C4" ~ "sal.rel.2",
                                         ISC11 == "L3_C4" ~ "sal.rel.3")) %>%
    dplyr::inner_join(.gemrtables.pkg.env$regions, by = c("COUNTRY" = "iso3c")) %>%
    dplyr::select(iso2c, year = YEAR, ind, value = obsValue)

  sal2 <- df[[3]] %>%
    dplyr::mutate(ind = dplyr::case_when(ISC11_LEVEL == "L2" ~ "teachers.2",
                                         ISC11_LEVEL == "L3" ~ "teachers.3")) %>%
    dplyr::inner_join(.gemrtables.pkg.env$regions, by = c("COUNTRY" = "iso3c")) %>%
    dplyr::select(iso2c, year = obsTime, ind, value = obsValue) %>%
    dplyr::bind_rows(sal1)  %>%
    tidyr::spread(key = ind, value = value) %>%
    dplyr::mutate(value = ((sal.rel.2 * teachers.2) + (sal.rel.3 * teachers.3))/(teachers.2 + teachers.3), ind = "sal.rel.2t3") %>%
    dplyr::filter(!is.na(value)) %>%
    dplyr::select(iso2c, year, ind, value)

  cleaned <- dplyr::bind_rows(schol, sal1, sal2) %>%
    dplyr::mutate(source = "OECD",
                  val_status = ifelse(ind == "sal.rel.2t3", "E", "A"),
                  year = as.numeric(year)) %>%
    dplyr::group_by(iso2c, ind) %>%
    dplyr::filter(year == max(year)) %>%
    dplyr::filter(!is.na(iso2c), !is.na(value))

}

#' un_aids_clean
#'
#' \code{un_aids_clean} is a function to clean data from the UN AIDS spreadsheet
#' on google drive.
#'
#' Information on survey used per observation is removed.
#'
#'@family clean
#'@seealso \code{\link{other}}

un_aids_clean <- function(df) {

  cleaned <- df %>%
    dplyr::mutate(year = paste0(stringi::stri_extract_first(source, regex ="[0-9]{2}"), stringi::stri_extract_last(source, regex ="[0-9]{2}"), sep =""),
                  year = suppressWarnings(ifelse(year == "NANA", NA, as.numeric(year))),
                  survey = stringr::str_replace_all(source, "Source: |[^A-Za-z ]+", ""),
                  source = "UNAIDS",
                  value = as.numeric(value),
                  ind = "hiv.prev.15t24",
                  iso2c = countrycode::countrycode(sourcevar = df$Country, origin = "country.name", destination = "iso2c"),
                  val_status = "A") %>%
    dplyr::filter(!is.na(year)) %>%
    dplyr::select(iso2c, year, ind, value, val_status, source) %>%
    dplyr::group_by(iso2c, ind) %>%
    dplyr::filter(year == max(year))

}

#' gcpea_clean
#'
#' \code{gcpea_clean} is a function to clean data from Global Campaign to
#' protect education from attack (GCPEA) on google drive.
#'
#' 'm' val_status indicates observation comes from a multi-year period and does
#' not reflect the number of attacks on education in a single year period).
#'
#'@family clean
#'@seealso \code{\link{other}}

gcpea_clean <- function(df) {

  cleaned <- df %>%
    tidyr::gather(key = "year", value = "value", - Country) %>%
    dplyr::mutate(year =  stringr::str_replace(year, pattern = "X", replacement = ""),
                  year = as.numeric(year),
                  ind = "edattacks",
                  val_status = ifelse(stringr::str_detect(value, "\\*"), "m", "A"),
                  value = stringr::str_replace(value, pattern = "\\*|-", replacement = ""),
                  value = as.numeric(value),
                  source = "GCPEA",
                  iso2c = countrycode::countrycode(sourcevar = .$Country, origin = "country.name", destination = "iso2c", warn = FALSE)) %>%
    dplyr::select(iso2c, year, ind, value, val_status, source) %>%
    dplyr::filter(!is.na(iso2c), !is.na(value)) %>%
    dplyr::group_by(iso2c, ind) %>%
    dplyr::filter(year == max(year))

}

#' unicef_ecce_clean
#'
#' \code{unicef_ecce_clean} is a function to clean UNICEF ECCE survey data on
#' google drive.
#'
#' Information on survey used per observation is removed.
#'@param df a datadrame in the same format as
#'  \url{https://drive.google.com/uc?export=download&id=1JzOo7rt8O3ZE9eSAL3v1jKexLwbICtM3}
#'
#'@param ind name of the indicator of the `value` column in the csv file
#'  (character).
#'@param source name of the source of the data (i.e. UNICEF)
#'@family clean
#'@seealso \code{\link{other}}


unicef_ecce_clean <- function(df, ind, source) {

  cleaned <- df %>%
    dplyr::mutate(year = paste0(stringi::stri_extract_first(survey, regex ="[0-9]{2}"), stringi::stri_extract_last(survey, regex ="[0-9]{2}"), sep =""),
                  year = suppressWarnings(ifelse(year == "NANA", NA, as.numeric(year))),
                  value = stringr::str_replace(value, "-|\\?", ""), value = suppressWarnings(as.numeric(value)),
                  survey = stringr::str_replace_all(survey, "Source: |[^A-Za-z ]+", ""),
                  source = source,
                  ind = ind,
                  iso2c = suppressWarnings(countrycode::countrycode(sourcevar = df$Country, origin = "country.name", destination = "iso2c")),
                  val_status = "A") %>%
    dplyr::filter(!is.na(year)) %>%
    dplyr::select(iso2c, year, ind, value, val_status, source) %>%
    dplyr::group_by(iso2c, ind) %>%
    dplyr::filter(year == max(year))

}

#' unicef_wash_clean
#'
#' \code{unicef_wash_clean} is a function to clean data from the UNICEF WASH
#' file on google drive.
#'
#'@family clean
#'@seealso \code{\link{other}}


unicef_wash_clean <- function(df) {

  cleaned <- df %>%
    tidyr::gather(key = "ind", value = "value", - Country, - year) %>%
    dplyr::mutate(value = stringr::str_replace(value, "-", ""),
                  value = as.numeric(value),
                  iso2c = countrycode::countrycode(sourcevar = .$Country, origin = "country.name", destination = "iso2c", warn = FALSE),
                  year = as.numeric(year),
                  ind = dplyr::case_when(stringr::str_detect(ind, "drinking") ~ "SchBSP.WPoWat",
                                         stringr::str_detect(ind, "sanitation") ~ "SchBSP.WToilssx",
                                         stringr::str_detect(ind, "handwashing")~ "SchBSP.WHwash"),
                  val_status = "E",
                  source = "UNICEF") %>%
    dplyr::filter(!is.na(iso2c), !is.na(value)) %>%
    dplyr::select(iso2c, year, ind, value, val_status, source)

}

#' bullying_clean
#'
#' \code{bullying_clean} is a function to clean the Innocenti bullying data on
#' google drive.
#'
#' Information on survey used per observation is removed.
#'
#'@family clean
#'@seealso \code{\link{other}}

bullying_clean <- function(df) {

  cleaned <- df %>%
    dplyr::mutate(iso2c = countrycode::countrycode(sourcevar = .$Country, origin = "country.name", destination = "iso2c", warn = FALSE),
                  ind = "stu.bully",
                  value = stu.bully,
                  val_status = "A",
                  source = "Innocenti",
                  iso2c =  dplyr::case_when(Country == "Dominicana" ~ "DO",
                                            Country == "United Arab" ~ "AE",
                                            TRUE ~ iso2c),
                  value =  dplyr::case_when(value == "Low" ~ 1,
                                    value == "Medium" ~ 2,
                                    value == "High" ~ 3)) %>%
    dplyr::filter(!is.na(iso2c), !is.na(value)) %>%
    dplyr::select(iso2c, year, ind, value, val_status, source)
}

#' ict_skills_clean
#'
#' \code{ict_skills_clean} is a function to clean the ITU ICT data on
#' google drive.
#'
#'@family clean
#'@seealso \code{\link{other}}

ict_skills_clean <- function(df) {

  cleaned <- df %>%
    tidyr::gather(key = "ind", value = "value", -Country, -year) %>%
    dplyr::mutate(iso2c = countrycode::countrycode(sourcevar = .$Country, origin = "country.name", destination = "iso2c"),
                  val_status = "A",
                  source = "ITU") %>%
    dplyr::filter(!is.na(iso2c), !is.na(value)) %>%
    dplyr::select(iso2c, year, ind, value, val_status, source)
}

#' chores_clean
#'
#' \code{chores_clean} is a function to clean the child chores data on
#' google drive.
#'
#'@family clean
#'@seealso \code{\link{other}}

  chores_clean <- function(df) {
    cleaned <- df %>%
      dplyr::mutate(chores.28plus.12t14 = total,
                    chores.28plus.12t14.GPIA = ifelse(GPI > 1, 1+(1-(1/GPI)), GPI),
                    iso2c = countrycode::countrycode(sourcevar = .$country, origin = "country.name", destination = "iso2c", warn = T),
                    year = 2016,
                    val_status = "A",
                    source = "UNICEF") %>%
      tidyr::gather(key = "ind", value = "value", -total, - source, - GPI, -iso2c, -val_status, -country, -year, -variable) %>%
      dplyr::select(-country, -total, -GPI, -variable)
  }

#' weights_clean
#'
#' \code{weights_clean} is a function to clean weights data imported from SDMX
#' queries in `weights` function.
#'
#' Several variables are calculated (Y25T65, Y_GE25, Y15T64, L1_GLAST_Q1_F,
#' L1_GLAST_Q1_M, L2_GLAST_Q1_F, L2_GLAST_Q1_M, L3_Q1_F = L3_F, L3_Q1_M = L3_M).
#'@family clean
#'@seealso \code{\link{weights}}

weights_clean <- function(df) {

  uis <- df[[1]] %>%
    unique() %>%
    dplyr::mutate(wt_var = dplyr::case_when(AGE =="SCH_AGE_GROUP" & SEX == "_T" ~ EDU_LEVEL,
                                            AGE =="SCH_AGE_GROUP" & SEX != "_T" ~ paste(EDU_LEVEL, SEX, sep = "_"),
                                            AGE == "TH_ENTRY_GLAST" & SEX == "_T" ~ paste(EDU_LEVEL, "GLAST", sep = "_"),
                                            AGE == "TH_ENTRY_GLAST" & SEX != "_T" ~ paste(EDU_LEVEL, "GLAST", SEX, sep = "_"),
                                            is.na(EDU_LEVEL) | EDU_LEVEL == "_T"  ~ AGE,
                                            STAT_UNIT == "TEACH" ~ paste(STAT_UNIT, EDU_LEVEL, sep = "_"),
                                            GRADE == "GLAST" ~ paste(EDU_LEVEL, STAT_UNIT, GRADE, sep = "_"),
                                            STAT_UNIT == "ILLPOP" ~ paste(AGE, STAT_UNIT, sep = "_"),
                                            STAT_UNIT == "STU" & GRADE == "_T" ~ paste(EDU_LEVEL, STAT_UNIT, sep ="_")),
                 wt_value = dplyr::case_when(STAT_UNIT == "POP" ~ as.numeric(OBS_VALUE) *1000,
                                             OBS_STATUS == "Z" ~ NA_real_,
                                             TRUE ~ as.numeric(OBS_VALUE))) %>%
    dplyr::select(iso2c = REF_AREA, year = TIME_PERIOD, wt_var, wt_value) %>%
    dplyr::filter(!is.na(iso2c), !is.na(wt_value), nchar(iso2c) == 2)

  unpd <- df[[2]] %>%
    dplyr::mutate(REF_AREA = as.numeric(stringi::stri_replace_all_regex(REF_AREA, "\\b0*(\\d+)\\b", "$1"))) %>%
    dplyr::mutate(wt_value = as.numeric(obsValue) * 1000,
                  iso2c = countrycode::countrycode(sourcevar = .$REF_AREA, origin = "iso3n", destination = "iso2c")) %>%
    dplyr::select(iso2c, year = obsTime, wt_var = AGE, wt_value)

  clean1 <- bind_rows(uis, unpd) %>%
    dplyr::mutate(year = as.numeric(year)) %>%
    dplyr::filter(!is.na(iso2c), nchar(iso2c) == 2)

  clean2 <- clean1 %>%
    dplyr::group_by(iso2c, year) %>%
    tidyr::spread(key = wt_var, value = wt_value) %>%
    dplyr::summarise(Y25T64 = `_T` - (Y_LT5 + Y10T14 + Y15T24 + Y_GE65),
                     Y_GE25 = `_T` - (Y_LT5 + Y10T14 + Y15T24),
                     Y15T64 =  `_T` - (Y_LT5 + Y10T14 + Y_GE65),
                     L1_GLAST_Q1_F = L1_GLAST_F * .2,
                     L1_GLAST_Q1_M = L1_GLAST_M * .2,
                     L2_GLAST_Q1_F = L2_GLAST_F * .2,
                     L2_GLAST_Q1_M = L2_GLAST_M * .2,
                     L3_Q1_F = L3_F * .2,
                     L3_Q1_M = L3_M * .2) %>%
    tidyr::gather(key = "wt_var", value = "wt_value", -iso2c, -year)

  clean3 <- dplyr::bind_rows(clean1, clean2) %>%
    dplyr::group_by(iso2c, wt_var) %>%
    dplyr::filter(!all(is.na(wt_value))) %>%
    ungroup
  #   dplyr::filter(!is.na(wt_value)) %>%
  #   dplyr::mutate(wt_value_z = wt_value / mean(wt_value))

  cleaned <-
    clean3 %>%
    dplyr::mutate(year = as.numeric(year)) %>%
    dplyr::filter(dplyr::between(year, 2006, .gemrtables.pkg.env$ref_year+1)) %>%
    tidyr::complete(tidyr::nesting(iso2c, wt_var), year) %>%
    # Approx insists on at least two values to interpolate;
    # since anyhow we want the interpolation to hold the latest value constant,
    # a value two years following the reference year is imputed here to be identical to the latest available.
    {dplyr::bind_rows(
      .,
      stats::na.omit(.) %>%
        dplyr::group_by(iso2c, wt_var) %>%
        dplyr::filter(year == max(year)) %>%
        dplyr::ungroup() %>%
        dplyr::mutate(year = .gemrtables.pkg.env$ref_year + 2)
    )} %>%
    dplyr::group_by(iso2c, wt_var) %>% #na.omit %>% filter(n() == 1)
    dplyr::mutate(wt_value = stats::approx(year, wt_value, xout = year, rule = 2)$y) %>%
    dplyr::filter(year == .gemrtables.pkg.env$ref_year) %>%
    dplyr::ungroup()
}


#' format_wide
#'
#' \code{format_wide} is a function to format stat table data to 'wide' format.
#'
#' Rounds values, converts binary values to 'Yes/No'; converts value to unicode
#' with subscript flags; converts to list of dataframes for export to xlsx.
#'@family clean

format_wide <- function(df) {

  redenominate_6 <- c("IllPop.Ag15t24", "IllPop.Ag15t99", "OFST.1.cp", "OFST.2.cp", "OFST.3.cp", "SAP.02", "SAP.1", "SAP.2t3",
                      "SAP.5t8", "stu.per.02", "stu.per.1", "stu.per.2t3", "stu.per.5t8",
                      "odaflow.volumescholarship", "odaflow.imputecost")

  redenominate_3 <- c("IE.5t8.40510", "teach.per.02", "OE.5t8.40510", "teach.per.1", "teach.per.2t3")

  wide_data <-
    df %>%
    dplyr::mutate(value = dplyr::case_when(ind %in% redenominate_6 ~ value/1000000,
                                           ind %in% redenominate_3 ~ value/1000,
                                           TRUE ~ value)) %>%
    dplyr::group_by(ind) %>%
    dplyr::mutate(
      digits = case_when(
        max(value, na.rm = TRUE) < 2 | stringr::str_detect(ind, 'sal.rel') ~ 2,
        # (ind %in% redenominate_6 | ind %in% redenominate_3) & value >= 1000000 ~ 1,
        # (ind %in% redenominate_6 | ind %in% redenominate_3) & value < 1000000 ~ 3,
        value < 0.5 | stringr::str_detect(ind, "XGDP|XGovExp") ~ 1,
        TRUE ~ 0)) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      value_str = format(round(value, digits),
                     big.mark = ',', trim = TRUE,
                     zero.print = '-',
                     nsmall = digits,
                     drop0trailing = FALSE,
                     scientific = FALSE,
                     digits = digits
                     )) %>%
    dplyr::mutate(
      value_str = ifelse(is.na(value) | entity != "country" | !stringr::str_detect(ind, "bully|esd|attack|admi"), value_str,
        ifelse(stringr::str_detect(ind, "bully"), c('Low', 'Medium', 'High')[value],
        ifelse(stringr::str_detect(ind, "esd"), c('None', 'Low', 'Medium', 'High')[value + 1],
        ifelse(stringr::str_detect(ind, "attack"), c('None', 'Sporadic', 'Affected', 'Heavy', 'Very heavy')[value + 1],
        ifelse(stringr::str_detect(ind, "admi"), c('No', 'Yes')[value + 1], value_str)
      ))))) %>%
    # dplyr::select(entity, value, val_status, ind, value_str) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(
                  val_status = ifelse(val_status == "A", "", tolower(val_status)),
                  year_diff = year - .gemrtables.pkg.env$ref_year,
                  year_diff = ifelse(year_diff == 0, "", year_diff),
                  val_status_utf = dplyr::case_when(val_status == "e" ~ "\u1d62",
                                                    # val_status == "m" ~ "\u2099",
                                                    TRUE ~ ''),
                  year_diff_utf = dplyr::case_when(year_diff  ==  1 ~ "\u208A\u2081",
                                                    year_diff == -1 ~ "\u208B\u2081",
                                                    year_diff == -2 ~ "\u208B\u2082",
                                                    year_diff == -3 ~ "\u208B\u2083",
                                                    year_diff == -4 ~ "\u208B\u2084",
                                                   TRUE ~ ''),
                  val_utf = ifelse(value_str == 'NA' | is.na(value_str),
                                    "\u2026",
                                    paste0(stringr::str_trim(value_str), ifelse(stringr::str_detect(ind, "admi|esd|odaflow|Entry\\.age|Years\\.dur"), "", year_diff_utf), val_status_utf, sep = ""))) %>%
    dplyr::select(sheet, annex_name, !!.gemrtables.pkg.env$region, ind, val_utf, entity) %>%
    dplyr::mutate(ind = factor(ind, levels = unique(ind)),
                  is_aggregate = ifelse(entity == "country", "country", "aggregate"),
                  sheet = paste("sheet", sheet),
                  val_utf = stringr::str_replace_all(val_utf, "NA", ""),
                  regionx = !!.gemrtables.pkg.env$region,
                  regionx = ifelse(is_aggregate == "aggregate", annex_name, regionx)) %>%
    dplyr::left_join(.gemrtables.pkg.env$regions2[, c(1,4)], by = c("regionx" = "annex_name")) %>%
    split(list(.$is_aggregate, .$sheet)) %>%
    purrr::map(tidyr::spread, key=ind, value = val_utf) %>%
    purrr::map(function(.) dplyr::mutate(., annex_name = ifelse(entity == "subregion" | entity == "income_subgroup", paste("  ", annex_name), annex_name))) %>%
    purrr::map(function(.) dplyr::mutate(., entity = dplyr::case_when(entity == "subregion" ~ "region",
                                                                      entity == "income_subgroup" ~ "income_group",
                                                                      TRUE ~ entity))) %>%
    purrr::map(function(.) dplyr::arrange(., region_order, annex_name)) %>%
    purrr::map(mutate_all, as.character) %>%
    purrr::map_if(suppressWarnings(stringr::str_detect(., "country")), function(.) dplyr::left_join(., .gemrtables.pkg.env$regions[, c("annex_name", "iso3c")], by = "annex_name")) %>%
    purrr::map(function(.) data.table::setDT(.)[.[, c(.I, NA), entity]$V1][!.N]) %>%
    purrr::map(function(.) data.table::setDT(.)[.[, c(.I, NA), eval(.gemrtables.pkg.env$region)]$V1][!.N]) %>%
    purrr::map(function(.) dplyr::select(., -sheet, -is_aggregate, -entity, -!!.gemrtables.pkg.env$region, - regionx, -region_order))
}
northeastloon/gemrtables documentation built on May 4, 2019, 3:09 a.m.