R/mics_survey_extract.R

Defines functions make_asfr_inputs join_survey_areas transform_mics filter_mics create_surveys_mics

Documented in create_surveys_mics filter_mics join_survey_areas make_asfr_inputs transform_mics

#' Download MICS surveys
#' @param iso3 ISO3 code
#' @export

create_surveys_mics <- function(iso3, mics_indicators) {

  # sharepoint <- spud::sharepoint$new(Sys.getenv("SHAREPOINT_URL"))
  # folder <- sharepoint$folder(site = Sys.getenv("SHAREPOINT_SITE"), path = Sys.getenv("MICS_ORDERLY_PATH"))
  # 
  # mics_file_names <- folder$list() %>%
  #   dplyr::filter(stringr::str_detect(name, tolower(iso3))) %>%
  #   .$name %>%
  #   sort()
  #  
  # mics_survey_names <- toupper(stringr::str_replace(mics_file_names, ".rds", ""))
  # 
  #   mics_file_names <- lapply(tolower(unique(dplyr::filter(mics_indicators, survey_id %in% mics_survey_names)$survey_id)),
  #          grep, x = mics_file_names, value = TRUE
  #   ) %>%
  #     unlist() %>%
  #     sort()
  
  path <- "~/Imperial College London/HIV Inference Group - WP - Documents/Data/household surveys/MICS/datasets/archive/mics_rds/20220112-013554-d0ff0834/mics_datasets_rds"
  files <- grep(tolower(iso3), list.files(path, full.names = T), value = T)
  mics_survey_names <- toupper(str_sub(files, 172, -5))
  names(files) <- mics_survey_names
  mics_survey_names <- mics_survey_names[mics_survey_names %in% unique(mics_indicators$survey_id)]
  files <- files[mics_survey_names]
  
  rename_datasets_key <- mics_indicators %>%
    dplyr::filter(survey_id %in% mics_survey_names) %>%
    dplyr::filter(label == "dataset name") %>%
    dplyr::arrange(survey_id, filetype) %>%
    dplyr::group_by(survey_id) %>%
    dplyr::group_split()

  # paths <- file.path("sites", Sys.getenv("SHAREPOINT_SITE"), Sys.getenv("MICS_ORDERLY_PATH"), mics_file_names)
  # files <- lapply(paths, spud::sharepoint_download, sharepoint_url = Sys.getenv("SHAREPOINT_URL"))

  mics_dat <- lapply(files, readRDS)

  extract_rename_datasets <- function(mics_dat, rename_datasets_key) {

    mics_dat <- mics_dat[rename_datasets_key$value]

    names(mics_dat) <- rename_datasets_key$id

    return(mics_dat)

  }

  mics_dat <- Map(extract_rename_datasets, mics_dat, rename_datasets_key)

  names(mics_dat) <- mics_indicators %>%
    dplyr::filter(survey_id %in% mics_survey_names) %>%
    dplyr::distinct(survey_id) %>%
    .$survey_id %>%
    sort()

  return(mics_dat)

}

#' Filter MICS datasets
#' @description Filter MICS household, women, and birth history datasets to key variables, and rename to ensure consistent column names between surveys
#' @export

filter_mics <- function(dat, mics_indicators, survey_id_i) {

  indicators <- dplyr::filter(mics_indicators,
                       survey_id == survey_id_i,
                       label != "dataset name"
  )

  wm <- dat$wm
  colnames(wm) <- tolower(colnames(wm))
  wm <- wm %>%
    dplyr::select(dplyr::filter(indicators, filetype == "wm")$value)
  colnames(wm) <- dplyr::filter(indicators, filetype == "wm")$id

  bh <- dat$bh
  colnames(bh) <- tolower(colnames(bh))
  bh <- bh %>%
    dplyr::select(dplyr::filter(indicators, filetype == "bh")$value)
  colnames(bh) <- dplyr::filter(indicators, filetype == "bh")$id


  hh <- dat$hh
  colnames(hh) <- tolower(colnames(hh))
  hh <- hh %>%
    dplyr::select(dplyr::filter(indicators, filetype == "hh")$value)
  colnames(hh) <- dplyr::filter(indicators, filetype == "hh")$id

  df <- list()
  df$wm <- wm %>%
    dplyr::mutate(survey_id = survey_id_i) %>%
    dplyr::filter(!is.na(wdob), !is.na(cluster), !is.na(hh_number), !is.na(line_number), !is.na(doi)) %>%
    dplyr::arrange(cluster, hh_number, line_number) %>%
    dplyr::group_by(cluster, hh_number, line_number) %>%
    dplyr::mutate(unique_id = dplyr::cur_group_id())

  df$bh <- bh %>%
    dplyr::mutate(survey_id = survey_id_i)

  df$hh <- hh %>%
    dplyr::mutate(survey_id = survey_id_i,
           area_level = as.numeric(dplyr::filter(indicators, id == "mics_area_level")$value)
    )

  return(df)


}

#' Transform MICS dataframes
#' @description Convert lists by survey to lists by dataset type
#' @export
transform_mics <- function(mics_survey_data, mics_indicators) {

  mics_dat <- Map(filter_mics, mics_survey_data, list(mics_indicators), names(mics_survey_data))

  wm <- mics_dat %>%
    lapply("[[", "wm") %>%
    dplyr::bind_rows(.id = "survey_id")

  hh <- mics_dat %>%
    lapply("[[", "hh") %>%
    lapply(function(x) {
      x %>%
        haven::zap_labels() %>%
        dplyr::left_join(data.frame(mics_area_name = attr(x$mics_area_name, "labels"),
                             mics_area_name_label = stringr::str_to_title(
                               names(attr(x$mics_area_name, "labels")))
                             )
                  ) %>%
        dplyr::select(-mics_area_name) %>%
        dplyr::mutate(mics_area_name_label = stringr::str_remove_all(mics_area_name_label, "\\u0093|\\u0094"),
               mics_area_name_label = stringr::str_trim(mics_area_name_label))

    }) %>%
    dplyr::bind_rows(.id = "survey_id")


  bh <- mics_dat %>%
    lapply("[[", "bh") %>%
    dplyr::bind_rows(.id = "survey_id")

  df <- list()
  df$wm <- wm
  df$hh <- hh
  df$bh <- bh

  return(df)
}

#' Join household datasets with area hierarchy
#' @export

join_survey_areas <- function(fertility_mics_data, areas, warn=FALSE) {

  errfun <- if (warn)
    warning
  else stop

  areas <- sf::st_drop_geometry(areas)

  dat <- fertility_mics_data$hh

  lvl <- dat %>%
    dplyr::distinct(survey_id, area_level)

  area_survey_level <- areas %>%
    dplyr::left_join(lvl)

  dat_merge <- dat %>%
    dplyr::full_join(area_survey_level %>%
                dplyr::select(survey_id, area_id, area_name, area_level) %>%
                dplyr::filter(!is.na(survey_id)),
              by = c("mics_area_name_label" = "area_name", "area_level", "survey_id")
    )


  if (any(is.na(dat_merge$area_id))) {

    missing_areas <- dat_merge %>%
      dplyr::filter(is.na(area_id)) %>%
      dplyr::select(survey_id, mics_area_name_label, area_id) %>%
      dplyr::distinct() %>%
      dplyr::rename(mics_area_name = mics_area_name_label)

    errfun("\n\nSurvey regions were not matched to areas:\n",
         paste0(utils::capture.output(missing_areas), collapse = "\n"),
         "\n\nThis must be corrected \n \n"
    )
  }

  if(any(is.na(dat_merge$cluster))) {

    missing_areas <- dat_merge %>%
      dplyr::filter(is.na(cluster)) %>%
      dplyr::select(survey_id, mics_area_name_label, area_id) %>%
      dplyr::distinct() %>%
      dplyr::rename(area_name = mics_area_name_label)

    warning("\n\nAreas were not found in MICS survey:\n",
            paste0(utils::capture.output(missing_areas), collapse = "\n"),
            "\n\nThis may be because the survey did not sample these regions"
          )

  }

  fertility_mics_data$hh <- dat %>%
    dplyr::left_join(area_survey_level %>%
                dplyr::select(survey_id, area_id, area_name, area_level),
              by = c("mics_area_name_label" = "area_name", "area_level", "survey_id")
    )

  fertility_mics_data

}

#' Transform survey datasets into inputs for calc_asfr
#' @export
make_asfr_inputs <- function(mics_survey_areas, mics_survey_data) {

  dat <- mics_survey_areas
  df <- list()

  df$wm <- dat$wm %>%
    dplyr::left_join(dat$hh %>% 
                       ungroup() %>% 
                       dplyr::select(survey_id, cluster, area_id) %>%
                       distinct())

  df$births_to_women <- dat$wm %>%
    dplyr::select(survey_id, cluster, hh_number, line_number, unique_id) %>%
    dplyr::left_join(dat$bh %>% dplyr::select(survey_id, cluster, hh_number, line_number, cdob)) %>%
    dplyr::select(survey_id, unique_id, cdob) %>%
    dplyr::filter(!is.na(cdob))

  df
}
osymandius/dfertility documentation built on Jan. 28, 2024, 4:18 a.m.