R/index.R

#' DHS data is delivered in .zip files and the names of these zip files
#' usually (but not always) encode the survey code (country, round, and
#' release).  These scheme means that at the listing stage the contents
#' can not be easily renamed althought _we_ assume case-ignorant naming
#' and later convert all names to lower case.  
#'
#' @param f paths to .zip files from DHS.
#' @return Here for each .zip file we calculate a list of contained 
#' files and the survey code encoded in the names of these nested files.  
#' @export
list_files <- function(f) {
  all_files <- lapply(f, unzip, list=TRUE) %>% 
    lapply(function(x) x[['Name']] %>% list())
  names(all_files) <- f
  all_files <- lapply(all_files, function(x) list(
    putative_survey = x[[1]] %>% basename %>% tolower %>% substr(1,6),
    original_file = x[[1]])
  )
  return(all_files)
}

#' While DHS data _usually_ encodes the survey code (country, round, and
#' release) in the file names, sometimes these are not correct.  It is 
#' possible to run a straightforward check by looking in the contained
#' Stata files and generating an independent label from the 'v000' column
#' which concatenates the country code and the DHS round (not release) 
#' code.  This function takes a list produced by `list_files` and 
#' inserts the internal country and round codes.
#'
#' @param l output of `list_files` function.
#' @return same as input with (where possible) inserted `internal_country_code`
#'         and `internal_round_code` elements.
#' @export
insert_internal_codes <- function(file_index) {
  scratch = tempdir()
  for (zip_file in names(file_index)) {
    file_index[[zip_file]][['internal_country_code']] <- list()
    file_index[[zip_file]][['internal_round_code']] <- list()
    file_index[[zip_file]][['bad_dta_file']] <- vector(mode='character', length=0)
    internal_files = file_index[[zip_file]][['original_file']]
    dta_files = internal_files[internal_files %>% sapply(has_extension, e='dta')]
    unzip(zipfile = zip_file, files = dta_files, exdir = scratch)
    for (f in dta_files) {
      dta_f = try(readstata13::read.dta13(file = file.path(scratch, f)))
      if (isTRUE(length(dta_f) == 1) && class(dta_f) == 'try-error') {
        file_index[[zip_file]][['bad_dta_file']] <- c(
          file_index[[zip_file]][['bad_dta_file']], dta_f)
        next
      }
      if (!is.null(dta_f[['v000']])) {
        code = unique(dta_f[['v000']])
        if (length(code) != 1) 
          stop("Multiple survey codes in one survey file.")
        file_index[[zip_file]][['internal_country_code']] = c(
          file_index[[zip_file]][['internal_country_code']], substr(code, 1, 2))
        file_index[[zip_file]][['internal_round_code']] = c(
          file_index[[zip_file]][['internal_round_code']], substr(code, 3, 3))
      }
      rm(dta_f); gc()
    }
    junk <- dir(path = scratch, full.names=TRUE)
    file.remove(junk, recursive=TRUE)
  }
  return(file_index)
}
  

#' Extract .dta files from DHS .zip files and insert data on
#' extracted .dta files into the appropriate file_index elements.
#' Files are copied to the output path all lower-case as is
#' our convention.
#'
#' @param file_index holding info on zip files and contents
#'        generated by, at least, `list_files`.
#' @param output_path where extracted .dta files are written.
#' @return file_index updated with an 'extracted_file` element
#'         for each entry that contained a .dta file.
#' @export
extract_dta_files <- function(file_index, output_path) {
  scratch = tempdir()
    zip_file[['extracted_file']] <- list()
  for (zip_file in names(file_index)) {
    internal_files = file_index[[zip_file]][['original_file']]
    dta_files = internal_files[internal_files %>% sapply(has_extension, e='dta')]
    unzip(zipfile = zip_file, files = dta_files, exdir = scratch)
    for (f in dta_files) file.copy(
      from = file.path(scratch, f), 
      to = file.path(output_path, tolower(f))
    )
    zip_file[['extracted_file']] <- c(zip_file[['extracted_file']], tolower(dta_files))
  }
  return(file_index)
}  
sakrejda/pdhs documentation built on May 28, 2019, 9:51 a.m.