#' DHS data is delivered in .zip files and the names of these zip files
#' usually (but not always) encode the survey code (country, round, and
#' release). These scheme means that at the listing stage the contents
#' can not be easily renamed althought _we_ assume case-ignorant naming
#' and later convert all names to lower case.
#'
#' @param f paths to .zip files from DHS.
#' @return Here for each .zip file we calculate a list of contained
#' files and the survey code encoded in the names of these nested files.
#' @export
list_files <- function(f) {
all_files <- lapply(f, unzip, list=TRUE) %>%
lapply(function(x) x[['Name']] %>% list())
names(all_files) <- f
all_files <- lapply(all_files, function(x) list(
putative_survey = x[[1]] %>% basename %>% tolower %>% substr(1,6),
original_file = x[[1]])
)
return(all_files)
}
#' While DHS data _usually_ encodes the survey code (country, round, and
#' release) in the file names, sometimes these are not correct. It is
#' possible to run a straightforward check by looking in the contained
#' Stata files and generating an independent label from the 'v000' column
#' which concatenates the country code and the DHS round (not release)
#' code. This function takes a list produced by `list_files` and
#' inserts the internal country and round codes.
#'
#' @param l output of `list_files` function.
#' @return same as input with (where possible) inserted `internal_country_code`
#' and `internal_round_code` elements.
#' @export
insert_internal_codes <- function(file_index) {
scratch = tempdir()
for (zip_file in names(file_index)) {
file_index[[zip_file]][['internal_country_code']] <- list()
file_index[[zip_file]][['internal_round_code']] <- list()
file_index[[zip_file]][['bad_dta_file']] <- vector(mode='character', length=0)
internal_files = file_index[[zip_file]][['original_file']]
dta_files = internal_files[internal_files %>% sapply(has_extension, e='dta')]
unzip(zipfile = zip_file, files = dta_files, exdir = scratch)
for (f in dta_files) {
dta_f = try(readstata13::read.dta13(file = file.path(scratch, f)))
if (isTRUE(length(dta_f) == 1) && class(dta_f) == 'try-error') {
file_index[[zip_file]][['bad_dta_file']] <- c(
file_index[[zip_file]][['bad_dta_file']], dta_f)
next
}
if (!is.null(dta_f[['v000']])) {
code = unique(dta_f[['v000']])
if (length(code) != 1)
stop("Multiple survey codes in one survey file.")
file_index[[zip_file]][['internal_country_code']] = c(
file_index[[zip_file]][['internal_country_code']], substr(code, 1, 2))
file_index[[zip_file]][['internal_round_code']] = c(
file_index[[zip_file]][['internal_round_code']], substr(code, 3, 3))
}
rm(dta_f); gc()
}
junk <- dir(path = scratch, full.names=TRUE)
file.remove(junk, recursive=TRUE)
}
return(file_index)
}
#' Extract .dta files from DHS .zip files and insert data on
#' extracted .dta files into the appropriate file_index elements.
#' Files are copied to the output path all lower-case as is
#' our convention.
#'
#' @param file_index holding info on zip files and contents
#' generated by, at least, `list_files`.
#' @param output_path where extracted .dta files are written.
#' @return file_index updated with an 'extracted_file` element
#' for each entry that contained a .dta file.
#' @export
extract_dta_files <- function(file_index, output_path) {
scratch = tempdir()
zip_file[['extracted_file']] <- list()
for (zip_file in names(file_index)) {
internal_files = file_index[[zip_file]][['original_file']]
dta_files = internal_files[internal_files %>% sapply(has_extension, e='dta')]
unzip(zipfile = zip_file, files = dta_files, exdir = scratch)
for (f in dta_files) file.copy(
from = file.path(scratch, f),
to = file.path(output_path, tolower(f))
)
zip_file[['extracted_file']] <- c(zip_file[['extracted_file']], tolower(dta_files))
}
return(file_index)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.