#' Create a data frame of datasets that your log in can download
#'
#' DHS datasets that can be downloaded
#' @param config Object of class `rdhs_config` as produced by `read_rdhs_config`
#' that must contain a valid `email`, `project` and `password`.
#' @param datasets_api_results Data.table for the api results for the datasets
#' endpoint. Default = NULL and
#' generated by default if not declared.
#' @param surveys_api_results Data.table for the api results for the surveys
#' endpoint. Default = NULL and
#' generated by default if not declared.
#'
#' @note Inspiration for function to
#' \url{https://github.com/ajdamico/lodown/blob/master/R/dhs.R}
#'
#' @return Returns \code{"data.frame"} of length 14:
#' \itemize{
#' \item{"FileFormat"}
#' \item{"FileSize"}
#' \item{"DatasetType"}
#' \item{"SurveyNum"}
#' \item{"SurveyId"}
#' \item{"FileType"}
#' \item{"FileDateLastModified"}
#' \item{"SurveyYearLabel"}
#' \item{"SurveyType"}
#' \item{"SurveyYear"}
#' \item{"DHS_CountryCode"}
#' \item{"FileName"}
#' \item{"CountryName"}
#' \item{"URLS"}
#' }
#'
available_datasets <- function(config,
datasets_api_results = NULL,
surveys_api_results = NULL) {
# fetch all the datasets meta from the api if to already passed
if (is.null(datasets_api_results) || is.null(surveys_api_results)) {
datasets_api_results <- dhs_datasets()
surveys_api_results <- dhs_surveys()
}
# set up temp file for unpacking bins
tf <- tempfile(fileext = ".txt")
values <- authenticate_dhs(config)
# grab project number here
project_number <- values$proj_id
# re-access the download-datasets page
z <- httr::POST(
"https://dhsprogram.com/data/dataset_admin/download-datasets.cfm",
body = list(proj_id = project_number)
)
# write the information from the `countries` page to a local file
writeBin(z$content, tf)
# load the text
y <- brio::read_lines(tf)
# DHS Website has changed and the POST request data requires two steps here
# Create post request for the download manager
values <- list(
Proj_ID = project_number,
action = "getdatasets"
)
# Head to download page
z <- httr::POST("https://dhsprogram.com/data/dataset_admin/index.cfm",
body = values)
# Create post request for the download manager
values <- list(
Proj_ID = project_number,
action = "downloadmanager"
)
# Head to download page
z <- httr::POST("https://dhsprogram.com/data/dataset_admin/index.cfm",
body = values)
# Grab the content from that and start creation for last post request
writeBin(z$content, tf)
# load the text
y <- brio::read_lines(tf)
# Donqwload manager post creation
ctrycodelist_lines <- grep("name=\"ctrycodelist\" value=", y, value = TRUE)
ctrycodelist <- qdapRegex::rm_between(
ctrycodelist_lines, 'value=\"', '\"', extract = TRUE
)
names(ctrycodelist) <- rep("ctrycodelist", length(ctrycodelist))
class(ctrycodelist) <- "list"
filedatatypelist_DHS_line <- grep("name=\"filedatatypelist_", y, value = TRUE)
filedatatypelist_DHS <- qdapRegex::rm_between(
filedatatypelist_DHS_line, 'value=\"', '\"', extract = TRUE
)
names(filedatatypelist_DHS) <- paste0(
"filedatatypelist_",
qdapRegex::rm_between(filedatatypelist_DHS_line,
"filedatatypelist_", "\" value", extract = TRUE)
)
class(filedatatypelist_DHS) <- "list"
formatlist <- grep("fformatlist", y, value = TRUE)
formatlist <- qdapRegex::rm_between(
formatlist, 'value=\"', '\"', extract = TRUE
)
names(formatlist) <- rep("fformatlist", length(formatlist))
class(formatlist) <- "list"
values <- list(
surveymode = "all",
Proj_ID = project_number,
action = "downloadmanager",
subaction = "Build URL File List",
sub = "submit",
submit = "Build URL File List",
FileDataTypeCode = "",
ctrycode = ""
)
values <- append(values,
values = c(ctrycodelist, filedatatypelist_DHS, formatlist))
# submit request for all the possible datasets
message("Creating Download url list from DHS website...")
z <- httr::POST(
"https://dhsprogram.com/data/dataset_admin/index.cfm",
body = values
)
link.urls <- xml2::xml_find_all(httr::content(z), "//a")
# pull all links download and read in
url_link <- paste0("https://dhsprogram.com", grep(
pattern = "/data/download/urlslist",
xml2::xml_attr(link.urls, "href"), value = TRUE
))
httr::GET(
url_link, httr::user_agent("https://github.com/ropensci/rdhs"),
destfile = tf, httr::write_disk(tf, overwrite = TRUE)
)
urls <- brio::read_lines(tf)
urls <- urls[-which(!nzchar(urls))]
# start filling in the end result data frame of all available datasets
res <- matrix(data = "", nrow = length(urls),
ncol = dim(datasets_api_results)[2] + 1)
colnames(res) <- c(names(datasets_api_results), "URLS")
res <- as.data.frame(res, stringsAsFactors = FALSE)
res$URLS <- urls
res$FileName <- qdapRegex::rm_between(urls, "Filename=", "&Tp",
extract = TRUE) %>% unlist()
res$DHS_CountryCode <- qdapRegex::rm_between(urls, "Ctry_Code=", "&surv_id",
extract = TRUE) %>% unlist()
# match meta using filenames and countrycodes
# (India has subnational datasets that clash)
fileName_matches <- match(
paste0(toupper(res$FileName),
toupper(res$DHS_CountryCode)),
paste0(toupper(datasets_api_results$FileName),
toupper(datasets_api_results$DHS_CountryCode))
)
res_matches <- which(!is.na(fileName_matches))
# remove any missing matches (shouldn't happen if API up to date)
if (sum(is.na(fileName_matches)) > 0) {
message("\nSome of your available datasets are not found in the DHS API. \n",
"This is likely due to the DHS API being out of date and as such \n",
"some of the meta information about your available datasets \n",
"may not be available.")
fileName_matches <- fileName_matches[-which(is.na(fileName_matches))]
}
s <- seq_len(length(datasets_api_results))
res[res_matches, s] <- datasets_api_results[fileName_matches, ]
return(res)
}
#' Create a data frame of datasets that your log in can download
#'
#' Download datasets specified using output of \code{available_datasets}.
#' @param config Object of class `rdhs_config` as produced by `read_rdhs_config`
#' that must contain a valid `email`, `project` and `password`.
#' @param desired_dataset Row from \code{available_datasets}
#' @param download_option Character dictating how the survey is stored when
#' downloaded. Must be one of:
#' \itemize{
#' \item{"zip"} - Just the zip. "z", "i", "p" or "zip" will match
#' \item{"rds"} - Just the read in and saved rds. "r", "d", "s" or "rdhs"
#' will match
#' \item{"both"} - Both the rds and extract. "b", "o", "t", "h" or "both"
#' will match
#' }
#' @param reformat Boolean detailing whether dataset rds should be
#' reformatted for ease of use later. Default = TRUE
#' @param all_lower Logical indicating whether all value labels should be
#' lower case. Default to `TRUE`.
#' @param output_dir_root Directory where files are to be downloaded to
#' @param ... Any other arguments to be passed to
#' \code{\link{read_dhs_dataset}}
#'
download_datasets <- function(config,
desired_dataset,
download_option = "both",
reformat=TRUE,
all_lower=TRUE,
output_dir_root=NULL,
...) {
# possible download options:
download_possibilities <- c("zip", "rds", "both")
download_option <- grep(
paste0(strsplit(download_option, "") %>% unlist(), collapse = "|"),
download_possibilities)
if (!is.element(download_option, 1:3)) {
stop("Download option specified not one of zip,rds,both")
}
# handle output dir
dataset_dir <- file.path(output_dir_root)
if (reformat) {
dataset_dir <- paste0(dataset_dir, "_reformatted")
}
# make sure the folder exists and create the zip path
dir.create(dataset_dir, showWarnings = FALSE, recursive = TRUE)
zip_path <- file.path(dataset_dir, desired_dataset$FileName)
# download our zip and parse the response for any errors
message("Downloading: \n", paste0(desired_dataset$CountryName, " ",
desired_dataset$SurveyYear, " ",
desired_dataset$SurveyType, " ",
desired_dataset$FileType, " ",
desired_dataset$FileFormat, " ",
"[", desired_dataset$FileName, "]",
collapse = ", "))
# set up temp file for unpacking bins
# annoyingly we have to do this because some zips have
# been zipped with the same name several times
# so they can not be unzipped to the same directory.
# Thus we bounce unzips between these two dirs.
tf <- tempfile()
tdir <- tempfile()
on.exit(unlink(c(tf, tdir), recursive = TRUE, force = TRUE))
# create a simple while loop on file size check and carry this
# out three times befoe stopping
file_size_check <- TRUE
attempts <- 3
# if the downloaded file is not the size we expect
# then re log in the first time
while (file_size_check & attempts > 0) {
# download zip to our tempfile
if (Sys.getenv("rdhs_LOUD_DOWNLOAD") == TRUE) {
resp <- httr::GET(desired_dataset$URLS[1],
destfile = tf,
httr::user_agent("https://github.com/ropensci/rdhs"),
httr::write_disk(tf, overwrite = TRUE),
httr::progress()
) %>% handle_api_response(to_json = FALSE)
} else {
resp <- httr::GET(desired_dataset$URLS[1],
destfile = tf,
httr::user_agent("https://github.com/ropensci/rdhs"),
httr::write_disk(tf, overwrite = TRUE)
) %>% handle_api_response(to_json = FALSE)
}
# if it's not the right size and first time we've tried then log in
if (file.size(tf) != desired_dataset$FileSize[1] & attempts == 3) {
# do updated authentication procedure
auth_downloads(config)
} else if (file.size(tf) == desired_dataset$FileSize[1]) {
file_size_check <- FALSE
attempts <- 0
}
attempts <- attempts - 1
}
## If the zip contains a nested zip file of the same name as the desired file,
## unzip and replace the file with the nested zip.
nest_zf <- grep(desired_dataset$FileName,
unzip(tf, list = TRUE)$Name,
ignore.case = TRUE,
value = TRUE)
while (length(nest_zf)){
tf <- unzip(tf, nest_zf[1], exdir = tdir)
nest_zf <- grep(desired_dataset$FileName,
unzip(tf, list = TRUE)$Name,
ignore.case = TRUE,
value = TRUE)
}
## DOWNLOAD OPTIONS HANDLING:
# 1. Just the zip - we'll always do this and then if it's 2 remove it later
# if it's just the zip then we copy it to the
# directory return the file path for this
res <- file.copy(tf, to = zip_path, overwrite = TRUE)
res <- if (res) {
zip_path
} else {
stop("Failed to download zip to where client root is")
}
# 2/3. rds or both
if (download_option >= 2) {
# now read the dataset in with the requested reformat options
res <- read_dhs_dataset(zip_path,
dataset = desired_dataset,
reformat, all_lower, ...
)
# handle results. If it's character it's because we
# haven't yet got a parser we are happy with
if (!is.character(res)) {
# let's assign the file name attribute to the res
attr(res$dataset, which = "filename") <- desired_dataset$file
# set up the rds_path to save the dataset
rds_path <- file.path(dataset_dir, paste0(desired_dataset$file, ".rds"))
# save the dataset out
saveRDS(res$dataset, rds_path)
# if the class of the object is from a geo or geo_covariates file then we
# will just return the rds path
if (inherits(res$dataset, "sf") ||
desired_dataset$FileType == "Geographic Data" ||
desired_dataset$FileType == "Geospatial Covariates") {
res <- rds_path
} else {
# if its a dataset then we return the path and the
# code_descriptions as these are useful to have cached
res$dataset <- rds_path
}
}
# 3. If not both then delete the zip
if (download_option != 3) {
file.remove(zip_path)
}
}
message("Dataset download finished")
return(res)
}
#' Authenticate Users for DHS website
#'
#' @title DHS Website Authentication
#' @param config Object of class `rdhs_config` as produced by `read_rdhs_config`
#' that must contain a valid `email`, `project` and `password`.
#'
#' @details If the user has more than one project that contains the first
#' 30 characters of the provided project they will be prompted to choose
#' which project they want. This choice will be saved so they do
#' not have to enter it again in this R session.
#'
#' @note Credit for some of the function to
#' \url{https://github.com/ajdamico/lodown/blob/master/R/dhs.R}
#'
#' @return Returns list of length 3:
#' \itemize{
#' \item user_name: your email usually
#' \item user_pass: your password you provided
#' \item proj_id: your project number
#' }
#'
#'
#'
authenticate_dhs <- function(config) {
your_email <- config$email
your_project <- config$project
your_password <- config$password
# Argument Checking
if (!is.character(your_email)) stop("your_email is not a string")
if (!is.character(your_project)) stop("your_project is not a string")
if (!is.character(your_password)) stop("your_password is not a string")
# authentication page
terms <- "https://dhsprogram.com/data/dataset_admin/login_main.cfm"
# create a temporary file
tf <- tempfile(fileext = ".txt")
# set the username and password
values <- list(
UserName = your_email,
UserPass = your_password,
Submitted = 1,
UserType = 2
)
# log in.
message("Logging into DHS website...")
z <- httr::POST(terms, body = values) %>% handle_api_response(to_json = FALSE)
# extract the available countries from the projects page
# write the information from the `projects` page to a local file
writeBin(z$content, tf)
# load the text
y <- brio::read_lines(tf)
# figure out the project number - only use first 30 chars due to ellipsis
# formation if it is longer than 30
if (nchar(your_project) > 30) {
project_lines <- unique(
y[grepl("option value", y) &
grepl(paste0(strsplit(your_project, "")[[1]][1:30], collapse = ""),
y, fixed = TRUE)
])
} else {
project_lines <- unique(
y[grepl("option value", y) &
grepl(paste0(strsplit(your_project, "")[[1]], collapse = ""),
y, fixed = TRUE)
])
}
# confirm only one project and handle if more than
if (length(project_lines) == 1) {
} else {
if (length(project_lines) > 1) {
# if they have more than one project that is similar
# then have they encoutnereed this before:
pl <- config$project_choice
# if nothing is set then ask them which one:
if (is.null(pl)) {
# get the names of the projects
projs <- unlist(qdapRegex::ex_between(project_lines, ">", "<"))
nums <- unlist(qdapRegex::ex_between(project_lines, "value=\"", "\">"))
nums <- as.numeric(nums)
oldest <- sort.int(nums, index.return = TRUE)$ix
# prompt for an option until they give is a good one
valid_prompt <- FALSE
while (!valid_prompt) {
pl <- readline(
prompt = cat(
"You have multiple projects that have similar names. Which one",
"did you want to use? The oldest project is number 1.",
"(Enter the correct number for your project)\n",
paste(seq_len(length(project_lines)), projs[oldest], sep = ": "),
"\nYour choice will be remembered within this R session,",
"but will need to be entered each time you load a new R session.",
sep = "\n"
)) %>% as.integer()
if (is.element(pl, seq_len(length(project_lines)))) {
valid_prompt <- TRUE
}
}
# set the option for the future
pl <- as.numeric(pl)
config$project_choice <- nums[oldest][pl]
update_rdhs_config(project_choice = nums[oldest][pl])
}
project_lines <- project_lines[oldest[pl]]
} else {
stop(
"Your log in credentials were not recognised by the DHS website.\n",
"Please check your credentials are right (?get_rdhs_config), ",
"and your internet connection for possible error."
)
}
}
# extract the project number from the line above
project_number <- gsub("(.*)<option value=\"([0-9]*)\">(.*)",
"\\2",
project_lines)
# remove the tf
suppressWarnings(file.remove(tf))
# return these credentials to be used for downloading datasets
res <- list(
user_name = your_email,
user_pass = your_password,
proj_id = project_number
)
return(res)
}
#' @noRd
auth_downloads <- function(config){
# authenticate
values <- authenticate_dhs(config)
# grab project number here
project_number <- values$proj_id
# Create post request for the download manager
values <- list(
Proj_ID = project_number,
action = "getdatasets"
)
# re-access the download-datasets page
z <- httr::POST(
"https://dhsprogram.com/data/dataset_admin/download-datasets.cfm",
body = list(proj_id = project_number)
)
# Head to download page
z <- httr::POST("https://dhsprogram.com/data/dataset_admin/index.cfm",
body = values)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.