R/authentication.R

Defines functions auth_downloads authenticate_dhs download_datasets available_datasets

Documented in authenticate_dhs available_datasets download_datasets

#' Create a data frame of datasets that your log in can download
#'
#' DHS datasets that can be downloaded
#' @param config Object of class `rdhs_config` as produced by `read_rdhs_config`
#'   that must contain a valid `email`, `project` and `password`.
#' @param datasets_api_results Data.table for the api results for the datasets
#'   endpoint. Default = NULL and
#' generated by default if not declared.
#' @param surveys_api_results  Data.table for the api results for the surveys
#'   endpoint. Default = NULL and
#' generated by default if not declared.
#'
#' @note Inspiration for function to
#'   \url{https://github.com/ajdamico/lodown/blob/master/R/dhs.R}
#'
#' @return Returns \code{"data.frame"} of length 14:
#' \itemize{
#'       \item{"FileFormat"}
#'       \item{"FileSize"}
#'       \item{"DatasetType"}
#'       \item{"SurveyNum"}
#'       \item{"SurveyId"}
#'       \item{"FileType"}
#'       \item{"FileDateLastModified"}
#'       \item{"SurveyYearLabel"}
#'       \item{"SurveyType"}
#'       \item{"SurveyYear"}
#'       \item{"DHS_CountryCode"}
#'       \item{"FileName"}
#'       \item{"CountryName"}
#'       \item{"URLS"}
#'       }
#'
available_datasets <- function(config,
                               datasets_api_results = NULL,
                               surveys_api_results = NULL) {


  # fetch all the datasets meta from the api if to already passed
  if (is.null(datasets_api_results) || is.null(surveys_api_results)) {
    datasets_api_results <- dhs_datasets()
    surveys_api_results <- dhs_surveys()
  }

  # set up temp file for unpacking bins
  tf <- tempfile(fileext = ".txt")
  values <- authenticate_dhs(config)

  # grab project number here
  project_number <- values$proj_id

  # re-access the download-datasets page
  z <- httr::POST(
    "https://dhsprogram.com/data/dataset_admin/download-datasets.cfm",
    body = list(proj_id = project_number)
  )

  # write the information from the `countries` page to a local file
  writeBin(z$content, tf)

  # load the text
  y <- brio::read_lines(tf)

  # DHS Website has changed and the POST request data requires two steps here

  # Create post request for the download manager
  values <- list(
    Proj_ID = project_number,
    action = "getdatasets"
  )

  # Head to download page
  z <- httr::POST("https://dhsprogram.com/data/dataset_admin/index.cfm",
                  body = values)

  # Create post request for the download manager
  values <- list(
    Proj_ID = project_number,
    action = "downloadmanager"
  )

  # Head to download page
  z <- httr::POST("https://dhsprogram.com/data/dataset_admin/index.cfm",
                  body = values)


  # Grab the content from that and start creation for last post request
  writeBin(z$content, tf)
  # load the text
  y <- brio::read_lines(tf)


  # Donqwload manager post creation
  ctrycodelist_lines <- grep("name=\"ctrycodelist\" value=", y, value = TRUE)
  ctrycodelist <- qdapRegex::rm_between(
    ctrycodelist_lines, 'value=\"', '\"', extract = TRUE
  )

  names(ctrycodelist) <- rep("ctrycodelist", length(ctrycodelist))
  class(ctrycodelist) <- "list"

  filedatatypelist_DHS_line <- grep("name=\"filedatatypelist_", y, value = TRUE)
  filedatatypelist_DHS <- qdapRegex::rm_between(
    filedatatypelist_DHS_line, 'value=\"', '\"', extract = TRUE
  )

  names(filedatatypelist_DHS) <- paste0(
    "filedatatypelist_",
    qdapRegex::rm_between(filedatatypelist_DHS_line,
                          "filedatatypelist_", "\" value", extract = TRUE)
  )
  class(filedatatypelist_DHS) <- "list"

  formatlist <- grep("fformatlist", y, value = TRUE)
  formatlist <- qdapRegex::rm_between(
    formatlist, 'value=\"', '\"', extract = TRUE
  )

  names(formatlist) <- rep("fformatlist", length(formatlist))
  class(formatlist) <- "list"

  values <- list(
    surveymode = "all",
    Proj_ID = project_number,
    action = "downloadmanager",
    subaction = "Build URL File List",
    sub = "submit",
    submit = "Build URL File List",
    FileDataTypeCode = "",
    ctrycode = ""
  )

  values <- append(values,
                   values = c(ctrycodelist, filedatatypelist_DHS, formatlist))

  # submit request for all the possible datasets
  message("Creating Download url list from DHS website...")
  z <- httr::POST(
    "https://dhsprogram.com/data/dataset_admin/index.cfm",
    body = values
  )

  link.urls <- xml2::xml_find_all(httr::content(z), "//a")

  # pull all links download and read in
  url_link <- paste0("https://dhsprogram.com", grep(
    pattern = "/data/download/urlslist",
    xml2::xml_attr(link.urls, "href"), value = TRUE
  ))

  httr::GET(
    url_link, httr::user_agent("https://github.com/ropensci/rdhs"),
    destfile = tf, httr::write_disk(tf, overwrite = TRUE)
  )

  urls <- brio::read_lines(tf)
  urls <- urls[-which(!nzchar(urls))]

  # start filling in the end result data frame of all available datasets
  res <- matrix(data = "", nrow = length(urls),
                ncol = dim(datasets_api_results)[2] + 1)
  colnames(res) <- c(names(datasets_api_results), "URLS")
  res <- as.data.frame(res, stringsAsFactors = FALSE)
  res$URLS <- urls
  res$FileName <- qdapRegex::rm_between(urls, "Filename=", "&Tp",
                                        extract = TRUE) %>% unlist()
  res$DHS_CountryCode <- qdapRegex::rm_between(urls, "Ctry_Code=", "&surv_id",
                                               extract = TRUE) %>% unlist()

  # match meta using filenames and countrycodes
  # (India has subnational datasets that clash)
  fileName_matches <- match(
    paste0(toupper(res$FileName),
           toupper(res$DHS_CountryCode)),
    paste0(toupper(datasets_api_results$FileName),
           toupper(datasets_api_results$DHS_CountryCode))
  )
  res_matches <- which(!is.na(fileName_matches))

  # remove any missing matches (shouldn't happen if API up to date)
  if (sum(is.na(fileName_matches)) > 0) {
    message("\nSome of your available datasets are not found in the DHS API. \n",
            "This is likely due to the DHS API being out of date and as such \n",
            "some of the meta information about your available datasets \n",
            "may not be available.")
    fileName_matches <- fileName_matches[-which(is.na(fileName_matches))]
  }

  s <- seq_len(length(datasets_api_results))
  res[res_matches, s] <- datasets_api_results[fileName_matches, ]

  return(res)
}


#' Create a data frame of datasets that your log in can download
#'
#' Download datasets specified using output of \code{available_datasets}.
#' @param config Object of class `rdhs_config` as produced by `read_rdhs_config`
#'   that must contain a valid `email`, `project` and `password`.
#' @param desired_dataset Row from \code{available_datasets}
#' @param download_option Character dictating how the survey is stored when
#'   downloaded. Must be one of:
#'   \itemize{
#'       \item{"zip"} - Just the zip. "z", "i", "p" or "zip" will match
#'       \item{"rds"} - Just the read in and saved rds. "r", "d", "s" or "rdhs"
#'       will match
#'       \item{"both"} - Both the rds and extract. "b", "o", "t", "h" or "both"
#'       will match
#'       }
#' @param reformat Boolean detailing whether dataset rds should be
#'    reformatted for ease of use later. Default = TRUE
#' @param all_lower Logical indicating whether all value labels should be
#'   lower case. Default to `TRUE`.
#' @param output_dir_root Directory where files are to be downloaded to
#' @param ... Any other arguments to be passed to
#'   \code{\link{read_dhs_dataset}}
#'
download_datasets <- function(config,
                              desired_dataset,
                              download_option = "both",
                              reformat=TRUE,
                              all_lower=TRUE,
                              output_dir_root=NULL,
                              ...) {

  # possible download options:
  download_possibilities <- c("zip", "rds", "both")
  download_option <- grep(
    paste0(strsplit(download_option, "") %>% unlist(), collapse = "|"),
    download_possibilities)

  if (!is.element(download_option, 1:3)) {
    stop("Download option specified not one of zip,rds,both")
  }

  # handle output dir
  dataset_dir <- file.path(output_dir_root)
  if (reformat) {
    dataset_dir <- paste0(dataset_dir, "_reformatted")
  }

  # make sure the folder exists and create the zip path
  dir.create(dataset_dir, showWarnings = FALSE, recursive = TRUE)
  zip_path <- file.path(dataset_dir, desired_dataset$FileName)

  # download our zip and parse the response for any errors
  message("Downloading: \n", paste0(desired_dataset$CountryName, " ",
                                   desired_dataset$SurveyYear, " ",
                                   desired_dataset$SurveyType, " ",
                                   desired_dataset$FileType, " ",
                                   desired_dataset$FileFormat, " ",
                                   "[", desired_dataset$FileName, "]",
                                   collapse = ", "))

  # set up temp file for unpacking bins
  # annoyingly we have to do this because some zips have
  # been zipped with the same name several times
  # so they can not be unzipped to the same directory.
  # Thus we bounce unzips between these two dirs.
  tf <- tempfile()
  tdir <- tempfile()
  on.exit(unlink(c(tf, tdir), recursive = TRUE, force = TRUE))

  # create a simple while loop on file size check and carry this
  # out three times befoe stopping
  file_size_check <- TRUE
  attempts <- 3

  # if the downloaded file is not the size we expect
  # then re log in the first time
  while (file_size_check & attempts > 0) {

    # download zip to our tempfile
    if (Sys.getenv("rdhs_LOUD_DOWNLOAD") == TRUE) {
      resp <- httr::GET(desired_dataset$URLS[1],
                        destfile = tf,
                        httr::user_agent("https://github.com/ropensci/rdhs"),
                        httr::write_disk(tf, overwrite = TRUE),
                        httr::progress()
      ) %>% handle_api_response(to_json = FALSE)
    } else {
      resp <- httr::GET(desired_dataset$URLS[1],
                        destfile = tf,
                        httr::user_agent("https://github.com/ropensci/rdhs"),
                        httr::write_disk(tf, overwrite = TRUE)
      ) %>% handle_api_response(to_json = FALSE)
    }

    # if it's not the right size and first time we've tried then log in
    if (file.size(tf) != desired_dataset$FileSize[1] & attempts == 3) {

      # do updated authentication procedure
      auth_downloads(config)

    } else if (file.size(tf) == desired_dataset$FileSize[1]) {
      file_size_check <- FALSE
      attempts <- 0
    }

    attempts <- attempts - 1
  }


  ## If the zip contains a nested zip file of the same name as the desired file,
  ## unzip and replace the file with the nested zip.

  nest_zf <- grep(desired_dataset$FileName,
                  unzip(tf, list = TRUE)$Name,
                  ignore.case = TRUE,
                  value = TRUE)
  while (length(nest_zf)){
    tf <- unzip(tf, nest_zf[1], exdir = tdir)
    nest_zf <- grep(desired_dataset$FileName,
                    unzip(tf, list = TRUE)$Name,
                    ignore.case = TRUE,
                    value = TRUE)
  }

  ## DOWNLOAD OPTIONS HANDLING:

  # 1. Just the zip - we'll always do this and then if it's 2 remove it later

  # if it's just the zip then we copy it to the
  # directory return the file path for this
  res <- file.copy(tf, to = zip_path, overwrite = TRUE)
  res <- if (res) {
    zip_path
  } else {
    stop("Failed to download zip to where client root is")
  }

  # 2/3. rds or both
  if (download_option >= 2) {

    # now read the dataset in with the requested reformat options
    res <- read_dhs_dataset(zip_path,
                            dataset = desired_dataset,
                            reformat, all_lower, ...
    )

    # handle results. If it's character it's because we
    # haven't yet got a parser we are happy with
    if (!is.character(res)) {

      # let's assign the file name attribute to the res
      attr(res$dataset, which = "filename") <- desired_dataset$file

      # set up the rds_path to save the dataset
      rds_path <- file.path(dataset_dir, paste0(desired_dataset$file, ".rds"))

      # save the dataset out
      saveRDS(res$dataset, rds_path)



      # if the class of the object is from a geo or geo_covariates file then we
      # will just return the rds path
      if (inherits(res$dataset, "sf") ||
          desired_dataset$FileType == "Geographic Data" ||
          desired_dataset$FileType == "Geospatial Covariates") {
        res <- rds_path
      } else {
        # if its a dataset then we return the path and the
        # code_descriptions as these are useful to have cached
        res$dataset <- rds_path
      }
    }

    # 3. If not both then delete the zip
    if (download_option != 3) {
      file.remove(zip_path)
    }
  }

  message("Dataset download finished")
  return(res)
}



#' Authenticate Users for DHS website
#'
#' @title DHS Website Authentication
#' @param config Object of class `rdhs_config` as produced by `read_rdhs_config`
#'   that must contain a valid `email`, `project` and `password`.
#'
#' @details If the user has more than one project that contains the first
#'   30 characters of the provided project they will be prompted to choose
#'   which project they want. This choice will be saved so they do
#'   not have to enter it again in this R session.
#'
#' @note Credit for some of the function to
#'   \url{https://github.com/ajdamico/lodown/blob/master/R/dhs.R}
#'
#' @return Returns list of length 3:
#'   \itemize{
#'       \item user_name: your email usually
#'       \item user_pass: your password you provided
#'       \item proj_id: your project number
#'       }
#'
#'
#'

authenticate_dhs <- function(config) {

  your_email <- config$email
  your_project <- config$project
  your_password <- config$password

  # Argument Checking
  if (!is.character(your_email)) stop("your_email is not a string")
  if (!is.character(your_project)) stop("your_project is not a string")
  if (!is.character(your_password)) stop("your_password is not a string")

  # authentication page
  terms <- "https://dhsprogram.com/data/dataset_admin/login_main.cfm"

  # create a temporary file
  tf <- tempfile(fileext = ".txt")

  # set the username and password
  values <- list(
    UserName = your_email,
    UserPass = your_password,
    Submitted = 1,
    UserType = 2
  )

  # log in.
  message("Logging into DHS website...")
  z <- httr::POST(terms, body = values) %>% handle_api_response(to_json = FALSE)

  # extract the available countries from the projects page
  # write the information from the `projects` page to a local file
  writeBin(z$content, tf)

  # load the text
  y <- brio::read_lines(tf)

  # figure out the project number - only use first 30 chars due to ellipsis
  # formation if it is longer than 30
  if (nchar(your_project) > 30) {
    project_lines <- unique(
      y[grepl("option value", y) &
          grepl(paste0(strsplit(your_project, "")[[1]][1:30], collapse = ""),
                y, fixed = TRUE)
        ])
  } else {
    project_lines <- unique(
      y[grepl("option value", y) &
          grepl(paste0(strsplit(your_project, "")[[1]], collapse = ""),
                y, fixed = TRUE)
        ])
  }

  # confirm only one project and handle if more than
  if (length(project_lines) == 1) {

  } else {
    if (length(project_lines) > 1) {

      # if they have more than one project that is similar
      # then have they encoutnereed this before:
      pl <- config$project_choice

      # if nothing is set then ask them which one:
      if (is.null(pl)) {

        # get the names of the projects
        projs <- unlist(qdapRegex::ex_between(project_lines, ">", "<"))
        nums <- unlist(qdapRegex::ex_between(project_lines, "value=\"", "\">"))
        nums <- as.numeric(nums)
        oldest <- sort.int(nums, index.return = TRUE)$ix
        # prompt for an option until they give is a good one
        valid_prompt <- FALSE
        while (!valid_prompt) {
          pl <- readline(
            prompt = cat(
              "You have multiple projects that have similar names. Which one",
              "did you want to use? The oldest project is number 1.",
              "(Enter the correct number for your project)\n",
              paste(seq_len(length(project_lines)), projs[oldest], sep = ": "),
              "\nYour choice will be remembered within this R session,",
              "but will need to be entered each time you load a new R session.",
              sep = "\n"
            )) %>% as.integer()

          if (is.element(pl, seq_len(length(project_lines)))) {
            valid_prompt <- TRUE
          }
        }

        # set the option for the future
        pl <- as.numeric(pl)
        config$project_choice <- nums[oldest][pl]
        update_rdhs_config(project_choice = nums[oldest][pl])

      }
      project_lines <- project_lines[oldest[pl]]
    } else {
      stop(
        "Your log in credentials were not recognised by the DHS website.\n",
        "Please check your credentials are right (?get_rdhs_config), ",
        "and your internet connection for possible error."
      )
    }
  }

  # extract the project number from the line above
  project_number <- gsub("(.*)<option value=\"([0-9]*)\">(.*)",
                         "\\2",
                         project_lines)

  # remove the tf
  suppressWarnings(file.remove(tf))

  # return these credentials to be used for downloading datasets
  res <- list(
    user_name = your_email,
    user_pass = your_password,
    proj_id = project_number
  )

  return(res)
}

#' @noRd
auth_downloads <- function(config){

  # authenticate
  values <- authenticate_dhs(config)

  # grab project number here
  project_number <- values$proj_id

  # Create post request for the download manager
  values <- list(
    Proj_ID = project_number,
    action = "getdatasets"
  )

  # re-access the download-datasets page
  z <- httr::POST(
    "https://dhsprogram.com/data/dataset_admin/download-datasets.cfm",
    body = list(proj_id = project_number)
  )

  # Head to download page
  z <- httr::POST("https://dhsprogram.com/data/dataset_admin/index.cfm",
                  body = values)

}
OJWatson/rdhs documentation built on April 4, 2024, 10:46 a.m.