R/download_survey.r

Defines functions download_survey

Documented in download_survey

#' Download a survey from its Zenodo repository
#'
#' @description Downloads survey data
#' @param survey a URL (see [list_surveys()])
#' @param dir a directory to save the files to; if not given, will save to a temporary directory
#' @importFrom httr GET content status_code http_error
#' @importFrom jsonlite fromJSON toJSON
#' @importFrom curl curl_download
#' @importFrom utils read.csv
#' @importFrom xml2 xml_text xml_find_first
#' @autoglobal
#' @examples
#' \dontrun{
#' list_surveys()
#' peru_survey <- download_survey("https://doi.org/10.5281/zenodo.1095664")
#' }
#' @return a vector of filenames that can be used with [load_survey]
#  @seealso load_survey
#' @export
download_survey <- function(survey, dir = NULL) {
  survey <- sub("^(https?:\\/\\/(dx\\.)?doi\\.org\\/|doi:)", "", survey)
  survey <- sub("#.*$", "", survey)
  is.doi <- (length(survey) > 0) && all(grepl("^10.[0-9.]{4,}/[-._;()/:A-z0-9]+$", survey))
  is.url <- (length(survey) > 0) && (is.doi || grepl("^https?:\\/\\/", survey))

  if (is.url & length(survey) > 1) {
    stop("'survey' must be of length 1")
  }

  if (is.doi) url <- paste0("https://doi.org/", survey) else url <- survey

  if (is.url) {
    temp_body <- GET(url, config = list(followlocation = TRUE))
    if (status_code(temp_body) == 404) stop("DOI '", survey, "' not found")
    if (http_error(temp_body)) {
      stop(
        "Could not fetch the resource. ",
        "This could an issue with the website server or your own connection."
      )
    }

    parsed_body <- content(temp_body, encoding = "UTF-8")
    parsed_cite <- fromJSON(
      xml_text(
        xml_find_first(parsed_body, '//script[@type="application/ld+json"]')
      )
    )

    reference <- list(
      title = parsed_cite$name,
      bibtype = "Misc",
      author = parsed_cite$creator$name,
      year = data.table::year(parsed_cite$datePublished)
    )
    if ("version" %in% names(parsed_cite)) {
      reference[["note"]] <- paste("Version", parsed_cite$version)
    }
    reference[[ifelse(is.doi, "doi", "url")]] <- survey

    data <- data.table(parsed_cite$distribution)
    ## only download csv files
    data <- data[encodingFormat == "csv"]
    data[, file_name := tolower(basename(contentUrl))]

    if (length(unique(data$file_name)) < length(data$file_name)) {
      warning("Zenodo repository contains files with names that only differ by case. ",
              "This will cause unpredictable behaviour on case-insensitive file systems. ",
              "Please contact the authors to get this fixed.")
      data <- data[!duplicated(file_name)]
    }

    if (is.null(dir)) {
      dir <- tempdir()
    }

    message("Getting ", parsed_cite$name, ".")

    # find initial longest common subequence of file names
    i <- 1
    end <- FALSE
    lcs <- ""
    while (!end) {
      initial_bits <- vapply(data$file_name, function(x) {
        substr(x, 1, i)
      }, "x")
      if (length(unique(initial_bits)) > 1) {
        end <- TRUE
      } else {
        lcs <- unique(initial_bits)
        i <- i + 1
      }
    }
    reference_file_path <- file.path(dir, paste0(lcs, "reference.json"))
    reference_json <- toJSON(reference)
    write(reference_json, reference_file_path)

    files <- c(reference_file_path, vapply(seq_len(nrow(data)), function(i) {
      url <- data[i, ]$contentUrl
      temp <- file.path(dir, data[i, ]$file_name)
      message("Downloading ", url)
      dl <- curl_download(url, temp)
      return(temp)
    }, ""))
  } else {
    stop("'survey' is not a DOI or URL.")
  }

  return(files)
}

Try the socialmixr package in your browser

Any scripts or data that you put into this service are public.

socialmixr documentation built on Oct. 29, 2022, 1:09 a.m.