R/url_funs.R

Defines functions format_preference clean_attr get_rounds_link get_href extract_country_html get_download_url grab_url_sddf_late_rounds grab_url country_url_sddf_late_rounds country_url_sddf country_url round_url

round_url <- function(rounds, format = NULL) { # nocov start
  # Get unique rounds to avoid repeting rounds
  rounds <- sort(unique(rounds))

  # Check rounds
  check_rounds(rounds)

  round_codes <- paste0("ESS", rounds)

  # Extract download urls from selected_rounds
  round_links <- sort(grep(pattern = paste0(round_codes, collapse = "|"),
                           x = get_rounds_link(.global_vars$ess_website),
                           value = TRUE))

  full_url <- get_download_url(round_links, format)

  full_url

} # nocov end

country_url <- function(country, rounds, format = NULL) { # nocov start
  # And also the rounds for that country
  check_country_rounds(country, rounds)

  # Only select the rounds that match this:
  # /download.html\\?file= for the downoad section
  # ESS[0-9] for any of the rounds
  # [A-Z]{1,1} followed by a capital letter for
  # the country name, as in DK
  # (.*) anything in between
  # [0-9]{4, } for the year of round
  country_regex <- "^/download.html\\?file=ESS[0-9][A-Z]{1,2}(.*)[0-9]{4, }$"


  full_urls <- grab_url(country, rounds, country_regex, format)

  full_urls
} # nocov end


country_url_sddf <- function(country, rounds, format = NULL) { # nocov start

  # And also the rounds for that country
  check_country_sddf_rounds(country, rounds)

  # Only select the rounds that match this:
  # /download.html\\?file= for the downoad section
  # ESS[0-9]{1,} for any of the rounds
  # [A-Z]{1,1} followed by a capital letter for
  # the country name, as in DK
  # (.*) anything in between
  # [0-9]{4, } for the year of round

  sddf_regex <-
    "^/download.html\\?file=ESS[0-9]{1,}_[A-Z]{1,2}_SDDF(.*)[0-9]{4, }$"


  full_urls <- grab_url(country, rounds, sddf_regex, format)

  full_urls
} # nocov end

country_url_sddf_late_rounds <- function(country, rounds, format = NULL) { # nocov start

  # And also the rounds for that country
  check_country_sddf_rounds(country, rounds)

  # No need to add regex as in country_url because grabbing the rounds
  # returns the direct paths
  full_urls <- grab_url_sddf_late_rounds(rounds, format)

  full_urls
} # nocov end


grab_url <- function(country, rounds, regex, format) { # nocov start

  # Get unique rounds to avoid repeting rounds
  rounds <- sort(unique(rounds))
  # Get unique country to avoid repetitions
  country <- sort(unique(country))

  # Returns the chosen countries html that contains
  # the links to all round website.
  country_round_html <- extract_country_html(country,
                                             .global_vars$country_index)

  # Go deeper in the node to grab that countries url to the rounds
  country_node <- xml2::xml_find_all(country_round_html, "//ul //li //a")

  # Here we have all href from the website
  country_href <- xml2::xml_attrs(country_node, "href")

  incomplete_links <-
    sort(grep(regex,
              country_href,
              value = TRUE))

  # Extract round numbers
  round_numbers <- as.numeric(string_extract(incomplete_links,
                                             "[0-9]{1,2}"))

  # Build final ESS round links
  round_links <- incomplete_links[which(round_numbers %in% rounds)]

  # Using each separate round html, extract the stata or spss direct link
  # to download the data. Return the full direct path.
  full_urls <- get_download_url(round_links, format)

  full_urls
} # nocov end


grab_url_sddf_late_rounds <- function(rounds, format) { #nocov start

  # Get unique rounds to avoid repeting rounds
  rounds <- sort(unique(rounds))

  # Returns the all late SDDF rounds main page
   late_sddf_mainpage <-
    get_href(
      .global_vars$ess_website,
      .global_vars$sddf_index,
      "//ul [@class='docliste']//a"
    )

  # Here we have all late SDDF main websites
  late_sddf_href <- xml2::xml_attrs(late_sddf_mainpage, "href")

  # Extract round numbers
  round_numbers <-
    as.numeric(
      string_extract(late_sddf_href, "[0-9]{1,}")
    )

  # Build final ESS round links
  round_links <- late_sddf_href[which(round_numbers %in% rounds)]

  # Using each separate round html, extract the stata or spss direct link
  # to download the data. Return the full direct path.
  full_urls <- get_download_url(round_links, format)

  full_urls
} #nocov end

# Given a country/round website such as https://www.europeansocialsurvey.org/download.html?file=ESS1ES&c=ES&y=2002,
# extracts the stata (or spss, in that specific order if one is not available) path and
# returns the complete url path to download the data
get_download_url <- function(round_links, format) { # nocov start
  # Create empty character slot with the same length as round links
  # to populate it in the loop
  format.files <- character(length(round_links))

  # Build paths for each round
  for (index in seq_along(round_links)) {
    download_page <- safe_GET(paste0(.global_vars$ess_website,
                                     round_links[index]))

    html_ess <- xml2::read_html(download_page)
    z <- xml2::xml_text(xml2::xml_find_all(html_ess, "//a/@href"))
    format.files[index] <- format_preference(z, format)
  }

  full_urls <- sort(paste0(.global_vars$ess_website, format.files))

  full_urls

} # nocov end

# Function accepts the chosen country and the list
# of all available countries and returns the html doc
# for the chosen country that contains the whole list of
# links to download that countries rounds.
extract_country_html <- function(chosen_module, module_index) { # nocov start

  country_refs  <- get_href(.global_vars$ess_website, module_index)
  # Returns "/data/country.html?c=ukraine" for all countries
  all_module_links <-
    xml2::xml_attr(
      country_refs,
      "href"
    )

  all_module_countries <- xml2::xml_text(country_refs)

  # Build full url for chosen country
  chosen_module_link <-
    paste0(
      all_module_links[which(chosen_module == all_module_countries)]
      # index where the country because all_module_links and
      # all_module_countries are in the same order
    )

  # Extract html from country link to donwnload rounds
  module_rounds <- safe_GET(chosen_module_link)

  module_round_html <- xml2::read_html(module_rounds)

  module_round_html
} # nocov end

# Function to grab <a href="/data/country.html?c=latvia">Latvia</a>
# for each country. The default href path should take all td lists
# of class labelled (most ESS HTML lists) but for SDDF this changes
# to all li lists of clas docliste
get_href <- function(ess_website,
                     module_index,
                     href_xpath = "//td [@class='label']//a") { # nocov start

  download_page <- safe_GET(paste0(ess_website, module_index))

  country_node <-
    xml2::xml_find_all(
      xml2::read_html(download_page),
      href_xpath
    )

  country_node
} # nocov end

# This will return a link like "/download.html?file=ESS8e01&y=2016"
# for every round available in the ess website.
get_rounds_link <- function(ess_website) { # nocov start
  download_page <- safe_GET(paste0(ess_website, "/data/download.html?r="))
  html_ess <- xml2::read_html(download_page)
  z <- xml2::xml_text(xml2::xml_find_all(html_ess, "//a/@href"))

  downloads <- unique(grep("^/download.html(.*)[0-9]{4, }$", z, value = TRUE))
  downloads
} # nocov end

# Function to automatically clean attributes such as
# <a> something <\a>. This will clean to: something
# if a and /a are specified as arguments.
clean_attr <- function(x, ...) { # nocov start
  other_attrs <- paste0((list(...)), collapse = "|")
  gsub(paste0(">|", "<|", other_attrs), "", x)
} # nocov end

# Function to find regex in x with preference in `formats`
format_preference <- function(x, format) { # nocov start
  format <- if (is.null(format)) c("stata", "spss", "sas") else format

  # If the user didn't supply the format (user can only supply only one
  # format and the format arg was manually imputed from above)
  if (length(format) > 1) {

    # Read each format and return the one that has a match
    for (i in format) {
      format_link <- grep(i, x, value = TRUE)
      if (length(format_link) > 0)  return(format_link)
    }

  } else {

    # If the user supplied the format, try reading that format
    # return the link if found or an error otherwise
    format_link <- grep(format, x, value = TRUE)
    if (length(format_link) > 0)  {
      return(format_link)
    } else {
      stop(paste0("Format '", format, "' not available."))
    }

  }
} # nocov end
cimentadaj/essurvey documentation built on Jan. 10, 2022, 2:25 p.m.