R/extract.R

Defines functions vector_to_df_columnwise data_list_correct extract_endpath extract_publisher_name extract_publisher_code extract_modified_date extract_release_date extract_language extract_dataset_name extract_url_format extract_access_url extract_url extract_description extract_publisher_data_url extract_keywords extract_metadata return_metadata extract_data

Documented in data_list_correct extract_access_url extract_data extract_dataset_name extract_description extract_endpath extract_keywords extract_language extract_metadata extract_modified_date extract_publisher_code extract_publisher_name extract_release_date extract_url extract_url_format

#' Extract data from a data_list.
#'
#' @param data_list A data_list similar to resp$result$items[[1]] that contains information on a dataset
#' @param encoding The encoding passed to read (all) the files. Most cases should be resolved with either
#' 'UTF-8', 'latin1' or 'ASCII'.
#'
#' @param guess_encoding A logical stating whether to guess the encoding. This is set to TRUE by default.
#' Whenever guess_encoding is set to TRUE, the 'encoding' argument is ignored. If \code{\link[readr]{guess_encoding}}
#' fails to guess the encoding, openes_load falls back to the encoding argument.
#' @param ... Arguments passed to \code{\link[readr]{read_csv}} and all other \code{read_*} functions.
#'
#' @details \code{get_data} will accept the end path of a data base and it will search for the access url.
#' If the dataset is either a csv, xls, xlsx or xml, then it will
#' attempt to read it. If it succeeds, it will return the data frame. If not, it will return
#' the data frame with only one column containing all available access URL's.
#'
#' For example, this URL: http://datos.gob.es/es/catalogo/a02002834-numero-de-centros-segun-ancho-de-banda-de-la-conexion-a-internet-que-tiene-el-centro6
#' says that it has a XML file but once you click on the 'download' XML, it redirects to a JavaScript based
#' website that has the table. This file unfortunately is unreadable to the package.
extract_data <- function(data_list, encoding, guess_encoding, ...) {

  # Check if the data_list is readable
  is_file_readable <- determine_dataset_url(data_list)

  if (length(is_file_readable) != 0) {
    # Grab the dataset names from the actual names from datos.gob.es
    names_datasets <- determine_dataset_name(data_list)

    if (!all(names_datasets == is_file_readable)) stop('Data cannot be read because it is not in correct order')

    output_data <- vector("list", length(is_file_readable))

    # Loop over each chosen file
    for (index in seq_along(is_file_readable)) {

      data_url <- names(is_file_readable)[index]

      if (guess_encoding) encoding <- determine_dataset_encoding(data_url, encoding)

      custom_locale <- readr::locale("es", encoding = encoding)

      # Determine the delimited of the file
      read_generic <- determine_read_generic(data_url, custom_locale)

      # Try reading the data and saving it in **that** data frame's slot
      output_data[[index]] <-
        try(read_generic(file = data_url,
                         locale = custom_locale,
                         ...),
            silent = TRUE)

      # If the data that was read raised an error, return
      # the format and URL as we would have done if there are no
      # files to read at all from this data_list
      if (inherits(output_data[[index]], "try-error")) {
        output_data[[index]] <-
          tibble::tibble(format = is_file_readable[index],
                         URL = names(is_file_readable[index]))
      }
    }

    # Assign the the same name of ech dataset to each slot
    names(output_data) <- names(names_datasets)

  } else {
    # If no file to read, return all urls and formats
    output_data <- list(return_metadata(data_list))
    names(output_data) <- 'unavailable_formats'
  }


  output_data
}


return_metadata <- function(data_list) {
  # If there's any error, this means that none of the formats
  # could be read. So we return the the tibble with the url
  # formats and the access urls
  output_data <- tibble::tibble(extract_dataset_name(data_list),
                                extract_url_format(data_list),
                                extract_access_url(data_list))

  names(output_data) <- c("name", "format", "URL")

  output_data
}

#' Extract all metadata from a data_list
#'
#' @param data_list A data_list similar to resp$result$items[[1]] that contains information on a dataset
extract_metadata <- function(data_list) {

  keywords <- extract_keywords(data_list)
  description <- extract_description(data_list)
  languages <- extract_language(data_list)
  url_path <- extract_url(data_list)
  url_formats <- extract_url_format(data_list)
  date_data <- extract_release_date(data_list)
  modified_data <- extract_modified_date(data_list)
  publisher <- extract_publisher_name(data_list)
  publisher_data_url <- extract_publisher_data_url(data_list)


  # Title, description, languages and url_path
  # are vector of the same length as the number of
  # languages. We construct them with N rows
  # and then add the url_path (which is
  # the path to the info of the dataset
  # and no the accessURL). That is, the data frame
  # is in long format

  first_df <-
    tibble::tibble(
      language = languages,
      description = description
    )

  first_df$url <- url_path

  # Do not change this to tibble because
  # first_df is a df and it will throw an error.
  # data.frame in this case allows to pass a data frrame
  # and bins the columns together
  data_df <-
    data.frame(
      keywords = keywords,
      first_df,
      date_issued = date_data,
      date_modified = modified_data,
      publisher = publisher,
      publisher_data_url = publisher_data_url,
      stringsAsFactors = FALSE
    )

  final_df <- tibble::as_tibble(data_df)
  final_df
}

#' Extract keywords from data_list
#'
#' @inheritParams extract_metadata
extract_keywords <- function(data_list) {

  if (!'keyword' %in% names(data_list)) {
    return("No keywords available")
  }

  keywords <- paste0(unlist(data_list$keyword), collapse = "; ")
  keywords
}

extract_publisher_data_url <- function(data_list) {

  if (!'identifier' %in% names(data_list)) {
    return("No identifier available")
  }

  data_list$identifier
}

#' Extract description from data_list
#'
#' @inheritParams extract_metadata
extract_description <- function(data_list) {

  if (!'_value' %in% names(unlist(data_list$description))) {
    return("No description available")
  }

  desc <- unlist(data_list$description)
  descriptions <- unname(desc[names(desc) == "_value"])

  descriptions
}

#' Extract URL from datos.gob.es from data_list
#'
#' @inheritParams extract_metadata
extract_url <- function(data_list) {

  if (!'_about' %in% names(data_list)) {
    return("No URL available")
  }

  info_url <- data_list$`_about`
  info_url
}

#' Extract access URL to the actual data from data_list
#'
#' @inheritParams extract_metadata
extract_access_url <- function(data_list) {

  if (!'accessURL' %in% names(unlist(data_list$distribution))) {
    return("No access URL available")
  }

  if (is.null(getElement(data_list$distribution, "accessURL"))) {
    access_url <- vapply(data_list$distribution, function(x) x$accessURL,
                         FUN.VALUE = character(1))
  } else {
    access_url <- data_list$distribution$accessURL
  }

  distr <- unlist(data_list['distribution'])

  access_url <- distr[names(distr) == 'distribution.accessURL']

  access_url
}

#' Extract the format of the dataset from data_list
#'
#' For example, csv or xml
#' @inheritParams extract_metadata
extract_url_format <- function(data_list) {

  if (!'format.value' %in% names(unlist(data_list$distribution))) {
    return("No format available")
  }

  distr <- unlist(data_list['distribution'])
  raw_formats <- unname(distr[names(distr) == 'distribution.format.value'])
  # mimemap is a vector in R/utils.R containing all formats. For more info
  # go to utils.R
  correct_formats <- names(mimemap)[match(raw_formats, mimemap, nomatch = numeric())]

  correct_formats
}

#' Extract the dataset names from data_list
#'
#' For example, elecciones2016.csv and elecciones2014.csv
#' @inheritParams extract_metadata
extract_dataset_name <- function(data_list) {

  # We've had problems with this. A data_list['distribution']
  # might be a simple list with the details of a dataset (if that path_id only has
  # ONE dataset). However, if that dataset has several files (xml, csv, whatever..)
  # then it is a list with several slots where each slot contains the details of each
  # datasets. Because we will use a vapply below, we always want the distribution (object below)
  # to be a list where we can loop through each slot and grab the dataset name we need to check
  # whether the data_list has only one dataset or many.

  # If there is only ONE dataset, then the condition below should find that subsetting the
  # distribution should already contain the details. For that reason, we test whether
  # the slot _about is present. Why not test directly whether the 'title' of the dataset
  # is in there? We are indeed looking for that! We don't do that because there are
  # datasets which might not have a name (that is NO slot, but named as distribution without a name on the website).

  # We assume that the dataset must have a _about field. If the dataset does, then we subset to keep
  # that dataset as a list where the first slot contains the data of the only dataset. If we can't find
  # the _about field it means that there's a lot of datasets and we double subset to keep each dataset
  # as a slot, where each slot contains that format's details.
  if ("_about" %in% names(data_list[['distribution']])) {
    distribution <- data_list['distribution']
  }  else {
    distribution <- data_list[['distribution']]
  }

  # The name of the data, in principle, is according to the language.
  # That is, if there's english, catalan and spanish, there will be
  # three dataset names such as votos_politica.csv, votos_politica.csv, etc..
  # In practice, no one names their datasets differently per language
  # But to avoid creating a complex chain of which names to pick, I
  # always pick the first language, assuming that there's at least one
  # because the previous check makes sure there is at least one

  check_if_title_exists <- function(x) if ("title" %in% names(x[[1]])) x[[1]]$title[[1]][['_value']] else "Distribucion sin nombre"

  data_set_names <-
    vapply(distribution,
           check_if_title_exists, FUN.VALUE = character(1))

  data_set_names
}

#' Extract access languages available from data_list
#'
#' @inheritParams extract_metadata
extract_language <- function(data_list) {

  if (!'_lang' %in% names(unlist(data_list$description))) {
    return("No language available")
  }

  desc <- unlist(data_list$description)

  languages <- unname(desc[names(desc) == "_lang"])

  languages
}

#' Extract the date at which the data was submitted from data_list
#'
#' The date is currently exported as a string but
#' should be turned into a a date class
#' @inheritParams extract_metadata
extract_release_date <- function(data_list) {

  if (!'issued' %in% names(data_list)) {
    return("No release date available")
  }

  # So that we can read abbriated months
  # such as ene, mar
  spanish_locale <- if (.Platform$OS.type == "windows") "Spanish" else "es_ES.UTF-8"
  # So that we can read abbreviated months
  # such as ene, mar
  orig_locale <- Sys.getlocale("LC_TIME")
  Sys.setlocale("LC_TIME", spanish_locale)
  on.exit(Sys.setlocale("LC_TIME", orig_locale))

  issued <- as.POSIXct(substr(data_list$issued, 6, 25),
                       format = "%d %b %Y %H:%M:%S", tz = "GMT")

  # For now, but this should be converted
  # to date time
  issued
}

#' Extract the date at which the data was modified from data_list
#'
#' The date is currently exported as a string but
#' should be turned into a a date class
#' @inheritParams extract_metadata
extract_modified_date <- function(data_list) {

  if (!'modified' %in% names(data_list)) {
    return("No modification date available")
  }

  spanish_locale <- if (.Platform$OS.type == "windows") "Spanish" else "es_ES.UTF-8"
  # So that we can read abbriated months
  # such as ene, mar
  orig_locale <- Sys.getlocale("LC_TIME")
  Sys.setlocale("LC_TIME", spanish_locale)
  on.exit(Sys.setlocale("LC_TIME", orig_locale))

  modified <- as.POSIXct(substr(data_list$modified, 6, 25),
                         format = "%d %b %Y %H:%M:%S", tz = "GMT")

  # For now, but this should be converted
  # to date time
  modified
}

#' Extract the publisher code of the dataset from data_list
#'
#' @inheritParams extract_metadata
extract_publisher_code <- function(data_list) {

  if (!'publisher' %in% names(data_list)) {
    return("No publisher available")
  }

  publisher_code <- sub(".*\\/", "", data_list$publisher)

  publisher_code
}

#' Extract the publisher name of the dataset from data_list
#'
#' @inheritParams extract_metadata
extract_publisher_name <- function(data_list) {

  if (!'publisher' %in% names(data_list)) {
    return("No publisher available")
  }

  publisher_code <- sub(".*\\/", "", data_list$publisher)
  publisher_name <- translate_publisher(publisher_code)

  publisher_name
}


#' Extract the end path of the dataset that directs to datos.gob.es from a data_list
#'
#' @inheritParams extract_metadata
extract_endpath <- function(data_list) {

  if (!'_about' %in% names(data_list)) {
    return("No link to the data in datos.gob.es")
  }

  end_path <- sub(".*\\/", "", data_list[["_about"]])

  end_path
}



#' Check data_list is in correct formats
#'
#' When new checks come up, add them in
#' the same format: logical tests first
#' and then add them to the if statement
#'
#' @param raw_json Raw JSON response from datos.gob.es
data_list_correct <- function(raw_json) {

  no_items <- !"items" %in% names(raw_json$result)
  no_datasets <- length(raw_json$result$items) == 0

  if (no_items || no_datasets) return(FALSE)

  data_list <- raw_json$result$items[[1]]

  wrong_length <- length(data_list) == 0
  no_names <- is.null(attr(data_list, 'names'))
  # Because the URL slot is in the distribution
  # and all data_lists must follow the same structure
  no_distribution_slot <- !'distribution' %in% names(data_list)
  no_description_slot <- !'description' %in% names(data_list)

  failure_tests <- c(
    wrong_length,
    no_names,
    no_distribution_slot,
    no_description_slot
  )

  if (any(failure_tests)) {
    return(FALSE)
  }

  TRUE
}

# Some of the extract_components return vectors of length > 1
# that refers to many languages. Like the description
# comes in three langauges. We want those vectors to be turned
# into columns with the language prefix in the column names
vector_to_df_columnwise <- function(vec, column_names) { # nocov start
  semi_df <- tibble::as_tibble(matrix(vec, ncol = length(vec)))
  names(semi_df) <- column_names
  semi_df
} # nocov end
cimentadaj/datos_gob documentation built on April 16, 2021, 11:47 a.m.