R/internal-utils.R
In spanishoddata: Get Spanish Origin-Destination Data

Documented in spod_assert_package spod_convert_dates_to_ranges spod_dates_argument_to_dates_seq spod_expand_dates_from_regex spod_get_valid_dates spod_graphql_valid_dates spod_infer_data_v_from_dates spod_is_data_version_overlaps spod_match_data_type spod_match_data_type_for_local_folders spod_request_length spod_unique_separated_ids spod_zone_names_en2es

#' Convert multiple formates of date arguments to a sequence of dates
#'
#' This function processes the date arguments provided to various functions in the package. It can handle single dates and arbitratry sequences (vectors) of dates in ISO (YYYY-MM-DD) and YYYYMMDD format. It can also handle date ranges in the format 'YYYY-MM-DD_YYYY-MM-DD' (or 'YYYYMMDD_YYYYMMDD'), date ranges in named vec and regular expressions to match dates in the format `YYYYMMDD`.
#'
#' @param dates A `character` or `Date` vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run.
#'
#' The possible values can be any of the following:
#'
#'  * For the `spod_get()` and `spod_convert()` functions, the `dates` can be set to "cached_v1" or "cached_v2" to request data from cached (already previously downloaded) v1 (2020-2021) or v2 (2022 onwards) data. In this case, the function will identify and use all data files that have been downloaded and cached locally, (e.g. using an explicit run of `spod_download()`, or any data requests made using the `spod_get()` or `spod_convert()` functions).
#'
#'  * A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. `character` or `Date` object.
#'
#' * A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. `character` or `Date` object. Can be any non-consecutive sequence of dates.
#'
#' * A date range
#'
#'   * eigher a `character` or `Date` object of length 2 with clearly named elements `start` and `end` in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. `c(start = "2020-02-15", end = "2020-02-17")`;
#'
#'   * or a `character` object of the form `YYYY-MM-DD_YYYY-MM-DD` or `YYYYMMDD_YYYYMMDD`. For example, `2020-02-15_2020-02-17` or `20200215_20200217`.
#'
#' * A regular expression to match dates in the format `YYYYMMDD`. `character` object. For example, `^202002` will match all dates in February 2020.
#'
#'
#' @return A `character` vector of dates in ISO format (YYYY-MM-DD).
#' @keywords internal
spod_dates_argument_to_dates_seq <- function(dates) {
  if (is.null(dates) || (!is.character(dates) && !inherits(dates, "Date"))) {
    stop(
      "Invalid date input format. Please provide a character vector or Date object."
    )
  }
  if (length(dates) == 1 && dates %in% c("cached_v1", "cached_v2")) {
    return(dates)
  }

  range_regex <- "^\\d{4}(-\\d{2}){2}_\\d{4}(-\\d{2}){2}$|^\\d{8}_\\d{8}$"
  single_date_regex <- "^(\\d{4}-\\d{2}-\\d{2}|\\d{8})$"
  # If dates is a vector of length one
  # Check if is single date, date range, or regex pattern
  if (length(dates) == 1) {
    # Check if date range
    # match both YYYY-MM-DD_YYYY-MM-DD and YYYYMMDD_YYYYMMDD
    if (grepl(range_regex, dates)) {
      date_parts <- strsplit(dates, "_")[[1]]
      date_parts <- lubridate::ymd(date_parts)
      dates <- seq.Date(date_parts[1], date_parts[2], by = "day")

      # if dates does not match the date range pattern
      # check if it is just a single day in YYYY-MM-DD or YYYYMMDD format
    } else if (grepl(single_date_regex, dates)) {
      dates <- lubridate::ymd(dates)

      # assume it is a regex pattern
    } else {
      dates <- spod_expand_dates_from_regex(dates)
      # since spod_expand_dates_from_regex already uses the metadata to generate valid dates we can skip any checks that are required for other date formats and only check for date overlaps between data versions
      if (isFALSE(spod_is_data_version_overlaps(dates))) {
        return(dates)
      }
    }

    # If dates if a vector of multiple values
  } else if (length(dates) > 1) {
    # Check if it is of length 2, then it may be a date range
    if (length(dates) == 2 & !is.null(names(dates))) {
      # if the vector is named with 'start' and 'end', we can assume it is a date range
      if (all(names(dates) %in% c("start", "end"))) {
        date_parts <- lubridate::ymd(dates)
        names(date_parts) <- names(dates)
        # check if start is before end
        if (date_parts["start"] > date_parts["end"]) {
          stop("Start date must be before end date.")
        }
        dates <- seq.Date(date_parts["start"], date_parts["end"], by = "day")
      }
    } else {
      # this is apparantly a sequence of dates
      dates <- lubridate::ymd(dates)
    }
  }

  # now that we have a clean sequence of dates, we can check for overlaps between data versions
  if (isFALSE(spod_is_data_version_overlaps(dates))) {
    return(dates)
  }
}


#' Check if specified dates span both data versions
#'
#' This function checks if the specified dates or date ranges span both v1 and v2 data versions.
#'
#' @param dates A `Dates` vector of dates to check.
#' @return `TRUE` if the dates span both data versions, `FALSE` otherwise.
#' @keywords internal
spod_is_data_version_overlaps <- function(dates) {
  all_dates_v1 <- spod_get_valid_dates(ver = 1)
  all_dates_v2 <- spod_get_valid_dates(ver = 2)

  if (any(dates %in% all_dates_v1) && any(dates %in% all_dates_v2)) {
    stop(paste0(
      "Dates found in both v1 and v2 data. The v1 and v2 data sets may not be comparable. Please see the respective codebooks and methodology documents: run `spod_codebook(1)` and `spod_codebook(2)`.\nThe valid dates range for v1 is: ",
      paste(spod_convert_dates_to_ranges(all_dates_v1), collapse = ", "),
      "\nThe valid dates range for v2 is: ",
      paste(spod_convert_dates_to_ranges(all_dates_v2), collapse = ", "),
      "\n To get the list of valid dates, you can use `spod_get_valid_dates(1)` and `spod_get_valid_dates(2)`."
    ))
  }
  return(FALSE)
}

#' Infer data version from dates
#'
#' @inheritParams spod_download
#' @inheritParams spod_dates_argument_to_dates_seq
#' @return An `integer` indicating the inferred data version.
#' @keywords internal
spod_infer_data_v_from_dates <- function(
  dates,
  ignore_missing_dates = FALSE
) {
  # check if user is requesting to just get all cached data
  if (length(dates) == 1) {
    if (as.character(dates) %in% c("cached_v1", "cached_v2")) {
      return(
        as.integer(stringr::str_extract(as.character(dates), "[0-9]$"))
      )
    }
  }
  cached_data_requested <- length(dates) == 1 &&
    all(as.character(dates) %in% c("cached_v1", "cached_v2"))

  # in case of overlap
  # will throw an error from the spod_is_data_version_overlaps
  if (spod_is_data_version_overlaps(dates)) {
    invisible(return(NULL))
  }

  # if no overlap identified above, compare with date ranges
  v1_dates <- spod_get_valid_dates(ver = 1)
  v2_dates <- spod_get_valid_dates(ver = 2)

  # Check if all dates are missing
  missing_dates <- dates[!dates %in% c(v1_dates, v2_dates)]
  if (length(missing_dates) == length(dates)) {
    stop(paste0(
      "All requested dates are missing from the available data.\nThe valid dates range for v1 is: ",
      paste(spod_convert_dates_to_ranges(v1_dates), collapse = ", "),
      "\nThe valid dates range for v2 is: ",
      paste(spod_convert_dates_to_ranges(v2_dates), collapse = ", "),
      ".\nYou requested the following missing dates: ",
      paste0(missing_dates, collapse = ", "),
      "\nSome of these dates are missing 'naturally' because of the mobile network outages. You can check the up to date list of the missing dates on the source data page at https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad",
      "\nTo get the list of valid dates, you can use `spod_get_valid_dates(1)` and `spod_get_valid_dates(2)`.",
      "\nConsider revising your date range or handling missing dates appropriately."
    ))
  }

  if (all(dates %in% v1_dates)) {
    return(1)
  } else if (all(dates %in% v2_dates)) {
    return(2)
  } else {
    # Handle missing dates based on ignore_missing_dates argument
    missing_dates <- dates[!dates %in% c(v1_dates, v2_dates)]
    if (length(missing_dates) > 0) {
      if (ignore_missing_dates == TRUE) {
        # Filter out missing dates and infer version based on the remaining ones
        valid_dates <- dates[dates %in% c(v1_dates, v2_dates)]
        if (all(valid_dates %in% v1_dates)) {
          return(1)
        } else if (all(valid_dates %in% v2_dates)) {
          return(2)
        } else {
          # If no valid dates remain, or none fully match a version
          return(NULL)
        }
      } else if (ignore_missing_dates == FALSE) {
        # Stop with an error if ignore_missing_dates is FALSE
        stop(paste0(
          "Some dates do not match the available data.\nThe valid dates range for v1 is: ",
          paste(spod_convert_dates_to_ranges(v1_dates), collapse = ", "),
          "\nThe valid dates range for v2 is: ",
          paste(spod_convert_dates_to_ranges(v2_dates), collapse = ", "),
          ".\nYou requested the following missing dates: ",
          paste0(missing_dates, collapse = ", "),
          "\nSome of these dates are missing 'naturally' because of the mobile network outages. You can check the up to date list of the missing dates on the source data page at https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad",
          "\nTo get the list of valid dates, you can use `spod_get_valid_dates(1)` and `spod_get_valid_dates(2)`.",
          "\nYou can also set `ignore_missing_dates` argument when using `spod_get()`, `spod_convert()`, or `spod_download()` to TRUE to ignore missing dates in your requested date range. Use that with caution, as skipping the missing dates may lead to misleading results if you then intend to calculate monthly or weekly flows including the periods with missing data. Consider using imputation methods to fill in the missing data from the available dates."
        ))
      }
    }
  }
}

#' Function to expand dates from a regex
#'
#' This function generates a sequence of dates from a regular expression pattern based on the provided regular expression.
#'
#' @param date_regex A regular expression to match dates in the format 'yyyymmdd'.
#' @return A `character` vector of dates matching the regex.
#' @keywords internal
spod_expand_dates_from_regex <- function(date_regex) {
  all_dates_v1 <- spod_get_valid_dates(ver = 1)
  all_dates_v2 <- spod_get_valid_dates(ver = 2)

  # Filter dates matching the regex for both versions
  matching_dates_v1 <- all_dates_v1[grepl(
    date_regex,
    format(all_dates_v1, "%Y%m%d")
  )]
  matching_dates_v2 <- all_dates_v2[grepl(
    date_regex,
    format(all_dates_v2, "%Y%m%d")
  )]

  # if both vectors are empty, throw an error
  if (length(matching_dates_v1) == 0 && length(matching_dates_v2) == 0) {
    stop(paste0(
      "No matching dates found in the available data.",
      "\nThe valid dates range for v1 is: ",
      paste(spod_convert_dates_to_ranges(all_dates_v1), collapse = ", "),
      "\nThe valid dates range for v2 is: ",
      paste(spod_convert_dates_to_ranges(all_dates_v2), collapse = ", "),
      "."
    ))
  }
  # If checks above have passed, we can combine the matching dates as only one contains dates and the other is empty
  matching_dates <- sort(c(matching_dates_v1, matching_dates_v2))

  return(matching_dates)
}

#' Get valid dates for the specified data version
#'
#' @description
#'
#' `r lifecycle::badge("stable")`
#'
#' Get all metadata for requested data version and identify all dates available for download.
#'
#' @inheritParams spod_available_data
#'
#' @return A vector of type `Date` with all possible valid dates for the specified data version (v1 for 2020-2021 and v2 for 2020 onwards).
#' @export
#' @examplesIf interactive()
#' \donttest{
#' # Get all valid dates for v1 (2020-2021) data
#' spod_get_valid_dates(ver = 1)
#'
#' # Get all valid dates for v2 (2020 onwards) data
#' spod_get_valid_dates(ver = 2)
#' }
#'
spod_get_valid_dates <- function(ver = NULL) {
  # Validate input
  checkmate::assertIntegerish(ver, max.len = 1)
  if (!ver %in% c(1, 2)) {
    stop(
      "Invalid version number. Must be 1 (for v1 2020-2021 data) or 2 (for v2 2022 onwards)."
    )
  }

  if (ver == 1) {
    # available_data <- spod_available_data_v1(check_local_files = FALSE, quiet = TRUE)
    # all_dates <- unique(available_data[grepl("maestra1.*diarios", available_data$target_url),]$data_ymd, na.rm = TRUE)
    # perahps it is worth hardcoding at lest the v1 data range as it is unlikely to change at this point
    all_dates <- seq.Date(
      from = as.Date("2020-02-14"),
      to = as.Date("2021-05-09"),
      by = "day"
    )
  } else if (ver == 2) {
    available_data <- spod_available_data_v2(quiet = TRUE)
    all_dates <- unique(
      available_data[
        grepl("viajes.*diarios", available_data$target_url),
      ]$data_ymd,
      na.rm = TRUE
    )
  }
  all_dates <- sort(all_dates)
  return(all_dates)
}
# TODO: currently checks for date range for od data only. not all datasets may be available for all dates, so this function may need to be updated to check for the availability of the specific for the requested dates. spod_match_data_type() helper in the same file may be useful here.

#' Translate zone names from English to Spanish
#' @inheritParams spod_download
#' @return A `character` string with the translated zone name. Or `NULL` if the zone name is not recognized.
#' @keywords internal
spod_zone_names_en2es <- function(
  zones = c(
    "districts",
    "dist",
    "distr",
    "distritos",
    "municipalities",
    "muni",
    "municip",
    "municipios",
    "lua",
    "large_urban_areas",
    "gau",
    "grandes_areas_urbanas"
  )
) {
  zones <- tolower(zones)
  zones <- match.arg(zones)
  if (zones %in% c("districts", "dist", "distr", "distritos")) {
    return("distritos")
  } else if (zones %in% c("municipalities", "muni", "municip", "municipios")) {
    return("municipios")
  } else if (
    zones %in% c("lua", "large_urban_areas", "gau", "grandes_areas_urbanas")
  ) {
    return("gau")
  }
}

#' Match data types to folders
#' @inheritParams spod_available_data
#' @return A `character` string with the folder name for the specified data type. Or `NULL` if the data type is not recognized.
#' @keywords internal
spod_match_data_type_for_local_folders <- function(
  type = c(
    "od",
    "origin-destination",
    "os",
    "overnight_stays",
    "nt",
    "number_of_trips"
  ),
  ver = c(1, 2)
) {
  if (!ver %in% c(1, 2)) {
    stop("Invalid version number. Must be 1 or 2.")
  }

  type <- tolower(type)
  type <- match.arg(type)

  if (ver == 1) {
    if (type %in% c("od", "origin-destination")) {
      return("maestra1")
    } else if (type %in% c("nt", "number_of_trips")) {
      return("maestra2")
    }
  }

  if (ver == 2) {
    if (type %in% c("od", "origin-destination")) {
      return("viajes")
    } else if (type %in% c("os", "overnight_stays")) {
      return("pernoctaciones")
    } else if (type %in% c("nt", "number_of_trips")) {
      return("personas")
    }
  }

  # need to add a warning here that the type is not recognized
  return(NULL)
}


#' Match data types for normalisation
#' @param type The type of data to match. Can be "od", "origin-destination", "os", "overnight_stays", or "nt", "number_of_trips".
#' @return A `character` string with the folder name for the specified data type. Or `NULL` if the type is not recognized.
#' @keywords internal
spod_match_data_type <- function(
  type = c(
    "od",
    "origin-destination",
    "viajes",
    "os",
    "overnight_stays",
    "pernoctaciones",
    "nt",
    "number_of_trips",
    "personas"
  )
) {
  type <- tolower(type)
  type <- match.arg(type)

  if (type %in% c("od", "origin-destination", "viajes")) {
    return("od")
  } else if (type %in% c("os", "overnight_stays", "pernoctaciones")) {
    return("os")
  } else if (type %in% c("nt", "number_of_trips", "personas")) {
    return("nt")
  }

  # need to add a warning here that the type is not recognized
  return(NULL)
}

#' Remove duplicate values in a semicolon-separated string
#'
#' @description
#' Remove duplicate IDs in a semicolon-separated string in a selected column in a data frame
#' @param column A `character` vector column in a data frame to remove duplicates from.
#'
#' @return A `character` vector with semicolon-separated unique IDs.
#' @keywords internal
spod_unique_separated_ids <- function(column) {
  purrr::map_chr(
    column,
    ~ {
      unique_ids <- unique(stringr::str_split(.x, ";\\s*")[[1]]) # Split by semicolon and remove duplicates
      stringr::str_c(unique_ids, collapse = "; ") # Join them back with semicolons
    }
  )
}

#' Convert dates to ranges
#'
#' This internal helper function reduces a vector of dates to a vector of date ranges to shorten the warning and error messages that mention the valid date ranges.
#' @param dates A `character` vector of dates.
#' @importFrom rlang .data
#' @return A `character` vector of date ranges.
#' @keywords internal
#'
spod_convert_dates_to_ranges <- function(dates) {
  dates <- as.Date(dates) # Ensure dates are in Date format
  ranges <- tibble::tibble(date = dates) |>
    dplyr::arrange(date) |>
    dplyr::mutate(
      diff = c(0, diff(date)), # Calculate differences
      group = cumsum(diff != 1) # Create groups for consecutive ranges
    ) |>
    dplyr::group_by(.data$group) |>
    dplyr::summarise(
      start = dplyr::first(date),
      end = dplyr::last(date),
      .groups = "drop"
    )

  # Create a character vector of ranges
  range_strings <- ranges |>
    dplyr::mutate(range = paste(.data$start, "to", .data$end)) |>
    dplyr::pull(range)

  return(range_strings)
}

#' Get valid dates from the GraphQL API
#' @return A `Date` vector of dates that are valid to request data with \code{spod_quick_get_od()}.
#' @keywords internal
spod_graphql_valid_dates <- function() {
  graphql_endpoint <- getOption("spanishoddata.graphql_api_endpoint")

  # Define the GraphQL query
  graphql_query <- list(
    query = "query {dates_find_available_dates_basic{startDate endDate } }",
    variables = structure(list(), .Names = character(0)) # Empty named list so that it serialises to {} not []
  )

  # Generate signature for the query
  session_token_and_signature <- spod_session_token_and_signature()
  x_session_token <- session_token_and_signature$x_session_token
  x_signature <- session_token_and_signature$x_signature

  # Send the POST request
  req <- httr2::request(graphql_endpoint) |>
    httr2::req_headers(
      "Content-Length" = spod_request_length(graphql_query),
      "Content-Type" = "application/json",
      "User-Agent" = getOption("spanishoddata.user_agent"),
      "X-Session-Token" = x_session_token,
      "X-Signature" = x_signature
    ) |>
    httr2::req_body_json(graphql_query) |>
    httr2::req_progress(type = "down")

  resp <- req |> httr2::req_perform()

  # parse the response
  response_data <- httr2::resp_body_json(resp, simplifyVector = TRUE)
  dates_table <- response_data$data$dates_find_available_dates_basic |>
    dplyr::rename(start_date = .data$startDate, end_date = .data$endDate) |>
    dplyr::mutate(
      end_date = dplyr::if_else(
        condition = .data$end_date == "2024-09-31",
        true = "2024-09-30",
        false = .data$end_date
      )
    )

  # Convert start_date and end_date columns to Date class
  dates_table$start_date <- as.Date(dates_table$start_date)
  dates_table$end_date <- as.Date(dates_table$end_date)

  # Generate a single vector of all dates in the intervals
  # Generate all dates for each interval
  date_sequences <- mapply(
    seq.Date,
    from = dates_table$start_date,
    to = dates_table$end_date,
    by = "day"
  )

  # Flatten the list of date sequences into a single vector and remove duplicates
  dates <- as.Date(unique(unlist(date_sequences)))

  return(dates)
}

#' @importFrom memoise memoise
spod_graphql_valid_dates_memoised <- memoise::memoise(spod_graphql_valid_dates)


spod_session_token_and_signature <- function() {
  # in-memory session token
  session_token <- getOption("spanishoddata.session_token")
  if (is.null(session_token)) {
    session_token <- paste0("rand-", floor(stats::runif(1, 1e6, 1e9)))
    options(spanishoddata.session_token = session_token)
  }

  # get HMAC secret
  hmac_secret <- spod_get_hmac_secret_memoised()

  # compute the raw HMAC-SHA256 bytes
  raw_sig <- digest::hmac(
    key = hmac_secret,
    object = session_token,
    algo = "sha256",
    serialize = FALSE,
    raw = TRUE # <= return raw bytes, not hex
  )

  # base64-encode those raw bytes
  signature_b64 <- openssl::base64_encode(raw_sig)

  x_session_token_and_signature <- list(
    x_session_token = session_token,
    x_signature = signature_b64
  )

  return(x_session_token_and_signature)
}

#' Get the length of the request payload
#' @param graphql_query Character. The GraphQL query.
#' @return Numeric. The length of the request payload.
#' @keywords internal
spod_request_length <- function(graphql_query) {
  payload_json <- jsonlite::toJSON(graphql_query, auto_unbox = TRUE)
  content_length <- nchar(payload_json, type = "bytes")

  return(content_length)
}

# reworked is_installed2 from https://github.com/dataheld/elf/blob/main/R/dependencies.R
#' Checks if a package is installed and *informs* the user if not
#'
#' This is wrapper around [rlang::check_installed];
#' instead of erroring out if the check fails it returns `FALSE`.
#' However, unlike [rlang::is_installed], it emits a message to the user.
#'
#' @inheritParams rlang::check_installed
#' @inheritDotParams rlang::check_installed
#' @keywords internal
spod_assert_package <- function(...) {
  if (rlang::is_installed(...)) {
    return(TRUE)
  }

  withRestarts(
    tryCatch(
      rlang::check_installed(...),
      error = function(cnd) {
        if (inherits(cnd, "rlib_error_package_not_found")) {
          message("The required package is not installed.")
          stop(cnd) # Re-throw the error
        }
      }
    ),
    abort = function(cnd) {
      message("The required package is not installed.")
      stop(cnd) # Re-throw the error
    }
  )

  rlang::is_installed(...)
}