R/duckdb-helpers.R

Defines functions spod_duckdb_set_temp spod_read_sql spod_duckdb_limit_resources spod_sql_where_dates spod_duckdb_create_province_enum spod_duckdb_filter_by_dates spod_duckdb_overnight_stays spod_duckdb_number_of_trips spod_duckdb_od

Documented in spod_duckdb_create_province_enum spod_duckdb_filter_by_dates spod_duckdb_limit_resources spod_duckdb_number_of_trips spod_duckdb_od spod_duckdb_overnight_stays spod_duckdb_set_temp spod_read_sql spod_sql_where_dates

#' Creates a duckdb connection to origin-destination data
#'
#' This function creates a duckdb connection to the origin-destination data stored in CSV.gz files.
#'
#' @param con A duckdb connection object. If not specified, a new in-memory connection will be created.
#' @inheritParams spod_available_data
#' @inheritParams spod_download
#' @return A `duckdb` connection object with 2 views:
#'
#'  * `od_csv_raw` - a raw table view of all cached CSV files with the origin-destination data that has been previously cached in $SPANISH_OD_DATA_DIR
#'
#'  * `od_csv_clean` - a cleaned-up table view of `od_csv_raw` with column names and values translated and mapped to English. This still includes all cached data.
#'
#' The structure of the cleaned-up views `od_csv_clean` is as follows:
#'
#' \describe{
#'   \item{date}{\code{Date}. The full date of the trip, including year, month, and day.}
#'   \item{id_origin}{\code{factor}. The identifier for the origin location of the trip, formatted as a code (e.g., '01001_AM').}
#'   \item{id_destination}{\code{factor}. The identifier for the destination location of the trip, formatted as a code (e.g., '01001_AM').}
#'   \item{activity_origin}{\code{factor}. The type of activity at the origin location (e.g., 'home', 'work'). \strong{Note:} Only available for district level data.}
#'   \item{activity_destination}{\code{factor}. The type of activity at the destination location (e.g., 'home', 'other'). \strong{Note:} Only available for district level data.}
#'   \item{residence_province_ine_code}{\code{factor}. The province of residence for the group of individual making the trip, encoded according to the INE classification. \strong{Note:} Only available for district level data.}
#'   \item{residence_province_name}{\code{factor}. The province of residence for the group of individuals making the trip (e.g., 'Cuenca', 'Girona'). \strong{Note:} Only available for district level data.}
#'   \item{hour}{\code{integer}. The time slot (the hour of the day) during which the trip started, represented as an integer (e.g., 0, 1, 2).}
#'   \item{distance}{\code{factor}. The distance category of the trip, represented as a code (e.g., '002-005' for 2-5 km).}
#'   \item{n_trips}{\code{double}. The number of trips taken within the specified time slot and distance.}
#'   \item{trips_total_length_km}{\code{double}. The total length of all trips in kilometers for the specified time slot and distance.}
#'   \item{year}{\code{double}. The year of the trip.}
#'   \item{month}{\code{double}. The month of the trip.}
#'   \item{day}{\code{double}. The day of the trip.}
#' }
#'
#' The structure of the original data in `od_csv_raw` is as follows:
#'
#' \describe{
#'   \item{fecha}{\code{Date}. The date of the trip, including year, month, and day.}
#'   \item{origen}{\code{character}. The identifier for the origin location of the trip, formatted as a character string (e.g., '01001_AM').}
#'   \item{destino}{\code{character}. The identifier for the destination location of the trip, formatted as a character string (e.g., '01001_AM').}
#'   \item{actividad_origen}{\code{character}. The type of activity at the origin location (e.g., 'casa', 'trabajo').}
#'   \item{actividad_destino}{\code{character}. The type of activity at the destination location (e.g., 'otros', 'trabajo').}
#'   \item{residencia}{\code{character}. The code representing the residence of the individual making the trip (e.g., '01') according to the official INE classification.}
#'   \item{edad}{\code{character}. The age of the individual making the trip. This data is actaully filled with 'NA' values, which is why this column is removed in the cleaned-up and translated view described above.}
#'   \item{periodo}{\code{integer}. The time period during which the trip started, represented as an integer (e.g., 0, 1, 2).}
#'   \item{distancia}{\code{character}. The distance category of the trip, represented as a character string (e.g., '002-005' for 2-5 km).}
#'   \item{viajes}{\code{double}. The number of trips taken within the specified time period and distance.}
#'   \item{viajes_km}{\code{double}. The total length of all trips in kilometers for the specified time period and distance.}
#'   \item{day}{\code{double}. The day of the trip.}
#'   \item{month}{\code{double}. The month of the trip.}
#'   \item{year}{\code{double}. The year of the trip.}
#' }
#' @keywords internal
spod_duckdb_od <- function(
    con = DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:", read_only = FALSE),
    zones = c(
      "districts", "dist", "distr", "distritos",
      "municipalities", "muni", "municip", "municipios",
      "lua", "large_urban_areas", "gau", "grandes_areas_urbanas"
    ),
    ver = NULL,
    data_dir = spod_get_data_dir()
) {
  
  locale <- "en" # hardcode locale for now, but be ready to make it a function argument across the package #TODO

  ver <- as.integer(ver)
  if (!ver %in% c(1, 2)) {
    stop("Invalid version number. Must be 1 or 2.")
  }
  
  zones <- match.arg(zones)
  zones <- spod_zone_names_en2es(zones)

  # check that gau is not selected with ver = 1
  if (ver == 1 && zones == "gau") {
    stop("The 'gau' zones are not available in v1 data.")
  }

  if (ver == 1) {
    # selecting districts files for v1 to avoid issues with municipalities # this is to address the bugs described in detail in:
    # http://www.ekotov.pro/mitma-data-issues/issues/011-v1-tpp-mismatch-zone-ids-in-table-and-spatial-data.html
    # http://www.ekotov.pro/mitma-data-issues/issues/012-v1-tpp-district-files-in-municipality-folders.html
    # the decision was to use distrcit data and aggregate it to replicate municipal data
    csv_folder <- paste0(
      data_dir, "/",
      spod_subfolder_raw_data_cache(ver = ver),
      "/maestra1-mitma-distritos",
      "/ficheros-diarios/"
    )
  } else if (ver == 2) {
    csv_folder <- paste0(
      data_dir, "/",
      spod_subfolder_raw_data_cache(ver = ver),
      "/estudios_basicos/por-", spod_zone_names_en2es(zones),
      "/viajes/ficheros-diarios/"
    )
  }

  # create view to the raw TXT/CSV.gz files
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-od-{zones}-raw-csv-view.sql"))
  )

  # create ENUMs

  # zones ENUMs from uniqe ids of relevant zones
  spatial_data <- spod_get_zones(zones,
    ver = ver,
    data_dir = data_dir,
    quiet = TRUE
  )
  
  if( ver == 1 ) {
    unique_ids <- unique(spatial_data$id)
  } else if( ver == 2 ) {
    if (locale == "en" ){
      unique_ids <- c("external", unique(spatial_data$id))
    } else if (locale == "es" ){
      unique_ids <- c("externo", unique(spatial_data$id))
    }
  }
  
  DBI::dbExecute(
    con,
    dplyr::sql(
      paste0(
        "CREATE TYPE ZONES_ENUM AS ENUM ('",
        paste0(unique_ids, collapse = "','"),
        "');"
      )
    )
  )

  # create ACTIV_ENUM (for all except for municipalities in v1 data)
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-od-enum-activity-{locale}.sql"))
  )

  # create DISTANCE_ENUM
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-od-enum-distance.sql"))
  )

  con <- spod_duckdb_create_province_enum(con)
  
  
  # v2 only enums
  if (ver == 2) {
    # income ENUM
    DBI::dbExecute(
      con,
      spod_read_sql(glue::glue("v{ver}-od-enum-income.sql"))
    )
    
    # age ENUM
    DBI::dbExecute(
      con,
      spod_read_sql(glue::glue("v{ver}-od-enum-age.sql"))
    )
    
    # sex ENUM
    DBI::dbExecute(
      con,
      spod_read_sql(glue::glue("v{ver}-od-enum-sex-{locale}.sql"))
    )
  }

  # create od_csv_clean view
  if (ver == 1 && zones == "municipios") {
    # this will be picked up by the sql loaded below if neccessary
    relations_districts_municipalities <- here::here(
        data_dir,
        spod_subfolder_raw_data_cache(1),
        "relaciones_distrito_mitma.csv"
    )
  }
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-od-{zones}-clean-csv-view-{locale}.sql"))
  )

  # return the connection as duckdb object
  return(con)
}

#' Create a duckdb number of trips table
#' 
#' @description
#' This function creates a duckdb connection to the number of trips data stored in a folder of CSV.gz files.
#' @inheritParams spod_duckdb_od
#' @inheritParams spod_available_data
#' @inheritParams spod_download
#' 
#' @return A `duckdb` connection object with 2 views:
#'
#'  * `od_csv_raw` - a raw table view of all cached CSV files with the origin-destination data that has been previously cached in $SPANISH_OD_DATA_DIR
#'
#'  * `od_csv_clean` - a cleaned-up table view of `od_csv_raw` with column names and values translated and mapped to English. This still includes all cached data.
#' 
#' @keywords internal
spod_duckdb_number_of_trips <- function(
  con = DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:", read_only = FALSE),
  zones = c(
    "districts", "dist", "distr", "distritos",
    "municipalities", "muni", "municip", "municipios",
    "lua", "large_urban_areas", "gau", "grandes_areas_urbanas"
  ),
  ver = NULL,
  data_dir = spod_get_data_dir()
) {
  
  locale <- "en" # TODO: add support for Spanish, hardcode for now

  ver <- as.integer(ver)
  if (!ver %in% c(1, 2)) {
    stop("Invalid version number. Must be 1 or 2.")
  }
  
  zones <- match.arg(zones)
  zones <- spod_zone_names_en2es(zones)

  # check that gau is not selected with ver = 1
  if (ver == 1 && zones == "gau") {
    stop("The 'gau' zones are not available in v1 data.")
  }

  if (ver == 1) {
    # selecting districts files for v1 to avoid issues with municipalities # this is to address the bugs described in detail in:
    # http://www.ekotov.pro/mitma-data-issues/issues/011-v1-tpp-mismatch-zone-ids-in-table-and-spatial-data.html
    # http://www.ekotov.pro/mitma-data-issues/issues/012-v1-tpp-district-files-in-municipality-folders.html
    # the decision was to use distrcit data and aggregate it to replicate municipal data
    csv_folder <- paste0(
      data_dir, "/",
      spod_subfolder_raw_data_cache(ver = ver),
      "/maestra2-mitma-distritos",
      "/ficheros-diarios/"
    )
  } else if (ver == 2) {
    csv_folder <- paste0(
      data_dir, "/",
      spod_subfolder_raw_data_cache(ver = ver),
      "/estudios_basicos/por-", spod_zone_names_en2es(zones),
      "/personas/ficheros-diarios/"
    )
  }

  # create view of csv files and preset variable types

  # create view to the raw TXT/CSV.gz files
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-nt-{zones}-raw-csv-view.sql"))
  )

  # create ENUMs

  # zones ENUMs from uniqe ids of relevant zones
  spatial_data <- spod_get_zones(zones,
    ver = ver,
    data_dir = data_dir,
    quiet = TRUE
  )
  
  unique_ids <- unique(spatial_data$id)

  DBI::dbExecute(
    con,
    dplyr::sql(
      paste0(
        "CREATE TYPE ZONES_ENUM AS ENUM ('",
        paste0(unique_ids, collapse = "','"),
        "');"
      )
    )
  )

  # create N_TRIPS_ENUM
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-nt-enum-ntrips.sql"))
  )

  
  # v2 only enums
# v2 only enums
if (ver == 2) {
  # age ENUM
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-nt-enum-age.sql"))
  )
  
  # sex ENUM
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-nt-enum-sex-{locale}.sql"))
  )
}

  # create od_csv_clean view
  if (ver == 1 && zones == "municipios") {
    # this will be picked up by the sql loaded below if neccessary
    relations_districts_municipalities <- here::here(
        data_dir,
        spod_subfolder_raw_data_cache(1),
        "relaciones_distrito_mitma.csv"
    )
  }
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-nt-{zones}-clean-csv-view-{locale}.sql"))
  )

  return(con)
}

#' Create a duckdb overnight stays table
#' 
#' @description
#' This function creates a duckdb connection to the overnight stays data stored in a folder of CSV.gz files.
#' @inheritParams spod_duckdb_od
#' @inheritParams spod_available_data
#' @inheritParams spod_download
#' 
#' @return A `duckdb` connection object with 2 views:
#'
#'  * `od_csv_raw` - a raw table view of all cached CSV files with the origin-destination data that has been previously cached in $SPANISH_OD_DATA_DIR
#'
#'  * `od_csv_clean` - a cleaned-up table view of `od_csv_raw` with column names and values translated and mapped to English. This still includes all cached data.
#' 
#' @keywords internal
spod_duckdb_overnight_stays <- function(
  con = DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:", read_only = FALSE),
  zones = c(
    "districts", "dist", "distr", "distritos",
    "municipalities", "muni", "municip", "municipios",
    "lua", "large_urban_areas", "gau", "grandes_areas_urbanas"
  ),
  ver = NULL,
  data_dir = spod_get_data_dir()
) {
  
  locale <- "en" # TODO: add support for Spanish, hardcode for now

  ver <- as.integer(ver)
  if (ver == 1) {
    stop("Overnight stays data is only available in v2 data (2022-01-01 onwards).")
  }
  
  zones <- match.arg(zones)
  zones <- spod_zone_names_en2es(zones)


  csv_folder <- paste0(
    data_dir, "/",
    spod_subfolder_raw_data_cache(ver = ver),
    "/estudios_basicos/por-", spod_zone_names_en2es(zones),
    "/pernoctaciones/ficheros-diarios/"
  )

  # create view of csv files and preset variable types

  # create view to the raw TXT/CSV.gz files
  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-os-{zones}-raw-csv-view.sql"))
  )

  # create ENUMs

  # zones ENUMs for residence, these are always detailed down to "districts", despite the selected zones
  spatial_data_residence <- spod_get_zones("distr",
    ver = ver,
    data_dir = data_dir,
    quiet = TRUE
  )
  
  unique_ids_residence <- unique(spatial_data_residence$id)
  
  DBI::dbExecute(
    con,
    dplyr::sql(
      paste0(
        "CREATE TYPE RESID_ZONES_ENUM AS ENUM ('",
        paste0(unique_ids_residence, collapse = "','"),
        "');"
      )
    )
  )

  # zones ENUMs for overnight stays, these always match the selected zones
  spatial_data_overnight <- spod_get_zones(zones,
    ver = ver,
    data_dir = data_dir,
    quiet = TRUE
  )

  unique_ids_overnight <- unique(spatial_data_overnight$id)
  
  DBI::dbExecute(
    con,
    dplyr::sql(
      paste0(
        "CREATE TYPE OVERNIGHT_ZONES_ENUM AS ENUM ('",
        paste0(unique_ids_overnight, collapse = "','"),
        "');"
      )
    )
  )


  DBI::dbExecute(
    con,
    spod_read_sql(glue::glue("v{ver}-os-{zones}-clean-csv-view-{locale}.sql"))
  )

  return(con)
}

#' Filter a duckdb conenction by dates
#' 
#' @description
#' IMPORTANT: This function assumes that the table or view that is being filtered has separate `year`, `month` and `day` columns with integer values. This is done so that the filtering is faster on CSV files that are stored in a folder structure with hive-style `/year=2020/month=2/day=14/`.
#' 
#' 
#' @param con A duckdb connection
#' @param source_view_name The name of the source duckdb "view" (the virtual table, in the context of current package likely connected to a folder of CSV files).
#' @param new_view_name The name of the new duckdb "view" (the virtual table, in the context of current package likely connected to a folder of CSV files).
#' @inheritParams spod_dates_argument_to_dates_seq
#' @param source_view_name The name of the source duckdb "view" (the virtual table, in the context of current package likely connected to a folder of CSV files)
#' @keywords internal
#' @return A `duckdb` connection with original views and a new filtered view.
#' 
spod_duckdb_filter_by_dates <- function(con, source_view_name, new_view_name, dates) {
  # prepare query to filter by dates
  query <- dplyr::sql(
    glue::glue(
      'CREATE VIEW "{new_view_name}" AS SELECT * FROM "{source_view_name}" ',
      spod_sql_where_dates(dates),
      ";"
    )
  )

  # create a view with a filter to the desired dates
  DBI::dbSendStatement(con, query)

  return(con)
}

#' Create province names ENUM in a duckdb connection
#' @param con A `duckdb` connection.
#' @return A `duckdb` connection with `INE_PROV_NAME_ENUM` and `INE_PROV_CODE_ENUM` created.
#' @keywords internal
#' 
spod_duckdb_create_province_enum <- function(con) {
  # LOAD SQL STATEMENT to create province names ENUM
  province_names_enum_sql <- readLines(
    system.file("extdata/sql-queries/province_names_enum.sql",
      package = "spanishoddata"
    )
  ) |>
    paste(collapse = "\n") |> 
    dplyr::sql()

  # create INE_PROV_NAME_ENUM
  DBI::dbSendStatement(
    con,
    province_names_enum_sql
  )

  # also create INE encoded INE_PROV_CODE_ENUM
  
  # format codes 1:52 as 2 digits with leading zeros
  ine_codes <- sprintf("%02d", 1:52)

  # create INE_PROV_CODE_ENUM
  DBI::dbSendStatement(
    con,
    dplyr::sql(
      paste0(
        "CREATE TYPE INE_PROV_CODE_ENUM AS ENUM ('",
        paste0(ine_codes, collapse = "','"),
        "');"
      )
    )
  )

  return(con)
}

#' Generate a WHERE part of an SQL query from a sequence of dates
#' @param dates A Dates vector of dates to process.
#' @return A character vector of the SQL query.
#' @keywords internal
spod_sql_where_dates <- function(dates) {
  # Extract unique year, month, and day combinations from the dates
  date_parts <- data.frame(
    year = format(dates, "%Y"),
    month = format(dates, "%m"),
    day = format(dates, "%d")
  )

  # Get distinct rows and sort them by year, month, and day
  date_parts <- date_parts[!duplicated(date_parts), ]
  date_parts <- date_parts[order(date_parts$year, date_parts$month, date_parts$day), ]

  # Create the WHERE conditions for each unique date
  where_conditions <- stats::aggregate(day ~ year + month, data = date_parts, FUN = function(x) paste(x, collapse = ", "))
  where_conditions$condition <- paste0(
    "(year = ", where_conditions$year,
    " AND month = ", where_conditions$month,
    " AND day IN (", where_conditions$day, "))"
  )

  # Combine all conditions into a single WHERE clause
  sql_query <- paste0(
    "WHERE ",
    paste(where_conditions$condition, collapse = " OR ")
  )

  return(sql_query)
}

#' Set maximum memory and number of threads for a `DuckDB` connection
#' @param con A `duckdb` connection
#' @param max_mem_gb The maximum memory to use in GB. A conservative default is 3 GB, which should be enough for resaving the data to `DuckDB` form a folder of CSV.gz files while being small enough to fit in memory of most even old computers. For data analysis using the already converted data (in `DuckDB` or Parquet format) or with the raw CSV.gz data, it is recommended to increase it according to available resources.
#' @param max_n_cpu The maximum number of threads to use. Defaults to the number of available cores minus 1.
#' @return A `duckdb` connection.
#' @keywords internal
spod_duckdb_limit_resources <- function(
  con,
  max_mem_gb = max(4, spod_available_ram() - 4),
  max_n_cpu = max(1, parallelly::availableCores() - 1)
) {
  DBI::dbExecute(
    con,
    dplyr::sql(
      glue::glue("SET max_memory='{max_mem_gb}GB';")
    )
  )

  DBI::dbExecute(
    con,
    dplyr::sql(
      glue::glue("SET threads='{max_n_cpu}';")
    )
  )

  return(con)
}

#' Load an SQL query, glue it, dplyr::sql it
#' @description
#' Load an SQL query from a specified file in package installation directory, glue::collapse it, glue::glue it in case of any variables that need to be replaced, and dplyr::sql it for additional safety.
#' 
#' @return Text of the SQL query of class `sql`/`character`.
#' @param sql_file_name The name of the SQL file to load from the package installation directory.
#' @keywords internal
spod_read_sql <- function(sql_file_name) {
  sql_file_path <- glue::glue("extdata/sql-queries/{sql_file_name}")
  
  sql_query <- readLines(
    system.file(sql_file_path, package = "spanishoddata")) |> 
    glue::glue_collapse(sep = "\n") |> 
    glue::glue(.envir = parent.frame()) |> 
    dplyr::sql()
  
  return(sql_query)
}

#' Set temp file for DuckDB connection
#' @param con A duckdb connection
#' @param temp_path The path to the temp folder for DuckDB for \href{https://duckdb.org/2024/07/09/memory-management.html#intermediate-spilling}{intermediate spilling} in case the set memory limit and/or physical memory of the computer is too low to perform the query. By default this is set to the `temp` directory in the data folder defined by SPANISH_OD_DATA_DIR environment variable. Otherwise, for queries on folders of CSV files or parquet files, the temporary path would be set to the current R working directory, which probably is undesirable, as the current working directory can be on a slow storage, or storage that may have limited space, compared to the data folder.
#' @return A `duckdb` connection.
#' @keywords internal
spod_duckdb_set_temp <- function(
  con,
  temp_path = spod_get_temp_dir()
) {
  DBI::dbExecute(
    con,
    dplyr::sql(
      glue::glue("SET temp_directory='{temp_path}';")
    )
  )

  return(con)
}

Try the spanishoddata package in your browser

Any scripts or data that you put into this service are public.

spanishoddata documentation built on April 12, 2025, 1:24 a.m.