R/data_read.R

Defines functions .get_data_path_suggestions .update_data_with_hash data_spec_get data_info .calculate_file_hash .get_data_record data_read_or_cache list_data data_list data_read

Documented in .calculate_file_hash data_info data_list data_read data_read_or_cache data_spec_get .get_data_path_suggestions .get_data_record .update_data_with_hash

#' Read data using dot notation path or direct file path
#'
#' Supports CSV, TSV, RDS, Excel (.xlsx, .xls), Stata (.dta), SPSS (.sav, .zsav, .por),
#' and SAS (.sas7bdat, .xpt) file formats.
#'
#' @param path Dot notation path (e.g. "source.private.example") or direct file path
#' @param delim Optional delimiter for CSV files ("comma", "tab", "semicolon", "space")
#' @param keep_attributes Logical flag to preserve special attributes (e.g., haven labels). Default: FALSE (strips attributes)
#' @param ... Additional arguments passed to read functions (readr::read_delim, readxl::read_excel, haven::read_*, etc.)
#' @return The loaded data, typically a data frame or tibble.
#' @export
data_read <- function(path, delim = NULL, keep_attributes = FALSE, ...) {
  # Validate arguments
  checkmate::assert_string(path, min.chars = 1)
  checkmate::assert_choice(delim, c("comma", "tab", "semicolon", "space", ",", "\t", ";", " "), null.ok = TRUE)
  checkmate::assert_flag(keep_attributes)

  # Check if path is a direct file path
  if (file.exists(path)) {
    # Direct file path - create a minimal spec
    file_ext <- tolower(sub(".*\\.", "", path))
    spec <- list(
      path = path,
      type = switch(file_ext,
        csv = "csv",
        tsv = "tsv",
        txt = "tsv",  # Assume .txt files are tab-delimited
        dat = "tsv",  # Assume .dat files are tab-delimited
        rds = "rds",
        xlsx = "excel",
        xls = "excel",
        dta = "stata",
        sav = "spss",
        zsav = "spss",
        por = "spss_por",
        sas7bdat = "sas",
        sas7bcat = "sas",
        xpt = "sas_xpt",
        parquet = "parquet",
        stop(sprintf("Unsupported file extension: %s", file_ext))
      ),
      locked = FALSE,
      delimiter = NULL
    )
  } else {
    # Try to get data specification from config
    spec <- tryCatch(
      data_info(path),
      error = function(e) {
        stop(sprintf("Path '%s' is not a valid file and failed to get data specification: %s", path, e$message))
      }
    )

    if (is.null(spec)) {
      # Get suggestions for similar paths
      suggestions <- .get_data_path_suggestions(path)

      error_msg <- sprintf("No data specification found for path: %s", path)
      if (length(suggestions) > 0) {
        error_msg <- paste0(
          error_msg,
          "\n\nAvailable data paths:\n  ",
          paste(suggestions, collapse = "\n  "),
          "\n\nTip: Use settings('data') to explore the full catalog, or narrow it down with settings('data.source.private')"
        )
      }
      stop(error_msg)
    }
  }

  # Check if file exists
  if (!file.exists(spec$path)) {
    stop(sprintf("File not found: %s", spec$path))
  }

  # Only do hash checking for config-based paths (not direct file paths)
  if (!file.exists(path)) {
    # Calculate current file hash
    current_hash <- tryCatch(
      .calculate_file_hash(spec$path),
      error = function(e) {
        stop(sprintf("Failed to calculate file hash: %s", e$message))
      }
    )

    # Get existing data record
    data_record <- tryCatch(
      .get_data_record(path),
      error = function(e) {
        warning(sprintf("Failed to get data record: %s", e$message))
        NULL
      }
    )

    # Handle hash checking
    if (is.null(data_record)) {
      # No record exists, create one with current hash
      tryCatch(
        .set_data(
          name = path,
          path = spec$path,
          type = spec$type,
          delimiter = if (!is.null(spec$delimiter)) spec$delimiter else NA,
          locked = if (!is.null(spec$locked)) spec$locked else FALSE,
          hash = current_hash
        ),
        error = function(e) {
          warning(sprintf("Failed to set data record: %s", e$message))
        }
      )
    } else {
      # Check if file has changed
      if (data_record$hash != current_hash) {
        if (spec$locked) {
          stop(sprintf(
            "Hash mismatch for locked data '%s'\nExpected hash: %s\nCurrent hash:  %s",
            path,
            substr(data_record$hash, 1, 12),
            substr(current_hash, 1, 12)
          ))
        } else {
          warning(sprintf(
            "File hash changed for '%s'\nPrevious hash: %s\nCurrent hash:  %s",
            path,
            substr(data_record$hash, 1, 12),
            substr(current_hash, 1, 12)
          ))
        }
      }
      # Update hash and metadata
      tryCatch(
        .set_data(
          name = path,
          path = spec$path,
          type = spec$type,
          delimiter = if (!is.null(spec$delimiter)) spec$delimiter else NA,
          locked = if (!is.null(spec$locked)) spec$locked else FALSE,
          hash = current_hash
        ),
        error = function(e) {
          warning(sprintf("Failed to update data record: %s", e$message))
        }
      )
    }
  }

  # Helper function to get delimiter character
  get_delimiter <- function(delimiter_name) {
    switch(delimiter_name,
      comma = ",",
      "," = ",",
      tab = "\t",
      "\t" = "\t",
      semicolon = ";",
      ";" = ";",
      space = " ",
      " " = " ",
      stop(sprintf("Unknown delimiter: %s. Must be one of: comma, tab, semicolon, space", delimiter_name))
    )
  }

  # Helper function to guess delimiter from extension
  guess_delimiter_from_ext <- function(path) {
    ext <- tolower(sub(".*\\.", "", path))
    switch(ext,
      csv = "comma",
      tsv = "tab",
      txt = "tab", # Common for tab-delimited
      dat = "tab", # Common for tab-delimited
      xlsx = NA,   # Excel files don't use delimiters
      xls = NA,    # Excel files don't use delimiters
      NULL
    )
  }

  # Determine delimiter (only for delimited file types)
  if (is.null(delim) && spec$type %in% c("csv", "tsv")) {
    # Use delimiter from spec if available, otherwise guess from extension
    if (is.null(spec$delimiter) || is.na(spec$delimiter)) {
      delim <- guess_delimiter_from_ext(spec$path)
    } else {
      delim <- spec$delimiter
    }

    if (is.null(delim)) {
      warning(sprintf("Could not determine delimiter from extension for %s, defaulting to comma", spec$path))
      delim <- "comma"
    }
  }

  # Helper to check for haven package
  require_haven <- function(file_type) {
    if (!requireNamespace("haven", quietly = TRUE)) {
      stop(
        sprintf("%s files require the haven package.\n", file_type),
        "Install with: install.packages('haven')"
      )
    }
  }

  # Helper to check for readxl package
  require_readxl <- function() {
    if (!requireNamespace("readxl", quietly = TRUE)) {
      stop(
        "Excel files require the readxl package.\n",
        "Install with: install.packages('readxl')"
      )
    }
  }

  require_arrow <- function() {
    if (!requireNamespace("arrow", quietly = TRUE)) {
      stop(
        "Parquet files require the arrow package.\n",
        "Install with: install.packages('arrow')"
      )
    }
  }

  # Load data
  data <- tryCatch(
    switch(spec$type,
      csv = {
        readr::read_delim(spec$path, show_col_types = FALSE, delim = get_delimiter(delim), ...)
      },
      tsv = {
        readr::read_delim(spec$path, show_col_types = FALSE, delim = "\t", ...)
      },
      rds = readRDS(spec$path),
      excel = ,
      xlsx = ,
      xls = {
        require_readxl()
        readxl::read_excel(spec$path, ...)
      },
      stata = ,
      dta = {
        require_haven("Stata")
        haven::read_dta(spec$path, ...)
      },
      spss = ,
      sav = ,
      zsav = {
        require_haven("SPSS")
        haven::read_sav(spec$path, ...)
      },
      spss_por = ,
      por = {
        require_haven("SPSS portable")
        haven::read_por(spec$path, ...)
      },
      sas = ,
      sas7bdat = ,
      sas7bcat = {
        require_haven("SAS")
        haven::read_sas(spec$path, ...)
      },
      sas_xpt = ,
      xpt = {
        require_haven("SAS transport")
        haven::read_xpt(spec$path, ...)
      },
      parquet = {
        require_arrow()
        arrow::read_parquet(spec$path, as_data_frame = TRUE, ...)
      },
      stop(sprintf("Unsupported file type: %s", spec$type))
    ),
    error = function(e) {
      stop(sprintf("Failed to load data: %s", e$message))
    }
  )

  # Strip haven attributes if requested (default behavior)
  if (!keep_attributes && spec$type %in% c("stata", "dta", "spss", "sav", "zsav", "spss_por", "por", "sas", "sas7bdat", "sas7bcat", "sas_xpt", "xpt")) {
    data <- haven::zap_formats(data)
    data <- haven::zap_labels(data)
    data <- haven::zap_label(data)
    data <- as.data.frame(data)
  }

  # Output description if available
  if (!is.null(spec$description) && nzchar(spec$description)) {
    message(sprintf("Note: %s: %s", path, spec$description))
  }

  data
}

#' List all data entries from config
#'
#' Lists all data specifications defined in the configuration, showing the
#' data key, path, type, and description (if available).
#'
#' @return A data frame with columns: name, path, type, locked, description
#' @export
#'
#' @examples
#' \donttest{
#' if (FALSE) {
#' # List all data entries
#' data_list()
#'
#' # Use the alias
#' list_data()
#' }
#' }
data_list <- function() {
  config <- settings_read()

  if (is.null(config$data) || length(config$data) == 0) {
    message("No data entries found in configuration")
    return(invisible(data.frame(
      name = character(),
      path = character(),
      type = character(),
      locked = logical(),
      description = character(),
      stringsAsFactors = FALSE
    )))
  }

  # Recursively extract all data entries
  extract_entries <- function(obj, prefix = "") {
    entries <- list()

    for (name in names(obj)) {
      item <- obj[[name]]
      full_name <- if (nzchar(prefix)) paste(prefix, name, sep = ".") else name

      if (is.list(item) && !is.null(item$path)) {
        # This is a data entry
        entries[[full_name]] <- list(
          name = full_name,
          path = item$path,
          type = if (!is.null(item$type)) item$type else NA_character_,
          locked = if (!is.null(item$locked)) item$locked else FALSE,
          description = if (!is.null(item$description)) item$description else NA_character_
        )
      } else if (is.list(item) && is.null(item$path)) {
        # This is a nested structure, recurse
        nested <- extract_entries(item, full_name)
        entries <- c(entries, nested)
      }
    }

    entries
  }

  entries_list <- extract_entries(config$data)

  if (length(entries_list) == 0) {
    message("No data entries found in configuration")
    return(invisible(NULL))
  }

  # Print formatted output
  message(sprintf("\n%d data %s found:\n", length(entries_list), if (length(entries_list) == 1) "entry" else "entries"))

  for (entry in entries_list) {
    # Name with type badge
    type_badge <- sprintf("[%s]", toupper(entry$type))
    message(sprintf("- %s %s", entry$name, type_badge))

    # Path
    message(sprintf("  Path: %s", entry$path))

    # Flags (locked)
    if (entry$locked) {
      message("  [locked]")
    }

    # Description (if available)
    if (!is.na(entry$description) && nzchar(entry$description)) {
      message(sprintf("  %s", entry$description))
    }

    message("")  # Blank line between entries
  }

  invisible(NULL)
}

# Backward compatibility alias (not exported - use data_list instead)
list_data <- function() {
  data_list()
}

#' Read data with caching (DEPRECATED)
#'
#' @description
#' `r lifecycle::badge("deprecated")`
#'
#' This function is deprecated. Use `cache_remember()` with `data_read()` instead:
#'
#' ```r
#' df <- cache_remember("my_data", data_read("source.private.example"))
#' ```
#'
#' @param path Dot notation path to load data (e.g. "source.private.example")
#' @param expire_after Optional expiration time in hours (default: from config)
#' @param refresh Optional boolean or function that returns boolean to force refresh
#' @return The loaded data, either from cache or file
#' @keywords internal
data_read_or_cache <- function(path, expire_after = NULL, refresh = FALSE) {
 .Deprecated(
    msg = paste0(
      "data_read_or_cache() is deprecated.\n",
      "Use cache_remember() with data_read() instead:\n",
      "  df <- cache_remember(\"my_data\", data_read(\"", path, "\"))"
    )
  )

  # Validate arguments
  checkmate::assert_string(path)
  checkmate::assert_number(expire_after, lower = 0, null.ok = TRUE)
  checkmate::assert(
    checkmate::check_flag(refresh),
    checkmate::check_function(refresh)
  )

  cache_key <- sprintf("data.%s", path)
  cache_remember(cache_key,
    {
      message(sprintf("Loading data from file: %s (cached as %s)", path, cache_key))
      data_read(path)
    },
    expire_after = expire_after,
    refresh = refresh
  )
}

#' Get a data value
#' @param name The data name
#' @return The data metadata (hash), or NULL if not found
#' @keywords internal
.get_data_record <- function(name) {
  con <- .get_db_connection()
  result <- DBI::dbGetQuery(
    con,
    "SELECT hash FROM data WHERE name = ?",
    list(name)
  )
  DBI::dbDisconnect(con)

  if (nrow(result) == 0) {
    return(NULL)
  }

  result
}

#' Calculate hash of a file
#' @param file_path Path to the file
#' @return The hash of the file as a character string
#' @keywords internal
.calculate_file_hash <- function(file_path) {
  # Read file content
  content <- readBin(file_path, "raw", n = file.size(file_path))
  # Calculate hash and convert to character
  as.character(openssl::sha256(content))
}

#' Get data specification from config
#'
#' Gets the data specification for a given dot notation path from settings.yml.
#' Supports dot notation (e.g., "source.private.example"), relative paths, and
#' absolute paths. Auto-detects file type from extension and applies intelligent
#' defaults for common formats.
#'
#' @param path Dot notation path (e.g. "source.private.example"), relative path,
#'   or absolute path to a data file
#'
#' @return A list with data specification including:
#'   \itemize{
#'     \item \code{path} - Full file path
#'     \item \code{type} - File type (csv, rds, stata, spss, sas, etc.)
#'     \item \code{delimiter} - Delimiter for CSV files (comma, tab, etc.)
#'     \item \code{locked} - Whether file is locked for integrity checking
#'     \item \code{private} - Whether file is in private data directory
#'     \item \code{description} - Optional description of the dataset (displayed when loading)
#'   }
#'
#' @examples
#' \donttest{
#' if (FALSE) {
#' # Get info from dot notation
#' info <- data_info("source.private.my_data")
#'
#' # Get info from file path
#' info <- data_info("data/public/example.csv")
#' }
#' }
#'
#' @export
data_info <- function(path) {
  # Validate arguments
  checkmate::assert_string(path, min.chars = 1)

  # Get config file path to resolve relative paths correctly
  config_file <- .get_settings_file()
  config_dir <- if (!is.null(config_file)) {
    # Normalize config_file to get absolute path, then take dirname
    dirname(normalizePath(config_file, winslash = "/", mustWork = TRUE))
  } else {
    getwd()
  }

  config <- settings_read()

  # Get file type info
  get_file_type_info <- function(path) {
    # For RDS files, we can be explicit
    if (grepl("\\.rds$", path, ignore.case = TRUE)) {
      return(list(type = "rds", delimiter = NULL))
    }

    # For TSV files
    if (grepl("\\.tsv$", path, ignore.case = TRUE)) {
      return(list(type = "csv", delimiter = "tab"))
    }

    # For CSV files
    if (grepl("\\.csv$", path, ignore.case = TRUE)) {
      return(list(type = "csv", delimiter = "comma"))
    }

    # Excel files
    if (grepl("\\.(xlsx|xls)$", path, ignore.case = TRUE)) {
      return(list(type = "excel", delimiter = NULL))
    }

    # Parquet files
    if (grepl("\\.parquet$", path, ignore.case = TRUE)) {
      return(list(type = "parquet", delimiter = NULL))
    }

    # Stata files
    if (grepl("\\.dta$", path, ignore.case = TRUE)) {
      return(list(type = "stata", delimiter = NULL))
    }

    # SPSS files
    if (grepl("\\.(sav|zsav)$", path, ignore.case = TRUE)) {
      return(list(type = "spss", delimiter = NULL))
    }

    if (grepl("\\.por$", path, ignore.case = TRUE)) {
      return(list(type = "spss_por", delimiter = NULL))
    }

    # SAS files
    if (grepl("\\.(sas7bdat|sas7bcat)$", path, ignore.case = TRUE)) {
      return(list(type = "sas", delimiter = NULL))
    }

    if (grepl("\\.xpt$", path, ignore.case = TRUE)) {
      return(list(type = "sas_xpt", delimiter = NULL))
    }

    # For everything else, let readr figure it out
    return(list(type = "csv", delimiter = NULL))
  }

  # Create base spec with defaults
  create_base_spec <- function(path) {
    type_info <- get_file_type_info(path)
    list(
      path = path,
      type = type_info$type,
      delimiter = type_info$delimiter,
      locked = FALSE,
      private = basename(dirname(path)) == "private"
    )
  }

  # Create spec with optional existing spec
  create_spec <- function(path, existing_spec = NULL) {
    # Normalize relative paths against config directory
    # (paths from config are relative to config file, not current directory)
    # Check for both Unix absolute (/...) and Windows absolute (C:/... or C:\...)
    is_absolute <- grepl("^/", path) || grepl("^[A-Za-z]:[/\\\\]", path)
    if (!is_absolute) {
      # Relative path - resolve against config directory
      full_path <- file.path(config_dir, path)
      # Normalize to handle .. and . in path
      path <- normalizePath(full_path, winslash = "/", mustWork = FALSE)
    }

    spec <- create_base_spec(path)
    if (!is.null(existing_spec)) {
      for (key in names(existing_spec)) {
        # Skip 'path' key - use the normalized path we calculated
        if (key != "path" && !is.null(existing_spec[[key]])) {
          spec[[key]] <- existing_spec[[key]]
        }
      }
    }
    spec
  }

  # Handle dot notation lookup
  get_dot_notation_spec <- function(parts, config) {
    current <- config$data
    config_spec <- current
    for (part in parts) {
      if (is.null(config_spec[[part]])) {
        return(NULL)
      }
      config_spec <- config_spec[[part]]
    }

    if (is.character(config_spec) && length(config_spec) == 1) {
      create_spec(config_spec)
    } else {
      create_spec(config_spec$path, config_spec)
    }
  }

  # Get spec based on path type
  get_spec_by_path_type <- function(path, config) {
    if (grepl("^/", path)) {
      # Handle absolute paths directly
      create_spec(path)
    } else if (grepl("[/\\\\]", path)) {
      # Handle relative paths by normalizing them against the working directory
      full_path <- normalizePath(file.path(getwd(), path), mustWork = FALSE)
      create_spec(full_path)
    } else {
      get_dot_notation_spec(strsplit(path, "\\.")[[1]], config)
    }
  }

  get_spec_by_path_type(path, config)
}

#' Get data specification (DEPRECATED)
#'
#' @description
#' `r lifecycle::badge("deprecated")`
#'
#' This function has been renamed to [data_info()]. Please use that instead.
#'
#' @param path Dot notation path or file path
#' @return A list with data specification
#' @keywords internal
data_spec_get <- function(path) {
  .Deprecated("data_info", msg = "data_spec_get() is deprecated. Use data_info() instead.")
  data_info(path)
}

#' Update data with hash in the data table
#' @param name The data name
#' @param hash The hash to store
#' @keywords internal
.update_data_with_hash <- function(name, hash) {
  con <- .get_db_connection()
  now <- lubridate::now()

  # Check if entry exists
  entry_exists <- DBI::dbGetQuery(
    con,
    "SELECT 1 FROM data WHERE name = ?",
    list(name)
  )

  if (nrow(entry_exists) > 0) {
    # Update existing entry
    DBI::dbExecute(
      con,
      "UPDATE data SET hash = ?, last_read_at = ?, updated_at = ? WHERE name = ?",
      list(hash, now, now, name)
    )
  } else {
    # Insert new entry
    DBI::dbExecute(
      con,
      "INSERT INTO data (name, hash, last_read_at, created_at, updated_at) VALUES (?, ?, ?, ?, ?)",
      list(name, hash, now, now, now)
    )
  }

  DBI::dbDisconnect(con)
}

#' Get suggestions for available data paths
#'
#' Helper function that extracts all available data paths from config.
#' Used to provide helpful suggestions when data_read() fails.
#'
#' @param attempted_path The path that the user tried (optional, for future fuzzy matching)
#' @return Character vector of available data paths
#' @keywords internal
.get_data_path_suggestions <- function(attempted_path = NULL) {
  config <- tryCatch(
    settings_read(),
    error = function(e) {
      return(character())
    }
  )

  if (is.null(config$data) || length(config$data) == 0) {
    return(character())
  }

  # Recursively extract all data entry paths
  extract_paths <- function(obj, prefix = "") {
    paths <- character()

    for (name in names(obj)) {
      item <- obj[[name]]
      full_name <- if (nzchar(prefix)) paste(prefix, name, sep = ".") else name

      if (is.list(item) && !is.null(item$path)) {
        # This is a data entry
        paths <- c(paths, full_name)
      } else if (is.list(item) && is.null(item$path)) {
        # This is a nested structure, recurse
        nested <- extract_paths(item, full_name)
        paths <- c(paths, nested)
      }
    }

    paths
  }

  all_paths <- extract_paths(config$data)

  # Limit to first 20 suggestions
  if (length(all_paths) > 20) {
    all_paths <- c(all_paths[1:20], sprintf("... and %d more", length(all_paths) - 20))
  }

  all_paths
}

Try the framework package in your browser

Any scripts or data that you put into this service are public.

framework documentation built on Feb. 18, 2026, 1:07 a.m.