knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE )
library(eyeris) library(dplyr) library(DBI)
The eyeris package provides powerful database functionality that serves as an alternative to CSV file storage for your pupillometry data. This guide will walk you through the complete database workflow, from creation to data extraction, and explain why you might want to choose database storage over traditional CSV files.
1. Scalability and Performance
R, databases can filter and aggregate data at the storage level, dramatically reducing memory usage and processing time.2. Cloud Computing Advantages
3. Data Integrity and Management
4. Analysis Efficiency
SQL queries: Leverage the power of SQL for complex filtering, aggregation, and joins.DuckDB databases work across R, Python, and other analysis platforms.Use CSV files when:
Use databases when:
eyeris DatabaseThe simplest way to create an eyeris database is by enabling database output during the bidsify() process:
# preprocess and epoch your data with eyeris glassbox functions # Note: Pass the file path directly to glassbox() - eyeris handles .asc reading internally processed_data <- "path/to/your/data.asc" %>% glassbox() %>% epoch( events = "TRIAL_START_{trial_type}_{trial_number}", limits = c(-0.5, 2.0), label = "trial_epochs" ) # Enable eyeris database alongside CSV files bidsify( processed_data, bids_dir = "~/my_eyetracking_study", participant_id = "001", session_num = "01", task_name = "memory_task", csv_enabled = TRUE, # still create CSV files db_enabled = TRUE, # while also creating your eyeris project database db_path = "study_database" # creates study_database.eyerisdb )
For cloud computing or large-scale processing, you can (and probably should) considering skipping CSV creation entirely:
bidsify( processed_data, bids_dir = "~/my_eyetracking_study", participant_id = "001", session_num = "01", task_name = "memory_task", csv_enabled = FALSE, # skip CSV creation db_enabled = TRUE, # use an eyeris project database only db_path = "study_database" )
Here's how to efficiently process multiple subjects into a single database:
subjects <- c("001", "002", "003", "004", "005") data_dir <- "~/raw_eyetracking_data" bids_dir <- "~/processed_study_data" for (subject_id in subjects) { cat("Processing subject", subject_id, "\n") subject_data <- file.path( data_dir, paste0("sub-", subject_id), "eye", paste0("sub-", subject_id, ".asc") ) %>% glassbox() %>% epoch( events = "STIMULUS_{condition}_{trial}", limits = c(-1, 3), label = "stimulus_response" ) # then add to eyeris database (which automatically handles subject cleanup) bidsify( subject_data, bids_dir = bids_dir, participant_id = subject_id, session_num = "01", task_name = "attention_task", csv_enabled = FALSE, db_enabled = TRUE, db_path = "attention_study_db" ) }
eyeris Databasescon <- eyeris_db_connect( bids_dir = "~/processed_study_data", db_path = "attention_study_db" # will look for attention_study_db.eyerisdb ) # be sure to always disconnect when done (or use on.exit to ensure cleanup) on.exit(eyeris_db_disconnect(con))
eyeris Database Overview and Exploration# first get a comprehensive summary of your eyeris project database summary_info <- eyeris_db_summary( "~/processed_study_data", "attention_study_db" ) summary_info$subjects # all subjects in database summary_info$data_types # available data types summary_info$sessions # session information summary_info$tasks # task names summary_info$total_tables # total number of tables # list all available tables all_tables <- eyeris_db_list_tables(con) print(all_tables) # filter tables by data type timeseries_tables <- eyeris_db_list_tables(con, data_type = "timeseries") confounds_tables <- eyeris_db_list_tables(con, data_type = "run_confounds") # filter tables by subject subject_001_tables <- eyeris_db_list_tables(con, subject = "001")
The eyeris_db_collect() function provides easy one-liner access to your data:
# extract ALL data for ALL subjects (returns a named list) all_data <- eyeris_db_collect("~/processed_study_data", "attention_study_db") # view available data types names(all_data) # access specific data types timeseries_data <- all_data$timeseries events_data <- all_data$events confounds_data <- all_data$run_confounds
# extract data for specific subjects only subset_subjects <- eyeris_db_collect( bids_dir = "~/processed_study_data", db_path = "attention_study_db", subjects = c("001", "002", "003") ) # extract specific data types only behavioral_data <- eyeris_db_collect( bids_dir = "~/processed_study_data", db_path = "attention_study_db", data_types = c("events", "epochs", "confounds_summary") ) # extract data for specific sessions and tasks session_01_data <- eyeris_db_collect( bids_dir = "~/processed_study_data", db_path = "attention_study_db", sessions = "01", tasks = "attention_task" )
# extract data from both eyes binocular_data <- eyeris_db_collect( bids_dir = "~/processed_study_data", db_path = "attention_study_db" ) # the function automatically combines left and right eye data # check if you have binocular data unique(binocular_data$timeseries$eye_suffix) # should show "eyeL" and "eyeR" # extract data for specific eye only left_eye_data <- eyeris_db_collect( bids_dir = "~/processed_study_data", db_path = "attention_study_db", eye_suffixes = "eyeL" )
# extract specific epoch data trial_epochs <- eyeris_db_collect( bids_dir = "~/processed_study_data", db_path = "attention_study_db", data_types = c("epochs", "confounds_events", "confounds_summary"), epoch_labels = "stimulus_response" # match your epoch label ) # multiple epoch types multiple_epochs <- eyeris_db_collect( bids_dir = "~/processed_study_data", db_path = "attention_study_db", data_types = "epochs", epoch_labels = c("stimulus_response", "baseline_period") )
list_format <- eyeris_db_collect("~/processed_study_data") # access individual data types pupil_data <- list_format$timeseries trial_data <- list_format$epochs
For maximum flexibility, you can also write and execute custom SQL queries:
# first connect to your eyeris project database con <- eyeris_db_connect("~/processed_study_data", "attention_study_db") # write your custom SQL query custom_query <- " SELECT subject_id, session_id, task_name, AVG(pupil_raw_deblink_detransient_interpolate_lpfilt_z) as mean_pupil, COUNT(*) as n_samples FROM timeseries_001_01_attention_task_run01_eyeL WHERE pupil_clean IS NOT NULL GROUP BY subject_id, session_id, task_name " results <- DBI::dbGetQuery(con, custom_query) print(results) # a complex cross-table query complex_query <- " SELECT e.subject_id, e.matched_event, e.text_unique, AVG(t.pupil_raw_deblink_detransient_interpolate_lpfilt_z) as mean_pupil_in_epoch, c.blink_rate_hz FROM epochs_001_01_attention_task_run01_stimulus_response_eyeL e JOIN timeseries_001_01_attention_task_run01_eyeL t ON e.subject_id = t.subject_id AND t.time_orig BETWEEN e.epoch_start AND e.epoch_end JOIN run_confounds_001_01_attention_task_run01_eyeL c ON e.subject_id = c.subject_id GROUP BY e.subject_id, e.matched_event, e.text_unique, c.blink_rate_hz " complex_results <- DBI::dbGetQuery(con, complex_query) print(complex_results)
# read a specific table directly specific_table <- eyeris_db_read( con = con, table_name = "timeseries_001_01_attention_task_run01_eyeL" ) # read from eyeris database with filters filtered_data <- eyeris_db_read( con = con, data_type = "events", subject = "001", session = "01", task = "attention_task" ) # read epoch data from eyeris database with specific epoch label epoch_data <- eyeris_db_read( con = con, data_type = "epochs", subject = "001", epoch_label = "stimulus_response" )
# extract all timeseries data pupil_data <- eyeris_db_collect( "~/processed_study_data", data_types = "timeseries" )$timeseries # analyze pupil responses by subject and condition pupil_summary <- pupil_data %>% filter(!is.na(pupil_clean)) %>% group_by(subject_id, session_id) %>% summarise( mean_pupil = mean(pupil_clean), sd_pupil = sd(pupil_clean), samples_per_subject = n(), .groups = 'drop' ) print(pupil_summary) # compare to loading individual CSV files (which should be much slower!) # csv_files <- list.files("~/processed_study_data", # pattern = "*timeseries*.csv", # recursive = TRUE, full.names = TRUE) # csv_data <- map_dfr(csv_files, read_csv)
eyeris reports the proportion of missing (NA) pupil samples as a
prop_missing column (and its raw count, n_missing) at two levels: per
recording block (in run_confounds) and per epoched event/trial (in
confounds_events). eyeris does not apply a fixed missing-data exclusion
cutoff for you. Instead, these columns are exposed so you can define your own
exclusion thresholds at whichever level (block, epoch, or trial) is appropriate
for your design.
# extract confounds data for quality control confounds_data <- eyeris_db_collect( "~/processed_study_data", data_types = c("run_confounds", "confounds_events") ) # block-level QC: identify subjects with poor data quality # (all thresholds below are illustrative -- choose values appropriate for your # study; nothing here is enforced by eyeris) quality_control <- confounds_data$run_confounds %>% group_by(subject_id, session_id) %>% summarise( mean_blink_rate = mean(blink_rate_hz, na.rm = TRUE), mean_prop_missing = mean(prop_missing, na.rm = TRUE), mean_prop_invalid = mean(prop_invalid, na.rm = TRUE), mean_gaze_variance = mean(gaze_x_var_px, na.rm = TRUE), .groups = 'drop' ) %>% mutate( high_blink_rate = mean_blink_rate > 0.5, # user-defined thresholds high_missing_data = mean_prop_missing > 0.5, high_invalid_data = mean_prop_invalid > 0.3, high_gaze_variance = mean_gaze_variance > 10000, exclude_subject = high_blink_rate | high_missing_data | high_invalid_data | high_gaze_variance ) # then view the subjects you have flagged for exclusion exclude_list <- quality_control %>% filter(exclude_subject) %>% select(subject_id, session_id, exclude_subject) print(exclude_list) # trial-level QC: flag individual epochs/trials to drop using your own # missing-data threshold (e.g., here, > 50% missing on the deblinked signal) trial_exclusions <- confounds_data$confounds_events %>% filter(grepl("deblink", step)) %>% mutate(exclude_trial = prop_missing > 0.5) %>% # user-defined threshold select( subject_id, session_id, run_number, epoch_label, matched_event, n_samples, n_missing, prop_missing, exclude_trial ) print(trial_exclusions)
# benchmark database approach system.time({ db_data <- eyeris_db_collect( "~/processed_study_data", subjects = c("001", "002", "003", "004", "005"), data_types = "timeseries" ) }) # benchmark CSV approach # system.time({ # csv_files <- list.files("~/processed_study_data", # pattern = "*timeseries*.csv", # recursive = TRUE, full.names = TRUE) # csv_data <- map_dfr(csv_files[1:5], read_csv) # only first 5 subjects # }) # memory usage comparison object.size(db_data) # database extraction # object.size(csv_data) # CSV loading # file size comparison db_file_size <- file.size("~/processed_study_data/derivatives/attention_study_db.eyerisdb") csv_total_size <- sum(file.size(list.files("~/processed_study_data", pattern = "*.csv", recursive = TRUE, full.names = TRUE))) cat("Database file size:", round(db_file_size / 1024^2, 2), "MB\n") cat("Total CSV file size:", round(csv_total_size / 1024^2, 2), "MB\n") cat("Storage efficiency:", round(db_file_size / csv_total_size * 100, 1), "% of CSV size\n")
# 1. Always use descriptive database names bidsify(data, db_path = "study_name_pilot_2024") # good # bidsify(data, db_path = "my-project") # default, not descriptive # 2. Use database-only mode for large studies bidsify(data, csv_enabled = FALSE, db_enabled = TRUE) # efficient # 3. Create separate databases for different experiments bidsify(data, db_path = "experiment_1_attention") bidsify(data, db_path = "experiment_2_memory") # 4. always disconnect from databases con <- eyeris_db_connect("~/data", "study_db") # ... do your work ... # then disconnect ... eyeris_db_disconnect(con) # or use on.exit for automatic cleanup process_data <- function() { con <- eyeris_db_connect("~/data", "study_db") on.exit(eyeris_db_disconnect(con)) # ... your analysis code here ... results <- eyeris_db_collect("~/data", "study_db") return(results) }
# 1. Use database-only workflow to minimize I/O costs process_cloud_data <- function(subject_list, input_bucket, output_bucket) { for (subject in subject_list) { # ... for demo purposes only -- download raw data ... download_from_cloud(subject, input_bucket) # ... process and add to database (no CSV files) ... eyeris_data <- glassbox(local_file) %>% epoch(...) bidsify( eyeris_data, bids_dir = "local_processing", participant_id = subject, csv_enabled = FALSE, # skip CSV for cloud efficiency db_enabled = TRUE, db_path = "cloud_study_db" ) # clean up local files unlink(local_file) } # upload final database back to your cloud upload_to_cloud("cloud_study_db.eyerisdb", output_bucket) } # 2. use database for distributed analysis analyze_cloud_data <- function() { # download only the database file download_from_cloud("cloud_study_db.eyerisdb") # extract only the data you need analysis_data <- eyeris_db_collect( "local_processing", data_types = c("epochs", "confounds_summary"), subjects = target_subjects ) # run analysis on extracted subset results <- run_statistical_analysis(analysis_data) return(results) }
# safe eyeris project database operations with error handling safe_extract <- function(bids_dir, db_path, ...) { tryCatch({ data <- eyeris_db_collect(bids_dir, db_path, ...) return(data) }, error = function(e) { cat("Error extracting data:", e$message, "\n") # first check if eyeris project database exists db_file <- file.path(bids_dir, "derivatives", paste0(db_path, ".eyerisdb")) if (!file.exists(db_file)) { cat("eyeris project database file not found:", db_file, "\n") return(NULL) } # try connecting to your eyeris project database con <- tryCatch({ eyeris_db_connect(bids_dir, db_path) }, error = function(e2) { cat("Cannot connect to eyeris project database:", e2$message, "\n") return(NULL) }) if (!is.null(con)) { # list available tables for debugging tables <- eyeris_db_list_tables(con) cat("Available tables:\n") print(tables) eyeris_db_disconnect(con) } return(NULL) }) } # example usage data <- safe_extract("~/my_study", "study_database", subjects = c("001", "002"), data_types = "timeseries")
# if you have existing eyeris-derived CSV files and want to migrate to a database migrate_csv_to_database <- function(bids_dir, db_path) { # ... find all CSV files ... csv_files <- list.files(bids_dir, pattern = "\\.csv$", recursive = TRUE, full.names = TRUE) # ... connect to an eyeris database ... con <- eyeris_db_connect(bids_dir, db_path) on.exit(eyeris_db_disconnect(con)) for (csv_file in csv_files) { cat("Processing:", basename(csv_file), "\n") # parse filename to extract metadata # ... (which of course depends on your CSV naming convention) ... filename_parts <- parse_bids_filename(basename(csv_file)) # ... read CSV data ... csv_data <- read.csv(csv_file) # ... then write to the eyeris project database ... write_eyeris_data_to_db( data = csv_data, con = con, data_type = filename_parts$data_type, sub = filename_parts$subject, ses = filename_parts$session, task = filename_parts$task, # ... other parameters ) } cat("Migration complete!\n") }
# export specific data back to CSV format (if needed) export_database_subset <- function(bids_dir, db_path, output_dir) { # ... extract data from the eyeris project database ... data <- eyeris_db_collect(bids_dir, db_path, subjects = c("001", "002"), data_types = c("timeseries", "events")) # ... create an output directory ... dir.create(output_dir, recursive = TRUE) # ... then export each data type ... for (data_type in names(data)) { filename <- file.path(output_dir, paste0(data_type, "_subset.csv")) write.csv(data[[data_type]], filename, row.names = FALSE) cat("Exported:", filename, "\n") } }
The eyeris database functionality provides a powerful, scalable alternative to CSV files for eye-tracking data storage and analysis. Key advantages include:
SQL queries and cross-platform compatibilityWhether you're running a small pilot study or a large-scale multi-session experiment, eyeris databases can streamline your workflow and improve analysis efficiency. Start with the simple eyeris_db_collect() function for most use cases, and leverage direct SQL queries when you need maximum flexibility.
For questions or issues with database functionality, please refer to the eyeris documentation or open an issue on the GitHub repository.
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.