using-parquet.R
In riemtan: Riemannian Metrics for Symmetric Positive Definite Matrices

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----eval=FALSE---------------------------------------------------------------
# install.packages("arrow")

## ----eval=FALSE---------------------------------------------------------------
# library(riemtan)
# library(Matrix)
# 
# # Create example connectomes (4x4 SPD matrices)
# set.seed(42)
# connectomes <- lapply(1:50, function(i) {
#   mat <- diag(4) + matrix(rnorm(16, 0, 0.1), 4, 4)
#   mat <- (mat + t(mat)) / 2  # Make symmetric
#   mat <- mat + diag(4) * 0.5  # Ensure positive definite
#   Matrix::pack(Matrix::Matrix(mat, sparse = FALSE))
# })
# 
# # Write to Parquet format
# write_connectomes_to_parquet(
#   connectomes,
#   output_dir = "my_connectomes",
#   subject_ids = paste0("subject_", 1:50),
#   provenance = list(
#     study = "Example Study",
#     acquisition_date = "2024-01-01",
#     preprocessing = "Standard pipeline v1.0"
#   )
# )

## ----eval=FALSE---------------------------------------------------------------
# # Detailed validation with verbose output
# validate_parquet_directory("my_connectomes", verbose = TRUE)

## ----eval=FALSE---------------------------------------------------------------
# # Load AIRM metric
# data(airm)
# 
# # Create Parquet backend (default cache size: 10 matrices)
# backend <- create_parquet_backend(
#   "my_connectomes",
#   cache_size = 10
# )
# 
# # Create CSample with the backend
# sample <- CSample$new(
#   backend = backend,
#   metric_obj = airm
# )
# 
# # Sample info
# print(paste("Sample size:", sample$sample_size))
# print(paste("Matrix dimension:", sample$matrix_size))

## ----eval=FALSE---------------------------------------------------------------
# # Compute tangent images
# sample$compute_tangents()
# 
# # Compute vectorized images
# sample$compute_vecs()
# 
# # Compute Frechet mean
# sample$compute_fmean(tol = 0.01, max_iter = 50)
# 
# # Center the sample
# sample$center()
# 
# # Compute variation
# sample$compute_variation()
# print(paste("Variation:", sample$variation))

## ----eval=FALSE---------------------------------------------------------------
# # Access specific connectome (loads from disk if not cached)
# conn_1 <- sample$connectomes[[1]]
# 
# # Access all connectomes (loads all from disk)
# all_conns <- sample$connectomes
# 
# # Cache management
# backend$get_cache_size()  # Check current cache usage
# backend$clear_cache()     # Clear cache to free memory

## ----eval=FALSE---------------------------------------------------------------
# # Create two samples with different backends
# backend1 <- create_parquet_backend("study1_connectomes")
# backend2 <- create_parquet_backend("study2_connectomes")
# 
# sample1 <- CSample$new(backend = backend1, metric_obj = airm)
# sample2 <- CSample$new(backend = backend2, metric_obj = airm)
# 
# # Create super sample
# super_sample <- CSuperSample$new(list(sample1, sample2))
# 
# # Gather all connectomes
# super_sample$gather()
# 
# # Compute statistics
# super_sample$compute_fmean()
# super_sample$compute_variation()

## ----eval=FALSE---------------------------------------------------------------
# # Traditional approach (all in memory)
# sample_memory <- CSample$new(
#   conns = connectomes,
#   metric_obj = airm
# )
# 
# # Parquet approach (lazy loading)
# backend <- create_parquet_backend("my_connectomes")
# sample_parquet <- CSample$new(
#   backend = backend,
#   metric_obj = airm
# )
# 
# # Both work identically
# sample_memory$compute_fmean()
# sample_parquet$compute_fmean()

## ----eval=FALSE---------------------------------------------------------------
# # Small cache (memory-constrained environments)
# backend_small <- ParquetBackend$new("my_connectomes", cache_size = 5)
# 
# # Large cache (memory-rich environments)
# backend_large <- ParquetBackend$new("my_connectomes", cache_size = 50)

## ----eval=FALSE---------------------------------------------------------------
# # Process in chunks
# n <- sample$sample_size
# batch_size <- 10
# 
# for (start in seq(1, n, by = batch_size)) {
#   end <- min(start + batch_size - 1, n)
# 
#   # Load batch
#   batch <- lapply(start:end, function(i) backend$get_matrix(i))
# 
#   # Process batch...
# 
#   # Clear cache to free memory
#   backend$clear_cache()
# }

## ----eval=FALSE---------------------------------------------------------------
# library(riemtan)
# 
# # Enable parallel processing (works on all platforms including Windows!)
# set_parallel_plan("multisession", workers = 4)
# 
# # Check status
# is_parallel_enabled()  # TRUE
# get_n_workers()        # 4
# 
# # Create Parquet-backed sample
# backend <- create_parquet_backend("large_dataset", cache_size = 20)
# sample <- CSample$new(backend = backend, metric_obj = airm)

## ----eval=FALSE---------------------------------------------------------------
# # Parallel tangent computations with progress bar
# sample$compute_tangents(progress = TRUE)   # 3-8x faster
# 
# # Parallel vectorization
# sample$compute_vecs(progress = TRUE)       # 2-4x speedup
# 
# # Parallel Frechet mean computation
# sample$compute_fmean(progress = TRUE)      # 2-5x faster for large samples

## ----eval=FALSE---------------------------------------------------------------
# # Load specific subset in batches
# subset_conns <- sample$load_connectomes_batched(
#   indices = 1:500,        # Load first 500 matrices
#   batch_size = 50,        # 50 matrices per batch
#   progress = TRUE         # Show progress
# )
# 
# # This loads 500 matrices in 10 batches, clearing cache between batches
# # Each batch is loaded in parallel for 5-10x speedup

## ----eval=FALSE---------------------------------------------------------------
# # Sequential (default if not configured)
# set_parallel_plan("sequential")
# system.time(sample$compute_tangents())  # Baseline
# 
# # Parallel with 4 workers
# set_parallel_plan("multisession", workers = 4)
# system.time(sample$compute_tangents())  # 3-4x faster
# 
# # Parallel with 8 workers
# set_parallel_plan("multisession", workers = 8)
# system.time(sample$compute_tangents())  # 6-8x faster

## ----eval=FALSE---------------------------------------------------------------
# # Install progressr for progress bars (optional)
# install.packages("progressr")
# 
# # Enable parallel processing
# set_parallel_plan("multisession", workers = 4)
# 
# # All operations support progress parameter
# sample$compute_tangents(progress = TRUE)
# sample$compute_vecs(progress = TRUE)
# sample$compute_fmean(progress = TRUE)
# 
# # Batch loading with progress
# conns <- sample$load_connectomes_batched(
#   indices = 1:1000,
#   batch_size = 100,
#   progress = TRUE  # Shows "Batch 1/10: loading matrices 1-100"
# )

## ----eval=FALSE---------------------------------------------------------------
# # Conservative (leave cores for system)
# set_parallel_plan("multisession", workers = parallel::detectCores() - 1)
# 
# # Maximum performance (use all cores)
# set_parallel_plan("multisession", workers = parallel::detectCores())

## ----eval=FALSE---------------------------------------------------------------
# # Each worker loads its own data copy
# # For 4 workers with 100 matrices of 200x200, expect:
# # Memory = 4 workers × cache_size × matrix_size ≈ 4 × 20 × 320 KB ≈ 25 MB
# 
# # Use smaller cache with more workers
# backend <- create_parquet_backend("dataset", cache_size = 10)
# set_parallel_plan("multisession", workers = 8)

## ----eval=FALSE---------------------------------------------------------------
# # Compute with parallelization
# set_parallel_plan("multisession", workers = 4)
# sample$compute_tangents(progress = TRUE)
# sample$compute_fmean(progress = TRUE)
# 
# # Reset to free worker resources
# reset_parallel_plan()

## ----eval=FALSE---------------------------------------------------------------
# # Small dataset (n < 10): Uses sequential processing automatically
# small_sample <- CSample$new(conns = small_connectomes[1:5], metric_obj = airm)
# small_sample$compute_tangents()  # Sequential (no overhead)
# 
# # Large dataset (n >= 10): Uses parallel processing automatically
# large_sample <- CSample$new(backend = backend, metric_obj = airm)
# large_sample$compute_tangents()  # Parallel (if plan is active)

## ----eval=FALSE---------------------------------------------------------------
# # multisession: Works on all platforms (Windows, Mac, Linux)
# set_parallel_plan("multisession", workers = 4)
# 
# # multicore: Unix only, lower overhead
# set_parallel_plan("multicore", workers = 4)  # Auto-fallback to multisession on Windows
# 
# # cluster: For remote/distributed computing
# set_parallel_plan("cluster", workers = c("node1", "node2"))

## ----eval=FALSE---------------------------------------------------------------
# # Get metadata
# metadata <- backend$get_metadata()
# 
# # Access subject IDs
# subject_ids <- metadata$subject_ids
# 
# # Access provenance information
# provenance <- metadata$provenance
# print(provenance$study)
# print(provenance$preprocessing)

## ----eval=FALSE---------------------------------------------------------------
# write_connectomes_to_parquet(
#   connectomes,
#   output_dir = "custom_naming",
#   file_pattern = "conn_%03d.parquet"
# )
# 
# # Files will be named: conn_001.parquet, conn_002.parquet, ...

## ----eval=FALSE---------------------------------------------------------------
# # Use minimal cache
# backend <- ParquetBackend$new("large_dataset", cache_size = 3)
# 
# # Compute statistics without loading all matrices at once
# sample <- CSample$new(backend = backend, metric_obj = airm)
# sample$compute_tangents()
# sample$compute_vecs()
# 
# # Operations that don't need all matrices in memory
# sample$compute_fmean(batch_size = 32)  # Uses batching

## ----eval=FALSE---------------------------------------------------------------
# # Check cache usage
# cache_size <- backend$get_cache_size()
# print(paste("Cached matrices:", cache_size))
# 
# # Free memory when needed
# backend$clear_cache()

Any scripts or data that you put into this service are public.

riemtan documentation built on Nov. 11, 2025, 1:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

riemtan
Riemannian Metrics for Symmetric Positive Definite Matrices

inst/doc/using-parquet.R
In riemtan: Riemannian Metrics for Symmetric Positive Definite Matrices

Try the riemtan package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

riemtan Riemannian Metrics for Symmetric Positive Definite Matrices

inst/doc/using-parquet.R In riemtan: Riemannian Metrics for Symmetric Positive Definite Matrices

Try the riemtan package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

riemtan
Riemannian Metrics for Symmetric Positive Definite Matrices

inst/doc/using-parquet.R
In riemtan: Riemannian Metrics for Symmetric Positive Definite Matrices