Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----eval=FALSE---------------------------------------------------------------
# install.packages("arrow")
## ----eval=FALSE---------------------------------------------------------------
# library(riemtan)
# library(Matrix)
#
# # Create example connectomes (4x4 SPD matrices)
# set.seed(42)
# connectomes <- lapply(1:50, function(i) {
# mat <- diag(4) + matrix(rnorm(16, 0, 0.1), 4, 4)
# mat <- (mat + t(mat)) / 2 # Make symmetric
# mat <- mat + diag(4) * 0.5 # Ensure positive definite
# Matrix::pack(Matrix::Matrix(mat, sparse = FALSE))
# })
#
# # Write to Parquet format
# write_connectomes_to_parquet(
# connectomes,
# output_dir = "my_connectomes",
# subject_ids = paste0("subject_", 1:50),
# provenance = list(
# study = "Example Study",
# acquisition_date = "2024-01-01",
# preprocessing = "Standard pipeline v1.0"
# )
# )
## ----eval=FALSE---------------------------------------------------------------
# # Detailed validation with verbose output
# validate_parquet_directory("my_connectomes", verbose = TRUE)
## ----eval=FALSE---------------------------------------------------------------
# # Load AIRM metric
# data(airm)
#
# # Create Parquet backend (default cache size: 10 matrices)
# backend <- create_parquet_backend(
# "my_connectomes",
# cache_size = 10
# )
#
# # Create CSample with the backend
# sample <- CSample$new(
# backend = backend,
# metric_obj = airm
# )
#
# # Sample info
# print(paste("Sample size:", sample$sample_size))
# print(paste("Matrix dimension:", sample$matrix_size))
## ----eval=FALSE---------------------------------------------------------------
# # Compute tangent images
# sample$compute_tangents()
#
# # Compute vectorized images
# sample$compute_vecs()
#
# # Compute Frechet mean
# sample$compute_fmean(tol = 0.01, max_iter = 50)
#
# # Center the sample
# sample$center()
#
# # Compute variation
# sample$compute_variation()
# print(paste("Variation:", sample$variation))
## ----eval=FALSE---------------------------------------------------------------
# # Access specific connectome (loads from disk if not cached)
# conn_1 <- sample$connectomes[[1]]
#
# # Access all connectomes (loads all from disk)
# all_conns <- sample$connectomes
#
# # Cache management
# backend$get_cache_size() # Check current cache usage
# backend$clear_cache() # Clear cache to free memory
## ----eval=FALSE---------------------------------------------------------------
# # Create two samples with different backends
# backend1 <- create_parquet_backend("study1_connectomes")
# backend2 <- create_parquet_backend("study2_connectomes")
#
# sample1 <- CSample$new(backend = backend1, metric_obj = airm)
# sample2 <- CSample$new(backend = backend2, metric_obj = airm)
#
# # Create super sample
# super_sample <- CSuperSample$new(list(sample1, sample2))
#
# # Gather all connectomes
# super_sample$gather()
#
# # Compute statistics
# super_sample$compute_fmean()
# super_sample$compute_variation()
## ----eval=FALSE---------------------------------------------------------------
# # Traditional approach (all in memory)
# sample_memory <- CSample$new(
# conns = connectomes,
# metric_obj = airm
# )
#
# # Parquet approach (lazy loading)
# backend <- create_parquet_backend("my_connectomes")
# sample_parquet <- CSample$new(
# backend = backend,
# metric_obj = airm
# )
#
# # Both work identically
# sample_memory$compute_fmean()
# sample_parquet$compute_fmean()
## ----eval=FALSE---------------------------------------------------------------
# # Small cache (memory-constrained environments)
# backend_small <- ParquetBackend$new("my_connectomes", cache_size = 5)
#
# # Large cache (memory-rich environments)
# backend_large <- ParquetBackend$new("my_connectomes", cache_size = 50)
## ----eval=FALSE---------------------------------------------------------------
# # Process in chunks
# n <- sample$sample_size
# batch_size <- 10
#
# for (start in seq(1, n, by = batch_size)) {
# end <- min(start + batch_size - 1, n)
#
# # Load batch
# batch <- lapply(start:end, function(i) backend$get_matrix(i))
#
# # Process batch...
#
# # Clear cache to free memory
# backend$clear_cache()
# }
## ----eval=FALSE---------------------------------------------------------------
# library(riemtan)
#
# # Enable parallel processing (works on all platforms including Windows!)
# set_parallel_plan("multisession", workers = 4)
#
# # Check status
# is_parallel_enabled() # TRUE
# get_n_workers() # 4
#
# # Create Parquet-backed sample
# backend <- create_parquet_backend("large_dataset", cache_size = 20)
# sample <- CSample$new(backend = backend, metric_obj = airm)
## ----eval=FALSE---------------------------------------------------------------
# # Parallel tangent computations with progress bar
# sample$compute_tangents(progress = TRUE) # 3-8x faster
#
# # Parallel vectorization
# sample$compute_vecs(progress = TRUE) # 2-4x speedup
#
# # Parallel Frechet mean computation
# sample$compute_fmean(progress = TRUE) # 2-5x faster for large samples
## ----eval=FALSE---------------------------------------------------------------
# # Load specific subset in batches
# subset_conns <- sample$load_connectomes_batched(
# indices = 1:500, # Load first 500 matrices
# batch_size = 50, # 50 matrices per batch
# progress = TRUE # Show progress
# )
#
# # This loads 500 matrices in 10 batches, clearing cache between batches
# # Each batch is loaded in parallel for 5-10x speedup
## ----eval=FALSE---------------------------------------------------------------
# # Sequential (default if not configured)
# set_parallel_plan("sequential")
# system.time(sample$compute_tangents()) # Baseline
#
# # Parallel with 4 workers
# set_parallel_plan("multisession", workers = 4)
# system.time(sample$compute_tangents()) # 3-4x faster
#
# # Parallel with 8 workers
# set_parallel_plan("multisession", workers = 8)
# system.time(sample$compute_tangents()) # 6-8x faster
## ----eval=FALSE---------------------------------------------------------------
# # Install progressr for progress bars (optional)
# install.packages("progressr")
#
# # Enable parallel processing
# set_parallel_plan("multisession", workers = 4)
#
# # All operations support progress parameter
# sample$compute_tangents(progress = TRUE)
# sample$compute_vecs(progress = TRUE)
# sample$compute_fmean(progress = TRUE)
#
# # Batch loading with progress
# conns <- sample$load_connectomes_batched(
# indices = 1:1000,
# batch_size = 100,
# progress = TRUE # Shows "Batch 1/10: loading matrices 1-100"
# )
## ----eval=FALSE---------------------------------------------------------------
# # Conservative (leave cores for system)
# set_parallel_plan("multisession", workers = parallel::detectCores() - 1)
#
# # Maximum performance (use all cores)
# set_parallel_plan("multisession", workers = parallel::detectCores())
## ----eval=FALSE---------------------------------------------------------------
# # Each worker loads its own data copy
# # For 4 workers with 100 matrices of 200x200, expect:
# # Memory = 4 workers × cache_size × matrix_size ≈ 4 × 20 × 320 KB ≈ 25 MB
#
# # Use smaller cache with more workers
# backend <- create_parquet_backend("dataset", cache_size = 10)
# set_parallel_plan("multisession", workers = 8)
## ----eval=FALSE---------------------------------------------------------------
# # Compute with parallelization
# set_parallel_plan("multisession", workers = 4)
# sample$compute_tangents(progress = TRUE)
# sample$compute_fmean(progress = TRUE)
#
# # Reset to free worker resources
# reset_parallel_plan()
## ----eval=FALSE---------------------------------------------------------------
# # Small dataset (n < 10): Uses sequential processing automatically
# small_sample <- CSample$new(conns = small_connectomes[1:5], metric_obj = airm)
# small_sample$compute_tangents() # Sequential (no overhead)
#
# # Large dataset (n >= 10): Uses parallel processing automatically
# large_sample <- CSample$new(backend = backend, metric_obj = airm)
# large_sample$compute_tangents() # Parallel (if plan is active)
## ----eval=FALSE---------------------------------------------------------------
# # multisession: Works on all platforms (Windows, Mac, Linux)
# set_parallel_plan("multisession", workers = 4)
#
# # multicore: Unix only, lower overhead
# set_parallel_plan("multicore", workers = 4) # Auto-fallback to multisession on Windows
#
# # cluster: For remote/distributed computing
# set_parallel_plan("cluster", workers = c("node1", "node2"))
## ----eval=FALSE---------------------------------------------------------------
# # Get metadata
# metadata <- backend$get_metadata()
#
# # Access subject IDs
# subject_ids <- metadata$subject_ids
#
# # Access provenance information
# provenance <- metadata$provenance
# print(provenance$study)
# print(provenance$preprocessing)
## ----eval=FALSE---------------------------------------------------------------
# write_connectomes_to_parquet(
# connectomes,
# output_dir = "custom_naming",
# file_pattern = "conn_%03d.parquet"
# )
#
# # Files will be named: conn_001.parquet, conn_002.parquet, ...
## ----eval=FALSE---------------------------------------------------------------
# # Use minimal cache
# backend <- ParquetBackend$new("large_dataset", cache_size = 3)
#
# # Compute statistics without loading all matrices at once
# sample <- CSample$new(backend = backend, metric_obj = airm)
# sample$compute_tangents()
# sample$compute_vecs()
#
# # Operations that don't need all matrices in memory
# sample$compute_fmean(batch_size = 32) # Uses batching
## ----eval=FALSE---------------------------------------------------------------
# # Check cache usage
# cache_size <- backend$get_cache_size()
# print(paste("Cached matrices:", cache_size))
#
# # Free memory when needed
# backend$clear_cache()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.