Nothing
# Genome Synthesis: Generate synthetic genomes from stratified Markov models
# Helper function to compute flat bin indices from per-dimension indices
# Uses vectorized matrix multiplication instead of loop-based stride computation
.compute_flat_indices <- function(per_dim_indices, dim_sizes) {
n_dims <- length(dim_sizes)
n_positions <- nrow(per_dim_indices)
flat_indices <- rep(NA_integer_, n_positions)
# Check which positions have valid indices in all dimensions
valid_positions <- rowSums(is.na(per_dim_indices)) == 0L
if (any(valid_positions)) {
# Precompute strides: c(1, s1, s1*s2, ...)
strides <- c(1L, cumprod(dim_sizes[-n_dims]))
# Get valid indices matrix (already 1-based)
valid_per_dim <- per_dim_indices[valid_positions, , drop = FALSE]
# Convert to 0-based for flat index computation: (idx - 1) * stride
# Then sum across dimensions and convert back to 1-based
# flat_idx = sum((idx_d - 1) * stride_d) + 1
# = sum(idx_d * stride_d) - sum(stride_d) + 1
flat_idx <- as.integer(valid_per_dim %*% strides - sum(strides) + 1L)
flat_indices[valid_positions] <- flat_idx
}
flat_indices
}
# Default chunk size threshold for parallel processing (1 billion bases)
.GSYNTH_MAX_CHUNK_SIZE <- 1e9
#' Process large genomes in parallel chunks
#'
#' Internal helper function that extracts the common parallel processing pattern
#' from gsynth functions. Handles both file and vector output modes.
#'
#' @param intervals Genomic intervals to process
#' @param output_format Output format: "misha" (binary), "fasta" (text), or "vector"
#' @param output_path Path to output file (ignored for vector mode)
#' @param process_chunk_fn Function to process each chunk. Should accept:
#' \itemize{
#' \item chunk_interval: Single-row intervals data frame
#' \item chunk_index: Integer index of the chunk (1-based)
#' \item chunk_seed: Optional seed for this chunk (if seed provided)
#' \item ...: Additional arguments passed through
#' }
#' For file output, should return path to temp file.
#' For vector output, should return character vector.
#' @param max_chunk_size Maximum total bases before triggering parallel processing
#' @param seed Optional seed for reproducible chunk seeds
#' @param ... Additional arguments passed to process_chunk_fn
#'
#' @return NULL if parallel processing not needed (genome too small).
#' For file output: invisible(NULL) after writing combined file.
#' For vector output: Character vector of combined results.
#'
#' @noRd
.gsynth_process_parallel <- function(intervals,
output_format,
output_path,
process_chunk_fn,
max_chunk_size = .GSYNTH_MAX_CHUNK_SIZE,
seed = NULL,
...) {
# Calculate total bases
total_bases <- sum(intervals$end - intervals$start)
# Return NULL if genome is small enough to process normally
if (total_bases <= max_chunk_size) {
return(NULL)
}
# Number of chunks = number of intervals (one per row)
n_chunks <- nrow(intervals)
message(sprintf(
"Large genome detected (%s bases). Processing %d chunks in parallel...",
format(total_bases, big.mark = ","), n_chunks
))
# Determine number of cores to use
n_cores <- min(getOption("gmax.processes", 1L), n_chunks)
message(sprintf("Using %d cores for parallel processing", n_cores))
# Generate reproducible seeds for each chunk if seed is provided
chunk_seeds <- if (!is.null(seed)) {
set.seed(seed)
sample.int(.Machine$integer.max, n_chunks)
} else {
rep(NA, n_chunks)
}
# Capture extra arguments
extra_args <- list(...)
if (output_format == "vector") {
# Vector output mode: process chunks and combine results
all_results <- parallel::mclapply(seq_len(n_chunks), function(i) {
chunk_interval <- intervals[i, , drop = FALSE]
chunk_seed <- if (!is.na(chunk_seeds[i])) chunk_seeds[i] else NULL
# Call process function with all arguments
do.call(process_chunk_fn, c(
list(
chunk_interval = chunk_interval,
chunk_index = i,
chunk_seed = chunk_seed
),
extra_args
))
}, mc.cores = n_cores, mc.preschedule = FALSE)
# Combine vector results
result <- unlist(all_results)
return(result)
}
# File output mode: process to temp files and combine
temp_files <- parallel::mclapply(seq_len(n_chunks), function(i) {
chunk_interval <- intervals[i, , drop = FALSE]
chunk_seed <- if (!is.na(chunk_seeds[i])) chunk_seeds[i] else NULL
# Call process function with all arguments
do.call(process_chunk_fn, c(
list(
chunk_interval = chunk_interval,
chunk_index = i,
chunk_seed = chunk_seed
),
extra_args
))
}, mc.cores = n_cores, mc.preschedule = FALSE)
# Combine results in order
message("Combining results...")
for (i in seq_along(temp_files)) {
temp_file <- temp_files[[i]]
if (!file.exists(temp_file)) {
stop(sprintf("Failed to generate chunk %d", i), call. = FALSE)
}
if (i == 1) {
# First chunk: copy to output
file.copy(temp_file, output_path, overwrite = TRUE)
} else {
# Subsequent chunks: append
if (output_format == "fasta") {
# Text mode: read lines and append
temp_content <- readLines(temp_file)
cat(temp_content, file = output_path, sep = "\n", append = TRUE)
} else {
# Binary mode (misha): read raw and append
temp_content <- readBin(temp_file, "raw", n = file.info(temp_file)$size)
con <- file(output_path, "ab")
writeBin(temp_content, con)
close(con)
}
}
# Clean up temp file
unlink(temp_file)
}
invisible(NULL)
}
#' Create a bin mapping from value-based merge specifications
#'
#' Converts value-based bin merge specifications into a bin_map named vector
#' that can be used with \code{\link{gsynth.train}}. This allows you to
#' specify merges using actual track values rather than bin indices.
#'
#' @param breaks Numeric vector of bin boundaries (same as used in
#' \code{\link{gsynth.train}})
#' @param merge_ranges List of merge specifications. Each specification is a
#' named list with:
#' \describe{
#' \item{from}{Numeric vector of length 2 \code{c(min, max)} defining the
#' source value range to merge. Use \code{-Inf} or \code{Inf} for
#' open-ended ranges. Can also be a single number (shorthand for
#' \code{c(value, Inf)}).}
#' \item{to}{Numeric vector of length 2 \code{c(min, max)} defining the
#' target bin that source bins should map to. Must match an existing
#' bin defined by \code{breaks}.}
#' }
#'
#' @return A named vector (bin_map) compatible with \code{bin_map} parameter
#' in \code{\link{gsynth.train}}. The names are source bin indices
#' (1-based), and values are target bin indices (1-based).
#'
#' @examples
#' # Define breaks for GC content [0, 1] in 0.025 increments
#' breaks <- seq(0, 1, 0.025)
#'
#' # Merge all GC content above 70% (0.7) into the bin (0.675, 0.7]
#' bin_map <- gsynth.bin_map(
#' breaks = breaks,
#' merge_ranges = list(
#' list(from = 0.7, to = c(0.675, 0.7))
#' )
#' )
#'
#' # Multiple merges: merge low GC (< 0.3) and high GC (> 0.7) into middle bins
#' bin_map2 <- gsynth.bin_map(
#' breaks = breaks,
#' merge_ranges = list(
#' list(from = c(-Inf, 0.3), to = c(0.4, 0.425)), # low GC -> (0.4, 0.425]
#' list(from = 0.7, to = c(0.675, 0.7)) # high GC -> (0.675, 0.7]
#' )
#' )
#'
#' @seealso \code{\link{gsynth.train}}
#' @export
gsynth.bin_map <- function(breaks, merge_ranges = NULL) {
if (!is.numeric(breaks) || length(breaks) < 2) {
stop("breaks must be a numeric vector with at least 2 elements", call. = FALSE)
}
breaks <- sort(breaks)
num_bins <- length(breaks) - 1
# Start with identity mapping
bin_map_vec <- seq_len(num_bins)
names(bin_map_vec) <- as.character(bin_map_vec)
if (is.null(merge_ranges) || length(merge_ranges) == 0) {
return(bin_map_vec)
}
# Find target bin indices for each target range
find_bin_for_range <- function(range) {
# range should be c(min, max)
if (length(range) != 2) {
stop("'to' range must be a vector of length 2: c(min, max)", call. = FALSE)
}
min_val <- range[1]
max_val <- range[2]
# Find which bin this range corresponds to
# Bins are [breaks[i], breaks[i+1]) for i=1..num_bins, except last is closed
for (i in seq_len(num_bins)) {
bin_min <- breaks[i]
bin_max <- breaks[i + 1]
# Check if the range matches this bin (within tolerance)
if (abs(min_val - bin_min) < 1e-10 && abs(max_val - bin_max) < 1e-10) {
return(i)
}
}
stop(sprintf(
"Target range [%.6f, %.6f] does not match any bin defined by breaks",
min_val, max_val
), call. = FALSE)
}
# Process each merge specification
for (spec in merge_ranges) {
if (!is.list(spec) || !("from" %in% names(spec)) || !("to" %in% names(spec))) {
stop("Each merge specification must be a list with 'from' and 'to' elements", call. = FALSE)
}
from_range <- spec$from
to_range <- spec$to
# Handle shorthand: single number means [value, Inf)
if (length(from_range) == 1 && is.numeric(from_range)) {
from_range <- c(from_range, Inf)
}
if (length(from_range) != 2) {
stop("'from' must be a vector of length 1 or 2", call. = FALSE)
}
from_min <- from_range[1]
from_max <- from_range[2]
# Find target bin index
target_bin <- find_bin_for_range(to_range)
# Find all source bins whose center is in from_range and map them to target_bin
for (i in seq_len(num_bins)) {
bin_min <- breaks[i]
bin_max <- breaks[i + 1]
bin_center <- (bin_min + bin_max) / 2
# Check if bin center is within from_range
# Handle Inf/-Inf by treating them as always true for that bound
in_range <- TRUE
if (is.finite(from_min)) {
in_range <- in_range && (bin_center >= from_min)
}
if (is.finite(from_max)) {
in_range <- in_range && (bin_center <= from_max)
}
if (in_range) {
bin_map_vec[i] <- target_bin
}
}
}
# Return as named vector (source -> target)
result <- as.integer(bin_map_vec)
names(result) <- as.character(seq_len(num_bins))
result
}
#' Train a stratified Markov-5 model from genome sequences
#'
#' Computes a 5th-order Markov model optionally stratified by bins of one or more
#' track expressions (e.g., GC content and CG dinucleotide frequency). This model
#' can be used to generate synthetic genomes that preserve the k-mer statistics of
#' the original genome within each stratification bin. When called with no dimension
#' specifications, trains a single unstratified model.
#'
#' @param ... Zero or more dimension specifications. Each specification is a list
#' containing:
#' \describe{
#' \item{expr}{Track expression for this dimension (required)}
#' \item{breaks}{Numeric vector of bin boundaries for this dimension (required)}
#' \item{bin_merge}{Optional list of merge specifications for merging sparse bins.
#' Each specification is a named list with 'from' and 'to' elements.}
#' }
#' If no dimensions are provided, trains an unstratified model with a single bin.
#' @param mask Optional intervals to exclude from training. Regions in the mask
#' will not contribute to k-mer counts. Can be computed using \code{gscreen()}.
#' @param intervals Genomic intervals to process. If NULL, uses all chromosomes.
#' @param iterator Iterator for track evaluation, determines the resolution at which
#' track values are computed.
#' @param pseudocount Pseudocount added to all k-mer counts to avoid zero probabilities.
#' Default is 1.
#' @param min_obs Minimum number of observations (6-mers) required per bin. Bins with
#' fewer observations will be marked as NA (not learned) and a warning will be
#' issued. Default is 0 (no minimum). During sampling, NA bins will fall back
#' to uniform sampling unless merged via \code{bin_merge}.
#'
#' @details
#' \strong{Strand symmetry:} The training process counts both the forward strand
#' 6-mer and its reverse complement for each position, ensuring strand-symmetric
#' transition probabilities. This means the reported total_kmers is approximately
#' double the number of genomic positions processed.
#'
#' \strong{N bases:} Positions where the 6-mer contains any N (unknown) bases are
#' skipped during training and counted in \code{total_n}. The model only learns
#' from valid A/C/G/T sequences.
#'
#' @return A \code{gsynth.model} object containing:
#' \describe{
#' \item{n_dims}{Number of stratification dimensions}
#' \item{dim_specs}{List of dimension specifications (expr, breaks, num_bins, bin_map)}
#' \item{dim_sizes}{Vector of bin counts per dimension}
#' \item{total_bins}{Total number of bins (product of dim_sizes)}
#' \item{total_kmers}{Total number of valid 6-mers counted}
#' \item{per_bin_kmers}{Number of 6-mers counted per bin}
#' \item{total_masked}{Number of positions skipped due to mask}
#' \item{total_n}{Number of positions skipped due to N bases}
#' \item{model_data}{Internal model data (counts and CDFs)}
#' }
#'
#' @examples
#' gdb.init_examples()
#'
#' # Create virtual tracks for stratification
#' gvtrack.create("g_frac", NULL, "kmer.frac", kmer = "G")
#' gvtrack.create("c_frac", NULL, "kmer.frac", kmer = "C")
#' gvtrack.create("cg_frac", NULL, "kmer.frac", kmer = "CG")
#' gvtrack.create("masked_frac", NULL, "masked.frac")
#'
#' # Define repeat mask
#' repeats <- gscreen("masked_frac > 0.5",
#' intervals = gintervals.all(),
#' iterator = 100
#' )
#'
#' # Train unstratified model (no stratification)
#' model_0d <- gsynth.train(
#' mask = repeats,
#' intervals = gintervals.all(),
#' iterator = 200
#' )
#'
#' # Train model with 2D stratification (GC content and CG dinucleotide)
#' model <- gsynth.train(
#' list(
#' expr = "g_frac + c_frac",
#' breaks = seq(0, 1, 0.025),
#' bin_merge = list(list(from = 0.7, to = c(0.675, 0.7)))
#' ),
#' list(
#' expr = "cg_frac",
#' breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.2),
#' bin_merge = list(list(from = 0.04, to = c(0.03, 0.04)))
#' ),
#' mask = repeats,
#' intervals = gintervals.all(),
#' iterator = 200
#' )
#'
#' @seealso \code{\link{gsynth.sample}}, \code{\link{gsynth.save}},
#' \code{\link{gsynth.load}}, \code{\link{gsynth.bin_map}}
#' @export
gsynth.train <- function(...,
mask = NULL,
intervals = NULL,
iterator = NULL,
pseudocount = 1,
min_obs = 0) {
.gcheckroot()
# Capture all dimension specs from ...
args <- list(...)
# Allow zero dimensions for unstratified model
zero_dim_model <- (length(args) == 0)
# Validate and process each dimension spec
dim_specs <- list()
for (i in seq_along(args)) {
spec <- args[[i]]
if (!is.list(spec)) {
stop(sprintf("Dimension %d must be a list", i), call. = FALSE)
}
if (!("expr" %in% names(spec))) {
stop(sprintf("Dimension %d must have an 'expr' element", i), call. = FALSE)
}
if (!("breaks" %in% names(spec))) {
stop(sprintf("Dimension %d must have a 'breaks' element", i), call. = FALSE)
}
# Validate breaks
breaks <- sort(spec$breaks)
if (!is.numeric(breaks) || length(breaks) < 2) {
stop(sprintf("Dimension %d breaks must be numeric with at least 2 elements", i),
call. = FALSE
)
}
num_bins <- length(breaks) - 1
# Process bin_merge -> bin_map for this dimension
bin_map <- seq_len(num_bins) # Identity mapping
if (!is.null(spec$bin_merge)) {
bin_map_result <- gsynth.bin_map(breaks, spec$bin_merge)
# Convert from named vector format to integer vector
for (j in seq_along(bin_map_result)) {
src_bin <- as.integer(names(bin_map_result)[j])
tgt_bin <- as.integer(bin_map_result[j])
if (!is.na(src_bin) && !is.na(tgt_bin) &&
src_bin >= 1 && src_bin <= num_bins &&
tgt_bin >= 1 && tgt_bin <= num_bins) {
bin_map[src_bin] <- tgt_bin
}
}
}
dim_specs[[i]] <- list(
expr = spec$expr,
breaks = breaks,
num_bins = num_bins,
bin_map = as.integer(bin_map) # 1-based for R
)
}
# Set up iterator (needed for both 0D and multi-D models)
.iterator <- do.call(.giterator, list(substitute(iterator)), envir = parent.frame())
# Get intervals (default to all chromosomes)
if (is.null(intervals)) {
intervals <- gintervals.all()
}
# Branch based on dimensionality
if (zero_dim_model) {
# Zero-dimensional model: single bin, no stratification
n_dims <- 0L
dim_sizes <- integer(0)
total_bins <- 1L
# Get number of positions for creating flat indices
# Use a minimal gextract to get chromosome/position info
message("Setting up iterator positions...")
iter_info <- gextract("1", intervals = intervals, iterator = .iterator)
if (is.null(iter_info) || nrow(iter_info) == 0) {
stop("No positions extracted. Check that intervals and iterator are valid.", call. = FALSE)
}
n_positions <- nrow(iter_info)
# All positions map to bin 1 in R (will be converted to 0-based for C++)
flat_indices <- rep(1L, n_positions)
# Get chromosome info for processing
chrom_sizes <- gintervals.chrom_sizes(intervals)
chrom_ids <- match(as.character(intervals$chrom), chrom_sizes$chrom) - 1L
chrom_starts <- intervals$start
chrom_ends <- intervals$end
# Prepare iterator position data
iter_chroms <- match(as.character(iter_info$chrom), chrom_sizes$chrom) - 1L
iter_starts <- as.integer(iter_info$start)
} else {
# Multi-dimensional model: existing logic
n_dims <- length(dim_specs)
dim_sizes <- sapply(dim_specs, function(d) d$num_bins)
total_bins <- prod(dim_sizes)
# Extract track values for ALL dimensions
message("Extracting track values...")
exprs <- sapply(dim_specs, function(d) d$expr)
# For multiple expressions, we need to extract each one
# gextract can handle multiple expressions
track_data <- gextract(exprs, intervals = intervals, iterator = .iterator)
if (is.null(track_data) || nrow(track_data) == 0) {
stop("No track data extracted. Check that intervals and iterator are valid.", call. = FALSE)
}
n_positions <- nrow(track_data)
# Compute per-dimension bin indices (1-based)
per_dim_indices <- matrix(NA_integer_, nrow = n_positions, ncol = n_dims)
for (d in seq_len(n_dims)) {
spec <- dim_specs[[d]]
track_values <- track_data[[spec$expr]]
bin_idx <- findInterval(track_values, spec$breaks, rightmost.closed = TRUE)
bin_idx[bin_idx == 0] <- NA # Values below first break
bin_idx[bin_idx > spec$num_bins] <- NA # Values above last break
# Apply bin mapping for this dimension
valid_mask <- !is.na(bin_idx) & bin_idx >= 1 & bin_idx <= spec$num_bins
if (any(valid_mask)) {
bin_idx[valid_mask] <- spec$bin_map[bin_idx[valid_mask]]
}
per_dim_indices[, d] <- bin_idx
}
# Compute flat bin indices using vectorized helper
flat_indices <- .compute_flat_indices(per_dim_indices, dim_sizes)
# Get chromosome info for processing
chrom_sizes <- gintervals.chrom_sizes(intervals)
chrom_ids <- match(as.character(intervals$chrom), chrom_sizes$chrom) - 1L
chrom_starts <- intervals$start
chrom_ends <- intervals$end
# Prepare iterator position data
iter_chroms <- match(as.character(track_data$chrom), chrom_sizes$chrom) - 1L
iter_starts <- as.integer(track_data$start)
}
# Handle NA flat indices (convert to -1 for C++)
flat_indices[is.na(flat_indices)] <- 0L
flat_indices <- as.integer(flat_indices) - 1L # 0-based for C++
# Prepare bin_map for C++ (0-based, for applying during training)
# We've already applied bin_map in R, so just pass identity mapping
bin_map_vec <- seq_len(total_bins) - 1L # 0-based identity
message("Training Markov model...")
# Call C++ training function
# We pass flat indices directly and create dummy breaks that give total_bins bins
# The C++ code computes num_bins = length(breaks) - 1
# So we need breaks with total_bins + 1 elements
dummy_breaks <- seq(0, total_bins, length.out = total_bins + 1)
result <- .gcall(
"C_gsynth_train",
as.integer(chrom_ids),
as.integer(chrom_starts),
as.integer(chrom_ends),
flat_indices,
iter_starts,
iter_chroms,
as.numeric(dummy_breaks),
bin_map_vec,
mask,
as.numeric(pseudocount),
.misha_env()
)
# Override C++ result with our multi-dimensional metadata
result$n_dims <- n_dims
result$dim_specs <- dim_specs
result$dim_sizes <- dim_sizes
result$total_bins <- total_bins
result$num_bins <- total_bins # For compatibility
result$iterator <- .iterator
# Store all breaks for reference
result$breaks <- NULL # Remove single breaks
# Check for sparse bins (fewer than min_obs observations)
result$min_obs <- min_obs
sparse_bins <- which(result$per_bin_kmers < min_obs)
result$sparse_bins <- sparse_bins
if (length(sparse_bins) > 0) {
# Mark CDFs for sparse bins as NA
for (bin_idx in sparse_bins) {
result$model_data$cdf[[bin_idx]][] <- NA_real_
}
# Notify user about sparse bins
n_sparse <- length(sparse_bins)
warning(sprintf(
"%d out of %d bins have fewer than %d observations and are marked as NA.\n%s",
n_sparse,
total_bins,
min_obs,
"Use bin_merge to merge sparse bins before sampling, or they will use uniform sampling."
), call. = FALSE)
# If we have dimension info, try to identify which dimensions have issues
if (n_dims > 1 && n_sparse <= 50) {
# Convert flat indices to per-dimension indices for user reference
sparse_coords <- matrix(NA_integer_, nrow = n_sparse, ncol = n_dims)
for (i in seq_along(sparse_bins)) {
flat_idx <- sparse_bins[i] - 1L # 0-based
for (d in seq_len(n_dims)) {
sparse_coords[i, d] <- (flat_idx %% dim_sizes[d]) + 1L
flat_idx <- flat_idx %/% dim_sizes[d]
}
}
colnames(sparse_coords) <- sapply(dim_specs, function(s) s$expr)
result$sparse_coords <- sparse_coords
}
}
class(result) <- "gsynth.model"
if (n_dims == 0) {
message(sprintf(
"Trained unstratified model: %s 6-mers (no stratification)",
format(result$total_kmers, big.mark = ",")
))
} else {
message(sprintf(
"Trained model: %s 6-mers across %d bins (%d dimensions)",
format(result$total_kmers, big.mark = ","),
result$total_bins,
result$n_dims
))
}
if (length(sparse_bins) > 0) {
message(sprintf(
" Note: %d bins have < %d observations (marked as NA)",
length(sparse_bins), min_obs
))
}
result
}
#' Print summary of a gsynth.model
#'
#' @param x A gsynth.model object
#' @param ... Additional arguments (ignored)
#'
#' @export
print.gsynth.model <- function(x, ...) {
cat("Synthetic Genome Markov-5 Model\n")
cat("----------------------------\n")
if (!is.null(x$n_dims) && x$n_dims == 0) {
# Zero-dimensional (unstratified) model
cat("Stratification: None (single global model)\n")
cat(sprintf("Total bins: 1\n"))
} else if (!is.null(x$n_dims)) {
# Multi-dimensional model
cat(sprintf("Dimensions: %d\n", x$n_dims))
for (d in seq_len(x$n_dims)) {
spec <- x$dim_specs[[d]]
cat(sprintf("\n Dimension %d:\n", d))
cat(sprintf(" Expression: %s\n", spec$expr))
cat(sprintf(" Bins: %d\n", spec$num_bins))
cat(sprintf(" Range: [%.4f, %.4f]\n", min(spec$breaks), max(spec$breaks)))
}
cat(sprintf("\nTotal bins: %d (", x$total_bins))
cat(paste(x$dim_sizes, collapse = " x "))
cat(")\n")
} else {
# Legacy single-dimension model
cat(sprintf("Expression: %s\n", x$expr))
cat(sprintf("Number of bins: %d\n", x$num_bins))
}
cat(sprintf("Total k-mers: %s\n", format(x$total_kmers, big.mark = ",")))
cat(sprintf("Masked positions: %s\n", format(x$total_masked, big.mark = ",")))
cat(sprintf("N positions: %s\n", format(x$total_n, big.mark = ",")))
# Show per-bin k-mer counts (abbreviated for multi-dimensional)
if (x$total_bins <= 20) {
cat("\nPer-bin k-mer counts:\n")
for (i in seq_along(x$per_bin_kmers)) {
if (x$per_bin_kmers[i] > 0) {
sparse_marker <- if (i %in% x$sparse_bins) " (NA - sparse)" else ""
cat(sprintf(
" Bin %d: %s%s\n",
i,
format(x$per_bin_kmers[i], big.mark = ","),
sparse_marker
))
}
}
} else {
non_empty <- sum(x$per_bin_kmers > 0)
cat(sprintf("\nNon-empty bins: %d / %d\n", non_empty, x$total_bins))
}
# Show sparse bins info
if (!is.null(x$sparse_bins) && length(x$sparse_bins) > 0) {
cat(sprintf(
"\nSparse bins (< %d obs): %d bins marked as NA\n",
x$min_obs,
length(x$sparse_bins)
))
if (length(x$sparse_bins) <= 10) {
cat(sprintf(" Bin indices: %s\n", paste(x$sparse_bins, collapse = ", ")))
}
}
invisible(x)
}
#' Save a gsynth.model to disk
#'
#' Saves a trained Markov model to an RDS file for later use.
#'
#' @param model A gsynth.model object from \code{\link{gsynth.train}}
#' @param file Path to save the model
#'
#' @seealso \code{\link{gsynth.load}}, \code{\link{gsynth.train}}
#' @export
gsynth.save <- function(model, file) {
if (!inherits(model, "gsynth.model")) {
stop("model must be a gsynth.model object", call. = FALSE)
}
saveRDS(model, file)
invisible(file)
}
#' Load a gsynth.model from disk
#'
#' Loads a previously saved Markov model from an RDS file.
#'
#' @param file Path to the saved model file
#'
#' @return A gsynth.model object
#'
#' @seealso \code{\link{gsynth.save}}, \code{\link{gsynth.train}}
#' @export
gsynth.load <- function(file) {
if (!file.exists(file)) {
stop(sprintf("File not found: %s", file), call. = FALSE)
}
model <- readRDS(file)
if (!inherits(model, "gsynth.model")) {
stop("File does not contain a valid gsynth.model", call. = FALSE)
}
model
}
#' Sample a synthetic genome from a trained Markov model
#'
#' Generates a synthetic genome by sampling from a trained stratified Markov-5
#' model. The generated genome preserves the k-mer statistics of the original
#' genome within each stratification bin.
#'
#' @param model A gsynth.model object from \code{\link{gsynth.train}}
#' @param output_path Path to the output file (ignored when output_format = "vector")
#' @param output_format Output format:
#' \itemize{
#' \item "misha": .seq binary format (default)
#' \item "fasta": FASTA text format
#' \item "vector": Return sequences as a character vector (does not write to file)
#' }
#' @param mask_copy Optional intervals to copy from the original genome instead of
#' sampling. Use this to preserve specific regions (e.g., repeats, regulatory
#' elements) exactly as they appear in the reference. Regions not in mask_copy
#' will be sampled using the Markov model.
#' Note: mask_copy intervals should be non-overlapping and sorted by start
#' position within each chromosome. Overlapping intervals may result in
#' only the first overlapping region being copied, with subsequent overlaps
#' skipped due to cursor advancement during sequential processing.
#' @param seed Random seed for reproducibility. If NULL, uses current random state.
#' @param intervals Genomic intervals to sample. If NULL, uses all chromosomes.
#' @param n_samples Number of samples to generate per interval. Default is 1.
#' When n_samples > 1 and output_format = "fasta", headers include "_sampleN".
#' When output_format = "vector", returns n_samples * n_intervals sequences.
#' @param bin_merge Optional list of bin merge specifications to apply during sampling,
#' one per dimension (length must equal model$n_dims). Each element should be:
#' \itemize{
#' \item A list of merge specifications (same format as in
#' \code{\link{gsynth.train}}: each spec is \code{list(from = ..., to = ...)})
#' \item Or NULL to use the bin mapping from training for that dimension
#' }
#' This allows merging sparse bins at sampling time without re-training.
#' Example for a 2D model:
#' \preformatted{
#' bin_merge = list(
#' # Dimension 1: merge bins with values >= 0.8 to bin [0.7, 0.8)
#' list(list(from = c(0.8, Inf), to = c(0.7, 0.8))),
#' # Dimension 2: use training-time bin_map (no override)
#' NULL
#' )
#' }
#'
#' @details
#' \strong{N bases during sampling:} When the sampler needs to initialize the
#' first 5-mer context and encounters regions with only N bases, it falls back
#' to uniform random base selection until a valid context is established.
#' Similarly, if a bin has no learned statistics (sparse bin with NA CDF),
#' uniform sampling is used for that position.
#'
#' \strong{Sparse bins:} If the model has sparse bins (from \code{min_obs} during
#' training), a warning is issued when sampling regions that fall into these bins.
#' Consider using \code{bin_merge} to redirect sparse bins to well-populated ones.
#'
#' @return When output_format is "misha" or "fasta", returns invisible NULL and
#' writes the synthetic genome to output_path. When output_format is "vector",
#' returns a character vector of sequences (length = n_intervals * n_samples).
#'
#' @examples
#' gdb.init_examples()
#'
#' # Create virtual tracks
#' gvtrack.create("g_frac", NULL, "kmer.frac", kmer = "G")
#' gvtrack.create("c_frac", NULL, "kmer.frac", kmer = "C")
#' gvtrack.create("cg_frac", NULL, "kmer.frac", kmer = "CG")
#' gvtrack.create("masked_frac", NULL, "masked.frac")
#'
#' # Define repeat mask (regions to preserve from original)
#' repeats <- gscreen("masked_frac > 0.5",
#' intervals = gintervals.all(),
#' iterator = 100
#' )
#'
#' # Train model (excluding repeats from training)
#' model <- gsynth.train(
#' list(expr = "g_frac + c_frac", breaks = seq(0, 1, 0.025)),
#' list(expr = "cg_frac", breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.2)),
#' mask = repeats,
#' iterator = 200,
#' min_obs = 1000
#' )
#'
#' # Sample with mask_copy to preserve repeats from original genome
#' temp_dir <- tempdir()
#' synthetic_genome_file <- file.path(temp_dir, "synthetic_genome.fa")
#' gsynth.sample(model, synthetic_genome_file,
#' output_format = "fasta",
#' mask_copy = repeats,
#' seed = 60427,
#' bin_merge = list(
#' list(list(from = 0.7, to = c(0.675, 0.7))),
#' list(list(from = 0.04, to = c(0.03, 0.04)))
#' )
#' )
#'
#' @seealso \code{\link{gsynth.train}}, \code{\link{gsynth.save}}
#' @export
gsynth.sample <- function(model,
output_path = NULL,
output_format = c("misha", "fasta", "vector"),
mask_copy = NULL,
seed = NULL,
intervals = NULL,
n_samples = 1,
bin_merge = NULL) {
.gcheckroot()
if (!inherits(model, "gsynth.model")) {
stop("model must be a gsynth.model object", call. = FALSE)
}
output_format <- match.arg(output_format)
# Validate output_path requirement
if (output_format != "vector" && is.null(output_path)) {
stop("output_path is required when output_format is not 'vector'", call. = FALSE)
}
# Validate n_samples
n_samples <- as.integer(n_samples)
if (is.na(n_samples) || n_samples < 1) {
stop("n_samples must be a positive integer", call. = FALSE)
}
# Set random seed if provided
if (!is.null(seed)) {
set.seed(seed)
}
# Get intervals (default to all chromosomes)
if (is.null(intervals)) {
intervals <- gintervals.all()
}
# Get iterator from model (with validation)
.iterator <- model$iterator
if (is.null(.iterator)) {
stop("Model iterator is NULL. The model may be corrupted or from an incompatible version.", call. = FALSE)
}
n_dims <- model$n_dims
dim_sizes <- model$dim_sizes
message("Extracting track values for sampling...")
if (n_dims == 0) {
# Zero-dimensional model: all positions use bin 1 (R 1-based)
# Still need iterator info to know positions
iter_info <- gextract("1", intervals = intervals, iterator = .iterator)
if (is.null(iter_info) || nrow(iter_info) == 0) {
stop("No positions extracted. Check that intervals are valid.", call. = FALSE)
}
n_positions <- nrow(iter_info)
# All positions map to bin 1 in R (will be converted to 0-based for C++)
flat_indices <- rep(1L, n_positions)
# Get chromosome info
chrom_sizes <- gintervals.chrom_sizes(intervals)
iter_chroms <- match(as.character(iter_info$chrom), chrom_sizes$chrom) - 1L
iter_starts <- as.integer(iter_info$start)
} else {
# Multi-dimensional model: extract track values and compute bins
# Extract track values for all dimensions
exprs <- sapply(model$dim_specs, function(d) d$expr)
track_data <- gextract(exprs, intervals = intervals, iterator = .iterator)
if (is.null(track_data) || nrow(track_data) == 0) {
stop("No track data extracted. Check that intervals are valid.", call. = FALSE)
}
n_positions <- nrow(track_data)
# Process sampling-time bin_merge if provided
sample_bin_maps <- vector("list", n_dims)
if (!is.null(bin_merge)) {
if (!is.list(bin_merge) || length(bin_merge) != n_dims) {
stop(sprintf(
"bin_merge must be a list with %d elements (one per dimension)",
n_dims
), call. = FALSE)
}
for (d in seq_len(n_dims)) {
spec <- model$dim_specs[[d]]
dim_merge <- bin_merge[[d]]
if (is.null(dim_merge)) {
# Use training-time bin_map for this dimension
sample_bin_maps[[d]] <- spec$bin_map
} else {
# Compute new bin_map from sampling-time bin_merge
bin_map_result <- gsynth.bin_map(spec$breaks, dim_merge)
new_bin_map <- seq_len(spec$num_bins)
for (j in seq_along(bin_map_result)) {
src_bin <- as.integer(names(bin_map_result)[j])
tgt_bin <- as.integer(bin_map_result[j])
if (!is.na(src_bin) && !is.na(tgt_bin) &&
src_bin >= 1 && src_bin <= spec$num_bins &&
tgt_bin >= 1 && tgt_bin <= spec$num_bins) {
new_bin_map[src_bin] <- tgt_bin
}
}
sample_bin_maps[[d]] <- as.integer(new_bin_map)
}
}
} else {
# Use training-time bin_maps for all dimensions
for (d in seq_len(n_dims)) {
sample_bin_maps[[d]] <- model$dim_specs[[d]]$bin_map
}
}
# Compute per-dimension bin indices (1-based)
per_dim_indices <- matrix(NA_integer_, nrow = n_positions, ncol = n_dims)
for (d in seq_len(n_dims)) {
spec <- model$dim_specs[[d]]
track_values <- track_data[[spec$expr]]
bin_idx <- findInterval(track_values, spec$breaks, rightmost.closed = TRUE)
bin_idx[bin_idx == 0] <- NA
bin_idx[bin_idx > spec$num_bins] <- NA
# Apply bin mapping for this dimension (use sample-time bin_map)
valid_mask <- !is.na(bin_idx) & bin_idx >= 1 & bin_idx <= spec$num_bins
if (any(valid_mask)) {
bin_idx[valid_mask] <- sample_bin_maps[[d]][bin_idx[valid_mask]]
}
per_dim_indices[, d] <- bin_idx
}
# Compute flat bin indices using vectorized helper
flat_indices <- .compute_flat_indices(per_dim_indices, dim_sizes)
# Get chromosome info
chrom_sizes <- gintervals.chrom_sizes(intervals)
iter_chroms <- match(as.character(track_data$chrom), chrom_sizes$chrom) - 1L
iter_starts <- as.integer(track_data$start)
}
# Handle NA flat indices (convert to -1 for C++)
flat_indices[is.na(flat_indices)] <- 0L
flat_indices <- as.integer(flat_indices) - 1L # 0-based
# Get CDF data from model (make a copy to avoid modifying the original)
cdf_list <- model$model_data$cdf
# Check for sparse bins that will be used during sampling
sparse_bins <- model$sparse_bins
if (!is.null(sparse_bins) && length(sparse_bins) > 0) {
# Find which sparse bins will be hit
unique_flat <- unique(flat_indices[flat_indices >= 0]) + 1L # 1-based
sparse_used <- intersect(unique_flat, sparse_bins)
if (length(sparse_used) > 0) {
warning(sprintf(
"%d sparse bins (with < %d observations) will be used during sampling.\n%s",
length(sparse_used),
model$min_obs,
"These bins will use uniform base distribution. Consider using bin_merge to merge them."
), call. = FALSE)
# Replace NA CDFs with uniform distribution for sampling
# Uniform CDF: [0.25, 0.5, 0.75, 1.0]
uniform_cdf <- matrix(
rep(c(0.25, 0.5, 0.75, 1.0), each = 1024),
nrow = 1024, ncol = 4
)
for (bin_idx in sparse_used) {
cdf_list[[bin_idx]] <- uniform_cdf
}
}
}
# Output format: 0 = misha, 1 = fasta, 2 = vector
output_format_int <- switch(output_format,
misha = 0L,
fasta = 1L,
vector = 2L
)
if (n_samples > 1 || output_format == "vector") {
message(sprintf("Sampling synthetic genome (%d samples per interval)...", n_samples))
} else {
message("Sampling synthetic genome...")
}
# Call C++ sampling function
# Create dummy breaks that give total_bins bins (length = total_bins + 1)
dummy_breaks <- seq(0, model$total_bins, length.out = model$total_bins + 1)
# Use empty string for output_path if vector mode
output_path_str <- if (is.null(output_path)) "" else as.character(output_path)
result <- .gcall(
"C_gsynth_sample",
cdf_list,
as.numeric(dummy_breaks),
flat_indices,
iter_starts,
iter_chroms,
intervals,
mask_copy,
output_path_str,
output_format_int,
n_samples,
.misha_env()
)
if (output_format == "vector") {
message(sprintf("Generated %d sequence(s)", length(result)))
return(result)
}
message(sprintf("Synthetic genome written to: %s", output_path))
invisible(NULL)
}
#' Generate random genome sequences
#'
#' Generates random DNA sequences based on nucleotide probabilities without
#' using a trained Markov model. Each nucleotide is sampled independently
#' according to the specified probabilities.
#'
#' @param intervals Genomic intervals to sample. If NULL, uses all chromosomes.
#' @param output_path Path to the output file (ignored when output_format = "vector")
#' @param output_format Output format:
#' \itemize{
#' \item "misha": .seq binary format (default)
#' \item "fasta": FASTA text format
#' \item "vector": Return sequences as a character vector (does not write to file)
#' }
#' @param nuc_probs Nucleotide probabilities. Can be specified as:
#' \itemize{
#' \item A named vector: \code{c(A = 0.3, C = 0.2, G = 0.2, T = 0.3)}
#' \item An unnamed vector in A, C, G, T order: \code{c(0.3, 0.2, 0.2, 0.3)}
#' }
#' Probabilities are automatically normalized to sum to 1. Default is uniform
#' (0.25 each).
#' @param mask_copy Optional intervals to copy from the original genome instead of
#' random sampling. Use this to preserve specific regions exactly as they
#' appear in the reference.
#' @param seed Random seed for reproducibility. If NULL, uses current random state.
#' @param n_samples Number of samples to generate per interval. Default is 1.
#' @param iterator Iterator for position resolution. Default is 1 (base-pair resolution).
#' Larger values may speed up processing but are typically not needed for
#' random sampling.
#'
#' @details
#' Unlike \code{\link{gsynth.sample}} which uses a trained Markov model to generate
#' sequences that preserve k-mer statistics, \code{gsynth.random} generates purely
#' random sequences where each nucleotide is sampled independently. This is useful
#' for generating baseline random sequences or sequences with specific GC content.
#'
#' \strong{Nucleotide ordering:} When using an unnamed vector for \code{nuc_probs},
#' the order is A, C, G, T. Named vectors can be in any order.
#'
#' @return When output_format is "misha" or "fasta", returns invisible NULL and
#' writes the random sequences to output_path. When output_format is "vector",
#' returns a character vector of sequences (length = n_intervals * n_samples).
#'
#' @examples
#' gdb.init_examples()
#'
#' # Generate random sequences with uniform nucleotide probabilities
#' seqs <- gsynth.random(
#' intervals = gintervals(1, 0, 1000),
#' output_format = "vector",
#' seed = 42
#' )
#'
#' # Generate GC-rich sequences (60% GC)
#' gc_rich <- gsynth.random(
#' intervals = gintervals(1, 0, 1000),
#' output_format = "vector",
#' nuc_probs = c(A = 0.2, C = 0.3, G = 0.3, T = 0.2),
#' seed = 42
#' )
#'
#' # Generate AT-rich sequences
#' at_rich <- gsynth.random(
#' intervals = gintervals(1, 0, 1000),
#' output_format = "vector",
#' nuc_probs = c(A = 0.35, C = 0.15, G = 0.15, T = 0.35),
#' seed = 42
#' )
#'
#' @seealso \code{\link{gsynth.sample}}, \code{\link{gsynth.train}}
#' @export
gsynth.random <- function(intervals = NULL,
output_path = NULL,
output_format = c("misha", "fasta", "vector"),
nuc_probs = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
mask_copy = NULL,
seed = NULL,
n_samples = 1,
iterator = 1) {
.gcheckroot()
output_format <- match.arg(output_format)
# Validate output_path requirement
if (output_format != "vector" && is.null(output_path)) {
stop("output_path is required when output_format is not 'vector'", call. = FALSE)
}
# Validate n_samples
n_samples <- as.integer(n_samples)
if (is.na(n_samples) || n_samples < 1) {
stop("n_samples must be a positive integer", call. = FALSE)
}
# Validate and normalize nuc_probs
if (!is.numeric(nuc_probs) || length(nuc_probs) != 4) {
stop("nuc_probs must be a numeric vector of length 4 (A, C, G, T)", call. = FALSE)
}
if (any(nuc_probs < 0)) {
stop("nuc_probs values must be non-negative", call. = FALSE)
}
if (sum(nuc_probs) == 0) {
stop("nuc_probs must have at least one positive value", call. = FALSE)
}
# Handle named vector - reorder to A, C, G, T
if (!is.null(names(nuc_probs))) {
expected_names <- c("A", "C", "G", "T")
if (!all(toupper(names(nuc_probs)) %in% expected_names)) {
stop("nuc_probs names must be A, C, G, T", call. = FALSE)
}
# Reorder to A, C, G, T
nuc_probs <- nuc_probs[match(expected_names, toupper(names(nuc_probs)))]
}
# Normalize to sum to 1
nuc_probs <- nuc_probs / sum(nuc_probs)
# Set random seed if provided
if (!is.null(seed)) {
set.seed(seed)
}
# Get intervals (default to all chromosomes)
if (is.null(intervals)) {
intervals <- gintervals.all()
}
# Check if we need to process in chunks for large genomes
# Define chunk processor for parallel execution
# Captures output_format, nuc_probs, mask_copy, n_samples, iterator from enclosing scope
.process_random_chunk <- function(chunk_interval, chunk_index, chunk_seed = NULL, ...) {
# Filter mask to this chunk's chromosome
chunk_mask <- if (!is.null(mask_copy)) {
mask_copy[mask_copy$chrom == chunk_interval$chrom[1], ]
} else {
NULL
}
if (output_format == "vector") {
# Vector mode: return sequences directly
gsynth.random(
intervals = chunk_interval,
output_path = NULL,
output_format = "vector",
nuc_probs = nuc_probs,
mask_copy = chunk_mask,
seed = chunk_seed,
n_samples = n_samples,
iterator = iterator
)
} else {
# File mode: write to temp file and return path
temp_file <- tempfile(fileext = if (output_format == "fasta") ".fa" else ".seq")
gsynth.random(
intervals = chunk_interval,
output_path = temp_file,
output_format = output_format,
nuc_probs = nuc_probs,
mask_copy = chunk_mask,
seed = chunk_seed,
n_samples = n_samples,
iterator = iterator
)
temp_file
}
}
# Try parallel processing for large genomes
parallel_result <- .gsynth_process_parallel(
intervals = intervals,
output_format = output_format,
output_path = output_path,
process_chunk_fn = .process_random_chunk,
max_chunk_size = .GSYNTH_MAX_CHUNK_SIZE,
seed = seed
)
# If parallel processing handled it, return the result
if (!is.null(parallel_result)) {
message(sprintf("Generated %d random sequence(s)", length(parallel_result)))
return(parallel_result)
}
if (is.null(parallel_result) && sum(intervals$end - intervals$start) > .GSYNTH_MAX_CHUNK_SIZE) {
# File output was written by parallel processing
message(sprintf("Random sequences written to: %s", output_path))
return(invisible(NULL))
}
# Set up iterator
.iterator <- do.call(.giterator, list(substitute(iterator)), envir = parent.frame())
# Get position info using gextract
message("Setting up random sampling positions...")
iter_info <- gextract("1", intervals = intervals, iterator = .iterator)
if (is.null(iter_info) || nrow(iter_info) == 0) {
stop("No positions extracted. Check that intervals are valid.", call. = FALSE)
}
n_positions <- nrow(iter_info)
# All positions map to bin 0 (single bin)
flat_indices <- rep(0L, n_positions)
# Get chromosome info
chrom_sizes <- gintervals.chrom_sizes(intervals)
iter_chroms <- match(as.character(iter_info$chrom), chrom_sizes$chrom) - 1L
iter_starts <- as.integer(iter_info$start)
# Create CDF from probabilities: [P(A), P(A)+P(C), P(A)+P(C)+P(G), 1]
# Order is A=0, C=1, G=2, T=3 (same as C++ code)
cdf_row <- cumsum(as.numeric(nuc_probs))
# Create CDF matrix: same for all 1024 contexts (no context dependency)
# Each row is the same CDF - pure random, context-independent sampling
cdf_matrix <- matrix(rep(cdf_row, 1024), nrow = 1024, ncol = 4, byrow = TRUE)
cdf_list <- list(cdf_matrix)
# Output format: 0 = misha, 1 = fasta, 2 = vector
output_format_int <- switch(output_format,
misha = 0L,
fasta = 1L,
vector = 2L
)
if (n_samples > 1 || output_format == "vector") {
message(sprintf("Generating random sequences (%d samples per interval)...", n_samples))
} else {
message("Generating random sequences...")
}
# Use empty string for output_path if vector mode
output_path_str <- if (is.null(output_path)) "" else as.character(output_path)
# Create dummy breaks for single bin
dummy_breaks <- c(0, 1)
result <- .gcall(
"C_gsynth_sample",
cdf_list,
as.numeric(dummy_breaks),
flat_indices,
iter_starts,
iter_chroms,
intervals,
mask_copy,
output_path_str,
output_format_int,
n_samples,
.misha_env()
)
if (output_format == "vector") {
message(sprintf("Generated %d random sequence(s)", length(result)))
return(result)
}
message(sprintf("Random sequences written to: %s", output_path))
invisible(NULL)
}
#' Iteratively replace a k-mer in the genome
#'
#' Performs an iterative replacement of a \code{target} k-mer with a
#' \code{replacement} sequence. This is useful for creating synthetic genomes
#' with specific motifs removed (e.g., creating a CpG-null genome by iteratively
#' swapping CG to GC).
#'
#' @param target The k-mer sequence to remove (e.g., "CG").
#' @param replacement The replacement sequence (e.g., "GC").
#' @param output_path Path to the output file (ignored when output_format = "vector").
#' @param output_format Output format:
#' \itemize{
#' \item "misha": .seq binary format (default)
#' \item "fasta": FASTA text format
#' \item "vector": Return sequences as a character vector (does not write to file)
#' }
#' @param intervals Genomic intervals to process. If NULL, uses all chromosomes.
#' @param check_composition Logical. If TRUE (default), ensures target and
#' replacement have the same nucleotide composition (preserving exact
#' base counts).
#'
#' @details
#' \strong{Bubble Sort / Iterative Logic:} The function scans the sequence and
#' replaces occurrences of \code{target} with \code{replacement}. If a replacement
#' creates a new instance of \code{target} (e.g., removing "CG" with "GC" in
#' the sequence "CCG" -> "CGC"), the new instance is also replaced. This continues
#' until the sequence is free of the \code{target} k-mer.
#'
#' When \code{target} and \code{replacement} are permutations of each other
#' (e.g., "CG" and "GC"), this acts as a "bubble sort" of nucleotides, moving
#' bases locally without altering the total GC content or base counts of the genome.
#'
#' @return When output_format is "misha" or "fasta", returns invisible NULL and
#' writes to output_path. When output_format is "vector", returns a
#' character vector of modified sequences.
#'
#' @examples
#' \dontrun{
#' # Robust removal of all CpG dinucleotides (preserving GC%)
#' gsynth.replace_kmer(
#' target = "CG",
#' replacement = "GC",
#' output_path = "genome_no_cpg.seq",
#' output_format = "misha"
#' )
#' }
#' @export
gsynth.replace_kmer <- function(target,
replacement,
output_path = NULL,
output_format = c("misha", "fasta", "vector"),
intervals = NULL,
check_composition = TRUE) {
.gcheckroot()
output_format <- match.arg(output_format)
# Validate inputs
if (!is.character(target) || length(target) != 1 || nchar(target) == 0) {
stop("target must be a non-empty string", call. = FALSE)
}
if (!is.character(replacement) || length(replacement) != 1 || nchar(replacement) == 0) {
stop("replacement must be a non-empty string", call. = FALSE)
}
if (target == replacement) {
stop("target and replacement cannot be identical", call. = FALSE)
}
# Composition check for robustness
if (check_composition) {
t_counts <- table(strsplit(target, "")[[1]])
r_counts <- table(strsplit(replacement, "")[[1]])
if (!identical(t_counts, r_counts)) {
stop(sprintf(
"Composition mismatch between '%s' and '%s'. Set check_composition=FALSE to override.",
target, replacement
), call. = FALSE)
}
}
# Validate output path
if (output_format != "vector" && is.null(output_path)) {
stop("output_path is required when output_format is not 'vector'", call. = FALSE)
}
if (is.null(intervals)) {
intervals <- gintervals.all()
}
# Check if we need to process in chunks for large genomes
# Define chunk processor for parallel execution
# Captures target, replacement, output_format from enclosing scope
.process_replace_chunk <- function(chunk_interval, chunk_index, chunk_seed = NULL, ...) {
if (output_format == "vector") {
# Vector mode: return sequences directly
gsynth.replace_kmer(
target = target,
replacement = replacement,
output_path = NULL,
output_format = "vector",
intervals = chunk_interval,
check_composition = FALSE # Already checked above
)
} else {
# File mode: write to temp file and return path
temp_file <- tempfile(fileext = if (output_format == "fasta") ".fa" else ".seq")
gsynth.replace_kmer(
target = target,
replacement = replacement,
output_path = temp_file,
output_format = output_format,
intervals = chunk_interval,
check_composition = FALSE
)
temp_file
}
}
# Try parallel processing for large genomes
parallel_result <- .gsynth_process_parallel(
intervals = intervals,
output_format = output_format,
output_path = output_path,
process_chunk_fn = .process_replace_chunk,
max_chunk_size = .GSYNTH_MAX_CHUNK_SIZE
)
# If parallel processing handled it, return the result
if (!is.null(parallel_result)) {
return(parallel_result)
}
if (is.null(parallel_result) && sum(intervals$end - intervals$start) > .GSYNTH_MAX_CHUNK_SIZE) {
# File output was written by parallel processing
message(sprintf("Modified genome written to: %s", output_path))
return(invisible(NULL))
}
# Map format to integer for C++
fmt_int <- switch(output_format,
misha = 0L,
fasta = 1L,
vector = 2L
)
# Handle NULL path for vector output
path_str <- if (is.null(output_path)) "" else as.character(output_path)
message(sprintf("Iteratively replacing '%s' with '%s'...", target, replacement))
res <- .gcall(
"C_gsynth_replace_kmer",
as.character(target),
as.character(replacement),
intervals,
path_str,
fmt_int,
.misha_env()
)
if (output_format == "vector") {
return(res)
}
message(sprintf("Modified genome written to: %s", output_path))
invisible(NULL)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.