Nothing
# Internal helper function to check if the current database is in indexed format
# @return Logical. TRUE if the database is indexed, FALSE otherwise
.gdb.is_indexed <- function() {
groot <- get("GROOT", envir = .misha)
if (is.null(groot) || groot == "") {
return(FALSE)
}
seq_dir <- file.path(groot, "seq")
if (!dir.exists(seq_dir)) {
return(FALSE)
}
index_path <- file.path(seq_dir, "genome.idx")
genome_seq_path <- file.path(seq_dir, "genome.seq")
return(file.exists(index_path) && file.exists(genome_seq_path))
}
#' Change Database to Indexed Genome Format
#'
#' Converts a per-chromosome database to indexed genome format
#' with a single consolidated genome.seq file and genome.idx index.
#' Optionally also converts tracks and interval sets to indexed format.
#'
#' @param groot Root directory of the database to change to indexed format. If NULL, uses the currently active database.
#' @param remove_old_files Logical. If TRUE, removes old per-chromosome files after successful conversion. Default: FALSE.
#' @param force Logical. If TRUE, forces the conversion without confirmation. Default: FALSE.
#' @param validate Logical. If TRUE, validates the conversion by comparing sequences. Default: TRUE.
#' @param convert_tracks Logical. If TRUE, also converts all eligible tracks to indexed format. Default: FALSE.
#' @param convert_intervals Logical. If TRUE, also converts all eligible interval sets to indexed format. Default: FALSE.
#' @param verbose Logical. If TRUE, prints verbose messages. Default: FALSE.
#' @param chunk_size Integer. The size of the chunk to read from the sequence files. Default: 104857600 (100MB). Reduce if
#' you are running into memory issues.
#'
#' @return Invisible NULL
#'
#' @details
#' This function converts a per-chromosome database (with separate .seq files per contig) to
#' indexed format (single genome.seq + genome.idx). The indexed format
#' provides better performance and scalability, especially for genomes with many contigs.
#'
#' \strong{Important: Preserving Chromosome Order}
#'
#' For exact conversion that produces bit-for-bit identical results before and after conversion,
#' you should load the source database first using \code{gsetroot()} or \code{gdb.init()}:
#' \itemize{
#' \item If database is loaded: Uses chromosome order from ALLGENOME (exact preservation)
#' \item If database is not loaded: Uses order from chrom_sizes.txt (may differ from ALLGENOME)
#' }
#'
#' This ensures that the converted database has the exact same chromosome ordering, which affects
#' iteration order, interval IDs, and other operations that depend on chromosome order.
#'
#' The conversion process:
#' \enumerate{
#' \item Checks if database is already in indexed format
#' \item Gets chromosome order from ALLGENOME (if loaded) or chrom_sizes.txt
#' \item Consolidates all per-chromosome .seq files into genome.seq
#' \item Creates genome.idx with CRC64 checksum
#' \item Optionally validates the conversion
#' \item Optionally removes old .seq files
#' \item If convert_tracks=TRUE, converts all eligible 1D tracks (dense, sparse, array)
#' \item If convert_intervals=TRUE, converts all eligible interval sets (1D and 2D)
#' }
#'
#' Tracks and intervals that cannot be converted (and are skipped):
#' \itemize{
#' \item Tracks: 2D tracks, virtual tracks, single-file tracks, already converted tracks
#' \item Intervals: Single-file interval sets, already converted interval sets
#' }
#'
#' @examples
#' \dontrun{
#' # Recommended: Load database first for exact conversion
#' gsetroot("/path/to/database")
#' gdb.convert_to_indexed(
#' convert_tracks = TRUE,
#' convert_intervals = TRUE,
#' remove_old_files = TRUE,
#' verbose = TRUE
#' )
#'
#' # Convert current database to indexed format (genome only)
#' gdb.convert_to_indexed()
#'
#' # Convert specific database without loading it first
#' # Note: chromosome order may differ from ALLGENOME
#' gdb.convert_to_indexed(groot = "/path/to/database")
#'
#' # Convert genome and all tracks to indexed format
#' gdb.convert_to_indexed(convert_tracks = TRUE)
#'
#' # Full conversion with validation and cleanup
#' gsetroot("/path/to/database") # Load first for exact order preservation
#' gdb.convert_to_indexed(
#' convert_tracks = TRUE,
#' convert_intervals = TRUE,
#' remove_old_files = TRUE,
#' validate = TRUE,
#' verbose = TRUE
#' )
#' }
#'
#' @seealso \code{\link{gdb.create}}, \code{\link{gdb.init}}, \code{\link{gtrack.convert_to_indexed}}, \code{\link{gintervals.convert_to_indexed}}, \code{\link{gintervals.2d.convert_to_indexed}}
#' @export
gdb.convert_to_indexed <- function(groot = NULL, remove_old_files = FALSE, force = FALSE, validate = TRUE, convert_tracks = FALSE, convert_intervals = FALSE, verbose = FALSE, chunk_size = 104857600) {
# Validate database and get setup information
setup_info <- .gdb.convert_to_indexed.validate_and_setup(groot, verbose)
# Return early if already indexed
if (setup_info$already_indexed && !force && !convert_tracks && !convert_intervals) {
return(invisible(NULL))
}
if (!setup_info$already_indexed) {
# Get user confirmation
if (!.gdb.convert_to_indexed.get_confirmation(setup_info$groot, setup_info$chrom_sizes, remove_old_files, force)) {
return(invisible(NULL))
}
# Convert genome sequences
.gdb.convert_to_indexed.genome(setup_info, validate, remove_old_files, verbose = verbose, chunk_size = chunk_size)
}
# Convert tracks if requested
if (convert_tracks) {
.gdb.convert_to_indexed.tracks(setup_info$groot, verbose)
}
# Convert intervals if requested
if (convert_intervals) {
.gdb.convert_to_indexed.intervals(setup_info$groot, remove_old_files, verbose)
}
if (verbose) message("\n=== Conversion Complete ===")
invisible(NULL)
}
# Helper function to validate database and get chromosome information
.gdb.convert_to_indexed.validate_and_setup <- function(groot, verbose = FALSE) {
# Use current database if not specified
if (is.null(groot)) {
groot <- get("GROOT", envir = .misha)
if (is.null(groot) || groot == "") {
stop("No database is currently active. Please call gdb.init() or specify groot parameter.", call. = FALSE)
}
}
# Check if database exists
if (!dir.exists(groot)) {
stop(sprintf("Database directory does not exist: %s", groot), call. = FALSE)
}
seq_dir <- file.path(groot, "seq")
if (!dir.exists(seq_dir)) {
stop(sprintf("seq directory does not exist: %s", seq_dir), call. = FALSE)
}
# Check if already in indexed format
index_path <- file.path(seq_dir, "genome.idx")
genome_seq_path <- file.path(seq_dir, "genome.seq")
if (file.exists(index_path) && file.exists(genome_seq_path)) {
if (verbose) message("Database is already in indexed format.")
return(list(already_indexed = TRUE, groot = groot))
}
# Get canonical chromosome names and order from ALLGENOME if database is currently loaded
# This ensures the converted database has the exact same order as the source
# IMPORTANT: For exact conversion, initialize the database first with gsetroot(groot)
canonical_names <- NULL
# Check if the database being converted is currently loaded
if (exists("GROOT", envir = .misha, inherits = FALSE) &&
exists("ALLGENOME", envir = .misha, inherits = FALSE)) {
current_groot <- get("GROOT", envir = .misha)
# Only use ALLGENOME if it's from the database we're converting
if (!is.null(current_groot) && normalizePath(current_groot) == normalizePath(groot)) {
allgenome <- get("ALLGENOME", envir = .misha)
if (!is.null(allgenome[[1]]) && "chrom" %in% colnames(allgenome[[1]])) {
canonical_names <- as.character(allgenome[[1]]$chrom)
if (verbose) {
message(sprintf("Using chromosome order from loaded database (%d chromosomes)", length(canonical_names)))
}
}
} else {
if (verbose) {
message("Database not currently loaded - using chrom_sizes.txt order")
message("For exact conversion preserving chromosome order, run gsetroot() first")
}
}
} else {
if (verbose) {
message("No database currently loaded - using chrom_sizes.txt order")
message("For exact conversion preserving chromosome order, run gsetroot() first")
}
}
# Read chromosome information
chrom_sizes_path <- file.path(groot, "chrom_sizes.txt")
if (!file.exists(chrom_sizes_path)) {
stop(sprintf("chrom_sizes.txt not found: %s", chrom_sizes_path), call. = FALSE)
}
chrom_sizes <- read.table(chrom_sizes_path, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
colnames(chrom_sizes) <- c("chrom", "size")
# Store original order index to preserve order even when modifying chromosome names
original_order_index <- seq_len(nrow(chrom_sizes))
# If we have canonical names from ALLGENOME, use them AND their order
# This ensures exact same chromosome order before and after conversion
if (!is.null(canonical_names) && length(canonical_names) == nrow(chrom_sizes)) {
# ALLGENOME order is the source of truth - preserve it exactly
# This ensures that converting per-chromosome -> indexed produces identical results
# Create mapping from chrom_sizes names to ALLGENOME names
original_chrom_names <- chrom_sizes$chrom
# Build reverse lookup: for each chrom_sizes name, find its corresponding ALLGENOME index
allgenome_indices <- integer(length(original_chrom_names))
for (i in seq_along(original_chrom_names)) {
orig_chrom <- original_chrom_names[i]
# Try exact match first
match_idx <- which(canonical_names == orig_chrom)
if (length(match_idx) > 0) {
allgenome_indices[i] <- match_idx[1]
} else {
# Try with/without chr prefix
if (startsWith(orig_chrom, "chr")) {
no_chr <- sub("^chr", "", orig_chrom)
match_idx <- which(canonical_names == no_chr)
} else {
chr_version <- paste0("chr", orig_chrom)
match_idx <- which(canonical_names == chr_version)
}
if (length(match_idx) > 0) {
allgenome_indices[i] <- match_idx[1]
} else {
# Fallback: maintain original position
allgenome_indices[i] <- i
}
}
}
# Reorder chrom_sizes to match ALLGENOME order
chrom_sizes <- chrom_sizes[order(allgenome_indices), ]
chrom_sizes$chrom <- canonical_names
original_order_index <- original_order_index[order(allgenome_indices)]
}
# Note: When ALLGENOME is available, we use its order to ensure exact equivalence
# When not available, we preserve chrom_sizes.txt order
# Check that per-chromosome .seq files exist
# Handle chr prefix mismatch between chrom_sizes.txt and .seq files
# Also preserve the actual chromosome names from the seq files
seq_files <- character(nrow(chrom_sizes))
actual_chrom_names <- character(nrow(chrom_sizes))
for (i in seq_len(nrow(chrom_sizes))) {
chrom <- chrom_sizes$chrom[i]
found_chrom <- chrom # Track the actual chromosome name found
# Try the chromosome name as-is first
seq_file <- file.path(seq_dir, paste0(chrom, ".seq"))
if (file.exists(seq_file)) {
seq_files[i] <- seq_file
actual_chrom_names[i] <- chrom
next
}
# If not found, try with chr prefix
if (!startsWith(chrom, "chr")) {
chr_seq_file <- file.path(seq_dir, paste0("chr", chrom, ".seq"))
if (file.exists(chr_seq_file)) {
seq_files[i] <- chr_seq_file
actual_chrom_names[i] <- paste0("chr", chrom)
next
}
}
# If not found, try without chr prefix
if (startsWith(chrom, "chr")) {
no_chr_chrom <- sub("^chr", "", chrom)
no_chr_seq_file <- file.path(seq_dir, paste0(no_chr_chrom, ".seq"))
if (file.exists(no_chr_seq_file)) {
seq_files[i] <- no_chr_seq_file
actual_chrom_names[i] <- no_chr_chrom
next
}
}
# If still not found, this is a missing file
seq_files[i] <- seq_file # Use original name for error reporting
actual_chrom_names[i] <- chrom
}
missing_files <- seq_files[!file.exists(seq_files)]
if (length(missing_files) > 0) {
stop(sprintf("Missing sequence files: %s", paste(basename(missing_files), collapse = ", ")), call. = FALSE)
}
# Only update chromosome names if we don't have canonical names from ALLGENOME
# If ALLGENOME is available, we already set the canonical names and reordered to match
if (is.null(canonical_names)) {
# Update chrom_sizes with actual chromosome names found in files
# This preserves the chr prefix if files are named with chr prefix
chrom_sizes$chrom <- actual_chrom_names
}
return(list(
already_indexed = FALSE,
groot = groot,
seq_dir = seq_dir,
chrom_sizes = chrom_sizes,
seq_files = seq_files,
index_path = index_path,
genome_seq_path = genome_seq_path,
chrom_sizes_path = chrom_sizes_path,
original_order_index = original_order_index
))
}
# Helper function to get user confirmation for conversion
.gdb.convert_to_indexed.get_confirmation <- function(groot, chrom_sizes, remove_old_files, force) {
if (interactive() && !force) {
cat(sprintf("About to convert database to indexed format: %s\n", groot))
cat(sprintf(" Chromosomes: %d\n", nrow(chrom_sizes)))
cat(sprintf(" Total size: %.2f MB\n", sum(chrom_sizes$size) / 1024^2))
if (remove_old_files) {
cat(" Old .seq files will be REMOVED after conversion\n")
}
response <- readline("Proceed with conversion? (yes/no): ")
if (!(tolower(response) %in% c("yes", "y"))) {
message("Conversion cancelled.")
return(FALSE)
}
}
return(TRUE)
}
# Helper function to convert genome sequences to indexed format
.gdb.convert_to_indexed.genome <- function(setup_info, validate = TRUE, remove_old_files = FALSE, verbose = FALSE, chunk_size = 104857600) {
groot <- setup_info$groot
chrom_sizes <- setup_info$chrom_sizes
seq_files <- setup_info$seq_files
index_path <- setup_info$index_path
genome_seq_path <- setup_info$genome_seq_path
chrom_sizes_path <- setup_info$chrom_sizes_path
if (verbose) message("Converting database to indexed format...")
# Create temporary FASTA file from .seq files
temp_fasta <- tempfile(fileext = ".fasta")
on.exit(unlink(temp_fasta), add = TRUE)
tryCatch(
{
# Write multi-FASTA file
if (verbose) message("Creating temporary multi-FASTA file...")
fasta_con <- file(temp_fasta, "wb") # Use binary mode for faster writes
for (i in seq_len(nrow(chrom_sizes))) {
chrom <- chrom_sizes$chrom[i]
seq_file <- seq_files[i]
# Write header
writeBin(charToRaw(sprintf(">%s\n", chrom)), fasta_con)
# Copy sequence file directly - no line breaking needed
# The C++ parser handles long lines just fine
seq_con <- file(seq_file, "rb")
repeat {
chunk <- readBin(seq_con, "raw", n = chunk_size)
if (length(chunk) == 0) break
writeBin(chunk, fasta_con)
}
close(seq_con)
# Write newline after sequence
writeBin(charToRaw("\n"), fasta_con)
if ((i %% 10) == 0 || i == nrow(chrom_sizes)) {
if (verbose) message(sprintf(" Processed %d/%d chromosomes", i, nrow(chrom_sizes)))
}
}
close(fasta_con)
# Call C++ import function
# Use sort=FALSE to preserve the chromosome order from chrom_sizes.txt
if (verbose) message("Creating indexed format...")
contig_info <- .gcall(
"gseq_multifasta_import",
temp_fasta,
genome_seq_path,
index_path,
FALSE, # sort=FALSE to preserve chrom_sizes.txt order
.misha_env()
)
if (verbose) message("Index created successfully")
# Update chrom_sizes.txt with the canonical chromosome names from the index
# This ensures consistency between chrom_sizes.txt and genome.idx
# contig_info is now in FASTA order (same as our input order) since C++ no longer sorts
if (verbose) message("Updating chrom_sizes.txt with canonical chromosome names...")
updated_chrom_sizes <- data.frame(
chrom = contig_info$name,
size = contig_info$size,
stringsAsFactors = FALSE
)
write.table(updated_chrom_sizes, chrom_sizes_path,
quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE
)
# Validate if requested
if (validate) {
if (verbose) message("Validating conversion...")
# Save current state before validation
old_groot <- NULL
if (exists("GROOT", envir = .misha, inherits = FALSE)) {
old_groot <- get("GROOT", envir = .misha)
}
# Sample validation: check first 100 bases of each chromosome
validation_failed <- FALSE
tryCatch({
# Init the converted database once for all validations
suppressMessages(gdb.init(groot))
for (i in seq_len(min(10, nrow(chrom_sizes)))) { # Check first 10 chroms
chrom_name <- updated_chrom_sizes$chrom[i]
seq_file <- seq_files[i]
# Read from old file
old_con <- file(seq_file, "rb")
old_seq <- toupper(rawToChar(readBin(old_con, "raw", n = 100)))
close(old_con)
# Extract from indexed format
new_seq <- toupper(gseq.extract(gintervals(chrom_name, 0, min(100, updated_chrom_sizes$size[i]))))
if (old_seq != new_seq) {
if (verbose) {
message(sprintf("\nValidation mismatch for %s:", chrom_name))
message(sprintf(" Old (first 60): %s", substr(old_seq, 1, 60)))
message(sprintf(" New (first 60): %s", substr(new_seq, 1, 60)))
message(sprintf(" Seq file: %s", seq_file))
}
warning(sprintf("Validation failed for chromosome %s", chrom_name))
validation_failed <- TRUE
} else if (verbose && i <= 3) {
message(sprintf(" [OK] %s validated successfully", chrom_name))
}
}
}, finally = {
# Restore old state once after all validations
if (!is.null(old_groot) && old_groot != "") {
suppressMessages(gdb.init(old_groot))
}
})
if (validation_failed) {
stop("Validation failed! Conversion may be corrupted. Old files have NOT been removed.", call. = FALSE)
} else {
if (verbose) message("Validation passed")
}
}
# Remove old files if requested
if (remove_old_files) {
if (verbose) message("Removing old .seq files...")
for (seq_file in seq_files) {
unlink(seq_file)
}
if (verbose) message(sprintf("Removed %d old .seq files", length(seq_files)))
}
if (verbose) message(sprintf("Database sequence conversion complete: %s", groot))
},
error = function(e) {
# Clean up partial files on error
if (file.exists(genome_seq_path)) {
unlink(genome_seq_path)
}
if (file.exists(index_path)) {
unlink(index_path)
}
stop(sprintf("Conversion failed: %s", conditionMessage(e)), call. = FALSE)
}
)
}
# Helper function to convert tracks to indexed format
.gdb.convert_to_indexed.tracks <- function(groot, verbose = FALSE) {
if (verbose) message("\n=== Converting Tracks ===")
# Temporarily init the database to get track list
old_groot <- NULL
if (exists("GROOT", envir = .misha, inherits = FALSE)) {
old_groot <- get("GROOT", envir = .misha)
}
tryCatch({
suppressMessages(gdb.init(groot))
all_tracks <- gtrack.ls()
if (length(all_tracks) == 0) {
if (verbose) message("No tracks found in database")
} else {
if (verbose) message(sprintf("Found %d tracks in database", length(all_tracks)))
# Filter to only 1D tracks that can be converted
convertible_tracks <- c()
skipped_tracks <- list()
for (track in all_tracks) {
# Get track info to determine type
info <- tryCatch(gtrack.info(track), error = function(e) NULL)
if (is.null(info)) {
skipped_tracks[[track]] <- "failed to get info"
next
}
# Only 1D tracks (dense, sparse, array) can be converted
if (!info$type %in% c("dense", "sparse", "array")) {
skipped_tracks[[track]] <- sprintf("unsupported type (%s)", info$type)
next
}
# Check if it's a Big Set track (directory format)
trackstr <- gsub("\\.", "/", track)
trackdir <- sprintf("%s.track", paste(groot, "tracks", trackstr, sep = "/"))
if (!dir.exists(trackdir)) {
skipped_tracks[[track]] <- "single-file format"
next
}
# Check if already converted
idx_path <- file.path(trackdir, "track.idx")
if (file.exists(idx_path)) {
skipped_tracks[[track]] <- "already converted"
next
}
convertible_tracks <- c(convertible_tracks, track)
}
# Report what we found
if (length(convertible_tracks) > 0) {
if (verbose) message(sprintf(" Convertible: %d tracks", length(convertible_tracks)))
} else {
if (verbose) message(" No tracks need conversion")
}
if (length(skipped_tracks) > 0) {
if (verbose) message(sprintf(" Skipped: %d tracks", length(skipped_tracks)))
for (track in names(skipped_tracks)) {
if (verbose) message(sprintf(" - %s: %s", track, skipped_tracks[[track]]))
}
}
# Convert tracks
if (length(convertible_tracks) > 0) {
converted_count <- 0
failed_tracks <- c()
for (track in convertible_tracks) {
if (verbose) message(sprintf(" Converting track: %s", track))
tryCatch(
{
gtrack.convert_to_indexed(track)
converted_count <- converted_count + 1
},
error = function(e) {
warning(sprintf("Failed to convert track %s: %s", track, conditionMessage(e)))
failed_tracks <<- c(failed_tracks, track)
}
)
}
if (verbose) message(sprintf("Successfully converted %d/%d tracks", converted_count, length(convertible_tracks)))
if (length(failed_tracks) > 0) {
warning(sprintf(
"Failed to convert %d tracks: %s",
length(failed_tracks),
paste(failed_tracks, collapse = ", ")
))
}
}
}
}, finally = {
# Restore old state
if (!is.null(old_groot) && old_groot != "") {
suppressMessages(gdb.init(old_groot))
}
})
}
# Helper function to convert interval sets to indexed format
.gdb.convert_to_indexed.intervals <- function(groot, remove_old_files = FALSE, verbose = FALSE) {
if (verbose) message("\n=== Converting Interval Sets ===")
# Temporarily init the database to get interval list
old_groot <- NULL
if (exists("GROOT", envir = .misha, inherits = FALSE)) {
old_groot <- get("GROOT", envir = .misha)
}
tryCatch({
suppressMessages(gdb.init(groot))
all_intervals <- gintervals.ls()
if (length(all_intervals) == 0) {
if (verbose) message("No interval sets found in database")
} else {
if (verbose) message(sprintf("Found %d interval sets in database", length(all_intervals)))
# Filter to only Big Set intervals that can be converted
convertible_1d <- c()
convertible_2d <- c()
skipped_intervals <- list()
for (intervset in all_intervals) {
# Construct path
path <- gsub("\\.", "/", intervset)
intervset_path <- paste0(groot, "/tracks/", path, ".interv")
# Check if it's a Big Set (directory format)
if (!file.exists(intervset_path)) {
skipped_intervals[[intervset]] <- "does not exist"
next
}
if (!dir.exists(intervset_path)) {
skipped_intervals[[intervset]] <- "single-file format"
next
}
# Determine if 1D or 2D
idx_path_1d <- file.path(intervset_path, "intervals.idx")
idx_path_2d <- file.path(intervset_path, "intervals2d.idx")
# Check for 2D files
pair_files <- list.files(intervset_path, pattern = "-")
has_2d <- length(pair_files) > 0
if (has_2d) {
# 2D interval set
if (file.exists(idx_path_2d)) {
skipped_intervals[[intervset]] <- "already converted (2D)"
} else {
convertible_2d <- c(convertible_2d, intervset)
}
} else {
# 1D interval set
if (file.exists(idx_path_1d)) {
skipped_intervals[[intervset]] <- "already converted (1D)"
} else {
convertible_1d <- c(convertible_1d, intervset)
}
}
}
# Report what we found
if (length(convertible_1d) > 0) {
if (verbose) message(sprintf(" Convertible 1D: %d interval sets", length(convertible_1d)))
}
if (length(convertible_2d) > 0) {
if (verbose) message(sprintf(" Convertible 2D: %d interval sets", length(convertible_2d)))
}
if (length(convertible_1d) == 0 && length(convertible_2d) == 0) {
if (verbose) message(" No interval sets need conversion")
}
if (length(skipped_intervals) > 0) {
if (verbose) message(sprintf(" Skipped: %d interval sets", length(skipped_intervals)))
for (intervset in names(skipped_intervals)) {
if (verbose) message(sprintf(" - %s: %s", intervset, skipped_intervals[[intervset]]))
}
}
# Convert 1D intervals
converted_1d_count <- 0
failed_1d <- c()
if (length(convertible_1d) > 0) {
for (intervset in convertible_1d) {
if (verbose) message(sprintf(" Converting 1D interval set: %s", intervset))
tryCatch(
{
gintervals.convert_to_indexed(intervset, remove.old = remove_old_files)
converted_1d_count <- converted_1d_count + 1
},
error = function(e) {
warning(sprintf("Failed to convert 1D interval set %s: %s", intervset, conditionMessage(e)))
failed_1d <<- c(failed_1d, intervset)
}
)
}
}
# Convert 2D intervals
converted_2d_count <- 0
failed_2d <- c()
if (length(convertible_2d) > 0) {
for (intervset in convertible_2d) {
if (verbose) message(sprintf(" Converting 2D interval set: %s", intervset))
tryCatch(
{
gintervals.2d.convert_to_indexed(intervset, remove.old = remove_old_files)
converted_2d_count <- converted_2d_count + 1
},
error = function(e) {
warning(sprintf("Failed to convert 2D interval set %s: %s", intervset, conditionMessage(e)))
failed_2d <<- c(failed_2d, intervset)
}
)
}
}
# Report results
total_converted <- converted_1d_count + converted_2d_count
total_convertible <- length(convertible_1d) + length(convertible_2d)
if (total_convertible > 0) {
if (verbose) {
message(sprintf(
"Successfully converted %d/%d interval sets (%d 1D, %d 2D)",
total_converted,
total_convertible,
converted_1d_count,
converted_2d_count
))
}
if (length(failed_1d) > 0 || length(failed_2d) > 0) {
all_failed <- c(failed_1d, failed_2d)
warning(sprintf(
"Failed to convert %d interval sets: %s",
length(all_failed),
paste(all_failed, collapse = ", ")
))
}
}
}
}, finally = {
# Restore old state
if (!is.null(old_groot) && old_groot != "") {
suppressMessages(gdb.init(old_groot))
}
})
}
#' Get Database Information
#'
#' Returns information about a misha genome database including format, number of chromosomes,
#' total genome size, and whether it uses the indexed format.
#'
#' @param groot Root directory of the database. If NULL, uses the currently active database.
#' @return A list with database information:
#' \itemize{
#' \item \code{path} - Full path to the database
#' \item \code{is_db} - TRUE if this is a valid misha database
#' \item \code{format} - "indexed" or "per-chromosome"
#' \item \code{num_chromosomes} - Number of chromosomes/contigs
#' \item \code{genome_size} - Total length of genome in bases
#' \item \code{chromosomes} - Data frame with chromosome names and sizes
#' }
#'
#' @examples
#' \dontrun{
#' # Get info about currently active database
#' info <- gdb.info()
#' cat("Database format:", info$format, "\n")
#' cat("Genome size:", info$genome_size / 1e6, "Mb\n")
#'
#' # Get info about specific database
#' info <- gdb.info("/path/to/database")
#' }
#'
#' @export gdb.info
gdb.info <- function(groot = NULL) {
# Use current database if not specified
if (is.null(groot)) {
if (!exists("GROOT", envir = .misha) || is.null(get("GROOT", envir = .misha))) {
stop("No database is currently active. Please call gdb.init() or specify groot parameter.", call. = FALSE)
}
groot <- get("GROOT", envir = .misha)
}
# Normalize path
groot <- normalizePath(groot, mustWork = FALSE)
# Check if directory exists
if (!dir.exists(groot)) {
return(list(
path = groot,
is_db = FALSE,
error = "Directory does not exist"
))
}
# Check for chrom_sizes.txt
chrom_sizes_path <- file.path(groot, "chrom_sizes.txt")
if (!file.exists(chrom_sizes_path)) {
return(list(
path = groot,
is_db = FALSE,
error = "Not a misha database (chrom_sizes.txt not found)"
))
}
# Read chromosome information
chrom_sizes <- tryCatch(
read.csv(chrom_sizes_path,
sep = "\t", header = FALSE,
col.names = c("chrom", "size"), colClasses = c("character", "numeric")
),
error = function(e) NULL
)
if (is.null(chrom_sizes)) {
return(list(
path = groot,
is_db = FALSE,
error = "Invalid chrom_sizes.txt format"
))
}
# Detect format
idx_path <- file.path(groot, "seq", "genome.idx")
genome_seq_path <- file.path(groot, "seq", "genome.seq")
if (file.exists(idx_path) && file.exists(genome_seq_path)) {
format <- "indexed"
} else {
format <- "per-chromosome"
}
# Calculate total genome size
genome_size <- sum(chrom_sizes$size)
list(
path = groot,
is_db = TRUE,
format = format,
num_chromosomes = nrow(chrom_sizes),
genome_size = genome_size,
chromosomes = chrom_sizes
)
}
#' Convert a track to indexed format
#'
#' Converts a per-chromosome track to indexed format (track.dat + track.idx).
#'
#' This function converts a track from the per-chromosome file format to
#' single-file indexed format. The indexed format dramatically reduces file descriptor
#' usage for genomes with many contigs and provides better performance for parallel access.
#'
#' The function performs the following steps:
#' \enumerate{
#' \item Validates that all per-chromosome files have consistent metadata
#' \item Creates track.dat by concatenating all per-chromosome files
#' \item Creates track.idx with offset/length information for each chromosome
#' \item Uses atomic operations (fsync + rename) to ensure data integrity
#' \item Removes the old per-chromosome files after successful conversion
#' }
#'
#' @param track track name to convert
#' @return None
#' @seealso \code{\link{gtrack.create}}, \code{\link{gtrack.create_sparse}}, \code{\link{gtrack.create_dense}}
#' @examples
#' \dontrun{
#' # Convert a track to indexed format
#' gtrack.convert_to_indexed("my_track")
#' }
#' @export gtrack.convert_to_indexed
gtrack.convert_to_indexed <- function(track = NULL) {
if (is.null(substitute(track))) {
stop("Usage: gtrack.convert_to_indexed(track)", call. = FALSE)
}
.gcheckroot()
trackstr <- do.call(.gexpr2str, list(substitute(track)), envir = parent.frame())
if (is.na(match(trackstr, get("GTRACKS", envir = .misha)))) {
stop(sprintf("Track %s does not exist", trackstr), call. = FALSE)
}
trackdir <- .track_dir(trackstr)
idx_path <- file.path(trackdir, "track.idx")
dat_path <- file.path(trackdir, "track.dat")
# Check if already converted
if (file.exists(idx_path)) {
message(sprintf("Track %s is already in indexed format.", trackstr))
return(invisible(0))
}
# Get track info to determine type
info <- gtrack.info(track)
track_type <- info$type
# Only 1D tracks can be converted
if (!track_type %in% c("dense", "sparse", "array")) {
stop(sprintf("Cannot convert track %s: only 1D tracks (dense, sparse, array) can be converted", trackstr), call. = FALSE)
}
# Call C++ function to perform the conversion (always remove old files)
success <- FALSE
tryCatch(
{
.gcall("gtrack_convert_to_indexed_format", trackstr, TRUE, .misha_env())
success <- TRUE
},
error = function(e) {
# Clean up temporary files on error
if (file.exists(paste0(dat_path, ".tmp"))) {
unlink(paste0(dat_path, ".tmp"))
}
if (file.exists(paste0(idx_path, ".tmp"))) {
unlink(paste0(idx_path, ".tmp"))
}
stop(sprintf("Failed to convert track %s: %s", trackstr, e$message), call. = FALSE)
}
)
invisible(0)
}
#' Convert 1D interval set to indexed format
#'
#' Converts a per-chromosome interval set to indexed format
#' (intervals.dat + intervals.idx) which reduces file descriptor usage.
#'
#' @param set.name name of interval set to convert
#' @param remove.old if TRUE, removes old per-chromosome files after successful conversion
#' @param force if TRUE, re-converts even if already in indexed format
#' @return invisible NULL
#' @details
#' The indexed format stores all chromosomes in a single intervals.dat file
#' with an intervals.idx index file. This reduces file descriptor usage from
#' N files (one per chromosome) to just 2 files.
#'
#' The conversion process:
#' \enumerate{
#' \item Creates temporary intervals.dat.tmp and intervals.idx.tmp files
#' \item Concatenates all per-chromosome files into intervals.dat.tmp
#' \item Builds index with offsets and checksums
#' \item Atomically renames temporary files to final names
#' \item Optionally removes old per-chromosome files
#' }
#'
#' The indexed format is 100% backward compatible with all existing misha functions.
#'
#' @examples
#' \dontrun{
#' # Convert an interval set
#' gintervals.convert_to_indexed("my_intervals")
#'
#' # Convert and remove old files
#' gintervals.convert_to_indexed("my_intervals", remove.old = TRUE)
#'
#' # Force re-conversion
#' gintervals.convert_to_indexed("my_intervals", force = TRUE)
#' }
#' @seealso \code{\link{gintervals.save}}, \code{\link{gintervals.load}}
#' @export
gintervals.convert_to_indexed <- function(set.name = NULL, remove.old = FALSE, force = FALSE) {
if (is.null(set.name) || !is.character(set.name) || length(set.name) != 1) {
stop("Usage: gintervals.convert_to_indexed(set.name, remove.old = FALSE, force = FALSE)", call. = FALSE)
}
.gcheckroot()
# Get interval set path - mimic C++ interv2path logic
path <- gsub("\\.", "/", set.name)
intervset_path <- paste0(get("GWD", envir = .misha), "/", path, ".interv")
# Check if it's a Big Set (directory) or single-file format
if (!file.exists(intervset_path)) {
stop(sprintf("Interval set %s does not exist", set.name), call. = FALSE)
}
is_bigset <- dir.exists(intervset_path)
if (!is_bigset) {
message(sprintf("Interval set %s is in single-file format and does not need conversion.", set.name))
return(invisible(NULL))
}
# Check if already converted (check index file instead of directory for robustness)
idx_path <- file.path(intervset_path, "intervals.idx")
dat_path <- file.path(intervset_path, "intervals.dat")
if (file.exists(idx_path) && !force) {
message(sprintf("Interval set %s is already in indexed format. Use force=TRUE to re-convert.", set.name))
return(invisible(NULL))
}
# Call C++ function to perform the conversion
tryCatch(
{
.gcall("ginterv_convert", set.name, remove.old, .misha_env())
},
error = function(e) {
# Clean up temporary files on error
tmp_dat <- paste0(dat_path, ".tmp")
tmp_idx <- paste0(idx_path, ".tmp")
if (file.exists(tmp_dat)) unlink(tmp_dat)
if (file.exists(tmp_idx)) unlink(tmp_idx)
stop(sprintf("Failed to convert interval set %s: %s", set.name, e$message), call. = FALSE)
}
)
invisible(NULL)
}
#' Convert 2D interval set to indexed format
#'
#' Converts a per-chromosome interval set to indexed format
#' (intervals2d.dat + intervals2d.idx) which reduces file descriptor usage.
#'
#' @param set.name name of 2D interval set to convert
#' @param remove.old if TRUE, removes old per-chromosome files after successful conversion
#' @param force if TRUE, re-converts even if already in indexed format
#' @return invisible NULL
#' @details
#' The indexed format stores all chromosome pairs in a single intervals2d.dat file
#' with an intervals2d.idx index file. This dramatically reduces file descriptor
#' usage, especially for genomes with many chromosomes (N*(N-1)/2 files to just 2).
#'
#' Only non-empty pairs are stored in the index, avoiding O(N^2) space overhead.
#'
#' The conversion process:
#' \enumerate{
#' \item Scans directory for existing per-pair files
#' \item Creates temporary intervals2d.dat.tmp and intervals2d.idx.tmp files
#' \item Concatenates all per-pair files into intervals2d.dat.tmp
#' \item Builds index with pair offsets and checksums
#' \item Atomically renames temporary files to final names
#' \item Optionally removes old per-pair files
#' }
#'
#' The indexed format is 100% backward compatible with all existing misha functions.
#'
#' @examples
#' \dontrun{
#' # Convert a 2D interval set
#' gintervals.2d.convert_to_indexed("my_2d_intervals")
#'
#' # Convert and remove old files
#' gintervals.2d.convert_to_indexed("my_2d_intervals", remove.old = TRUE)
#'
#' # Force re-conversion
#' gintervals.2d.convert_to_indexed("my_2d_intervals", force = TRUE)
#' }
#'
#' @export
gintervals.2d.convert_to_indexed <- function(set.name = NULL, remove.old = FALSE, force = FALSE) {
if (is.null(set.name) || !is.character(set.name) || length(set.name) != 1) {
stop("Usage: gintervals.2d.convert_to_indexed(set.name, remove.old = FALSE, force = FALSE)", call. = FALSE)
}
.gcheckroot()
# Get interval set path - mimic C++ interv2path logic
path <- gsub("\\.", "/", set.name)
intervset_path <- paste0(get("GWD", envir = .misha), "/", path, ".interv")
# Check if it's a Big Set (directory) or single-file format
if (!file.exists(intervset_path)) {
stop(sprintf("2D interval set %s does not exist", set.name), call. = FALSE)
}
is_bigset <- dir.exists(intervset_path)
if (!is_bigset) {
message(sprintf("2D interval set %s is in single-file format and does not need conversion.", set.name))
return(invisible(NULL))
}
# Check if already converted (check index file instead of directory for robustness)
idx_path <- file.path(intervset_path, "intervals2d.idx")
dat_path <- file.path(intervset_path, "intervals2d.dat")
if (file.exists(idx_path) && !force) {
message(sprintf("2D interval set %s is already in indexed format. Use force=TRUE to re-convert.", set.name))
return(invisible(NULL))
}
# Call C++ function to perform the conversion
tryCatch(
{
.gcall("ginterv2d_convert", set.name, remove.old, .misha_env())
},
error = function(e) {
# Clean up temporary files on error
tmp_dat <- paste0(dat_path, ".tmp")
tmp_idx <- paste0(idx_path, ".tmp")
if (file.exists(tmp_dat)) unlink(tmp_dat)
if (file.exists(tmp_idx)) unlink(tmp_idx)
stop(sprintf("Failed to convert 2D interval set %s: %s", set.name, e$message), call. = FALSE)
}
)
invisible(NULL)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.