R/file_index.R

Defines functions read_file_index save_file_index make_file_index

Documented in make_file_index read_file_index save_file_index

#' Index a file for faster access to parts of the file
#'
#' @param pathname (character) The file to be indexed.
#'
#' @param offset (numeric) The number of bytes to skip before start indexing.
#'
#' @param index (numeric; optional) An existing index (file byte offsets)
#' generated on an earlier version of the input file. If specified, the
#' indexing will continue at the very last known index, instead of
#' re-indexing from the beginning.
#'
#' @param skip (numeric) The number of `newline` matches to ignore before
#' recording them.
#'
#' @param n_max (numeric) The maximum number of bytes to scan.
#'
#' @param newline (character) The character to scan for.
#'
#' @param drop_eof (logical) If TRUE, the last identified byte offset is
#' dropped if at the very end of the file, i.e. when there is nothing 
#' available to read from that position.
#'
#' @param bfr_size (numeric) The number of bytes to read in each iteration.
#'
#' @return A numeric vector of file byte offsets that corresponds to the
#' beginning of a line, i.e. a position in the file that was preceeded
#' by a `newline` character.  The first line is at file byte offset
#' `0`, which is also always the first element in the returned vector.
#'
#' @example incl/make_file_index.R
#'
#' @importFrom utils file_test
#' @importFrom progressr progressor
#' @export
make_file_index <- function(pathname, offset = NULL, skip = 0L, index = NULL, n_max = Inf, newline = "\n", drop_eof = TRUE, bfr_size = 50e6) {
  stopifnot(length(pathname) == 1L, file_test("-f", pathname))
  if (!is.null(index)) {
    stopifnot(is.numeric(index), !anyNA(index))
    index_range <- range(index)
    stopifnot(index_range[1] >= 0, index_range[2] < Inf)
    if (is.null(offset)) offset <- index_range[2] - 1
  }
  if (is.null(offset)) offset <- 0
  stopifnot(length(offset) == 1L, is.numeric(offset), is.finite(offset), offset >= 0)
  stopifnot(length(skip) == 1L, is.numeric(skip), is.finite(skip), skip >= 0)
  stopifnot(length(n_max) == 1L, is.numeric(n_max), !is.na(n_max), n_max >= 0)
  stopifnot(length(newline) == 1L, is.character(newline), !is.na(newline))
  stopifnot(length(bfr_size) == 1L, is.numeric(bfr_size), is.finite(bfr_size),
            bfr_size > 0)

  nl <- charToRaw(newline)

  file_size <- file.size(pathname)
  if (!is.null(index)) {
    stopifnot(file_size > index_range[2])
  }  
  con <- file(pathname, open = "rb")
  on.exit(close(con))

  if (offset > 0) {
    offset <- offset - 1L
    if (offset > 0) seek(con, where = offset, origin = "start", rw = "read")
  }

  ## Coerce to double to avoid integer overflow for large files
  offset <- as.double(offset)

  ## Report on progress (either by MBs or newlines read)
  max_steps <- if (is.infinite(n_max)) (file_size - offset)/1e6 else n_max
  p <- progressor(max_steps)

  count <- 1
  pos <- list(offset)
  repeat {
    raw <- readBin(con, what = raw(), n = bfr_size)
    nraw <- length(raw)
    ## Reached end of file?
    if (nraw == 0) break
    idxs <- which(raw == nl)
    raw <- NULL
    idxs <- idxs + offset
    offset <- offset + nraw
    pos[[length(pos) + 1]] <- idxs
    count <- count + length(idxs)
    msg <- sprintf("%d indices", count)
    if (is.infinite(n_max)) {
      p(msg, amount = nraw/1e6)
    } else {
      p(msg, amount = length(idxs))
      if (offset > n_max) break
    }
    idxs <- NULL
  }
  pos <- unlist(pos, use.names = FALSE)
  if (is.finite(n_max)) pos <- pos[pos <= n_max]

  ## Skip?
  if (skip > 0) pos <- pos[-seq_len(skip)]

  ## Drop last position if at the very end of the file?
  if (drop_eof) {
    n <- length(pos)
    if (pos[n] == file_size) pos <- pos[-n]
  }

  if (!is.null(index)) {
    drop <- which(pos <= index_range[2])
    if (length(drop) > 0) pos <- pos[-drop]
    pos <- c(index, pos)

    ## Check for duplicated. Sorting will set ALTREP sort flag,
    ## which will speed up future sorting and duplication checks.
    pos <- sort(pos)
    dups <- anyDuplicated(pos)
    stopifnot(length(dups) == 1, dups == 0)
  }
  
  pos
}

#' @param index (numeric vector) A sorted index of file byte positions.
#'
#' @param file A pathname to a \file{*.index} file to be
#' created or read from.
#'
#' @rdname make_file_index
#' @export
save_file_index <- function(index, file) {
  stopifnot(is.numeric(index))
  index <- as.double(index)
  writeBin(index, con = file, endian = "little")
}


#' @rdname make_file_index
#' @importFrom utils file_test
#' @export
read_file_index <- function(file) {
  stopifnot(file_test("-f", file))
  file_size <- file.size(file)
  n <- file_size/8
  readBin(con = file, what = double(0L), endian = "little", n = n)
}
UCSF-HPC/wyntonquery documentation built on March 6, 2025, 1:12 a.m.