#' Index a file for faster access to parts of the file
#'
#' @param pathname (character) The file to be indexed.
#'
#' @param offset (numeric) The number of bytes to skip before start indexing.
#'
#' @param index (numeric; optional) An existing index (file byte offsets)
#' generated on an earlier version of the input file. If specified, the
#' indexing will continue at the very last known index, instead of
#' re-indexing from the beginning.
#'
#' @param skip (numeric) The number of `newline` matches to ignore before
#' recording them.
#'
#' @param n_max (numeric) The maximum number of bytes to scan.
#'
#' @param newline (character) The character to scan for.
#'
#' @param drop_eof (logical) If TRUE, the last identified byte offset is
#' dropped if at the very end of the file, i.e. when there is nothing
#' available to read from that position.
#'
#' @param bfr_size (numeric) The number of bytes to read in each iteration.
#'
#' @return A numeric vector of file byte offsets that corresponds to the
#' beginning of a line, i.e. a position in the file that was preceeded
#' by a `newline` character. The first line is at file byte offset
#' `0`, which is also always the first element in the returned vector.
#'
#' @example incl/make_file_index.R
#'
#' @importFrom utils file_test
#' @importFrom progressr progressor
#' @export
make_file_index <- function(pathname, offset = NULL, skip = 0L, index = NULL, n_max = Inf, newline = "\n", drop_eof = TRUE, bfr_size = 50e6) {
stopifnot(length(pathname) == 1L, file_test("-f", pathname))
if (!is.null(index)) {
stopifnot(is.numeric(index), !anyNA(index))
index_range <- range(index)
stopifnot(index_range[1] >= 0, index_range[2] < Inf)
if (is.null(offset)) offset <- index_range[2] - 1
}
if (is.null(offset)) offset <- 0
stopifnot(length(offset) == 1L, is.numeric(offset), is.finite(offset), offset >= 0)
stopifnot(length(skip) == 1L, is.numeric(skip), is.finite(skip), skip >= 0)
stopifnot(length(n_max) == 1L, is.numeric(n_max), !is.na(n_max), n_max >= 0)
stopifnot(length(newline) == 1L, is.character(newline), !is.na(newline))
stopifnot(length(bfr_size) == 1L, is.numeric(bfr_size), is.finite(bfr_size),
bfr_size > 0)
nl <- charToRaw(newline)
file_size <- file.size(pathname)
if (!is.null(index)) {
stopifnot(file_size > index_range[2])
}
con <- file(pathname, open = "rb")
on.exit(close(con))
if (offset > 0) {
offset <- offset - 1L
if (offset > 0) seek(con, where = offset, origin = "start", rw = "read")
}
## Coerce to double to avoid integer overflow for large files
offset <- as.double(offset)
## Report on progress (either by MBs or newlines read)
max_steps <- if (is.infinite(n_max)) (file_size - offset)/1e6 else n_max
p <- progressor(max_steps)
count <- 1
pos <- list(offset)
repeat {
raw <- readBin(con, what = raw(), n = bfr_size)
nraw <- length(raw)
## Reached end of file?
if (nraw == 0) break
idxs <- which(raw == nl)
raw <- NULL
idxs <- idxs + offset
offset <- offset + nraw
pos[[length(pos) + 1]] <- idxs
count <- count + length(idxs)
msg <- sprintf("%d indices", count)
if (is.infinite(n_max)) {
p(msg, amount = nraw/1e6)
} else {
p(msg, amount = length(idxs))
if (offset > n_max) break
}
idxs <- NULL
}
pos <- unlist(pos, use.names = FALSE)
if (is.finite(n_max)) pos <- pos[pos <= n_max]
## Skip?
if (skip > 0) pos <- pos[-seq_len(skip)]
## Drop last position if at the very end of the file?
if (drop_eof) {
n <- length(pos)
if (pos[n] == file_size) pos <- pos[-n]
}
if (!is.null(index)) {
drop <- which(pos <= index_range[2])
if (length(drop) > 0) pos <- pos[-drop]
pos <- c(index, pos)
## Check for duplicated. Sorting will set ALTREP sort flag,
## which will speed up future sorting and duplication checks.
pos <- sort(pos)
dups <- anyDuplicated(pos)
stopifnot(length(dups) == 1, dups == 0)
}
pos
}
#' @param index (numeric vector) A sorted index of file byte positions.
#'
#' @param file A pathname to a \file{*.index} file to be
#' created or read from.
#'
#' @rdname make_file_index
#' @export
save_file_index <- function(index, file) {
stopifnot(is.numeric(index))
index <- as.double(index)
writeBin(index, con = file, endian = "little")
}
#' @rdname make_file_index
#' @importFrom utils file_test
#' @export
read_file_index <- function(file) {
stopifnot(file_test("-f", file))
file_size <- file.size(file)
n <- file_size/8
readBin(con = file, what = double(0L), endian = "little", n = n)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.