warc: Tools to Work with the Web Archive Ecosystem

Documented in find_sequence gz_close gz_eof gz_flush gz_fseek gz_gets gz_gets_raw gzip_inflate_from_pos gz_offset gz_open gz_read_char gz_read_raw gz_seek gz_tell gz_write_char gz_write_raw

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

int_create_cdx_from_warc <- function(warc_path, warc_record_types, field_spec, cdx_path) {
    invisible(.Call('warc_int_create_cdx_from_warc', PACKAGE = 'warc', warc_path, warc_record_types, field_spec, cdx_path))
}

#' Open a gzip file for reading or writing
#'
#' @param path path to file
#' @param mode "\code{write}", "\code{append}" or "\code{read}"
#' @return handle to the file
#' @export
gz_open <- function(path, mode) {
    .Call('warc_gz_open', PACKAGE = 'warc', path, mode)
}

#' Return the current raw compressed offset in the file
#'
#' @param gzfile file handle
#' @return offset position (integer)
#' @export
gz_offset <- function(gzfile) {
    .Call('warc_gz_offset', PACKAGE = 'warc', gzfile)
}

#' Return the current raw uncompressedf offset in the file
#'
#' @param gzfile file handle
#' @return offset position (integer)
#' @export
gz_tell <- function(gzfile) {
    .Call('warc_gz_tell', PACKAGE = 'warc', gzfile)
}

#' Sets the starting position for the next \code{gz_read()} or \code{gz_write()}
#'
#' @param gzfile file handle
#' @param offset represents a number of bytes in the compressed data stream
#' @param from either "\code{start}", "\code{end}" or "\code{current}"
#' @return \code{TRUE} if successful
#' @export
gz_fseek <- function(gzfile, offset, from) {
    .Call('warc_gz_fseek', PACKAGE = 'warc', gzfile, offset, from)
}

#' Sets the starting position for the next \code{gz_read()} or \code{gz_write()}
#'
#' @param gzfile file handle
#' @param offset represents a number of bytes in the uncompressed data stream
#' @param from either "\code{start}" or "\code{current}"
#' @return the resulting offset location as measured in bytes from the beginning of the
#'   uncompressed stream, or –1 in case of error, in particular if the file is opened
#'   for writing and the new starting position would be before the current position.
#' @export
gz_seek <- function(gzfile, offset, from) {
    .Call('warc_gz_seek', PACKAGE = 'warc', gzfile, offset, from)
}

#' Read from a gz file into a raw vector
#'
#' @param gzfile file handle
#' @param len number of of characters
#' @export
gz_read_raw <- function(gzfile, len) {
    .Call('warc_gz_read_raw', PACKAGE = 'warc', gzfile, len)
}

#' Read from a gz file into a character vector
#'
#' @param gzfile file handle
#' @param len number of of characters
#' @export
gz_read_char <- function(gzfile, len) {
    .Call('warc_gz_read_char', PACKAGE = 'warc', gzfile, len)
}

#' Test for end of file
#'
#' @export
#' @param gzfile file handle
gz_eof <- function(gzfile) {
    .Call('warc_gz_eof', PACKAGE = 'warc', gzfile)
}

#' Read a line from a gz file
#'
#' @export
#' @param gzfile file handle
#' @note line buffer max is 8,192 characters. The intent of this function is to use it
#'   on well-known formats.
gz_gets <- function(gzfile) {
    .Call('warc_gz_gets', PACKAGE = 'warc', gzfile)
}

#' Read a line from a gz file
#'
#' @export
#' @param gzfile file handle
#' @note line buffer max is 8,192 characters. The intent of this function is to use it
#'   on well-known formats.
gz_gets_raw <- function(gzfile) {
    .Call('warc_gz_gets_raw', PACKAGE = 'warc', gzfile)
}

#' Write a raw vector to a gz file
#'
#' @param gzfile file handle
#' @param buffer raw vector to write
#' @export
gz_write_raw <- function(gzfile, buffer) {
    invisible(.Call('warc_gz_write_raw', PACKAGE = 'warc', gzfile, buffer))
}

#' Write an atomic character vector to a file
#'
#' @param gzfile file handle
#' @param buffer atomic character vector
#' @export
gz_write_char <- function(gzfile, buffer) {
    invisible(.Call('warc_gz_write_char', PACKAGE = 'warc', gzfile, buffer))
}

#' Flush currenzt gzip stream
#'
#' This will flush all zlib output buffers for the current file
#' and terminate the gzip stream. The next \code{gz_write()} will
#' start a new gzip stream.
#'
#' @param gzfile file handle
#' @export
gz_flush <- function(gzfile) {
    invisible(.Call('warc_gz_flush', PACKAGE = 'warc', gzfile))
}

#' Close the gz file
#'
#' @param gzfile file handle
#' @note if you want to properly flush the buffers and correctly terminate a gzip stream
#'   then you \emph{must} call \code{gz_flush()} before closing the file.
#' @export
gz_close <- function(gzfile) {
    invisible(.Call('warc_gz_close', PACKAGE = 'warc', gzfile))
}

#' Inflate a gzip stream from a file
#'
#' Given a gzip file that was built with concatenated individual gzip streams,
#' this function will expand the contents of the stream into a \code{raw} vector
#' and return it.
#'
#' @param path path to gzip individual stream compressed WARC file
#' @param raw_stream_pos position in the raw file at \code{path} (not the "gzip
#'   stream position")
#' @note Since this is working with compressed files, the memory size of the returned
#'   value may be quite large.
#' @export
gzip_inflate_from_pos <- function(path, raw_stream_pos) {
    .Call('warc_gzip_inflate_from_pos', PACKAGE = 'warc', path, raw_stream_pos)
}

gzuncompress <- function(r_source, r_guess_size) {
    .Call('warc_gzuncompress', PACKAGE = 'warc', r_source, r_guess_size)
}

#' Find the first occurrence (if any) of a sequence of raw bytes
#' (\code{pattern}) in \code{buffer}.
#'
#' @param buffer vector to search in
#' @param pattern sequence of bytes to look for
#' @return index in \code{buffer} or \code{-1} if not found
#' @export
find_sequence <- function(buffer, pattern) {
    .Call('warc_find_sequence', PACKAGE = 'warc', buffer, pattern)
}

hrbrmstr/warc documentation built on May 17, 2019, 5:53 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

hrbrmstr/warc
Tools to Work with the Web Archive Ecosystem

R/RcppExports.R
In hrbrmstr/warc: Tools to Work with the Web Archive Ecosystem

Defines functions int_create_cdx_from_warc gz_open gz_offset gz_tell gz_fseek gz_seek gz_read_raw gz_read_char gz_eof gz_gets gz_gets_raw gz_write_raw gz_write_char gz_flush gz_close gzip_inflate_from_pos gzuncompress find_sequence

Documented in find_sequence gz_close gz_eof gz_flush gz_fseek gz_gets gz_gets_raw gzip_inflate_from_pos gz_offset gz_open gz_read_char gz_read_raw gz_seek gz_tell gz_write_char gz_write_raw

R Package Documentation

Browse R Packages

We want your feedback!

hrbrmstr/warc Tools to Work with the Web Archive Ecosystem

R/RcppExports.R In hrbrmstr/warc: Tools to Work with the Web Archive Ecosystem

Defines functions int_create_cdx_from_warc gz_open gz_offset gz_tell gz_fseek gz_seek gz_read_raw gz_read_char gz_eof gz_gets gz_gets_raw gz_write_raw gz_write_char gz_flush gz_close gzip_inflate_from_pos gzuncompress find_sequence

Documented in find_sequence gz_close gz_eof gz_flush gz_fseek gz_gets gz_gets_raw gzip_inflate_from_pos gz_offset gz_open gz_read_char gz_read_raw gz_seek gz_tell gz_write_char gz_write_raw

R Package Documentation

Browse R Packages

We want your feedback!

hrbrmstr/warc
Tools to Work with the Web Archive Ecosystem

R/RcppExports.R
In hrbrmstr/warc: Tools to Work with the Web Archive Ecosystem