R/cdx.r

Defines functions read_cdx print.cdx

Documented in print.cdx read_cdx

#' Read a WARC CDX index file
#'
#' CDX files are used to index the content of WARC files.
#'
#' The returned object is a \code{tbl_df} but is also classed
#' \code{cdx}.
#'
#' @param path path to CDX file
#' @param warc_path path to the WARC files referenced in \code{path}. Defaults to
#'     to the location of the CDX file
#' @references \url{https://iipc.github.io/warc-specifications/specifications/cdx-format/cdx-2015/}
#' @export
#' @examples \dontrun{
#' cdx <- read_cdx(system.file("extdata", "20160901.cdx", package="warc"))
#' }
read_cdx <- function(path, warc_path=NULL) {
  #CDX a b a m s k r M V g u
  lines <- readr::read_lines(path.expand(path))
  if (is.null(warc_path)) warc_path <- dirname(path.expand(path))

  delim <- substr(lines[1], 1, 1)

  if (substr(lines[1], 2, 4) != "CDX") {
    stop("Not a CDX file", call.=FALSE)
  }

  if (substr(lines[1], 5, 5) != delim) {
    stop("Malformed CDX file header", call.=FALSE)
  }

  header <- stri_split_fixed(substr(lines[1], 6, nchar(lines[1])), delim)[[1]]
  header <- data_frame(field=header)
  header <- left_join(header, field_trans, by="field")

  map_df(lines[-1], function(x) {
    bits <- stri_split_fixed(x, delim)[[1]]
    record <- setNames(as.list(bits), header$short_name)
  }) -> df

  df <- mutate(df, date=as.POSIXct(date, "%Y%m%d%H%M%S", tz="GMT"))
  df <- mutate(df, warc_path=warc_path)
  df <- suppressMessages(readr::type_convert(df))

  if ("response_code" %in% colnames(df)) {
    df <- mutate(df, response_code=ifelse(response_code==0, NA, response_code))
  }

  if ("original_url" %in% colnames(df)) {
    df <- mutate(df, original_url=remove_brackets(original_url))
  }

  class(df) <- c("cdx", class(df))
  df

}

#' Display a CDX object
#' @export
print.cdx <- function(x, ...) {

  fields <- left_join(data_frame(short_name=colnames(x)), field_trans, by="short_name")
  fields <- mutate(fields, description=ifelse(is.na(description), "", description))
  fields <- mutate(fields, description=ifelse(short_name == "warc_path", "WARC path", description))

  cat(sprintf("# A CDX WARC index with %s records and the following fields:\n\n",
              scales::comma(nrow(x))))

  max_sn <- max(map_int(fields$short_name, nchar))

  for (i in 1:nrow(fields)) {
    cat(sprintf("%s: %s\n",
            stri_pad_left(fields$short_name[i], max_sn),
            fields$description[i]))
  }

  cat("\n\n")

  dplyr::glimpse(x)

}
hrbrmstr/warc documentation built on May 17, 2019, 5:53 p.m.