warc: Tools to Work with the Web Archive Ecosystem

sgzip_inflate_from_pos <- purrr::safely(gzip_inflate_from_pos)

field_trans <-
  structure(
    list(
      field = c(
        "A",
        "B",
        "C",
        "D",
        "F",
        "G",
        "H",
        "I",
        "J",
        "K",
        "L",
        "M",
        "N",
        "P",
        "Q",
        "R",
        "S",
        "U",
        "V",
        "X",
        "Y",
        "Z",
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "r",
        "s",
        "t",
        "u",
        "v",
        "x",
        "y",
        "z"
      ),
      short_name = c(
        "canonized_url",
        "news_group",
        "rulespace_category",
        "compressed_dat_file_offset",
        "canonized_Frame",
        "multi_column_lang_desc",
        "canonized_host",
        "canonized_image",
        "canonized_jump_point",
        "fbis",
        "canonized_link",
        "meta_tags",
        "massaged_url",
        "canonized_path",
        "language_string",
        "canonized_redirect",
        "compressed_record_size",
        "uniqueness",
        "compressed_arc_file_offset",
        "canonized_url_href",
        "canonized_url_src",
        "canonized_url_script",
        "original_url",
        "date",
        "old_style_checksum",
        "uncompressed_dat_file_offset",
        "ip",
        "frame",
        "file_name",
        "original_host",
        "image",
        "original_jump_point",
        "new_style_checksum",
        "link",
        "mime_type_orig",
        "arc_document_length",
        "port",
        "original_path",
        "redirect",
        "response_code",
        "title",
        "urn",
        "uncompressed_arc_file_offset",
        "url_href",
        "url_src",
        "url_script"
      ),
      description = c(
        "canonized url",
        "news group",
        "rulespace category",
        "compressed dat file offset",
        "canonized frame",
        "multi-columm language description",
        "canonized host",
        "canonized image",
        "canonized jump point",
        "Some weird FBIS whats changed kinda thing",
        "canonized link",
        "meta tags (AIF)",
        "massaged url",
        "canonized path",
        "language string",
        "canonized redirect",
        "compressed record size",
        "uniqness",
        "compressed arc file offset",
        "canonized url in other href tages",
        "canonized url in other src tags",
        "canonized url found in script",
        "original url",
        "date",
        "old style checksum",
        "uncompressed dat file offset",
        "IP",
        "frame",
        "file name",
        "original host",
        "image",
        "original jump point",
        "new style checksum",
        "link",
        "mime type of original document",
        "arc document length",
        "port",
        "original path",
        "redirect",
        "response code",
        "title",
        "warc record id",
        "uncompressed arc file offset",
        "url in other href tages",
        "url in other src tags",
        "url found in script"
      )
    ),
    .Names = c("field", "short_name", "description"),
    row.names = c(NA, -46L),
    class = c("tbl_df", "tbl", "data.frame")
  )

# from httr source
http_statuses <- c(
  "100" = "Continue",
  "101" = "Switching Protocols",
  "102" = "Processing (WebDAV; RFC 2518)",
  "200" = "OK",
  "201" = "Created",
  "202" = "Accepted",
  "203" = "Non-Authoritative Information",
  "204" = "No Content",
  "205" = "Reset Content",
  "206" = "Partial Content",
  "207" = "Multi-Status (WebDAV; RFC 4918)",
  "208" = "Already Reported (WebDAV; RFC 5842)",
  "226" = "IM Used (RFC 3229)",
  "300" = "Multiple Choices",
  "301" = "Moved Permanently",
  "302" = "Found",
  "303" = "See Other",
  "304" = "Not Modified",
  "305" = "Use Proxy",
  "306" = "Switch Proxy",
  "307" = "Temporary Redirect",
  "308" = "Permanent Redirect (experimental Internet-Draft)",
  "400" = "Bad Request",
  "401" = "Unauthorized",
  "402" = "Payment Required",
  "403" = "Forbidden",
  "404" = "Not Found",
  "405" = "Method Not Allowed",
  "406" = "Not Acceptable",
  "407" = "Proxy Authentication Required",
  "408" = "Request Timeout",
  "409" = "Conflict",
  "410" = "Gone",
  "411" = "Length Required",
  "412" = "Precondition Failed",
  "413" = "Request Entity Too Large",
  "414" = "Request-URI Too Long",
  "415" = "Unsupported Media Type",
  "416" = "Requested Range Not Satisfiable",
  "417" = "Expectation Failed",
  "418" = "I'm a teapot (RFC 2324)",
  "420" = "Enhance Your Calm (Twitter)",
  "422" = "Unprocessable Entity (WebDAV; RFC 4918)",
  "423" = "Locked (WebDAV; RFC 4918)",
  "424" = "Failed Dependency (WebDAV; RFC 4918)",
  "424" = "Method Failure (WebDAV)",
  "425" = "Unordered Collection (Internet draft)",
  "426" = "Upgrade Required (RFC 2817)",
  "428" = "Precondition Required (RFC 6585)",
  "429" = "Too Many Requests (RFC 6585)",
  "431" = "Request Header Fields Too Large (RFC 6585)",
  "444" = "No Response (Nginx)",
  "449" = "Retry With (Microsoft)",
  "450" = "Blocked by Windows Parental Controls (Microsoft)",
  "451" = "Unavailable For Legal Reasons (Internet draft)",
  "499" = "Client Closed Request (Nginx)",
  "500" = "Internal Server Error",
  "501" = "Not Implemented",
  "502" = "Bad Gateway",
  "503" = "Service Unavailable",
  "504" = "Gateway Timeout",
  "505" = "HTTP Version Not Supported",
  "506" = "Variant Also Negotiates (RFC 2295)",
  "507" = "Insufficient Storage (WebDAV; RFC 4918)",
  "508" = "Loop Detected (WebDAV; RFC 5842)",
  "509" = "Bandwidth Limit Exceeded (Apache bw/limited extension)",
  "510" = "Not Extended (RFC 2774)",
  "511" = "Network Authentication Required (RFC 6585)",
  "598" = "Network read timeout error (Unknown)",
  "599" = "Network connect timeout error (Unknown)"
)