# File src/library/tools/R/urltools.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 2015-2023 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
## See RFC 3986 <https://www.rfc-editor.org/rfc/rfc3986> and
## <https://url.spec.whatwg.org/>.
get_IANA_URI_scheme_db <-
function() {
## See
## <https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml>.
baseurl <- "https://www.iana.org/assignments/uri-schemes/"
db <- utils::read.csv(
url(paste0(baseurl, "uri-schemes-1.csv")),
stringsAsFactors = FALSE,
encoding = "UTF-8"
)
names(db) <- chartr(".", "_", names(db))
db$URI_Scheme <- sub(" .*", "", db$URI_Scheme)
db
}
parse_URI_reference <-
function(x) {
re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"
if (length(x)) {
y <- do.call(rbind, regmatches(x, regexec(re, x)))
y <- y[, c(3, 5, 6, 8, 10), drop = FALSE]
} else {
y <- matrix(character(), 0L, 5L)
}
colnames(y) <- c("scheme", "authority", "path", "query", "fragment")
y
}
.get_urls_from_Rd <-
function(x, href = TRUE, ifdef = FALSE) {
urls <- character()
recurse <- function(e) {
tag <- attr(e, "Rd_tag")
## Rd2HTML and Rd2latex remove whitespace and \n from URLs.
if (identical(tag, "\\url")) {
urls <<- c(urls, lines2str(.Rd_deparse(e, tag = FALSE)))
} else if (href && identical(tag, "\\href")) {
## One could also record the \href text argument in the
## names, but then one would need to process named and
## unnamed extracted URLs separately.
urls <<- c(urls, lines2str(.Rd_deparse(e[[1L]], tag = FALSE)))
} else if (ifdef && length(tag) && (tag %in% c("\\if", "\\ifelse"))) {
## cf. testRdConditional()
condition <- e[[1L]]
if (all(RdTags(condition) == "TEXT")) {
if (
any(
c("TRUE", "html") %in%
trimws(strsplit(paste(condition, collapse = ""), ",")[[1L]])
)
)
recurse(e[[2L]]) else if (tag == "\\ifelse") recurse(e[[3L]])
}
} else if (is.list(e)) lapply(e, recurse)
}
lapply(x, recurse)
unique(trimws(urls))
}
.get_urls_from_HTML_file <-
function(f) {
doc <- xml2::read_html(f)
if (!inherits(doc, "xml_node")) return(character())
nodes <- xml2::xml_find_all(doc, "//a")
hrefs <- xml2::xml_attr(nodes, "href")
unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")])
}
.get_urls_from_PDF_file <-
function(f) {
## Seems there is no straightforward way to extract hyperrefs from a
## PDF, hence first convert to HTML.
## Note that pdftohtml always outputs in cwd ...
owd <- getwd()
dir.create(d <- tempfile())
on.exit({
unlink(d, recursive = TRUE)
setwd(owd)
})
file.copy(normalizePath(f), d)
setwd(d)
g <- tempfile(tmpdir = d, fileext = ".xml")
system2(
"pdftohtml",
c("-s -q -i -c -xml", shQuote(basename(f)), shQuote(basename(g)))
)
## Oh dear: seems that pdftohtml can fail without a non-zero exit
## status.
if (file.exists(g)) .get_urls_from_HTML_file(g) else character()
}
url_db <-
function(urls, parents) {
## Some people get leading LFs in URLs, so trim before checking.
db <- list2DF(list(
URL = trimws(as.character(urls)),
Parent = as.character(parents)
))
class(db) <- c("url_db", "data.frame")
db
}
url_db_from_HTML_files <-
function(dir, recursive = FALSE, files = NULL, verbose = FALSE) {
urls <- parents <- character()
if (is.null(files))
files <- list.files(
dir,
pattern = "[.]html$",
full.names = TRUE,
recursive = recursive
)
urls <-
lapply(files, function(f) {
if (verbose)
message(sprintf("processing %s", .file_path_relative_to_dir(f, dir)))
.get_urls_from_HTML_file(f)
})
names(urls) <- files
urls <- Filter(length, urls)
if (length(urls)) {
parents <- rep.int(
.file_path_relative_to_dir(names(urls), dir),
lengths(urls)
)
urls <- unlist(urls, use.names = FALSE)
}
url_db(urls, parents)
}
url_db_from_PDF_files <-
function(dir, recursive = FALSE, files = NULL, verbose = FALSE) {
urls <- parents <- character()
if (is.null(files))
files <- list.files(
dir,
pattern = "[.]pdf$",
full.names = TRUE,
recursive = recursive
)
urls <-
lapply(files, function(f) {
if (verbose)
message(sprintf("processing %s", .file_path_relative_to_dir(f, dir)))
.get_urls_from_PDF_file(f)
})
names(urls) <- files
urls <- Filter(length, urls)
if (length(urls)) {
parents <- rep.int(
.file_path_relative_to_dir(names(urls), dir),
lengths(urls)
)
urls <- unlist(urls, use.names = FALSE)
}
url_db(urls, parents)
}
url_db_from_package_Rd_db <-
function(db) {
urls <- Filter(length, lapply(db, .get_urls_from_Rd))
url_db(
unlist(urls, use.names = FALSE),
rep.int(file.path("man", names(urls)), lengths(urls))
)
}
url_db_from_package_metadata <-
function(meta) {
urls <- character()
fields <- c("URL", "BugReports")
for (v in meta[fields]) {
if (is.na(v)) next
pattern <-
"<(URL: *)?((https?|ftp)://[^[:space:],]*)[[:space:]]*>"
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
regmatches(v, m) <- ""
pattern <- "(^|[^>\"])((https?|ftp)://[^[:space:],]*)"
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
}
if (!is.na(v <- meta["Description"])) {
pattern <-
"<(URL: *)?((https?|ftp)://[^[:space:]]+)[[:space:]]*>"
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
regmatches(v, m) <- ""
pattern <-
"([^>\"])((https?|ftp)://[[:alnum:]/.:@+\\_~%#?=&;,-]+[[:alnum:]/])"
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
regmatches(v, m) <- ""
pattern <- "<([A-Za-z][A-Za-z0-9.+-]*:[^>]+)>"
## scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 2L))
}
url_db(urls, rep.int("DESCRIPTION", length(urls)))
}
url_db_from_package_citation <-
function(dir, meta, installed = FALSE) {
urls <- character()
path <- if (installed) "CITATION" else file.path("inst", "CITATION")
cfile <- file.path(dir, path)
if (file.exists(cfile)) {
cinfo <- .read_citation_quietly(cfile, meta)
if (!inherits(cinfo, "error"))
urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE)))
}
url_db(urls, rep.int(path, length(urls)))
}
url_db_from_package_news <-
function(dir, installed = FALSE) {
path <- if (installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd")
nfile <- file.path(dir, path)
urls <-
if (file.exists(nfile)) {
macros <- initialRdMacros()
.get_urls_from_Rd(prepare_Rd(
parse_Rd(nfile, macros = macros),
stages = "install"
))
} else character()
url_db(urls, rep.int(path, length(urls)))
}
url_db_from_package_HTML_files <-
function(dir, installed = FALSE) {
path <- if (installed) "doc" else file.path("inst", "doc")
files <- Sys.glob(file.path(dir, path, "*.html"))
if (installed && file.exists(rfile <- file.path(dir, "README.html")))
files <- c(files, rfile)
url_db_from_HTML_files(dir, files = files)
}
url_db_from_package_README_md <-
function(dir, installed = FALSE) {
urls <- path <- character()
rfile <- Filter(
file.exists,
c(
if (!installed) file.path(dir, "inst", "README.md"),
file.path(dir, "README.md")
)
)[1L]
if (!is.na(rfile) && nzchar(Sys.which("pandoc"))) {
path <- .file_path_relative_to_dir(rfile, dir)
tfile <- tempfile("README", fileext = ".html")
on.exit(unlink(tfile))
out <- .pandoc_md_for_CRAN(rfile, tfile)
if (!out$status) {
urls <- .get_urls_from_HTML_file(tfile)
}
}
url_db(urls, rep.int(path, length(urls)))
}
url_db_from_package_NEWS_md <-
function(dir, installed = FALSE) {
urls <- path <- character()
nfile <- Filter(
file.exists,
c(
if (!installed) file.path(dir, "inst", "NEWS.md"),
file.path(dir, "NEWS.md")
)
)[1L]
if (!is.na(nfile) && nzchar(Sys.which("pandoc"))) {
path <- .file_path_relative_to_dir(nfile, dir)
tfile <- tempfile("NEWS", fileext = ".html")
on.exit(unlink(tfile))
out <- .pandoc_md_for_CRAN(nfile, tfile)
if (!out$status) {
urls <- .get_urls_from_HTML_file(tfile)
}
}
url_db(urls, rep.int(path, length(urls)))
}
url_db_from_package_sources <-
function(dir, add = FALSE) {
meta <- .get_package_metadata(dir, FALSE)
db <- rbind(
url_db_from_package_metadata(meta),
url_db_from_package_Rd_db(Rd_db(dir = dir)),
url_db_from_package_citation(dir, meta),
url_db_from_package_news(dir)
)
if (requireNamespace("xml2", quietly = TRUE)) {
db <- rbind(
db,
url_db_from_package_HTML_files(dir),
url_db_from_package_README_md(dir),
url_db_from_package_NEWS_md(dir)
)
}
if (add) db$Parent <- file.path(basename(dir), db$Parent)
db
}
url_db_from_installed_packages <-
function(packages, lib.loc = NULL, verbose = FALSE) {
if (!length(packages)) return()
one <- function(p) {
if (verbose) message(sprintf("processing %s", p))
dir <- system.file(package = p, lib.loc = lib.loc)
if (dir == "") return()
meta <- .read_description(file.path(dir, "DESCRIPTION"))
rddb <- Rd_db(p, lib.loc = dirname(dir))
db <- rbind(
url_db_from_package_metadata(meta),
url_db_from_package_Rd_db(rddb),
url_db_from_package_citation(dir, meta, installed = TRUE),
url_db_from_package_news(dir, installed = TRUE)
)
if (requireNamespace("xml2", quietly = TRUE)) {
db <- rbind(
db,
url_db_from_package_HTML_files(dir, installed = TRUE),
url_db_from_package_README_md(dir, installed = TRUE),
url_db_from_package_NEWS_md(dir, installed = TRUE)
)
}
db$Parent <- file.path(p, db$Parent)
db
}
do.call(rbind, c(lapply(packages, one), list(make.row.names = FALSE)))
}
get_IANA_HTTP_status_code_db <-
function() {
## See
## <https://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml>
baseurl <- "https://www.iana.org/assignments/http-status-codes/"
db <- utils::read.csv(
url(paste0(baseurl, "http-status-codes-1.csv")),
stringsAsFactors = FALSE
)
## Drop "Unassigned".
db[db$Description != "Unassigned", ]
}
## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes>
## and <https://www.rfc-editor.org/rfc/rfc959>,
## Section 4.2.2 "Numeric Order List of Reply Codes",
## and <https://www.rfc-editor.org/rfc/rfc2228>,
## Section 5 "New FTP Replies".
## Only need those >= 400.
table_of_FTP_server_return_codes <-
c(
"421" = "Service not available, closing control connection.",
"425" = "Can't open data connection.",
"426" = "Connection closed; transfer aborted.",
"430" = "Invalid username or password",
"431" = "Need some unavailable resource to process security.",
"434" = "Requested host unavailable.",
"450" = "Requested file action not taken.",
"451" = "Requested action aborted: local error in processing.",
"452" = "Requested action not taken. Insufficient storage space in system.",
"500" = "Syntax error, command unrecognized.",
"501" = "Syntax error in parameters or arguments.",
"502" = "Command not implemented.",
"503" = "Bad sequence of commands.",
"504" = "Command not implemented for that parameter.",
"530" = "Not logged in.",
"532" = "Need account for storing files.",
"533" = "Command protection level denied for policy reasons.",
"534" = "Request denied for policy reasons.",
"535" = "Failed security check (hash, sequence, etc).",
"536" = "Requested PROT level not supported by mechanism.",
"537" = "Command protection level not supported by security mechanism.",
"550" = "Requested action not taken. File unavailable",
"551" = "Requested action aborted: page type unknown.",
"552" = "Requested file action aborted. Exceeded storage allocation (for current directory or dataset).",
"553" = "Requested action not taken. File name not allowed.",
"631" = "Integrity protected reply.",
"632" = "Confidentiality and integrity protected reply.",
"633" = "Confidentiality protected reply."
)
check_url_db <-
function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL) {
use_curl <-
!parallel &&
config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_", "TRUE")) &&
requireNamespace("curl", quietly = TRUE)
if (parallel && is.null(pool)) pool <- curl::new_pool()
.gather <- function(
u = character(),
p = list(),
s = rep.int("", length(u)),
m = rep.int("", length(u)),
new = rep.int("", length(u)),
cran = rep.int("", length(u)),
spaces = rep.int("", length(u)),
R = rep.int("", length(u))
) {
y <- list2DF(list(
URL = u,
From = p,
Status = s,
Message = m,
New = new,
CRAN = cran,
Spaces = spaces,
R = R
))
class(y) <- c("check_url_db", "data.frame")
y
}
.fetch_headers <-
if (parallel)
function(urls) .fetch_headers_via_curl(urls, verbose, pool) else
function(urls) .fetch_headers_via_base(urls, verbose)
.check_ftp <- function(u, h) {
if (inherits(h, "error")) {
s <- "-1"
msg <- sub("[[:space:]]*$", "", conditionMessage(h))
} else {
s <- as.character(attr(h, "status"))
msg <- table_of_FTP_server_return_codes[s]
}
c(s, msg, "", "")
}
.check_http <- if (remote)
function(u, h) c(.check_http_A(u, h), .check_http_B(u)) else
function(u, h) c(rep.int("", 3L), .check_http_B(u))
.check_http_A <- function(u, h) {
newLoc <- ""
if (inherits(h, "error")) {
s <- "-1"
msg <- sub("[[:space:]]*$", "", conditionMessage(h))
if (
grepl(
paste(
c(
"server certificate verification failed",
"failed to get server cert",
"libcurl error code (51|60)"
),
collapse = "|"
),
msg
)
) {
h2 <- tryCatch(curlGetHeaders(u, verify = FALSE), error = identity)
s2 <- as.character(attr(h2, "status"))
msg <- paste0(
msg,
"\n\t(Status without verification: ",
table_of_HTTP_status_codes[s2],
")"
)
}
} else {
s <- as.character(attr(h, "status"))
msg <- table_of_HTTP_status_codes[s]
}
## Look for redirected URLs
## According to
## <https://www.rfc-editor.org/rfc/rfc7230#section-3.1.2> the first
## line of a response is the status-line, with "a possibly empty
## textual phrase describing the status code", so only look for
## a 301 status code in the first line.
if (grepl(" 301 ", h[1L], useBytes = TRUE)) {
## Get the new location from the last consecutive 301
## obtained.
h <- split(h, c(0L, cumsum(h == "\r\n")[-length(h)]))
i <- vapply(h, function(e) grepl(" 301 ", e[1L], useBytes = TRUE), NA)
h <- h[[which(!i)[1L] - 1L]]
pos <- grep("^[Ll]ocation: ", h, useBytes = TRUE)
if (length(pos)) {
loc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1", h[pos[1L]])
## Ouch. According to RFC 7231, the location is a URI
## reference, and may be relative in which case it needs
## resolving against the effect request URI.
## <https://www.rfc-editor.org/rfc/rfc7231#section-7.1.2>.
## Not quite straightforward, hence do not report such
## 301s.
## (Alternatively, could try reporting the 301 but no
## new location.)
newParts <- parse_URI_reference(loc)
if (nzchar(newParts[1L, "scheme"])) {
newLoc <- loc
## Handle fragments. If the new URL does have one,
## use it. Otherwise, if the old has one, use that.
## (From section 7.1.2).
if (newParts[1L, "fragment"] == "") {
uParts <- parse_URI_reference(u)
if (nzchar(uFragment <- uParts[1L, "fragment"])) {
newLoc <- paste0(newLoc, "#", uFragment)
}
}
}
}
}
##
if ((s != "200") && use_curl) {
g <- .curl_GET_status(u)
if (g == "200") {
s <- g
msg <- "OK"
}
}
## A mis-configured site
if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc))))
s <- "405"
c(s, msg, newLoc)
}
.check_http_B <- function(u) {
ul <- tolower(u)
cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) &&
!grepl(
"^https?://cran.r-project.org/web/packages/([.[:alnum:]_]+(html|pdf|rds))?$",
ul
)) ||
(grepl(
"^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$",
ul
)) ||
startsWith(ul, "http://cran.r-project.org") ||
any(startsWith(ul, mirrors)))
R <- grepl("^http://(www|bugs|journal).r-project.org", ul)
spaces <- grepl(" ", u)
c(if (cran) u else "", if (spaces) u else "", if (R) u else "")
}
bad <- .gather()
if (!NROW(db)) return(bad)
## Could also use utils::getCRANmirrors(local.only = TRUE).
mirrors <- c(
utils::read.csv(
file.path(R.home("doc"), "CRAN_mirrors.csv"),
as.is = TRUE,
encoding = "UTF-8"
)$URL,
"http://cran.rstudio.com/",
"https://cran.rstudio.com/"
)
mirrors <- tolower(sub("/$", "", mirrors))
if (inherits(db, "check_url_db")) {
## Allow re-checking check results.
parents <- db$From
urls <- db$URL
} else {
parents <- split(db$Parent, db$URL)
urls <- names(parents)
}
parts <- parse_URI_reference(urls)
## Empty URLs.
ind <- apply(parts == "", 1L, all)
if (any(ind)) {
len <- sum(ind)
bad <- rbind(
bad,
.gather(urls[ind], parents[ind], m = rep.int("Empty URL", len))
)
}
## Invalid URI schemes.
schemes <- parts[, 1L]
ind <- is.na(match(
tolower(schemes),
c(
"",
IANA_URI_scheme_db$URI_Scheme,
"arxiv",
## Also allow 'isbn' and 'issn', which in fact
## are registered URN namespaces but not
## registered URI schemes, see
## <https://www.iana.org/assignments/urn-formal/isbn>
## <https://www.iana.org/assignments/urn-formal/issn>
## <https://doi.org/10.17487/rfc3986>
## <https://doi.org/10.17487/rfc8141>.
"isbn",
"issn",
## Also allow 'javascript' scheme, see
## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03>
## (but apparently never registered with IANA).
"javascript"
)
))
if (any(ind)) {
len <- sum(ind)
msg <- rep.int("Invalid URI scheme", len)
doi <- schemes[ind] == "doi"
if (any(doi))
msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)")
bad <- rbind(bad, .gather(urls[ind], parents[ind], m = msg))
}
## Could check urn URIs at least for appropriate namespaces using
## <https://www.iana.org/assignments/urn-namespaces/urn-namespaces-1.csv>
## ftp.
pos <- which(schemes == "ftp")
if (length(pos) && remote) {
urlspos <- urls[pos]
headers <- .fetch_headers(urlspos)
results <- do.call(rbind, Map(.check_ftp, urlspos, headers))
status <- as.numeric(results[, 1L])
ind <- (status < 0L) | (status >= 400L)
if (any(ind)) {
pos <- pos[ind]
s <- as.character(status[ind])
s[s == "-1"] <- "Error"
m <- results[ind, 2L]
m[is.na(m)] <- ""
bad <- rbind(bad, .gather(urls[pos], parents[pos], s, m))
}
}
## http/https.
pos <- which(schemes == "http" | schemes == "https")
if (length(pos) && remote) {
urlspos <- urls[pos]
## Check DOI URLs via the DOI handle API, as we nowadays do for
## checking DOIs.
myparts <- parts[pos, , drop = FALSE]
ind <- (((myparts[, 2L] == "doi.org") |
(myparts[, 2L] == "dx.doi.org")) &
startsWith(myparts[, 3L], "/10.") &
!nzchar(myparts[, 4L]) &
!nzchar(myparts[, 5L]))
if (any(ind))
urlspos[ind] <- paste0("https://doi.org/api/handles", myparts[ind, 3L])
## Could also use regexps, e.g.
## pat <- "^https?://(dx[.])?doi.org/10[.]([^?#]+)$"
## ind <- grep(pat, urlspos)
## if(length(ind))
## urlspos[ind] <-
## paste0("https://doi.org/api/handles/10.",
## sub(pat, "\\2", urlspos[ind]))
## but using the parts is considerably faster ...
headers <- .fetch_headers(urlspos)
results <- do.call(rbind, Map(.check_http, urlspos, headers))
status <- as.numeric(results[, 1L])
## 405 is HTTP not allowing HEAD requests
## maybe also skip 500, 503, 504 as likely to be temporary issues
ind <- is.na(match(status, c(200L, 405L, NA))) |
nzchar(results[, 3L]) |
nzchar(results[, 4L]) |
nzchar(results[, 5L]) |
nzchar(results[, 6L])
if (any(ind)) {
pos <- pos[ind]
s <- as.character(status[ind])
s[is.na(s)] <- ""
s[s == "-1"] <- "Error"
m <- results[ind, 2L]
m[is.na(m)] <- ""
bad <- rbind(
bad,
.gather(
urls[pos],
parents[pos],
s,
m,
results[ind, 3L],
results[ind, 4L],
results[ind, 5L],
results[ind, 6L]
)
)
}
}
bad
}
format.check_url_db <-
function(x, ...) {
if (!NROW(x)) return(character())
u <- x$URL
new <- x$New
ind <- nzchar(new)
if (any(ind)) {
u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind])
if (
config_val_to_logical(Sys.getenv(
"_R_CHECK_URLS_SHOW_301_STATUS_",
"FALSE"
))
) {
x$Message[ind] <- "Moved Permanently"
x$Status[ind] <- "301"
}
}
paste0(
sprintf("URL: %s", u),
sprintf("\nFrom: %s", vapply(x$From, paste, "", collapse = "\n ")),
ifelse((s <- x$Status) == "", "", sprintf("\nStatus: %s", s)),
ifelse(
(m <- x$Message) == "",
"",
sprintf("\nMessage: %s", gsub("\n", "\n ", m, fixed = TRUE))
),
ifelse((m <- x$Spaces) == "", "", "\nURL contains spaces"),
ifelse((m <- x$CRAN) == "", "", "\nCRAN URL not in canonical form"),
ifelse((m <- x$R) == "", "", "\nR-project URL not in canonical form")
)
}
print.check_url_db <-
function(x, ...) {
if (NROW(x)) writeLines(paste(format(x), collapse = "\n\n"))
invisible(x)
}
as.matrix.check_url_db <-
function(x, ...) {
n <- lengths(x[["From"]])
y <- do.call(
cbind,
c(
list(URL = rep.int(x[["URL"]], n), Parent = unlist(x[["From"]])),
lapply(x[-c(1L, 2L)], rep.int, n)
)
)
rownames(y) <- NULL
y
}
.fetch_headers_via_base <-
function(urls, verbose = FALSE, ids = urls)
Map(
function(u, verbose, i) {
if (verbose) message(sprintf("processing %s", i))
tryCatch(curlGetHeaders(u), error = identity)
},
urls,
verbose,
ids
)
.fetch_headers_via_curl <-
function(urls, verbose = FALSE, pool = NULL) {
out <- .curl_multi_run_worker(urls, TRUE, verbose, pool)
ind <- !vapply(out, inherits, NA, "error")
if (any(ind))
out[ind] <- lapply(out[ind], function(x) {
y <- strsplit(rawToChar(x$headers), "(?<=\r\n)", perl = TRUE)[[1L]]
attr(y, "status") <- x$status_code
y
})
out
}
.curl_multi_run_worker <-
function(urls, nobody = FALSE, verbose = FALSE, pool = NULL) {
## Use 'nobody = TRUE' to fetch only headers.
.progress_bar <- function(length, msg = "") {
bar <- new.env(parent = baseenv())
if (is.null(length)) {
length <- 0L
}
done <- fmt <- NULL # make codetools happy
bar$length <- length
bar$done <- -1L
digits <- trunc(log10(length)) + 1L
bar$fmt <- paste0("\r", msg, "[ %", digits, "i / %", digits, "i ]")
bar$update <- function() {
assign("done", inherits = TRUE, done + 1L)
if (length <= 0L) {
return()
}
if (done >= length) {
cat("\r", strrep(" ", nchar(fmt)), "\r", sep = "")
} else {
cat(sprintf(fmt, done, length), sep = "")
}
}
environment(bar$update) <- bar
bar$update()
bar
}
if (is.null(pool)) pool <- curl::new_pool()
bar <- .progress_bar(if (verbose) length(urls), msg = "fetching ")
out <- vector("list", length(urls))
for (i in seq_along(out)) {
u <- urls[[i]]
h <- curl::new_handle(url = u)
curl::handle_setopt(
h,
nobody = nobody,
cookiesession = 1L,
followlocation = 1L,
http_version = 2L,
ssl_enable_alpn = 0L
)
timeout <- as.integer(getOption("timeout"))
if (!is.na(timeout) && (timeout > 0L))
curl::handle_setopt(h, connecttimeout = timeout, timeout = timeout)
if (
grepl("^https?://github[.]com", u) &&
nzchar(a <- Sys.getenv("GITHUB_PAT", ""))
) {
curl::handle_setheaders(h, "Authorization" = paste("token", a))
}
handle_result <- local({
i <- i
function(x) {
out[[i]] <<- x
bar$update()
}
})
handle_error <- local({
i <- i
function(x) {
out[[i]] <<-
structure(
list(message = x),
class = c("curl_error", "error", "condition")
)
bar$update()
}
})
curl::multi_add(h, done = handle_result, fail = handle_error, pool = pool)
}
curl::multi_run(pool = pool)
out
}
.curl_GET_status <-
function(u, verbose = FALSE) {
if (verbose) message(sprintf("processing %s", u))
## Configure curl handle for better luck with JSTOR URLs/DOIs.
## Alternatively, special-case requests to
## https?://doi.org/10.2307
## https?://www.jstor.org
h <- curl::new_handle()
curl::handle_setopt(
h,
cookiesession = 1,
followlocation = 1,
http_version = 2L,
ssl_enable_alpn = 0
)
timeout <- as.integer(getOption("timeout"))
if (!is.na(timeout) && (timeout > 0L))
curl::handle_setopt(h, connecttimeout = timeout, timeout = timeout)
if (
startsWith(u, "https://github.com") &&
nzchar(a <- Sys.getenv("GITHUB_PAT", ""))
)
curl::handle_setheaders(h, "Authorization" = paste("token", a))
g <- tryCatch(curl::curl_fetch_memory(u, handle = h), error = identity)
if (inherits(g, "error")) -1L else g$status_code
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.