inst/tools/urltools.R

#  File src/library/tools/R/urltools.R
#  Part of the R package, https://www.R-project.org
#
#  Copyright (C) 2015-2021 The R Core Team
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  https://www.R-project.org/Licenses/

get_IANA_URI_scheme_db <-
function()
{
    ## See
    ## <https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml>.
    baseurl <- "https://www.iana.org/assignments/uri-schemes/"
    db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")),
                          stringsAsFactors = FALSE, encoding = "UTF-8")
    names(db) <- chartr(".", "_", names(db))
    db
}

parse_URI_reference <-
function(x)
{
    ## See RFC_3986 <http://www.ietf.org/rfc/rfc3986.txt>.
    re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"
    if(length(x)) {
        y <- do.call(rbind, regmatches(x, regexec(re, x)))
        y <- y[, c(3, 5, 6, 8, 10), drop = FALSE]
    } else {
        y <- matrix(character(), 0L, 5L)
    }
    colnames(y) <- c("scheme", "authority", "path", "query", "fragment")
    y
}

.get_urls_from_Rd <-
function(x, href = TRUE, ifdef = FALSE)
{
    urls <- character()
    recurse <- function(e) {
        tag <- attr(e, "Rd_tag")
        ## Rd2HTML and Rd2latex remove whitespace and \n from URLs.
        if(identical(tag, "\\url")) {
            urls <<- c(urls, lines2str(.Rd_deparse(e, tag = FALSE)))
        } else if(href && identical(tag, "\\href")) {
            ## One could also record the \href text argument in the
            ## names, but then one would need to process named and
            ## unnamed extracted URLs separately.
            urls <<- c(urls, lines2str(.Rd_deparse(e[[1L]], tag = FALSE)))
        } else if(ifdef && length(tag) && (tag %in% c("\\if", "\\ifelse"))) {
            ## cf. testRdConditional()
            condition <- e[[1L]]
            if(all(RdTags(condition) == "TEXT")) {
                if(any(c("TRUE", "html") %in%
                       trimws(strsplit(paste(condition, collapse = ""), 
                                       ",")[[1L]])))
                    recurse(e[[2L]])
                else if(tag == "\\ifelse")
                    recurse(e[[3L]])
            }
        } else if(is.list(e))
            lapply(e, recurse)
    }
    lapply(x, recurse)
    unique(trimws(urls))
}

.get_urls_from_HTML_file <-
function(f)
{
    doc <- xml2::read_html(f)
    if(!inherits(doc, "xml_node")) return(character())
    nodes <- xml2::xml_find_all(doc, "//a")
    hrefs <- xml2::xml_attr(nodes, "href")
    unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")])
}

.get_urls_from_PDF_file <-
function(f)    
{
    ## Seems there is no straightforward way to extract hyperrefs from a
    ## PDF, hence first convert to HTML.
    ## Note that pdftohtml always outputs in cwd ...
    owd <- getwd()
    dir.create(d <- tempfile())
    on.exit({ unlink(d, recursive = TRUE); setwd(owd) })
    file.copy(normalizePath(f), d)
    setwd(d)
    g <- tempfile(tmpdir = d, fileext = ".xml")
    system2("pdftohtml",
            c("-s -q -i -c -xml", shQuote(basename(f)), shQuote(basename(g))))
    ## Oh dear: seems that pdftohtml can fail without a non-zero exit
    ## status.
    if(file.exists(g))
        .get_urls_from_HTML_file(g)
    else
        character()
}

url_db <-
function(urls, parents)
{
    ## Some people get leading LFs in URLs, so trim before checking.
    db <- data.frame(URL = trimws(as.character(urls)),
                     Parent = as.character(parents),
                     stringsAsFactors = FALSE)
    class(db) <- c("url_db", "data.frame")
    db
}

url_db_from_HTML_files <-
function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
{
    urls <- parents <- character()
    if(is.null(files)) 
        files <- list.files(dir, pattern = "[.]html$",
                            full.names = TRUE,
                            recursive = recursive)
    urls <-
        lapply(files,
               function(f) {
                   if(verbose)
                       message(sprintf("processing %s",
                                       .file_path_relative_to_dir(f, dir)))
                   .get_urls_from_HTML_file(f)
               })
    names(urls) <- files
    urls <- Filter(length, urls)
    if(length(urls)) {
        parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
                           lengths(urls))
        urls <- unlist(urls, use.names = FALSE)
    }
    url_db(urls, parents)
}

url_db_from_PDF_files <-
function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
{
    urls <- parents <- character()
    if(is.null(files))
        files <- list.files(dir, pattern = "[.]pdf$",
                            full.names = TRUE,
                            recursive = recursive)
    ## FIXME: this is simpler to do with full.names = FALSE and without
    ## tools:::.file_path_relative_to_dir().
    urls <-
        lapply(files,
               function(f) {
                   if(verbose)
                       message(sprintf("processing %s",
                                       .file_path_relative_to_dir(f, dir)))
                   .get_urls_from_PDF_file(f)
               })
    names(urls) <- files
    urls <- Filter(length, urls)
    if(length(urls)) {
        parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
                           lengths(urls))
        urls <- unlist(urls, use.names = FALSE)
    }
    url_db(urls, parents)
}

url_db_from_package_Rd_db <-
function(db)
{
    urls <- Filter(length, lapply(db, .get_urls_from_Rd))
    url_db(unlist(urls, use.names = FALSE),
           rep.int(file.path("man", names(urls)),
                   lengths(urls)))
}

url_db_from_package_metadata <-
function(meta)
{
    urls <- character()
    fields <- c("URL", "BugReports")
    for(v in meta[fields]) {
        if(is.na(v)) next
        pattern <-
            "<(URL: *)?((https?|ftp)://[^[:space:],]*)[[:space:]]*>"
        m <- gregexpr(pattern, v)
        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
        regmatches(v, m) <- ""
        pattern <- "(^|[^>\"])((https?|ftp)://[^[:space:],]*)"
        m <- gregexpr(pattern, v)
        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
    }
    if(!is.na(v <- meta["Description"])) {
        pattern <-
            "<(URL: *)?((https?|ftp)://[^[:space:]]+)[[:space:]]*>"
        m <- gregexpr(pattern, v)
        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
        regmatches(v, m) <- ""
        pattern <-
            "([^>\"])((https?|ftp)://[[:alnum:]/.:@+\\_~%#?=&;,-]+[[:alnum:]/])"
        m <- gregexpr(pattern, v)
        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
    }

    url_db(urls, rep.int("DESCRIPTION", length(urls)))
}

url_db_from_package_citation <-
function(dir, meta, installed = FALSE)
{
    urls <- character()
    path <- if(installed) "CITATION" else file.path("inst", "CITATION")
    cfile <- file.path(dir, path)
    if(file.exists(cfile)) {
        cinfo <- .read_citation_quietly(cfile, meta)
        if(!inherits(cinfo, "error"))
            urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE)))
    }
    url_db(urls, rep.int(path, length(urls)))
}

url_db_from_package_news <-
function(dir, installed = FALSE)
{
    path <- if(installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd")
    nfile <- file.path(dir, path)
    urls <-
        if(file.exists(nfile)) {
            macros <- initialRdMacros()
            .get_urls_from_Rd(prepare_Rd(parse_Rd(nfile, macros = macros),
                                         stages = "install"))
        } else character()
    url_db(urls, rep.int(path, length(urls)))
}

url_db_from_package_HTML_files <-
function(dir, installed = FALSE)
{
    path <- if(installed) "doc" else file.path("inst", "doc")
    files <- Sys.glob(file.path(dir, path, "*.html"))
    if(installed && file.exists(rfile <- file.path(dir, "README.html")))
        files <- c(files, rfile)
    url_db_from_HTML_files(dir, files = files)
}

url_db_from_package_README_md <-
function(dir, installed = FALSE)
{
    urls <- path <- character()
    rfile <- Filter(file.exists,
                    c(if(!installed)
                          file.path(dir, "inst", "README.md"),
                      file.path(dir, "README.md")))[1L]
    if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) {
        path <- .file_path_relative_to_dir(rfile, dir)
        tfile <- tempfile("README", fileext = ".html")
        on.exit(unlink(tfile))
        out <- .pandoc_md_for_CRAN(rfile, tfile)
        if(!out$status) {
            urls <- .get_urls_from_HTML_file(tfile)
        }
    }
    url_db(urls, rep.int(path, length(urls)))
}

url_db_from_package_NEWS_md <-
function(dir, installed = FALSE)
{
    urls <- path <- character()
    nfile <- Filter(file.exists,
                    c(if(!installed)
                          file.path(dir, "inst", "NEWS.md"),
                      file.path(dir, "NEWS.md")))[1L]
    if(!is.na(nfile) && nzchar(Sys.which("pandoc"))) {
        path <- .file_path_relative_to_dir(nfile, dir)
        tfile <- tempfile("NEWS", fileext = ".html")
        on.exit(unlink(tfile))
        out <- .pandoc_md_for_CRAN(nfile, tfile)
        if(!out$status) {
            urls <- .get_urls_from_HTML_file(tfile)
        }
    }
    url_db(urls, rep.int(path, length(urls)))
}

url_db_from_package_sources <-
function(dir, add = FALSE) {
    meta <- .read_description(file.path(dir, "DESCRIPTION"))
    db <- rbind(url_db_from_package_metadata(meta),
                url_db_from_package_Rd_db(Rd_db(dir = dir)),
                url_db_from_package_citation(dir, meta),
                url_db_from_package_news(dir))
    if(requireNamespace("xml2", quietly = TRUE)) {
        db <- rbind(db,
                    url_db_from_package_HTML_files(dir),
                    url_db_from_package_README_md(dir),
                    url_db_from_package_NEWS_md(dir)
                    )
    }
    if(add)
        db$Parent <- file.path(basename(dir), db$Parent)
    db
}

url_db_from_installed_packages <-
function(packages, lib.loc = NULL, verbose = FALSE)
{
    if(!length(packages)) return()
    one <- function(p) {
        if(verbose)
            message(sprintf("processing %s", p))
        dir <- system.file(package = p, lib.loc = lib.loc)
        if(dir == "") return()
        meta <- .read_description(file.path(dir, "DESCRIPTION"))
        rddb <- Rd_db(p, lib.loc = dirname(dir))
        db <- rbind(url_db_from_package_metadata(meta),
                    url_db_from_package_Rd_db(rddb),
                    url_db_from_package_citation(dir, meta,
                                                 installed = TRUE),
                    url_db_from_package_news(dir, installed = TRUE))
        if(requireNamespace("xml2", quietly = TRUE)) {
            db <- rbind(db,
                        url_db_from_package_HTML_files(dir,
                                                       installed = TRUE),
                        url_db_from_package_README_md(dir,
                                                      installed = TRUE),
                        url_db_from_package_NEWS_md(dir,
                                                    installed = TRUE)
                        )
        }
        db$Parent <- file.path(p, db$Parent)
        db
    }
    do.call(rbind,
            c(lapply(packages, one),
              list(make.row.names = FALSE)))
}

get_IANA_HTTP_status_code_db <-
function()
{
    ## See
    ## <https://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml>
    baseurl <- "https://www.iana.org/assignments/http-status-codes/"
    db <- utils::read.csv(url(paste0(baseurl, "http-status-codes-1.csv")),
                          stringsAsFactors = FALSE)
    ## Drop "Unassigned".
    db[db$Description != "Unassigned", ]
}

## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes>
## and <http://tools.ietf.org/html/rfc959>,
## Section 4.2.2 "Numeric Order List of Reply Codes",
## and <https://tools.ietf.org/html/rfc2228>,
## Section 5 "New FTP Replies".
## Only need those >= 400.
table_of_FTP_server_return_codes <-
    c("421" = "Service not available, closing control connection.",
      "425" = "Can't open data connection.",
      "426" = "Connection closed; transfer aborted.",
      "430" = "Invalid username or password",
      "431" = "Need some unavailable resource to process security.",
      "434" = "Requested host unavailable.",
      "450" = "Requested file action not taken.",
      "451" = "Requested action aborted: local error in processing.",
      "452" = "Requested action not taken.  Insufficient storage space in system.",
      "500" = "Syntax error, command unrecognized.",
      "501" = "Syntax error in parameters or arguments.",
      "502" = "Command not implemented.",
      "503" = "Bad sequence of commands.",
      "504" = "Command not implemented for that parameter.",
      "530" = "Not logged in.",
      "532" = "Need account for storing files.",
      "533" = "Command protection level denied for policy reasons.",
      "534" = "Request denied for policy reasons.",
      "535" = "Failed security check (hash, sequence, etc).",
      "536" = "Requested PROT level not supported by mechanism.",
      "537" = "Command protection level not supported by security mechanism.",
      "550" = "Requested action not taken.  File unavailable",
      "551" = "Requested action aborted: page type unknown.",
      "552" = "Requested file action aborted.  Exceeded storage allocation (for current directory or dataset).",
      "553" = "Requested action not taken.  File name not allowed.",
      "631" = "Integrity protected reply.",
      "632" = "Confidentiality and integrity protected reply.",
      "633" = "Confidentiality protected reply."
      )

check_url_db <-
function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
{
    use_curl <-
        !parallel &&
        config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_",
                                         "TRUE")) &&
        requireNamespace("curl", quietly = TRUE)

    if(parallel && is.null(pool))
        pool <- curl::new_pool()    

    .gather <- function(u = character(),
                        p = list(),
                        s = rep.int("", length(u)),
                        m = rep.int("", length(u)),
                        new = rep.int("", length(u)),
                        cran = rep.int("", length(u)),
                        spaces = rep.int("", length(u)),
                        R = rep.int("", length(u))) {
        y <- data.frame(URL = u, From = I(p), Status = s, Message = m,
                        New = new, CRAN = cran, Spaces = spaces, R = R,
                        row.names = NULL, stringsAsFactors = FALSE)
        y$From <- p
        class(y) <- c("check_url_db", "data.frame")
        y
    }
    
    .fetch_headers <-
        if(parallel)
            function(urls)
                .fetch_headers_via_curl(urls, verbose, pool)
        else
            function(urls)
                .fetch_headers_via_base(urls, verbose)

    .check_ftp <- function(u, h) {
        if(inherits(h, "error")) {
            s <- "-1"
            msg <- sub("[[:space:]]*$", "", conditionMessage(h))
        } else {
            s <- as.character(attr(h, "status"))
            msg <- table_of_FTP_server_return_codes[s]
        }
        c(s, msg, "", "")
    }

    .check_http <- if(remote)
                       function(u, h) c(.check_http_A(u, h),
                                        .check_http_B(u))
                   else
                       function(u, h) c(rep.int("", 3L),
                                        .check_http_B(u))

    .check_http_A <- function(u, h) {
        newLoc <- ""
        if(inherits(h, "error")) {
            s <- "-1"
            msg <- sub("[[:space:]]*$", "", conditionMessage(h))
            if(grepl(paste(c("server certificate verification failed",
                             "failed to get server cert",
                             "libcurl error code (51|60)"),
                           collapse = "|"),
                     msg)) {
                h2 <- tryCatch(curlGetHeaders(u, verify = FALSE),
                               error = identity)
                s2 <- as.character(attr(h2, "status"))
                msg <- paste0(msg, "\n\t(Status without verification: ",
                              table_of_HTTP_status_codes[s2], ")")
            }
        } else {
            s <- as.character(attr(h, "status"))
            msg <- table_of_HTTP_status_codes[s]
        }
        ## Look for redirected URLs
        ## According to
        ## <https://tools.ietf.org/html/rfc7230#section-3.1.2> the first
        ## line of a response is the status-line, with "a possibly empty
        ## textual phrase describing the status code", so only look for
        ## a 301 status code in the first line.
        if(grepl(" 301 ", h[1L], useBytes = TRUE) &&
           !startsWith(u, "https://doi.org/") &&
           !startsWith(u, "http://dx.doi.org/")) {
            ## Get the new location from the last consecutive 301
            ## obtained.
            h <- split(h, c(0L, cumsum(h == "\r\n")[-length(h)]))
            i <- vapply(h,
                        function(e)
                            grepl(" 301 ", e[1L], useBytes = TRUE),
                        NA)
            h <- h[[which(!i)[1L] - 1L]]
            pos <- grep("^[Ll]ocation: ", h, useBytes = TRUE)
            if(length(pos)) {
                loc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1",
                           h[pos[1L]])
                ## Ouch.  According to RFC 7231, the location is a URI
                ## reference, and may be relative in which case it needs
                ## resolving against the effect request URI.
                ## <https://tools.ietf.org/html/rfc7231#section-7.1.2>.
                ## Not quite straightforward, hence do not report such
                ## 301s. 
                ## (Alternatively, could try reporting the 301 but no
                ## new location.)
                if(nzchar(parse_URI_reference(loc)[1L, "scheme"]))
                    newLoc <- loc
                ## (Note also that fragments would need extra care.)
            }
        }
        ##
        if((s != "200") && use_curl) {
            g <- .curl_GET_status(u)
            if(g == "200") {
                s <- g
                msg <- "OK"
            }
        }
        ## A mis-configured site
        if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc))))
            s <- "405"
        c(s, msg, newLoc)
    }

    .check_http_B <- function(u) {
        ul <- tolower(u)
        cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) &&
                  !grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]_]+(html|pdf|rds)$",
                         ul)) ||
                 (grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$",
                        ul)) ||
                 startsWith(ul, "http://cran.r-project.org") ||
                 any(startsWith(ul, mirrors)))
        R <- grepl("^http://(www|bugs|journal).r-project.org", ul)
        spaces <- grepl(" ", u)
        c(if(cran) u else "", if(spaces) u else "", if(R) u else "")
    }

    bad <- .gather()

    if(!NROW(db)) return(bad)

    ## Could also use utils::getCRANmirrors(local.only = TRUE).
    mirrors <- c(utils::read.csv(file.path(R.home("doc"),
                                           "CRAN_mirrors.csv"),
                                 as.is = TRUE, encoding = "UTF-8")$URL,
                 "http://cran.rstudio.com/",
                 "https://cran.rstudio.com/")
    mirrors <- tolower(sub("/$", "", mirrors))

    if(inherits(db, "check_url_db")) {
        ## Allow re-checking check results.
        parents <- db$From
        urls <- db$URL
    } else {
        parents <- split(db$Parent, db$URL)
        urls <- names(parents)
    }

    parts <- parse_URI_reference(urls)

    ## Empty URLs.
    ind <- apply(parts == "", 1L, all)
    if(any(ind)) {
        len <- sum(ind)
        bad <- rbind(bad,
                     .gather(urls[ind],
                             parents[ind],
                             m = rep.int("Empty URL", len)))
    }

    ## Invalid URI schemes.
    schemes <- parts[, 1L]
    ind <- is.na(match(schemes,
                       c("",
                         IANA_URI_scheme_db$URI_Scheme,
                         ## Also allow 'javascript' scheme, see
                         ## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03>
                         ## (but apparently never registered with IANA).
                         "javascript")))
    if(any(ind)) {
        len <- sum(ind)
        msg <- rep.int("Invalid URI scheme", len)
        doi <- schemes[ind] == "doi"
        if(any(doi))
            msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)")
        bad <- rbind(bad,
                     .gather(urls[ind], parents[ind], m = msg))
    }

    ## ftp.
    pos <- which(schemes == "ftp")
    if(length(pos) && remote) {
        urlspos <- urls[pos]
        headers <- .fetch_headers(urlspos)
        results <- do.call(rbind, Map(.check_ftp, urlspos, headers))
        status <- as.numeric(results[, 1L])
        ind <- (status < 0L) | (status >= 400L)
        if(any(ind)) {
            pos <- pos[ind]
            s <- as.character(status[ind])
            s[s == "-1"] <- "Error"
            m <- results[ind, 2L]
            m[is.na(m)] <- ""
            bad <- rbind(bad,
                         .gather(urls[pos], parents[pos], s, m))
        }
    }

    ## http/https.
    pos <- which(schemes == "http" | schemes == "https")
    if(length(pos)) {
        urlspos <- urls[pos]
        headers <- .fetch_headers(urlspos)
        results <- do.call(rbind, Map(.check_http, urlspos, headers))
        status <- as.numeric(results[, 1L])
        ## 405 is HTTP not allowing HEAD requests
        ## maybe also skip 500, 503, 504 as likely to be temporary issues
        ind <- is.na(match(status, c(200L, 405L, NA))) |
            nzchar(results[, 3L]) |
            nzchar(results[, 4L]) |
            nzchar(results[, 5L]) |
            nzchar(results[, 6L])
        if(any(ind)) {
            pos <- pos[ind]
            s <- as.character(status[ind])
            s[is.na(s)] <- ""
            s[s == "-1"] <- "Error"
            m <- results[ind, 2L]
            m[is.na(m)] <- ""
            bad <- rbind(bad,
                         .gather(urls[pos], parents[pos], s, m,
                                 results[ind, 3L],
                                 results[ind, 4L],
                                 results[ind, 5L],
                                 results[ind, 6L]))
        }
    }
    bad
}

format.check_url_db <-
function(x, ...)
{
    if(!NROW(x)) return(character())

    u <- x$URL
    new <- x$New
    ind <- nzchar(new)
    if(any(ind)) {
        u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind])
        if(config_val_to_logical(Sys.getenv("_R_CHECK_URLS_SHOW_301_STATUS_",
                                            "FALSE"))) {
            x$Message[ind] <- "Moved Permanently"
            x$Status[ind] <- "301"
        }
    }

    paste0(sprintf("URL: %s", u),
           sprintf("\nFrom: %s",
                   vapply(x$From, paste, "", collapse = "\n      ")),
           ifelse((s <- x$Status) == "",
                  "",
                  sprintf("\nStatus: %s", s)),
           ifelse((m <- x$Message) == "",
                  "",
                  sprintf("\nMessage: %s", gsub("\n", "\n  ", m, fixed=TRUE))),
           ifelse((m <- x$Spaces) == "",
                  "",
                  "\nURL contains spaces"),
           ifelse((m <- x$CRAN) == "",
                  "",
                  "\nCRAN URL not in canonical form"),
           ifelse((m <- x$R) == "",
                  "",
                  "\nR-project URL not in canonical form")
           )
}

print.check_url_db <-
function(x, ...)
{
    if(NROW(x))
        writeLines(paste(format(x), collapse = "\n\n"))
    invisible(x)
}

as.matrix.check_url_db <-
function(x, ...)
{
    n <- lengths(x[["From"]])
    y <- do.call(cbind,
                 c(list(URL = rep.int(x[["URL"]], n),
                        Parent = unlist(x[["From"]])),
                   lapply(x[-c(1L, 2L)], rep.int, n)))
    rownames(y) <- NULL
    y
}

.fetch_headers_via_base <- function(urls, verbose = FALSE, ids = urls)
    Map(function(u, verbose, i) {
            if(verbose) message(sprintf("processing %s", i))
            tryCatch(curlGetHeaders(u), error = identity)
        },
        urls, verbose, ids)

.fetch_headers_via_curl <- function(urls, verbose = FALSE, pool = NULL) {

    .progress_bar <- function(length, msg = "") {
        bar <- new.env(parent = baseenv())
        if(is.null(length)) {
            length <- 0L
        }
        ## <FIXME>
        ## make codetools happy
        done <- fmt <- NULL
        ## </FIXME>
        bar$length <- length
        bar$done <- -1L
        digits <- trunc(log10(length)) + 1L
        bar$fmt <- paste0("\r", msg, "[ %", digits, "i / %", digits, "i ]")
        bar$update <- function() {
            assign("done", inherits = TRUE, done + 1L)
            if (length <= 0L) {
                return()
            }
            if (done >= length) {
                cat("\r", strrep(" ", nchar(fmt)), "\r", sep = "")
            } else {
                cat(sprintf(fmt, done, length), sep = "")
            }
        }
        environment(bar$update) <- bar
        bar$update()
        bar
    }

    if(is.null(pool))
        pool <- curl::new_pool()

    hs <- vector("list", length(urls))

    bar <- .progress_bar(if (verbose) length(urls), msg = "fetching ")
    for(i in seq_along(hs)) {
        u <- urls[[i]]
        h <- curl::new_handle(url = u)
        curl::handle_setopt(h,
                            nobody = TRUE,
                            cookiesession = 1L,
                            followlocation = 1L,
                            http_version = 2L,
                            ssl_enable_alpn = 0L)
        timeout <- as.integer(getOption("timeout"))
        if(!is.na(timeout) && (timeout > 0L))
            curl::handle_setopt(h,
                                connecttimeout = timeout,
                                timeout = timeout)
        if(grepl("^https?://github[.]com", u) &&
           nzchar(a <- Sys.getenv("GITHUB_PAT", ""))) {
            curl::handle_setheaders(h, "Authorization" = paste("token", a))
        }
        handle_result <- local({
            i <- i
            function(x) {
                hs[[i]] <<- x
                bar$update()
            }
        })
        handle_error <- local({
            i <- i
            function(x) {
                hs[[i]] <<-
                    structure(list(message = x),
                              class = c("curl_error", "error", "condition"))
                bar$update()
            }
        })
        curl::multi_add(h,
                        done = handle_result,
                        fail = handle_error,
                        pool = pool)
    }

    curl::multi_run(pool = pool)
   
    out <- vector("list", length(hs))
    for(i in seq_along(out)) {
        if(inherits(hs[[i]], "error")) {
            out[[i]] <- hs[[i]]
        } else {
            out[[i]] <- strsplit(rawToChar(hs[[i]]$headers),
                                 "(?<=\r\n)",
                                 perl = TRUE)[[1L]]
            attr(out[[i]], "status") <- hs[[i]]$status_code
        }
    }
        
    out
}
    
.curl_GET_status <-
function(u, verbose = FALSE)
{
    if(verbose)
        message(sprintf("processing %s", u))
    ## Configure curl handle for better luck with JSTOR URLs/DOIs.
    ## Alternatively, special-case requests to
    ##   https?://doi.org/10.2307
    ##   https?://www.jstor.org
    h <- curl::new_handle()
    curl::handle_setopt(h,
                        cookiesession = 1,
                        followlocation = 1,
                        http_version = 2L,
                        ssl_enable_alpn = 0)
    timeout <- as.integer(getOption("timeout"))
    if(!is.na(timeout) && (timeout > 0L))
        curl::handle_setopt(h,
                            connecttimeout = timeout,
                            timeout = timeout)
    if(startsWith(u, "https://github.com") &&
       nzchar(a <- Sys.getenv("GITHUB_PAT", "")))
        curl::handle_setheaders(h, "Authorization" = paste("token", a))
    g <- tryCatch(curl::curl_fetch_memory(u, handle = h),
                  error = identity)
    if(inherits(g, "error"))
        -1L
    else
        g$status_code
}

Try the urlchecker package in your browser

Any scripts or data that you put into this service are public.

urlchecker documentation built on Dec. 11, 2021, 9:39 a.m.