R/plugins_get.R

Defines functions warn_crossref_tdm got_link_ft crossref_ft transtech_ft cob_ft cambridge_ft aip_ft instinvestfil_ft amersocclinoncol_ft amersocmicrobiol_ft jama_ft microbiology_ft roysoc_ft pnas_ft aaas_ft ieee_ft roysocchem_ft informa_ft scientificsocieties_ft wiley_ft sciencedirect_ft elsevier_ft biorxiv_ft arxiv_ft copernicus_ft pensoft_ft frontiersin_ft peerj_ft elife_ft bmc_ft entrez_ft plos_wrapper plapply plugin_get_generator error_df http_mssg get_ft check_file tcat ft_error ft_object null_list make_key pprint_cache type_in construct_paths

# get plugins --------------------------------------

## utilities
construct_paths <- function(co, x, type){
  if (!co$cache) {
    list(backend = NULL, path = "session", data = x)
  } else {
    list(
      backend = co$backend,
      cache_path = co$path,
      path = x,
      data = NULL
    )
  }
}

type_in <- function(type, types, name) {
  if (!type %in% types) {
    stop(sprintf("'type' for %s must be %s", name,
      paste0(sprintf("'%s'", types), collapse=", ")))
  }
}

pprint_cache <- function(x) list(backend = NULL, path = x, data = NULL)

make_key <- function(id, type) {
  hash <- doi_normalize(id)
  file_ext <- get_file_ext(type)
  file.path(cache_options_get()$path, paste0(hash, file_ext))
}

null_list <- function(opts) {
  list(found = NULL, dois = NULL, data = NULL, opts = opts, 
    errors = data.frame(NULL))
}

ft_object <- function(path, id, type) {
  list(path = path, id = id, type = type, error = NULL)
}

ft_error <- function(error, id) {
  list(path = NULL, id = id, type = NULL, error = error)
}

tcat <- function(...) {
  tryCatch(..., error = function(e) e, warning = function(w) w)
}

check_file <- function(w) {
  if (!file.exists(w)) return(FALSE) # doesn't exist
  if (!grepl("\\.xml", w)) return(FALSE) # not an xml file
  if (length(readLines(w)) == 0) return(FALSE) # nothing in the file
  xml <- tryCatch(xml2::read_xml(w), error = function(e) e)
  if (inherits(xml, c("error", "warning"))) return(FALSE) # unknown read errors
  tried <- tryCatch(xml2::xml_find_all(xml, "//ce:*"), 
    error = function(e) e, warning = function(w) w)
  if (inherits(tried, c("error", "warning"))) return(FALSE) # xpath errors
  length(tried) == 0
}

get_ft <- function(x, type, url, path, headers = list(), ...) {
  els_retain_non_ft <- as.logical(Sys.getenv("ELSEVIER_RETAIN_NON_FT", FALSE))
  cli <- crul::HttpClient$new(
    url = url, 
    opts = c(list(followlocation = 1), ...),
    headers = headers
  )
  #cat(paste0("within get_ft: ", cli$url), sep="\n")
  res <- tryCatch(cli$get(disk = path), 
    error = function(e) e,
    warning = function(w) w)
  #cat(class(res)[1L], sep = "\n")

  # if an error cleanup by deleting the file
  ## do elsevier check first
  elsevier_check <- (
    grepl("elsevier", res$url, ignore.case = TRUE) && 
    check_file(path) &&
    !els_retain_non_ft
  )
  if (
    inherits(res, c("error", "warning")) ||  ## an error from tryCatch
    res$status_code > 201 || ## HTTP status code indicates an error
    !grepl(type, res$response_headers[['content-type']]) || ## content type does not match
    inherits(
      tryCatch(
        switch(type, 
          xml = xml2::read_xml(res$content), 
          pdf = pdftools::pdf_info(res$content)), 
      error=function(e) e), "error") || ## invalid file, somehow gave 200 code
    elsevier_check ## got Elsevier abstract, but that's it
  ) {
    unlink(path)
    mssg <- if (inherits(res, c("error", "warning"))) {
      # tryCatch message
      res$message 
    } else if (
      !grepl(type, res$response_headers[['content-type']]) && 
      !is.null(res$response_headers[['content-type']])
    ) {
      # content type mismatch
      ct <- res$response_headers[['content-type']]
      sprintf("type was supposed to be `%s`, but was `%s`", type, ct)
    } else if (elsevier_check) {
      sprintf("elsevier: got abstract only; likely do not have access")
    } else {
      # if all else fails just give a HTTP status code message back
      http_mssg(res)
    }
    if ("url" %in% names(res)) {
      if (grepl("wiley", res$url, ignore.case = TRUE)) {
        mssg <- paste(mssg, res$response_headers$status, sep = " & ")
      }
    }
    warning("you may not have access to ", x, 
      "\n or an error occurred", 
      "\n or the downloaded file was invalid", 
      call. = FALSE)
    return(ft_error(mssg, x))
  }
  # if success return object
  return(ft_object(res$content, x, type))
}
http_mssg <- function(x) {
  b <- x$status_http()
  sprintf("(%s) %s", b$status_code, b$message)
}

# x = dat
error_df <- function(x) {
  tmp <- lapply(x$path, "[[", "error")
  data.frame(
    id = names(tmp), 
    error = unlist(Map(function(z) if (is.null(z)) NA_character_ else z, unname(tmp))), 
    stringsAsFactors = FALSE)
}

## plugin generator
plugin_get_generator <- function(srce, fun) {
  function(sources, ids, opts, type, url_pattern = NULL, 
    progress = FALSE, ...) {

    if (any(grepl("plos", sources))) {
      ids <- grep("annotation", ids, value = TRUE, invert = TRUE)
    }
    
    # do request
    ids <- stats::na.omit(ids)
    callopts <- list(...)
    if (any(grepl(eval(srce), sources))) {
      if (!any(grepl("arxiv", sources))) check_dois(ids)
      if (any(grepl("plos", sources))) {
        opts$doi <- ids
      } else if (any(sources %in% c("entrez"))) {
        opts$dois <- ids
      } else {
        opts$dois <- ids
      }

      opts <- c(opts, callopts)
      opts$type <- type
      opts$progress <- progress
      if (!is.null(url_pattern)) opts$url_pattern <- url_pattern
      out <- do.call(fun, opts)
      
      # deals with case where no results
      if (length(out) == 0) return(null_list(opts))

      dat <- if (any(sources %in% c("arxiv", "biorxiv"))) {
        pprint_cache(out)
      } else {
        construct_paths(cache_options_get(), out, type)
      }
      list(found = length(ft_compact(lapply(out, "[[", "path"))), dois = names(out), 
        data = dat, opts = opts, errors = error_df(dat))
    } else {
      null_list(opts)
    }
  }
}

## make plugins
plugin_get_plos <- plugin_get_generator("plos", plos_wrapper)
plugin_get_bmc <- plugin_get_generator("bmc", bmc_ft)
plugin_get_elife <- plugin_get_generator("elife", elife_ft)
plugin_get_peerj <- plugin_get_generator("peerj", peerj_ft)
plugin_get_frontiersin <- plugin_get_generator("frontiersin", frontiersin_ft)
plugin_get_pensoft <- plugin_get_generator("pensoft", pensoft_ft)
plugin_get_copernicus <- plugin_get_generator("copernicus", copernicus_ft)
plugin_get_entrez <- plugin_get_generator("entrez", entrez_ft)
plugin_get_biorxiv <- plugin_get_generator("biorxiv", biorxiv_ft)
plugin_get_arxiv <- plugin_get_generator("arxiv", arxiv_ft)
plugin_get_elsevier <- plugin_get_generator("elsevier", elsevier_ft)
plugin_get_sciencedirect <- plugin_get_generator("sciencedirect", sciencedirect_ft)
plugin_get_wiley <- plugin_get_generator("wiley", wiley_ft)
plugin_get_scientificsocieties <- plugin_get_generator("scientificsocieties", scientificsocieties_ft)
plugin_get_informa <- plugin_get_generator("informa", informa_ft)
plugin_get_crossref <- plugin_get_generator("crossref", crossref_ft)
plugin_get_links <- plugin_get_generator("generic", got_link_ft)
plugin_get_royalsocchem <- plugin_get_generator("royalsocchem", roysocchem_ft)
plugin_get_ieee <- plugin_get_generator("ieee", ieee_ft)
plugin_get_aaas <- plugin_get_generator("aaas", aaas_ft)
plugin_get_pnas <- plugin_get_generator("pnas", pnas_ft)
plugin_get_microbiology <- plugin_get_generator("microbiology", microbiology_ft)
plugin_get_jama <- plugin_get_generator("jama", jama_ft)
plugin_get_amersocmicrobiol <- plugin_get_generator("amersocmicrobiol", amersocmicrobiol_ft)
plugin_get_amersocclinoncol <- plugin_get_generator("amersocclinoncol", amersocclinoncol_ft)
plugin_get_instinvestfil <- plugin_get_generator("instinvestfil", instinvestfil_ft)
plugin_get_aip <- plugin_get_generator("aip", aip_ft)
plugin_get_cambridge <- plugin_get_generator("cambridge", cambridge_ft)
plugin_get_cob <- plugin_get_generator("cob", cob_ft)
plugin_get_roysoc <- plugin_get_generator("roysoc", roysoc_ft)
plugin_get_transtech <- plugin_get_generator("transtech", transtech_ft)

# lapply replacement with progress bar: actual a for loop internally
plapply <- function(x, FUN, type = NULL, progress = FALSE, ...) {
  if (progress) {
    pb <- utils::txtProgressBar(min = 0, max = length(x), 
      initial = 0, style = 3)
    on.exit(close(pb))
  }
  out <- vector(mode = "list", length = length(x))
  for (i in seq_along(x)) {
    if (progress) utils::setTxtProgressBar(pb, i)
    out[[i]] <- FUN(x[[i]], type = type, progress = progress, ...)
  }
  out <- stats::setNames(out, x)
  return(out)
}

## getters - could stand to make closure for the below as well, FIXME
# plos
plos_wrapper <- function(dois, type, progress = FALSE, ...) {
  plos_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(ftd_doi(x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    url <- res[grepl(type, res$content_type), ]$url
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, plos_fun, type, progress, ...)
}

# Entrez - wrapper around rentrez::entrez_search/rentrez::entrez_fetch
# type: only xml
entrez_ft <- function(dois, type = "xml", progress = FALSE, ...) {
  ids <- stats::na.omit(dois)
  db <- "pmc"
  if (length(ids) > 50) {
    chunk_size <- 50
    ids_chunked <- split(ids, ceiling(seq_along(ids)/chunk_size))
    out <- list()
    for (i in seq_along(ids_chunked)) {
      out[[i]] <- rentrez::entrez_search(
        db = db,
        term = paste0(sprintf('%s[doi]', ids_chunked[[i]]), collapse = "|")
      )
    }
    res <- list(ids = unlist(lapply(out, function(z) z$ids)))
  } else {
    res <- rentrez::entrez_search(
      db = db,
      term = paste0(sprintf('%s[doi]', ids), collapse = "|")
    )
  }
  
  if (length(res$ids) == 0) {
    db <- 'pubmed'
    if (length(ids) > 50) {
      chunk_size <- 50
      ids_chunked <- split(ids, ceiling(seq_along(ids)/chunk_size))
      out <- list()
      for (i in seq_along(ids_chunked)) {
        out[[i]] <- rentrez::entrez_search(
          db = db,
          term = paste0(sprintf('%s[doi]', ids_chunked[[i]]), collapse = "|")
        )
      }
      res <- list(ids = unlist(lapply(out, function(z) z$ids)))
    } else {
      res <- rentrez::entrez_search(
        db = db,
        term = paste0(sprintf('%s[doi]', ids), collapse = "|")
      )
    }
  }

  if (length(res$ids) == 0) return(NULL)
  ent_fun <- function(z, progress, db, ...) {
    path <- make_key(z, 'xml')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, z, 'xml'))
    }
    # have to keep this httr usage
    invisible(
      rentrez::entrez_fetch(db = db, id = z, rettype = "xml", 
        config = c(httr_write_disk(path, cache_options_get()$overwrite), ...)
      )
    )
    ft_object(path, z, 'xml')
  }
  plapply(res$ids, ent_fun, progress = progress, db = db, ...)
}

# type: xml only presumably?
bmc_ft <- function(dois, type = "xml", progress = FALSE, ...) {
  res <- rentrez::entrez_search(
    db = "pubmed", term = paste0(sprintf('%s[doi]', dois), collapse = "|"))
  if (length(res$ids) == 0) return(NULL)
  bmc_fun <- function(z, type, progress = FALSE, ...) {
    path <- make_key(z, 'xml')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, z, 'xml'))
    }
    # have to keep this httr usage
    invisible(rentrez::entrez_fetch(db = 'pubmed', id = z, 
      rettype = "xml", config = httr_write_disk(path, cache_options_get()$overwrite)))
    ft_object(path, z, 'xml')
  }
  plapply(res$ids, bmc_fun, type, progress, ...)
}

# type: xml and pdf
elife_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "elife")
  elife_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    lk <- tcat(ft_cr_links(x))
    if (inherits(lk, "error")) return(ft_error(lk$message, x))
    lk <- tcat(Filter(function(x) grepl(paste0("\\.", type), x), lk)[[1]][[1]])
    get_ft(x, type, lk, path, list(), ...)
  }
  plapply(dois, elife_fun, type, progress, ...)
}

# type: xml and pdf
peerj_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "peerj")
  peerj_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(ftd_doi(x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    url <- res[grepl(type, res$content_type), ]$url
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, peerj_fun, type, progress, ...)
}

# type: xml and pdf
frontiersin_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "frontiersin")
  fronteiersin_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(ftd_doi(x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    url <- res[grepl(type, res$content_type), ]$url
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, fronteiersin_fun, type, progress, ...)
}

# type: xml and pdf
pensoft_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "pensoft")
  pensoft_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(ftd_doi(x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    url <- res[grepl(type, res$content_type), ]$url
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, pensoft_fun, type, progress, ...)
}

# type: xml and pdf
copernicus_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "copernicus")
  copernicus_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(crul::HttpClient$new(url = paste0("https://doi.org/", x))$head())
    if (inherits(res, c("error", "warning")) || !res$success()) {
      mssg <- if (inherits(res, c("error", "warning"))) res$message else http_mssg(res)
      return(ft_error(mssg, x))
    }
    url <- paste0(res$url, sub("10.5194/", "", x), ".", type)
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, copernicus_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
arxiv_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  arxiv_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    url <- sprintf("http://arxiv.org/pdf/%s.pdf", x)
    get_ft(x, 'pdf', url, path, ...)
  }
  plapply(dois, arxiv_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
biorxiv_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  biorxiv_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x, 'pdf', url, path, ...)
  }
  plapply(dois, biorxiv_fun, type, progress, ...)
}

# type: plain and xml
elsevier_ft <- function(dois, type, progress = FALSE, retain_non_ft = FALSE, ...) {
  warn_crossref_tdm()
  assert(retain_non_ft, "logical")
  type_in(type, c('plain', 'xml', 'pdf'), "elsevier")
  elsevier_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(rcrossref::cr_works(dois = x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    res <- res$data$link[[1]]
    url <- res[res$content.type == paste0("text/", type), "URL"][[1]]
    if (is.null(url) || length(url) == 0) {
      mssg <- "has no link available"
      warning(x, " with type '", type, "' ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    header <- list(
      # See https://dev.elsevier.com/tecdoc_text_mining.html
      `X-ELS-APIKey` = Sys.getenv("ELSEVIER_TDM_KEY"),
      Accept = paste0(switch(type, xml = "text/", plain = "text/"), type)
    )
    get_ft(x, type, url, path, header, ...)
  }
  Sys.setenv(ELSEVIER_RETAIN_NON_FT = retain_non_ft)
  on.exit(Sys.unsetenv("ELSEVIER_RETAIN_NON_FT"), add = TRUE)
  plapply(dois, elsevier_fun, type, progress, ...)
}

# type: plain and xml
sciencedirect_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('plain', 'xml'), "sciencedirect")
  sciencedirect_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    
    # See https://dev.elsevier.com/tecdoc_text_mining.html
    header <- list(
      `X-ELS-APIKey` = Sys.getenv("ELSEVIER_TDM_KEY"),
      Accept = paste0(switch(type, xml = "text/", plain = "text/"), type)
    )
    
    # We specifically ask for the full text. Thus, we get an error if not available
    # instead of silently just an abstract or some other metadata.
    url <- paste0("https://api.elsevier.com/content/article/doi/", x, "?view=FULL")
    http <- crul::HttpClient$new(
      url = url,
      headers = header,
      opts = list(...)
    )
    res <- tcat(http$head())
    if (inherits(res, c("error", "warning")) || !res$success()) {
      if (inherits(res, c("error", "warning"))) {
        mssg <- res$message 
      } else {
        # Elsevier gives an extra message in case of errors
        if (is.null(res$response_headers$`x-els-status`)) {
          elsevier_status <- ""
        } else {
          elsevier_status <- paste(":", res$response_headers$`x-els-status`)
        }
        mssg <- paste0(http_mssg(res), elsevier_status)
      }
      return(ft_error(mssg, x))
    }
    
    get_ft(x, type, url, path, header, ...)
  }
  plapply(dois, sciencedirect_fun, type, progress, ...)
}

# type: pdf and xml
wiley_ft <- function(dois, type, progress = FALSE, ...) {
  warn_crossref_tdm()
  type_in(type, c('pdf', 'xml'), "wiley")
  wiley_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    if (type == "pdf") {
      url <- paste0("https://api.wiley.com/onlinelibrary/tdm/v1/articles/",
        sub("/", "%2F", x))
    } else {
      url <- paste0("https://onlinelibrary.wiley.com/doi/full-xml/",
        sub("/", "%2F", x))
    }
    header <- list(
      # See https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining
      `Wiley-TDM-Client-Token` = Sys.getenv("WILEY_TDM_KEY"),
      Accept = paste0(switch(type, xml = "application/", pdf = "application/"), type)
    )
    suppressWarnings(get_ft(x, type, url, path, header, ...))
  }
  plapply(dois, wiley_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
scientificsocieties_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  scs_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tryCatch(ft_cr_links(x), error = function(e) e, 
      warning = function(w) w)
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    if (is.null(lk) || length(lk) == 0) {
      mssg <- "has no link available"
      warning(x, " ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    get_ft(x, 'pdf', lk[[1]][[1]], path, ...)
  }
  plapply(dois, scs_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
informa_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  informa_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tryCatch(ft_cr_links(x), error = function(e) e, warning = function(w) w)
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    if (is.null(lk) || length(lk) == 0) {
      mssg <- "has no link available"
      warning(x, " ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    get_ft(x, 'pdf', lk[[1]][[1]], path, ...)
  }
  plapply(dois, informa_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
roysocchem_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  roysocchem_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, roysocchem_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
ieee_ft <- function(dois, type = "pdf",progress = FALSE, ...) {
  ieee_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tryCatch(ft_cr_links(x), error = function(e) e, warning = function(w) w)
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    if (is.null(lk) || length(lk) == 0) {
      mssg <- "has no link available"
      warning(x, " ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    get_ft(x = x, type = 'pdf', url = lk[[1]][[1]], path = path, ...)
  }
  plapply(dois, ieee_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
aaas_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  aaas_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, aaas_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
pnas_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  pnas_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, pnas_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
roysoc_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  roysoc_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, roysoc_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
microbiology_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  microbiology_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, microbiology_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
jama_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  jama_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, jama_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
amersocmicrobiol_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  amersocmicrobiol_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, amersocmicrobiol_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
amersocclinoncol_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  amersocclinoncol_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, amersocclinoncol_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
instinvestfil_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  instinvestfil_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tryCatch(ft_cr_links(x), error = function(e) e, warning = function(w) w)
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    if (is.null(lk) || length(lk) == 0) {
      mssg <- "has no link available"
      warning(x, " ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    url = sub('view', 'download', lk[[1]][[1]])
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, instinvestfil_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
aip_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  aip_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, aip_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
cambridge_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  cambridge_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    res <- tcat(crul::HttpClient$new(url = paste0("https://doi.org/", x),
      opts=list(followlocation=1))$get())
    if (inherits(res, c("error", "warning")) || !res$success()) {
      mssg <- if (inherits(res, c("error", "warning"))) res$message else http_mssg(res)
      return(ft_error(mssg, x))
    }
    html <- xml2::read_html(res$parse("UTF-8"))
    node <- xml2::xml_find_first(html, "//meta[@name=\"citation_pdf_url\"]")
    url <- xml2::xml_attr(node, "content")
    if (all(is.na(url))) {
      mssg <- "no pdf link found or you may not have access to the article"
      return(ft_error(mssg, x))
    }
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, cambridge_fun, type, progress, ...)
}

# company_of_biologists
# type: only pdf (type parameter is ignored)
cob_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  cob_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, cob_fun, type, progress, ...)
}

# Trans Tech Publications
# type: only pdf (type parameter is ignored)
transtech_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  tt_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    # ftdoi_get(member = names(dfsplit)[i])
    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, tt_fun, type, progress, ...)
}

# special Crossref plugin to try any DOI
crossref_ft <- function(dois, type, progress = FALSE, ...) {
  crossref_fun <- function(x, type, progress, ...) {
    lks <- tcat(ft_cr_links(x))
    if (inherits(lks, c("error", "warning")) || inherits(lks, "warning")) {
      mssg <- lks$message
      if (inherits(lks, "error")) warning(x, " has no full text links available", call. = FALSE)
      if (inherits(lks, "warning")) warning(x, " not found", call. = FALSE)
      return(ft_error(mssg, x))
    }
    if (any(names(lks) == "unspecified")) {
      for (i in seq_along(lks)) {
        if (names(lks)[i] == "unspecified") {
          extype <- strextract(lks[[i]][[1]], "xml|pdf")
          if (length(extype) != 0) names(lks)[i] <- extype
        }
      }
    }
    lk <- lks[[type]]
    types <- c('xml', 'pdf')
    if (is.null(lk)) {
      lk <- lks[[ types[!types %in% type] ]]
      type <- types[!types %in% type]
      if (is.null(lk)) return(ft_error("no link found from Crossref", x))
    }

    # check if we already have it
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }

    # update path
    path <- sub("[A-Za-z]+$", type, path)

    # fetch it
    get_ft(x, type, lk[[1]], path, ...)
  }
  plapply(dois, crossref_fun, type, progress, ...)
}

# special plugin when link already in hand
got_link_ft <- function(dois, type, url_pattern, progress = FALSE, ...) {
  link_fun <- function(x, type, progress, ...) {
    # try for the type passed, if not found, try for another
    types <- c('xml', 'pdf')
    pat <- url_pattern[[type]]
    if (is.null(pat)) {
      pat <- url_pattern[[ types[!types %in% type] ]]
      type <- types[!types %in% type]
      if (is.null(pat)) return(ft_error("no appropriate link found", x))
    }
    url <- sprintf(pat, x)
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, link_fun, type, progress, ...)
}

warn_crossref_tdm <- function(x) {
  crtdm <- 'CROSSREF_TDM'
  if (nzchar(Sys.getenv(crtdm)))
    warning(sprintf(
      "`%s` env var found: see 'Authentication' section of ?fulltext",
      crtdm), call. = FALSE)
}

Try the fulltext package in your browser

Any scripts or data that you put into this service are public.

fulltext documentation built on June 12, 2021, 9:06 a.m.