R/plugins_get.R
In fulltext: Full Text of 'Scholarly' Articles Across Many Data Sources

# get plugins --------------------------------------

## utilities
construct_paths <- function(co, x, type){
  if (!co$cache) {
    list(backend = NULL, path = "session", data = x)
  } else {
    list(
      backend = co$backend,
      cache_path = co$path,
      path = x,
      data = NULL
    )
  }
}

type_in <- function(type, types, name) {
  if (!type %in% types) {
    stop(sprintf("'type' for %s must be %s", name,
      paste0(sprintf("'%s'", types), collapse=", ")))
  }
}

pprint_cache <- function(x) list(backend = NULL, path = x, data = NULL)

make_key <- function(id, type) {
  hash <- doi_normalize(id)
  file_ext <- get_file_ext(type)
  file.path(cache_options_get()$path, paste0(hash, file_ext))
}

null_list <- function(opts) {
  list(found = NULL, dois = NULL, data = NULL, opts = opts, 
    errors = data.frame(NULL))
}

ft_object <- function(path, id, type) {
  list(path = path, id = id, type = type, error = NULL)
}

ft_error <- function(error, id) {
  list(path = NULL, id = id, type = NULL, error = error)
}

tcat <- function(...) {
  tryCatch(..., error = function(e) e, warning = function(w) w)
}

check_file <- function(w) {
  if (!file.exists(w)) return(FALSE) # doesn't exist
  if (!grepl("\\.xml", w)) return(FALSE) # not an xml file
  if (length(readLines(w)) == 0) return(FALSE) # nothing in the file
  xml <- tryCatch(xml2::read_xml(w), error = function(e) e)
  if (inherits(xml, c("error", "warning"))) return(FALSE) # unknown read errors
  tried <- tryCatch(xml2::xml_find_all(xml, "//ce:*"), 
    error = function(e) e, warning = function(w) w)
  if (inherits(tried, c("error", "warning"))) return(FALSE) # xpath errors
  length(tried) == 0
}

get_ft <- function(x, type, url, path, headers = list(), ...) {
  els_retain_non_ft <- as.logical(Sys.getenv("ELSEVIER_RETAIN_NON_FT", FALSE))
  cli <- crul::HttpClient$new(
    url = url, 
    opts = c(list(followlocation = 1), ...),
    headers = headers
  )
  #cat(paste0("within get_ft: ", cli$url), sep="\n")
  res <- tryCatch(cli$get(disk = path), 
    error = function(e) e,
    warning = function(w) w)
  #cat(class(res)[1L], sep = "\n")

  # if an error cleanup by deleting the file
  ## do elsevier check first
  elsevier_check <- (
    grepl("elsevier", res$url, ignore.case = TRUE) && 
    check_file(path) &&
    !els_retain_non_ft
  )
  if (
    inherits(res, c("error", "warning")) ||  ## an error from tryCatch
    res$status_code > 201 || ## HTTP status code indicates an error
    !grepl(type, res$response_headers[['content-type']]) || ## content type does not match
    inherits(
      tryCatch(
        switch(type, 
          xml = xml2::read_xml(res$content), 
          pdf = pdftools::pdf_info(res$content)), 
      error=function(e) e), "error") || ## invalid file, somehow gave 200 code
    elsevier_check ## got Elsevier abstract, but that's it
  ) {
    unlink(path)
    mssg <- if (inherits(res, c("error", "warning"))) {
      # tryCatch message
      res$message 
    } else if (
      !grepl(type, res$response_headers[['content-type']]) && 
      !is.null(res$response_headers[['content-type']])
    ) {
      # content type mismatch
      ct <- res$response_headers[['content-type']]
      sprintf("type was supposed to be `%s`, but was `%s`", type, ct)
    } else if (elsevier_check) {
      sprintf("elsevier: got abstract only; likely do not have access")
    } else {
      # if all else fails just give a HTTP status code message back
      http_mssg(res)
    }
    if ("url" %in% names(res)) {
      if (grepl("wiley", res$url, ignore.case = TRUE)) {
        mssg <- paste(mssg, res$response_headers$status, sep = " & ")
      }
    }
    warning("you may not have access to ", x, 
      "\n or an error occurred", 
      "\n or the downloaded file was invalid", 
      call. = FALSE)
    return(ft_error(mssg, x))
  }
  # if success return object
  return(ft_object(res$content, x, type))
}
http_mssg <- function(x) {
  b <- x$status_http()
  sprintf("(%s) %s", b$status_code, b$message)
}

# x = dat
error_df <- function(x) {
  tmp <- lapply(x$path, "[[", "error")
  data.frame(
    id = names(tmp), 
    error = unlist(Map(function(z) if (is.null(z)) NA_character_ else z, unname(tmp))), 
    stringsAsFactors = FALSE)
}

## plugin generator
plugin_get_generator <- function(srce, fun) {
  function(sources, ids, opts, type, url_pattern = NULL, 
    progress = FALSE, ...) {

    if (any(grepl("plos", sources))) {
      ids <- grep("annotation", ids, value = TRUE, invert = TRUE)
    }
    
    # do request
    ids <- stats::na.omit(ids)
    callopts <- list(...)
    if (any(grepl(eval(srce), sources))) {
      if (!any(grepl("arxiv", sources))) check_dois(ids)
      if (any(grepl("plos", sources))) {
        opts$doi <- ids
      } else if (any(sources %in% c("entrez"))) {
        opts$dois <- ids
      } else {
        opts$dois <- ids
      }

      opts <- c(opts, callopts)
      opts$type <- type
      opts$progress <- progress
      if (!is.null(url_pattern)) opts$url_pattern <- url_pattern
      out <- do.call(fun, opts)
      
      # deals with case where no results
      if (length(out) == 0) return(null_list(opts))

      dat <- if (any(sources %in% c("arxiv", "biorxiv"))) {
        pprint_cache(out)
      } else {
        construct_paths(cache_options_get(), out, type)
      }
      list(found = length(ft_compact(lapply(out, "[[", "path"))), dois = names(out), 
        data = dat, opts = opts, errors = error_df(dat))
    } else {
      null_list(opts)
    }
  }
}

## make plugins
plugin_get_plos <- plugin_get_generator("plos", plos_wrapper)
plugin_get_bmc <- plugin_get_generator("bmc", bmc_ft)
plugin_get_elife <- plugin_get_generator("elife", elife_ft)
plugin_get_peerj <- plugin_get_generator("peerj", peerj_ft)
plugin_get_frontiersin <- plugin_get_generator("frontiersin", frontiersin_ft)
plugin_get_pensoft <- plugin_get_generator("pensoft", pensoft_ft)
plugin_get_copernicus <- plugin_get_generator("copernicus", copernicus_ft)
plugin_get_entrez <- plugin_get_generator("entrez", entrez_ft)
plugin_get_biorxiv <- plugin_get_generator("biorxiv", biorxiv_ft)
plugin_get_arxiv <- plugin_get_generator("arxiv", arxiv_ft)
plugin_get_elsevier <- plugin_get_generator("elsevier", elsevier_ft)
plugin_get_sciencedirect <- plugin_get_generator("sciencedirect", sciencedirect_ft)
plugin_get_wiley <- plugin_get_generator("wiley", wiley_ft)
plugin_get_scientificsocieties <- plugin_get_generator("scientificsocieties", scientificsocieties_ft)
plugin_get_informa <- plugin_get_generator("informa", informa_ft)
plugin_get_crossref <- plugin_get_generator("crossref", crossref_ft)
plugin_get_links <- plugin_get_generator("generic", got_link_ft)
plugin_get_royalsocchem <- plugin_get_generator("royalsocchem", roysocchem_ft)
plugin_get_ieee <- plugin_get_generator("ieee", ieee_ft)
plugin_get_aaas <- plugin_get_generator("aaas", aaas_ft)
plugin_get_pnas <- plugin_get_generator("pnas", pnas_ft)
plugin_get_microbiology <- plugin_get_generator("microbiology", microbiology_ft)
plugin_get_jama <- plugin_get_generator("jama", jama_ft)
plugin_get_amersocmicrobiol <- plugin_get_generator("amersocmicrobiol", amersocmicrobiol_ft)
plugin_get_amersocclinoncol <- plugin_get_generator("amersocclinoncol", amersocclinoncol_ft)
plugin_get_instinvestfil <- plugin_get_generator("instinvestfil", instinvestfil_ft)
plugin_get_aip <- plugin_get_generator("aip", aip_ft)
plugin_get_cambridge <- plugin_get_generator("cambridge", cambridge_ft)
plugin_get_cob <- plugin_get_generator("cob", cob_ft)
plugin_get_roysoc <- plugin_get_generator("roysoc", roysoc_ft)
plugin_get_transtech <- plugin_get_generator("transtech", transtech_ft)

# lapply replacement with progress bar: actual a for loop internally
plapply <- function(x, FUN, type = NULL, progress = FALSE, ...) {
  if (progress) {
    pb <- utils::txtProgressBar(min = 0, max = length(x), 
      initial = 0, style = 3)
    on.exit(close(pb))
  }
  out <- vector(mode = "list", length = length(x))
  for (i in seq_along(x)) {
    if (progress) utils::setTxtProgressBar(pb, i)
    out[[i]] <- FUN(x[[i]], type = type, progress = progress, ...)
  }
  out <- stats::setNames(out, x)
  return(out)
}

## getters - could stand to make closure for the below as well, FIXME
# plos
plos_wrapper <- function(dois, type, progress = FALSE, ...) {
  plos_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(ftd_doi(x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    url <- res[grepl(type, res$content_type), ]$url
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, plos_fun, type, progress, ...)
}

# Entrez - wrapper around rentrez::entrez_search/rentrez::entrez_fetch
# type: only xml
entrez_ft <- function(dois, type = "xml", progress = FALSE, ...) {
  ids <- stats::na.omit(dois)
  db <- "pmc"
  if (length(ids) > 50) {
    chunk_size <- 50
    ids_chunked <- split(ids, ceiling(seq_along(ids)/chunk_size))
    out <- list()
    for (i in seq_along(ids_chunked)) {
      out[[i]] <- rentrez::entrez_search(
        db = db,
        term = paste0(sprintf('%s[doi]', ids_chunked[[i]]), collapse = "|")
      )
    }
    res <- list(ids = unlist(lapply(out, function(z) z$ids)))
  } else {
    res <- rentrez::entrez_search(
      db = db,
      term = paste0(sprintf('%s[doi]', ids), collapse = "|")
    )
  }
  
  if (length(res$ids) == 0) {
    db <- 'pubmed'
    if (length(ids) > 50) {
      chunk_size <- 50
      ids_chunked <- split(ids, ceiling(seq_along(ids)/chunk_size))
      out <- list()
      for (i in seq_along(ids_chunked)) {
        out[[i]] <- rentrez::entrez_search(
          db = db,
          term = paste0(sprintf('%s[doi]', ids_chunked[[i]]), collapse = "|")
        )
      }
      res <- list(ids = unlist(lapply(out, function(z) z$ids)))
    } else {
      res <- rentrez::entrez_search(
        db = db,
        term = paste0(sprintf('%s[doi]', ids), collapse = "|")
      )
    }
  }

  if (length(res$ids) == 0) return(NULL)
  ent_fun <- function(z, progress, db, ...) {
    path <- make_key(z, 'xml')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, z, 'xml'))
    }
    # have to keep this httr usage
    invisible(
      rentrez::entrez_fetch(db = db, id = z, rettype = "xml", 
        config = c(httr_write_disk(path, cache_options_get()$overwrite), ...)
      )
    )
    ft_object(path, z, 'xml')
  }
  plapply(res$ids, ent_fun, progress = progress, db = db, ...)
}

# type: xml only presumably?
bmc_ft <- function(dois, type = "xml", progress = FALSE, ...) {
  res <- rentrez::entrez_search(
    db = "pubmed", term = paste0(sprintf('%s[doi]', dois), collapse = "|"))
  if (length(res$ids) == 0) return(NULL)
  bmc_fun <- function(z, type, progress = FALSE, ...) {
    path <- make_key(z, 'xml')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, z, 'xml'))
    }
    # have to keep this httr usage
    invisible(rentrez::entrez_fetch(db = 'pubmed', id = z, 
      rettype = "xml", config = httr_write_disk(path, cache_options_get()$overwrite)))
    ft_object(path, z, 'xml')
  }
  plapply(res$ids, bmc_fun, type, progress, ...)
}

# type: xml and pdf
elife_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "elife")
  elife_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    lk <- tcat(ft_cr_links(x))
    if (inherits(lk, "error")) return(ft_error(lk$message, x))
    lk <- tcat(Filter(function(x) grepl(paste0("\\.", type), x), lk)[[1]][[1]])
    get_ft(x, type, lk, path, list(), ...)
  }
  plapply(dois, elife_fun, type, progress, ...)
}

# type: xml and pdf
peerj_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "peerj")
  peerj_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(ftd_doi(x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    url <- res[grepl(type, res$content_type), ]$url
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, peerj_fun, type, progress, ...)
}

# type: xml and pdf
frontiersin_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "frontiersin")
  fronteiersin_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(ftd_doi(x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    url <- res[grepl(type, res$content_type), ]$url
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, fronteiersin_fun, type, progress, ...)
}

# type: xml and pdf
pensoft_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "pensoft")
  pensoft_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(ftd_doi(x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    url <- res[grepl(type, res$content_type), ]$url
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, pensoft_fun, type, progress, ...)
}

# type: xml and pdf
copernicus_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('pdf', 'xml'), "copernicus")
  copernicus_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(crul::HttpClient$new(url = paste0("https://doi.org/", x))$head())
    if (inherits(res, c("error", "warning")) || !res$success()) {
      mssg <- if (inherits(res, c("error", "warning"))) res$message else http_mssg(res)
      return(ft_error(mssg, x))
    }
    url <- paste0(res$url, sub("10.5194/", "", x), ".", type)
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, copernicus_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
arxiv_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  arxiv_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    url <- sprintf("http://arxiv.org/pdf/%s.pdf", x)
    get_ft(x, 'pdf', url, path, ...)
  }
  plapply(dois, arxiv_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
biorxiv_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  biorxiv_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x, 'pdf', url, path, ...)
  }
  plapply(dois, biorxiv_fun, type, progress, ...)
}

# type: plain and xml
elsevier_ft <- function(dois, type, progress = FALSE, retain_non_ft = FALSE, ...) {
  warn_crossref_tdm()
  assert(retain_non_ft, "logical")
  type_in(type, c('plain', 'xml', 'pdf'), "elsevier")
  elsevier_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    res <- tcat(rcrossref::cr_works(dois = x))
    if (inherits(res, c("error", "warning"))) return(ft_error(res$message, x))
    res <- res$data$link[[1]]
    url <- res[res$content.type == paste0("text/", type), "URL"][[1]]
    if (is.null(url) || length(url) == 0) {
      mssg <- "has no link available"
      warning(x, " with type '", type, "' ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    header <- list(
      # See https://dev.elsevier.com/tecdoc_text_mining.html
      `X-ELS-APIKey` = Sys.getenv("ELSEVIER_TDM_KEY"),
      Accept = paste0(switch(type, xml = "text/", plain = "text/"), type)
    )
    get_ft(x, type, url, path, header, ...)
  }
  Sys.setenv(ELSEVIER_RETAIN_NON_FT = retain_non_ft)
  on.exit(Sys.unsetenv("ELSEVIER_RETAIN_NON_FT"), add = TRUE)
  plapply(dois, elsevier_fun, type, progress, ...)
}

# type: plain and xml
sciencedirect_ft <- function(dois, type, progress = FALSE, ...) {
  type_in(type, c('plain', 'xml'), "sciencedirect")
  sciencedirect_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    
    # See https://dev.elsevier.com/tecdoc_text_mining.html
    header <- list(
      `X-ELS-APIKey` = Sys.getenv("ELSEVIER_TDM_KEY"),
      Accept = paste0(switch(type, xml = "text/", plain = "text/"), type)
    )
    
    # We specifically ask for the full text. Thus, we get an error if not available
    # instead of silently just an abstract or some other metadata.
    url <- paste0("https://api.elsevier.com/content/article/doi/", x, "?view=FULL")
    http <- crul::HttpClient$new(
      url = url,
      headers = header,
      opts = list(...)
    )
    res <- tcat(http$head())
    if (inherits(res, c("error", "warning")) || !res$success()) {
      if (inherits(res, c("error", "warning"))) {
        mssg <- res$message 
      } else {
        # Elsevier gives an extra message in case of errors
        if (is.null(res$response_headers$`x-els-status`)) {
          elsevier_status <- ""
        } else {
          elsevier_status <- paste(":", res$response_headers$`x-els-status`)
        }
        mssg <- paste0(http_mssg(res), elsevier_status)
      }
      return(ft_error(mssg, x))
    }
    
    get_ft(x, type, url, path, header, ...)
  }
  plapply(dois, sciencedirect_fun, type, progress, ...)
}

# type: pdf and xml
wiley_ft <- function(dois, type, progress = FALSE, ...) {
  warn_crossref_tdm()
  type_in(type, c('pdf', 'xml'), "wiley")
  wiley_fun <- function(x, type, progress, ...) {
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    if (type == "pdf") {
      url <- paste0("https://api.wiley.com/onlinelibrary/tdm/v1/articles/",
        sub("/", "%2F", x))
    } else {
      url <- paste0("https://onlinelibrary.wiley.com/doi/full-xml/",
        sub("/", "%2F", x))
    }
    header <- list(
      # See https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining
      `Wiley-TDM-Client-Token` = Sys.getenv("WILEY_TDM_KEY"),
      Accept = paste0(switch(type, xml = "application/", pdf = "application/"), type)
    )
    suppressWarnings(get_ft(x, type, url, path, header, ...))
  }
  plapply(dois, wiley_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
scientificsocieties_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  scs_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tryCatch(ft_cr_links(x), error = function(e) e, 
      warning = function(w) w)
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    if (is.null(lk) || length(lk) == 0) {
      mssg <- "has no link available"
      warning(x, " ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    get_ft(x, 'pdf', lk[[1]][[1]], path, ...)
  }
  plapply(dois, scs_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
informa_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  informa_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tryCatch(ft_cr_links(x), error = function(e) e, warning = function(w) w)
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    if (is.null(lk) || length(lk) == 0) {
      mssg <- "has no link available"
      warning(x, " ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    get_ft(x, 'pdf', lk[[1]][[1]], path, ...)
  }
  plapply(dois, informa_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
roysocchem_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  roysocchem_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, roysocchem_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
ieee_ft <- function(dois, type = "pdf",progress = FALSE, ...) {
  ieee_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tryCatch(ft_cr_links(x), error = function(e) e, warning = function(w) w)
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    if (is.null(lk) || length(lk) == 0) {
      mssg <- "has no link available"
      warning(x, " ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    get_ft(x = x, type = 'pdf', url = lk[[1]][[1]], path = path, ...)
  }
  plapply(dois, ieee_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
aaas_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  aaas_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, aaas_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
pnas_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  pnas_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, pnas_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
roysoc_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  roysoc_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, roysoc_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
microbiology_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  microbiology_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, microbiology_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
jama_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  jama_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl('pdf', lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, jama_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
amersocmicrobiol_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  amersocmicrobiol_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, amersocmicrobiol_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
amersocclinoncol_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  amersocclinoncol_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, amersocclinoncol_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
instinvestfil_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  instinvestfil_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tryCatch(ft_cr_links(x), error = function(e) e, warning = function(w) w)
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    if (is.null(lk) || length(lk) == 0) {
      mssg <- "has no link available"
      warning(x, " ", mssg, call. = FALSE)
      return(ft_error(mssg, x))
    }
    url = sub('view', 'download', lk[[1]][[1]])
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, instinvestfil_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
aip_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  aip_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }

    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, aip_fun, type, progress, ...)
}

# type: only pdf (type parameter is ignored)
cambridge_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  cambridge_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    res <- tcat(crul::HttpClient$new(url = paste0("https://doi.org/", x),
      opts=list(followlocation=1))$get())
    if (inherits(res, c("error", "warning")) || !res$success()) {
      mssg <- if (inherits(res, c("error", "warning"))) res$message else http_mssg(res)
      return(ft_error(mssg, x))
    }
    html <- xml2::read_html(res$parse("UTF-8"))
    node <- xml2::xml_find_first(html, "//meta[@name=\"citation_pdf_url\"]")
    url <- xml2::xml_attr(node, "content")
    if (all(is.na(url))) {
      mssg <- "no pdf link found or you may not have access to the article"
      return(ft_error(mssg, x))
    }
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, cambridge_fun, type, progress, ...)
}

# company_of_biologists
# type: only pdf (type parameter is ignored)
cob_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  cob_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, cob_fun, type, progress, ...)
}

# Trans Tech Publications
# type: only pdf (type parameter is ignored)
transtech_ft <- function(dois, type = "pdf", progress = FALSE, ...) {
  tt_fun <- function(x, type, progress, ...) {
    path <- make_key(x, 'pdf')
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, 'pdf'))
    }
    # ftdoi_get(member = names(dfsplit)[i])
    lk <- tcat(ftd_doi(x))
    if (inherits(lk, c("error", "warning"))) return(ft_error(lk$message, x))
    url <- lk[grepl("pdf", lk$content_type), ]$url
    get_ft(x = x, type = 'pdf', url = url, path = path, ...)
  }
  plapply(dois, tt_fun, type, progress, ...)
}

# special Crossref plugin to try any DOI
crossref_ft <- function(dois, type, progress = FALSE, ...) {
  crossref_fun <- function(x, type, progress, ...) {
    lks <- tcat(ft_cr_links(x))
    if (inherits(lks, c("error", "warning")) || inherits(lks, "warning")) {
      mssg <- lks$message
      if (inherits(lks, "error")) warning(x, " has no full text links available", call. = FALSE)
      if (inherits(lks, "warning")) warning(x, " not found", call. = FALSE)
      return(ft_error(mssg, x))
    }
    if (any(names(lks) == "unspecified")) {
      for (i in seq_along(lks)) {
        if (names(lks)[i] == "unspecified") {
          extype <- strextract(lks[[i]][[1]], "xml|pdf")
          if (length(extype) != 0) names(lks)[i] <- extype
        }
      }
    }
    lk <- lks[[type]]
    types <- c('xml', 'pdf')
    if (is.null(lk)) {
      lk <- lks[[ types[!types %in% type] ]]
      type <- types[!types %in% type]
      if (is.null(lk)) return(ft_error("no link found from Crossref", x))
    }

    # check if we already have it
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }

    # update path
    path <- sub("[A-Za-z]+$", type, path)

    # fetch it
    get_ft(x, type, lk[[1]], path, ...)
  }
  plapply(dois, crossref_fun, type, progress, ...)
}

# special plugin when link already in hand
got_link_ft <- function(dois, type, url_pattern, progress = FALSE, ...) {
  link_fun <- function(x, type, progress, ...) {
    # try for the type passed, if not found, try for another
    types <- c('xml', 'pdf')
    pat <- url_pattern[[type]]
    if (is.null(pat)) {
      pat <- url_pattern[[ types[!types %in% type] ]]
      type <- types[!types %in% type]
      if (is.null(pat)) return(ft_error("no appropriate link found", x))
    }
    url <- sprintf(pat, x)
    path <- make_key(x, type)
    if (file.exists(path) && !cache_options_get()$overwrite) {
      if (!progress) message(paste0("path exists: ", path))
      return(ft_object(path, x, type))
    }
    get_ft(x, type, url, path, ...)
  }
  plapply(dois, link_fun, type, progress, ...)
}

warn_crossref_tdm <- function(x) {
  crtdm <- 'CROSSREF_TDM'
  if (nzchar(Sys.getenv(crtdm)))
    warning(sprintf(
      "`%s` env var found: see 'Authentication' section of ?fulltext",
      crtdm), call. = FALSE)
}
Any scripts or data that you put into this service are public.
fulltext documentation built on June 12, 2021, 9:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
fulltext
Full Text of 'Scholarly' Articles Across Many Data Sources

R/plugins_get.R
In fulltext: Full Text of 'Scholarly' Articles Across Many Data Sources

Try the fulltext package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fulltext Full Text of 'Scholarly' Articles Across Many Data Sources

R/plugins_get.R In fulltext: Full Text of 'Scholarly' Articles Across Many Data Sources

Try the fulltext package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fulltext
Full Text of 'Scholarly' Articles Across Many Data Sources

R/plugins_get.R
In fulltext: Full Text of 'Scholarly' Articles Across Many Data Sources