R/article_pdf_download.R

Defines functions article_pdf_download

Documented in article_pdf_download

#' Batch download articles from bibtex files
#'
#'
#' @param infilepath  (character) path to target folder with input files
#' @param outfilepath (character) path to folder for export files
#' @param colandr     A file (character) that provides titles to match; designed to be output of Colandr
#' @param cond        Condition (logical) that defines sorting of output from Colandr file
#'
#' @return data frame containing dowload information
#'
#' @importFrom magrittr %>%
#'
#' @export
#' @examples \dontrun{ article_pdf_download(infilepath = "/data/isi_searches", outfilepath = "data")}

article_pdf_download <- function(infilepath, outfilepath = infilepath, colandr=NULL, cond="included"){
  # ===============================
  # CONSTANTS
  # ===============================
  # Create the main output directory
  output_dir <- file.path(outfilepath, 'output')

  # PDF subdirectory
  pdf_output_dir <- file.path(output_dir, 'pdfs')

  # Non-PDF files subdirectory
  nopdf_output_dir <- file.path(output_dir, 'non-pdfs')


  # ===============================
  # MAIN
  # ===============================
  # Read .bib files
  df_citations <- bibfile_reader(infilepath)

  # Join the DOI to Colandr output (https://www.colandrcommunity.com)
  if(is.null(colandr) == F){
    # Read sorted list from Colandr
    papers <- readr::read_csv(file.path(colandr))

    # Match titles from Colandr to DOIs from .bib
    matched <- title_to_doi(papers,df_citations,cond)

  }else{
    matched <- df_citations
  }

  ## STEP 1: ORGANIZE LINKS
  message('===============================\nORGANIZING LINKS\n===============================')
  # Select attributes of interest and clean the author field
  my_df <- tibble::tibble(Name=paste(gsub(";.*$", "", matched$AU),matched$PY,matched$SO),
                          DOI=matched$DI)

  # Create tibble that reports information to the user
  report <- my_df

  # Print percent of papers with DOI
  perc = suppressWarnings((nrow(dplyr::filter(my_df, !is.na(DOI)))/nrow(my_df)))
  suppressWarnings(perc %>%
                     "*"(100) %>%
                     round(digits=1) %>%
                     paste0("%") %>%
                     message(" of references contained a DOI"))
  rm(perc)

  # Add column to data frame describing if DOI is NA
  report$DOI_exists <- ifelse(is.na(report$DOI), FALSE, TRUE)

  # Remove links with NAs
  my_df <- dplyr::filter(my_df, !is.na(DOI))

  # Collect links
  my_df$links <- sapply(my_df$DOI, crminer::crm_links)

  # Count number of references that found no link
  perc = 1-(nrow(my_df[lapply(my_df$links, length) == 0,])/nrow(my_df))
  suppressWarnings(perc %>%
                     "*"(100) %>%
                     round(digits=1) %>%
                     paste0("%") %>%
                     message(" of references with a DOI returned a URL link"))
  rm(perc)

  # Add to report document which reference didn't have URL
  my_df$length <- lapply(my_df$links, length)
  my_df$URL_found <- ifelse(my_df$length > 0, TRUE, FALSE)
  report <- dplyr::left_join(report,my_df, by = c("Name", "DOI")) %>%
    dplyr::select(Name,DOI,DOI_exists,URL_found)
  my_df <- dplyr::select(my_df,Name,DOI,links)

  # Remove references with no URL
  my_df <- my_df[lapply(my_df$links, length) > 0,]

  # Elsevier links require a separate download process, so we distinguish them here
  my_df <- elsevier_tagger(my_df, "links")

  ## STEP 2: DOWNLOAD PDFS FROM LINKS
  message('===============================\nDOWNLOADING PDFS FROM LINKS\n===============================')

  ## Download the PDFs
  # Set the cache path
  crminer::crm_cache$cache_path_set(path = "", type = "function() dirname(outfilepath)", prefix = basename(outfilepath))
  # crminer::crm_cache$cache_path_set(path = "soil_pdfs", type = "function() '/Users/brun'", prefix = "Desktop")

  # initialize the column to store the PDF filename
  my_df$downloaded_file <- as.character(NA)

  # Clear the cache
  crminer::crm_cache$delete_all()
  nb_pdfs <- length(crminer::crm_cache$list())
  old_cache <- crminer::crm_cache$delete_all()

  # Download the PDFs
  for (i in 1:nrow(my_df)) {
    message(sprintf("number of papers downloaded %i",nb_pdfs))
    # my_df$path[i] <- paste0(file.path(pdf_output_dir, my_df$Name[i]), '.pdf')
    tryCatch(crminer::crm_text(my_df$links[[i]], type = "pdf", cache=FALSE, overwrite_unspecified=TRUE),
             # my_df$downloaded_file[i] <- crminer::crm_cache$list()[i],
             # (url, my_df$path[i], overwrite_unspecified = TRUE),
             error=function(cond) {
               # we don't handle links of type 'html' or 'plain', because they almost never provide pdf download; moreover, we only want xml links from elsevier because we only handle those
               # link <- NA
               message(sprintf("There was a problem downloading this link %s", my_df$links[[i]]))
               # message(cond)
             },
             finally = message(sprintf("\nThe reference %s has been processed \n", my_df$Name[[i]]))
             )
    # keep track of the PDF names
    if (length(crminer::crm_cache$list()) > nb_pdfs) {
      # print(crminer::crm_cache$list())
      nb_pdfs <- length(crminer::crm_cache$list())
      last_paper <- setdiff(crminer::crm_cache$list(), old_cache)
      my_df$downloaded_file[i] <- last_paper
    } else {
      my_df$downloaded_file[i] <- NA
    }
    old_cache <- crminer::crm_cache$list()
  }


  message('===============================\nPDFS DOWNLOADED\n===============================')

  ## STEP 3: POST-PROCESSING
  # distinguish real pdf files from other files (mainly html webpages)

  # Check if pdf_output directory exists
  dir.create(output_dir, showWarnings = FALSE)
  # Check if pdf_output_dir directory exists
  # dir.create(pdf_output_dir, showWarnings = FALSE)
  # Check if pdf_output_dir directory exists
  dir.create(nopdf_output_dir, showWarnings = FALSE)

  my_df$downloaded <- as.logical(NA)
  my_df$is_pdf <- as.logical(NA)

  for (i in 1:dim(my_df)[1]) {
    print(my_df$downloaded_file[i])
    if (file.exists(my_df$downloaded_file[i])) {
      my_df$downloaded[i] <- TRUE
      my_df$is_pdf[i] <- is_binary(my_df$downloaded_file[i])
    } else {
      my_df$downloaded[i] <- FALSE
      my_df$is_pdf[i] <- FALSE
    }
  }
  # Add the flags for downloaded and PDF to the data frame
  # my_df$downloaded <- as.logical(my_df$downloaded)
  # my_df$is_pdf <- as.logical(my_df$is_pdf)

  # Extract some statistics
  download_success <- sum(my_df$downloaded, na.rm = TRUE) # out of 5759 acquired links, 4604 produced downloaded files
  unique_files <- length(unique(my_df$downloaded_file[my_df$downloaded])) # out of 4604 downloaded files, 4539 are unique
  unique_pdfs <- length(unique(my_df$downloaded_file[my_df$downloaded & my_df$is_pdf])) # out of 4539 unique downloaded files, 4057 are binary files (PDFs)
  message(sprintf("Over the %i acquired links, %i PDFs were succesfully downloaded", nrow(my_df), unique_pdfs))

  # Extract the files info that were not PDFs
  non_pdf_paths <- unique(my_df$downloaded_file[my_df$downloaded & !my_df$is_pdf]) # For investigative purposes, here are the paths for the non-PDF files (482) that were downloaded

  if(length(non_pdf_paths > 0)){
    ## Move the non-pdf files to a specific directory
    # Create the destination list
    html_paths <- file.path(
      nopdf_output_dir,
      paste0(basename(tools::file_path_sans_ext(non_pdf_paths)),
             ".html")
    )
    # Move the files
    file.rename(from = non_pdf_paths, to = html_paths)
  }

  # ## Fix the double dot before file extension
  # pdf_files <- dir(pdf_output_dir, full.names = TRUE)
  # pdf_fixed <- gsub("\\.\\.pdf","\\.pdf",pdf_files)
  # file.rename(from = pdf_files , to = pdf_fixed)

  # output information regarding the download processs to csv
  summary_path <- file.path(output_dir, 'summary.csv')
  write.csv(dplyr::select(my_df, -links), file = summary_path, row.names = F)

  message('\n Details of the PDF retrieval process have been stored in ', summary_path, '\n')

  return(my_df)
}
swood-ecology/BibScan documentation built on Sept. 24, 2021, 8:13 p.m.