scicloud: Cluster and Network Word Analysis of Scientific Papers

Documented in searchScopus

#' @title searchScopus
#'
#' @description This function accepts a search string in URL format and an
#'     Elsevier API Key and returns the DOI numbers of all search results as
#'     a vector for further processing.
#'
#' @author Creator of the scicloud workflow: Henrik von Wehrden,
#'   \email{henrik.von_wehrden@@leuphana.de} \cr
#'   
#'   Code by: Matthias Nachtmann,
#'   \email{matthias.nachtmann@@stud.leuphana.de},
#'   Lisa Gotzian, \email{lisa.gotzian@@stud.leuphana.de},
#'   Prabesh Dhakal, \email{prabesh.dhakal@@stud.leuphana.de} \cr
#'    
#'   First version of scicloud: Matthias Nachtmann,
#'   \email{matthias.nachtmann@@stud.leuphana.de}
#'
#' @param searchString The search string you want to ask the server. See the
#'     Scopus API (\url{https://dev.elsevier.com/}) for details.
#' @param maxResults The maximum amount of accepted search results. Usually,
#'     Scopus does not provide more than 5000 results.
#' @param countIncrement The number of results per GET request. A private user
#'     can't exceed 25 per request. If you are inside a subscribed IP range,
#'     you can use the maximum of 200 per request. Note that the weekly quota
#'     for requests is 20,000.
#' @param myAPIKey Your private Elsevier API key for communicating with the
#'     API. You can request one at \url{https://dev.elsevier.com/}.
#' @param saveToWd A logical parameter whether or not to save the output of the
#'     function to the working directory. This is especially useful for later
#'     analysis steps. The file can be read in by using \code{\link[base]{readRDS}}.
#' @family scicloud functions
#' @return A data frame containing the DOI numbers and Scopus-IDs of the search
#'     results, as well as paper metadata like author and publishing year.
#' @export
#' @examples
#' \dontrun{
#' DOInumbers <- searchScopus("TITLE-ABS-KEY(sustainability) AND PUBYEAR > 2009",
#'   "1234567890ABCDEF",
#'   maxResults = 160, countIncrement = 20
#' )
#' View(DOInumbers)
#' 
#' ## Run the analysis with the acquired dataframe using scicloud
#' # Create a tfidf wordlist
#' scicloudList <- createScicloudList(scopusList = DOInumbers, myAPIKey = myAPIKey)
#'
#' # Run the analysis with a specified no. of cluster
#' scicloudAnalysis <- runAnalysis(scicloudList = scicloudList, numberOfClusters = 4)
#'
#' # Generate a summary of the analysis
#' scicloudSpecs <- inspectScicloud(scicloudAnalysis)
#' }
#'
searchScopus <- function(searchString,
                         myAPIKey,
                         maxResults = 500,
                         countIncrement = 200,
                         saveToWd = FALSE) {
  #### PHASE I: GET THE DOIs and SCOPUS IDs OF THE SEARCH RESULT ####

  # percent-encode the search string
  searchString <- utils::URLencode(searchString)

  # initialize an empty request results object
  searchEntries <- NULL

  # initialize start parameter for API call
  start <- 0


  # a function that returns a custom error message
  errorMessage <- function(cond) {
    message(
      paste0(
        "Error while retrieving the title - DOI: ",
        searchResults[i, "DOI"],
        ", Scopus-ID: ",
        searchResults[i, "Scopus-ID"]
      )
    )
    message("Here's the original error message:")
    message(cond)
    # Choose a return value in case of error
    return(NA)
  }
  # a function that returns a custom warning message
  warningMessage <- function(cond) {
    message(
      paste0(
        "Warning while retrieving the title - DOI: ",
        searchResults[i, "DOI"],
        ", Scopus-ID: ",
        searchResults[i, "Scopus-ID"]
      )
    )
    message("Here's the original warning message:")
    message(cond)
    # Choose a return value in case of warning
    return(NULL)
  }

  # evaluation of the number of requests in order to be
  # able to get more than 200 results
  numberOfRequests <- ceiling(maxResults / countIncrement)

  if (maxResults > 5000 | numberOfRequests > 20000) {
    stop("Scopus does not provide more than 5,000 search results or 20,000 requests.")
  }

  for (i in 1:numberOfRequests) {
    # store only maxResults entries
    if (i == numberOfRequests) {
      countIncrement <- maxResults %% countIncrement
    }

    # create the url that is sent as a GET request
    URL <-
      paste0(
        "http://api.elsevier.com/content/search/scopus?query=",
        searchString,
        "&count=",
        countIncrement,
        "&start=",
        start
      )

    # response from the server (stored in JSON format)
    serverResponse <-
      httr::GET(
        URL,
        httr::add_headers(`X-ELS-APIKey` = myAPIKey, Accept = "application/json")
      )

    # store the JSON content from the response
    responseContent <- httr::content(serverResponse)

    # store JSON content in entry vector
    searchEntries <-
      c(searchEntries, responseContent$`search-results`$entry)

    # update the current number of results ensuring the progress bar always ends with 100%
    start <- min(start + countIncrement, maxResults)


    # API allows 9 requests per second
    if (i %% 9 == 0) {
      Sys.sleep(1)
    }
  }

  cat(
    "\nRemaining quota:",
    serverResponse$headers$`x-ratelimit-remaining`,
    "requests within the next 7 days.\n"
  )
  Sys.sleep(1)

  # initialize a matrix that stores the results (more efficient this way)
  searchResults <-
    matrix(NA, nrow = maxResults, ncol = 20)
  # assign column header names to the matrix where we store the results
  colnames(searchResults) <-
    c(
      "Title",
      "Year",
      "Month",
      "Day",
      "Authors",
      "Journal",
      "Volume",
      "Issue",
      "Pages",
      "CitedBy",
      "CitationPerYear",
      "DOI",
      "Scopus-ID",
      "Publisher",
      "Affiliation",
      "Affiliation-City",
      "Affiliation-Country",
      "FileName",
      "Abstract",
      "FullText"
    )

  # let users know what is happening at this stage (assissted with progress bar later)
  cat("\nAccessing DOIs and Scopus IDs of the search result...\n")

  # attempt to grab the contents that we are interested in
  for (i in 1:maxResults) {
    # grab the DOI
    resultDOI <- tryCatch(
      {
        searchEntries[[i]]$`prism:doi`
      },
      error = errorMessage,
      warning = warningMessage
    )

    # Grab the scopus ID
    resultScopusID <- tryCatch(
      {
        sub("SCOPUS_ID:", "", searchEntries[[i]]$`dc:identifier`)
      },
      error = errorMessage,
      warning = warningMessage
    )

    # store DOI and Scopus ID
    searchResults[i, "DOI"] <-
      if (length(resultDOI) > 0) {
        resultDOI
      } else {
        NA
      }
    searchResults[i, "Scopus-ID"] <-
      if (length(resultScopusID) > 0) {
        resultScopusID
      } else {
        NA
      }
  }
  # check for redundant entries
  searchResults <- unique(searchResults)
  # assign unique ID to the rows to avoid any collisions along the way
  searchResults <- cbind(searchResults, ID = 1:nrow(searchResults))

  DOInumbersMetaData <- getScopusMetaData(searchResults, myAPIKey = myAPIKey)
  # check for empty abstracts
  DOInumbersMetaData <- DOInumbersMetaData[-which(is.na(DOInumbersMetaData[,"Abstract"])),]
  
  # save metaDOInumbers dataFrame to R object file to working directory & global env
  if (saveToWd == TRUE) {
    save_data(DOInumbersMetaData, "metaDOInumbers")
  }
 
  return(DOInumbersMetaData)
}