dslr: Converting Dimensions DSL API outputs to dataframes

#'Sends iterative query requests to the Dimensions API and returns JSON-style list object of results.
#'Takes arguments from dim_request and runs an iterative version of that request to obtain all possible
#'records up to the first 50k.
#'Important Note: Queries should not contain "limit" or "skip" as these are calculated dynamically
#'based on function arguments.
#'
#'@param dim_token Is the token object created by dim_login in the Global environment
#'@param endpoint Is a string containing the Dimensions API endpoint URL.
#'@param query Text string of the DSL query. Please consult API Documentation for how to write DSL queries.
#'@param skip Specifies the number of records to offset (0 by default, max 50000).
#'@param limit Is the maximum returned documents (0 by default, max 50000)
#'@param pause Is the time dim_request waits in between requests. Default is 1.5s.
#'@param force Is a logical argument to determine whether or not to continue the data pull despite errors that may
#'arise during iterations. By default it is set to FALSE. In situations where the query results are > 50k,
#'force will prompt the first 50k results to be returned.
#'@param logs Is a logical which determines whether or not text logs are exported to the package directory/logs folder
#'in the case of query errors. By default set to FALSE.
#'@param req Is the first request generated by dim_request and used to scope out the query.
#'@return A JSON-style nested list object
#'

iterative_request <- function(dim_token,
                              endpoint = 'https://app.dimensions.ai/api/dsl/v2',
                              query,
                              skip,
                              limit,
                              pause,
                              force,
                              logs,
                              req){

  # Get the data source (e.g. publications, grants, etc.)
  dsrc <- strsplit(query, ' ')[[1]][2]

  #First check of total_count -- check maxlimit to see if total_count should reflect the count returned from
  #the API or the arg value
  total_count <- httr::content(req)$`_stats`$total_count
  warnings <- httr::content(req)$`_warnings`

  #If no limit specified, total count > 50k and force is FALSE, stop and call attention to this.
  #Or if limit > 50000 and force == FALSE
  if((limit == 0 & total_count > 50000 & force == FALSE)|(limit > 50000 & force == FALSE)|(skip > 50000 & force == FALSE)){
    warning('Records requested go beyond the 50k limit. Please review your request and see if there are ways to alter your query to within these limits or set force = TRUE to get the first 50k records. If you must have > 50k records, please contact Dimensions support for additional options.')
    stop()} #Close if

  #If total_count is zero notify user.
  else if(total_count == 0){

    resp <- 'No results. Please check your query.'

    # If warnings included in zero result, output to logs, notify user.
    if(!is.null(warnings)){
      capture.output(warnings, file = paste0('./dsl_warnings_', Sys.time(),'.txt'))
      resp <- paste(resp, 'Warnings detected -- please check the dsl_warnings output file in the logs directory of the package folder.')
    }

    warning(resp)
    stop()
  }

  #Otherwise, iterate through the request
  else {

    #If force is TRUE and limit > 50000, use 50000, otherwise use limit
    limit <- ifelse(force == TRUE & limit > 50000, 50000, limit)

    #Iterative querying
    s <- ifelse(skip > 0 & skip < 50000, skip,
                ifelse(skip > 50000, 50000, skip))

    #Calculate total records
    tr = ifelse(limit == 0, total_count, min(limit, total_count))

    #Calculate difference between total_count and skip
    #If difference is 0 then d = 1, otherwise use tr - s
    d <- ifelse(s > tr, tr, tr - s)

    #Get chunksize based on tr
    chunksize = get_chunksize(tr)

    #Check d relative to limit and set iter_lim accordingly.
    #If limit > 0 and chunksize, use the chunksize
    #Otherwise, if limit = 0, use chunksize, all else use limit.
    iter_lim <- ifelse(limit > 0 & limit > chunksize, chunksize,
                       ifelse(limit == 0, chunksize, limit))

    #Calculate number of iterations
    iterations <- ceiling(d/iter_lim)

    #Start a list to append iterative results to
    results <- list()

    #Indicate start of iterations
    message(paste('Iterating request over', iterations, 'iteration(s) of up to', iter_lim, 'records each.'))

    ##Create list object to store queries that fail for troubleshooting
    error_logs = list()

    #Iterate through searches
    for(i in 1:iterations){

      error_logs[[i]] <- list()

      iterquery = paste(query, 'limit', iter_lim, 'skip', s, sep = ' ')

      iterreq <- httr::POST(endpoint,
                            httr::add_headers(Authorization = paste("JWT", dim_token)),
                            body = iterquery,
                            encode = 'json')

      # If an iteration fails and the error is known, alert user, write error logs, stop.
      if(iterreq$status_code != 200 & iterreq$status_code %in% names(error_messages)){
        warning(error_messages[paste(iterreq$status_code)])

        if(logs == TRUE){
          error_logs[[i]][['query']] <- iterquery
          error_logs[[i]][['response']] <- iterreq
          capture.output(error_logs, file = paste0('./error_logs_', Sys.time(),'.txt'))}

        stop()
      } #Close if

      # If error is unrecognized, alert user, write to logs, stop.
      else if(iterreq$status_code != 200 & !(iterreq$status_code %in% names(error_messages))){
        warning(paste('Error on iteration', i, '. Please contact Dimensions Support and provide the following error message:', httr::content(iterreq)))
        if(logs == TRUE){
          error_logs[[i]][['query']] <- iterquery
          error_logs[[i]][['response']] <- iterreq
          capture.output(error_logs, file = paste0('./error_logs_', Sys.time(),'.txt'))}

        stop()
      }# Close elseif

      # If no errors, parse JSON into list format
      else {
        iterdata <- jsonlite::fromJSON(httr::content(iterreq, 'text', encoding = 'UTF-8'), flatten = TRUE)
      }# Close else

      # Allow the API to breathe
      Sys.sleep(pause)

      #If an iteration returns status 429 (too many requests), sleep for 30s, try again.
      if(iterreq$status_code == 429){
        message('Too many requests. Taking 30 second break then will resume.')
        Sys.sleep(30); s = s} #Close if

      #If an iteration returns errors...
      else if('errors' %in% names(iterdata) & iterreq$status_code != 429){
        #specifically related to queried data being too large...then chunk it out
        if(grepl('The response generated by your query is too large', iterdata$errors$query$details[[1]])){

          message(paste('Iteration', i, 'too large. Chunking...', sep = ' '))

          #Find smallest necessary limit
          sub_lim <- lim_shrink(query, lim, s, endpoint, dim_token)

          #Iterate through sub_lim chunks to fill in the larger iteration
          for(l in 1:(lim%/%sub_lim)){

            error_logs[[i]][[l]] <- list()

            iterquery = paste(query, 'limit', sub_lim, 'skip', s, sep = ' ')

            iterreq <- httr::POST(endpoint,
                                  httr::add_headers(Authorization = paste("JWT", dim_token)),
                                  body = iterquery,
                                  encode = 'json')

            # If an sub-iteration fails and the error is known, alert user, write error logs, stop.
            if(req$status_code != 200 & req$status_code %in% names(error_messages)){
              warning(error_messages[paste(req$status_code)])

              if(logs == TRUE){
                error_logs[[i]][[l]][['query']] <- iterquery
                error_logs[[i]][[l]][['response']] <- iterreq
                capture.output(error_logs, file = paste0('./error_logs_', Sys.time(),'.txt'))}

              stop()
            } #Close if

            # If error is unrecognized, alert user, write to logs, stop.
            else if(req$status_code != 200 & !(req$status_code %in% names(error_messages))){
              warning(paste('Error on iteration', i, '-', l, '. Please contact Dimensions Support and provide the following error message:', httr::content(iterreq)))

              if(logs == TRUE){
                error_logs[[i]][[l]][['query']] <- iterquery
                error_logs[[i]][[l]][['response']] <- iterreq
                capture.output(error_logs, file = paste0('./error_logs_', Sys.time(),'.txt'))}

              stop()
            }# Close elseif

            # If no errors, parse JSON into list format
            else {
              iterdata <- jsonlite::fromJSON(httr::content(iterreq, 'text', encoding = 'UTF-8'), flatten = TRUE)
            }# Close else

            results <- results[[i]][[l]] = iterdata[[dsrc]]

            s = s + sub_lim

          } #Close for-loop

        }#Close if

        #related to something else, but if force is TRUE, then skip iteration and issue warning.
        else if(grepl('The result of this query was too large and had to be truncated',
                      iterdata$`_warnings`[[1]]) == TRUE){

        }
        else if(grepl('The response generated by your query is too large',
                      iterdata$errors$query$details[[1]]) == FALSE & force == TRUE){
          warning(paste('Error in iteration', i, 'results', s, 'through', s + lim, 'not processed.', sep = ' '))
          warning(iterdata[['errors']][['query']][['details']][[1]])

          s <- s + iterlim

          if(s + iter_lim > tr){
            iter_lim <- abs(tr - s)
          }

        } #Close else if

        #If any iteration has errors and force is FALSE, return results thus far and provide warnings.
        else if(force == FALSE){
          warning(paste('Failure at iteration ', i, 'on query:', iterquery, '. Results incomplete. To skip this iteration try again with force = TRUE.'))
          warning(iterdata[['errors']][['query']][['details']][[1]])
          return(results)
        }#Close else-if

      }#Close if errors

      #If no errors, append results, increase skip by iter_lim and continue to next iteration.
      else{

        results[[i]] = iterdata[[dsrc]]
        message(paste0('Iteration ', i, ' of ', iterations, ' completed.'))

        s <- s + iter_lim

        # If s is now greater than tr, set iter_lim to |tr - s|.
        if(s + iter_lim > tr){
          iter_lim <- abs(tr - s)
        }# close if

      } #close else

    } #Close for-loop (iterate searches)

    rtn = list(data = results, item = dsrc, query = query, total_count = total_count)

    return(rtn)

  } #Close else (iterating request)
} # Close function

#'Handles function requests. Determines the function being used and then handles the resulting API
#'results accordingly.
#'
#'@param req Is the first request generated by dim_request.
#'@param query Is a properly formatted text string of the DSL query.
#'
#'@return A list object of the result(s)
#'

function_request <- function(req, query){

  #If request contains any of these function calls, take first item in response list
  if(grepl('classify|extract_concepts|extract_grants', strsplit(query, ' ')[[1]][1])){
    rtn = jsonlite::fromJSON(httr::content(req, 'text', encoding = 'UTF-8'), simplifyVector = TRUE)[[1]]
  } #Close if

  # Otherwise, if request is extract_affiliations, unpack those results instead
  else if (grepl('extract_affiliations', strsplit(query, ' ')[[1]][1])){

    resp = jsonlite::fromJSON(httr::content(req, 'text', encoding = 'UTF-8'), flatten = TRUE)

    rtn = data.frame(m.affiliation_part = character(),
                     institute.id = character(),
                     institute.name = character(),
                     institute.city = character(),
                     institute.state = character(),
                     institute.country = character(),
                     metadata.requires_manual_review = logical())

    for(m in resp[['results']][['matches']]){
      for(i in m[['institutes']]){

        if(length(i) == 0){
          blank <- data.frame(m.affiliation_part = m$affiliation_part,
                              institute.id = NA,
                              institute.name = NA,
                              institute.city = NA,
                              institute.state = NA,
                              institute.country = NA,
                              metadata.requires_manual_review = FALSE)
          rtn <- bind_rows(rtn, blank)
        } # Close if

        else if(length(i) > 0){

          tmp <- bind_cols(data.frame(m$affiliation_part), data.frame(i))
          rtn <- bind_rows(rtn, tmp)

        } #Close else if
      } #Close for loop
    } #Close for loop

    return(rtn)

  } #Close else if
}#Close function
cheneypinata/dslr documentation built on Jan. 6, 2022, 11:27 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
cheneypinata/dslr
Converting Dimensions DSL API outputs to dataframes

R/request_handlers.R
In cheneypinata/dslr: Converting Dimensions DSL API outputs to dataframes

Defines functions function_request iterative_request

R Package Documentation

Browse R Packages

We want your feedback!

cheneypinata/dslr Converting Dimensions DSL API outputs to dataframes

R/request_handlers.R In cheneypinata/dslr: Converting Dimensions DSL API outputs to dataframes

Defines functions function_request iterative_request

R Package Documentation

Browse R Packages

We want your feedback!

cheneypinata/dslr
Converting Dimensions DSL API outputs to dataframes

R/request_handlers.R
In cheneypinata/dslr: Converting Dimensions DSL API outputs to dataframes