R/query_aat.R

Defines functions query_aat

Documented in query_aat

#' Query the Getty AAT SPARQL Service
#' 
#'Function to query the Getty Art & Architecture Thesaurus. 
#'
#' @return A dataframe of \code{term} (the string provided),
#' \code{aat_term} (the matched term from AAT), \code{aat_id} (the id of the term, prefixed with 'aat:'), \code{aat_note} (Notes of the term), \code{aat_parents} (parent categories of the term).
#'
#' @param term String to check against the AAT
#' @param query_type Which query to use, one of: 'direct', 'anylabel', 'fulltext', or 'widesearch'
#' @param keywords_string String or list to extract keywords from to try to get a better match (optional)
#' 
#'
#' @export
#' @importFrom stringr str_replace_all
#' @importFrom jsonlite fromJSON
#' @importFrom stringr str_squish
#' @importFrom tokenizers tokenize_words
#' @importFrom stopwords stopwords
#' @importFrom utils URLencode
#'  
query_aat <- function(term = NA, query_type = NA, keywords_string = NA){
  
  getty_url <- "http://vocab.getty.edu/sparql.json?query="
  
  if (is.na(term) || term == ""){
    stop("term can not be empty")
  }
  
  if (is.na(query_type) || query_type == ""){
    stop("query_type can not be empty")
  }
  
  query_type_options <- c('direct', 'anylabel', 'fulltext', 'widesearch')
  
  if (!query_type %in% query_type_options){
    stop("invalid query_type")
  }
  
  #results data frame----
  results <- data.frame(matrix(nrow = 0, ncol = 5, data = NA), stringsAsFactors = FALSE)
  names(results) <- c("term", "aat_term", "aat_id", "aat_note", "aat_parents")
  
  if (query_type == "direct"){
    #direct match ----
    #Explicit Engligh
    getty_query_template <- "select 
            	?Subject ?Term ?ScopeNote ?Parents
            		{
            			?Subject gvp:prefLabelGVP/xl:literalForm \"%s\"@en; 
            			skos:inScheme aat:; 
            			gvp:prefLabelGVP [xl:literalForm ?Term].
            			optional {
            				?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]
            			}
            			optional {
            				?Subject gvp:parentString ?Parents
            			}
            		}"
    
    #Replace search string in query
    getty_query <- stringr::str_squish(sprintf(getty_query_template, term))

    #URLEncode the query
    getty_query_encoded <- utils::URLencode(getty_query, reserved = FALSE)
    
    json <- try(jsonlite::fromJSON(paste0(getty_url, getty_query_encoded), flatten = TRUE), silent = TRUE)
    
    if (class(json) != "try-error"){
      if (!is.null(dim(json$results$bindings))){
        for (i in seq(1, dim(json$results$bindings)[1])){
          results <- rbind(results, 
                           cbind("term" = term, 
                                 "aat_term" = json$results$bindings$Term.value,
                                 "aat_id" = paste0("aat:", stringr::str_replace(json$results$bindings$Subject.value, "http://vocab.getty.edu/aat/", "")), 
                                 "aat_note" = json$results$bindings$ScopeNote.value,
                                 "aat_parents" = json$results$bindings$Parents.value)
          )
        }
      }
    }else{
      getty_query_template <- "select 
            	?Subject ?Term ?ScopeNote ?Parents
            		{
            			?Subject gvp:prefLabelGVP/xl:literalForm \"%s\"; 
            			skos:inScheme aat:; 
            			gvp:prefLabelGVP [xl:literalForm ?Term].
            			optional {
            				?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]
            			}
            			optional {
            				?Subject gvp:parentString ?Parents
            			}
            		}"
      
      #Replace search string in query
      getty_query <- stringr::str_squish(sprintf(getty_query_template, term))
      
      #URLEncode the query
      getty_query_encoded <- URLencode(getty_query, reserved = FALSE)
      
      json <- try(jsonlite::fromJSON(paste0(getty_url, getty_query_encoded), flatten = TRUE), silent = TRUE)
      
      if (class(json) != "try-error"){
        if (!is.null(dim(json$results$bindings))){
          for (i in seq(1, dim(json$results$bindings)[1])){
            results <- rbind(results, 
                             cbind("term" = term, 
                                   "aat_term" = json$results$bindings$Term.value,
                                   "aat_id" = paste0("aat:", stringr::str_replace(json$results$bindings$Subject.value, "http://vocab.getty.edu/aat/", "")), 
                                   "aat_note" = json$results$bindings$ScopeNote.value,
                                   "aat_parents" = json$results$bindings$Parents.value)
            )
          }
        }
      }
    }
  }else if (query_type == "anylabel"){
    #anylabel search ----
    term_clean <- stringr::str_replace_all(term, "'s", "")
    term_clean <- stringr::str_replace_all(term_clean, "'", "")
    
    getty_query_template <- "select 
                    ?Subject ?Term ?ScopeNote ?Parents
                    	{
                            ?Subject rdfs:label \"%s\"@en;
                            skos:inScheme aat:; 
                            gvp:prefLabelGVP [xl:literalForm ?Term];
                            optional {
                        			?Subject gvp:parentString ?Parents
                        		}
                      			optional {
                      				?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]
                      			}
                        }"
    
    #Replace search string in query
    getty_query <- stringr::str_squish(sprintf(getty_query_template, term_clean))
    
    #URLEncode the query
    getty_query_encoded <- URLencode(getty_query, reserved = FALSE)
    
    json <- try(jsonlite::fromJSON(paste0(getty_url, getty_query_encoded), flatten = TRUE), silent = TRUE)
    
    if (class(json) != "try-error"){
      if (!is.null(dim(json$results$bindings))){
        for (i in seq(1, dim(json$results$bindings)[1])){
          results <- rbind(results, 
                           cbind("term" = term, 
                                 "aat_term" = json$results$bindings$Term.value,
                                 "aat_id" = paste0("aat:", stringr::str_replace(json$results$bindings$Subject.value, "http://vocab.getty.edu/aat/", "")), 
                                 "aat_note" = json$results$bindings$ScopeNote.value,
                                 "aat_parents" = json$results$bindings$Parents.value)
          )
        }
      }
    }
  }else if (query_type == "fulltext"){
    #fulltext search ----
    
    #Cleanup term
    term_clean <- stringr::str_replace_all(term, "'s", "")
    term_clean <- stringr::str_replace_all(term_clean, "'", "")
    
    getty_query_template <- "select 
                        ?Subject ?Term ?Parents ?ScopeNote {
              	          ?Subject a skos:Concept; luc:term \"%s\"; 
              	          skos:inScheme aat: ; 
              	          gvp:prefLabelGVP [xl:literalForm ?Term]. 
              	          optional {
              	              ?Subject gvp:parentString ?Parents
              	              }
              	          optional {
              	              ?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]
              	              }
                        }"
    
    #Replace search string in query
    getty_query <- stringr::str_squish(sprintf(getty_query_template, term_clean))
    
    #URLEncode the query
    getty_query_encoded <- URLencode(getty_query, reserved = FALSE)
    
    json <- try(jsonlite::fromJSON(paste0(getty_url, getty_query_encoded), flatten = TRUE), silent = TRUE)
    
    if (class(json) != "try-error"){
      if (!is.null(dim(json$results$bindings))){
        for (i in seq(1, dim(json$results$bindings)[1])){
          results <- rbind(results, 
                           cbind("term" = term, 
                                 "aat_term" = json$results$bindings$Term.value,
                                 "aat_id" = paste0("aat:", stringr::str_replace(json$results$bindings$Subject.value, "http://vocab.getty.edu/aat/", "")), 
                                 "aat_note" = json$results$bindings$ScopeNote.value,
                                 "aat_parents" = json$results$bindings$Parents.value)
          )
        }
      }
    }
    
    #fulltext keywords ----
    if (class(keywords_string) == "list"){
      for (d in seq(1, dim(keywords_string)[1])){
        keywords_tokenized <- unique(tokenizers::tokenize_words(keywords_string[d][[1]], stopwords = stopwords::stopwords("en"), simplify = TRUE, strip_punct = TRUE, strip_numeric = TRUE, lowercase = TRUE))
        
        getty_query_template <- "select 
                        ?Subject ?Term ?Parents ?ScopeNote {
              	          ?Subject a skos:Concept; luc:term \"%s\"; 
              	          skos:inScheme aat: ; 
              	          gvp:prefLabelGVP [xl:literalForm ?Term]. 
              	          optional {
              	              ?Subject gvp:parentString ?Parents
              	              }
              	          optional {
              	              ?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]
              	              }
                        }"
        
        if (length(keywords_tokenized) > 0){
          for (i in seq(1, length(keywords_tokenized))){
            
            #Replace search string in query
            string_to_query <- paste0(term_clean, " AND ", keywords_tokenized[i])
            getty_query <- stringr::str_squish(sprintf(getty_query_template, string_to_query))
            
            #URLEncode the query
            getty_query_encoded <- URLencode(getty_query, reserved = FALSE)
            
            json <- try(jsonlite::fromJSON(paste0(getty_url, getty_query_encoded), flatten = TRUE), silent = TRUE)
            
            if (class(json) != "try-error"){
              if (!is.null(dim(json$results$bindings))){
                for (i in seq(1, dim(json$results$bindings)[1])){
                  results <- rbind(results, 
                                   cbind("term" = term, 
                                         "aat_term" = json$results$bindings$Term.value,
                                         "aat_id" = paste0("aat:", stringr::str_replace(json$results$bindings$Subject.value, "http://vocab.getty.edu/aat/", "")), 
                                         "aat_note" = json$results$bindings$ScopeNote.value,
                                         "aat_parents" = json$results$bindings$Parents.value)
                  )
                }
              }
            }
          }
        }
      }
    }else if (class(keywords_string) == "character"){
      if (!is.na(keywords_string)){
        keywords_tokenized <- unique(tokenizers::tokenize_words(keywords_string, stopwords = stopwords::stopwords("en"), simplify = TRUE, strip_punct = TRUE, strip_numeric = TRUE, lowercase = TRUE))
        
        if (length(keywords_tokenized) > 0){
          for (i in seq(1, length(keywords_tokenized))){
            
            getty_query_template <- "select 
                        ?Subject ?Term ?Parents ?ScopeNote {
              	          ?Subject a skos:Concept; luc:term \"%s\"; 
              	          skos:inScheme aat: ; 
              	          gvp:prefLabelGVP [xl:literalForm ?Term]. 
              	          optional {
              	              ?Subject gvp:parentString ?Parents
              	              }
              	          optional {
              	              ?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]
              	              }
                        }"
            
            #Replace search string in query
            string_to_query <- paste0(term_clean, " AND ", keywords_tokenized[i])
            getty_query <- stringr::str_squish(sprintf(getty_query_template, string_to_query))
            
            #URLEncode the query
            getty_query_encoded <- URLencode(getty_query, reserved = FALSE)
            
            json <- try(jsonlite::fromJSON(paste0(getty_url, getty_query_encoded), flatten = TRUE), silent = TRUE)
            
            if (class(json) != "try-error"){
              if (!is.null(dim(json$results$bindings))){
                for (i in seq(1, dim(json$results$bindings)[1])){
                  results <- rbind(results, 
                                   cbind("term" = term, 
                                         "aat_term" = json$results$bindings$Term.value,
                                         "aat_id" = paste0("aat:", stringr::str_replace(json$results$bindings$Subject.value, "http://vocab.getty.edu/aat/", "")), 
                                         "aat_note" = json$results$bindings$ScopeNote.value,
                                         "aat_parents" = json$results$bindings$Parents.value)
                  )
                }
              }
            }
          }
        }
      }
    }
    
    
  }else if (query_type == "widesearch"){
    #widesearch----
    
    #Cleanup term
    term_clean <- stringr::str_replace_all(term, "'s", "*")
    term_clean <- stringr::str_replace_all(term_clean, "'", "*")
    
    getty_query_template <- "select ?Subject ?Term ?Parents ?ScopeNote ?Type (coalesce(?Type1,?Type2) as ?ExtraType) {
            ?Subject luc:term \"%s\"; a ?typ.
            ?typ rdfs:subClassOf gvp:Subject; rdfs:label ?Type.
            filter (?typ != gvp:Subject)
            optional {?Subject gvp:placeTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type1]]}
            optional {?Subject gvp:agentTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type2]]}
            optional {?Subject gvp:prefLabelGVP [xl:literalForm ?Term]}
            optional {?Subject gvp:parentStringAbbrev ?Parents}
            optional {?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]}}"
    
    #Replace search string in query
    getty_query <- stringr::str_squish(sprintf(getty_query_template, term_clean))
    
    #URLEncode the query
    getty_query_encoded <- URLencode(getty_query, reserved = FALSE)
    
    json <- try(jsonlite::fromJSON(paste0(getty_url, getty_query_encoded), flatten = TRUE), silent = TRUE)
    
    if (class(json) != "try-error"){
      if (!is.null(dim(json$results$bindings))){
        for (i in seq(1, dim(json$results$bindings)[1])){
          results <- rbind(results, 
                           cbind("term" = term, 
                                 "aat_term" = json$results$bindings$Term.value,
                                 "aat_id" = paste0("aat:", stringr::str_replace(json$results$bindings$Subject.value, "http://vocab.getty.edu/aat/", "")), 
                                 "aat_note" = json$results$bindings$ScopeNote.value,
                                 "aat_parents" = json$results$bindings$Parents.value)
          )
        }
      }
    }
    
    
    #fulltext keywords ----
    keywords_tokenized <- unique(tokenizers::tokenize_words(keywords_string, stopwords = stopwords::stopwords("en"), simplify = TRUE, strip_punct = TRUE, strip_numeric = TRUE, lowercase = TRUE))
    
    if (length(keywords_tokenized) > 0){
      for (i in seq(1, length(keywords_tokenized))){
        
        getty_query_template <- "select ?Subject ?Term ?Parents ?ScopeNote ?Type (coalesce(?Type1,?Type2) as ?ExtraType) {
            ?Subject luc:term \"%s\"; a ?typ.
            ?typ rdfs:subClassOf gvp:Subject; rdfs:label ?Type.
            filter (?typ != gvp:Subject)
            optional {?Subject gvp:placeTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type1]]}
            optional {?Subject gvp:agentTypePreferred [gvp:prefLabelGVP [xl:literalForm ?Type2]]}
            optional {?Subject gvp:prefLabelGVP [xl:literalForm ?Term]}
            optional {?Subject gvp:parentStringAbbrev ?Parents}
            optional {?Subject skos:scopeNote [dct:language gvp_lang:en; rdf:value ?ScopeNote]}}"
        
        #Replace search string in query
        string_to_query <- paste0(term_clean, " AND ", keywords_tokenized[i])
        getty_query <- stringr::str_squish(sprintf(getty_query_template, string_to_query))
        
        #URLEncode the query
        getty_query_encoded <- URLencode(getty_query, reserved = FALSE)
        
        json <- try(jsonlite::fromJSON(paste0(getty_url, getty_query_encoded), flatten = TRUE), silent = TRUE)
        
        if (class(json) != "try-error"){
          if (!is.null(dim(json$results$bindings))){
            for (i in seq(1, dim(json$results$bindings)[1])){
              results <- rbind(results, 
                               cbind("term" = term, 
                                     "aat_term" = json$results$bindings$Term.value,
                                     "aat_id" = paste0("aat:", stringr::str_replace(json$results$bindings$Subject.value, "http://vocab.getty.edu/aat/", "")), 
                                     "aat_note" = json$results$bindings$ScopeNote.value,
                                     "aat_parents" = json$results$bindings$Parents.value)
              )
            }
          }
        }
      }
    }
  }
  
  #Done, return dataframe with results
  return(results)
}
Smithsonian/collexScrubber documentation built on July 18, 2019, 5:38 p.m.