R/jbt.R

#'
#'
#' access jbt models cached data
#'
#'

jbt <- new.env(parent = .GlobalEnv)

with(jbt, {

  models <- list()

  .INITIALIZED <- F
  .init <- function(reinitialize = F) {
    if(!.INITIALIZED || reinitialize){
      models_available <- .jbt_models_available()
      models <<- lapply(names(models_available), .getmodel)
      names(models) <<- names(models_available)
      .INITIALIZED <- T
    }
  }

  .sim_urlpattern='http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/${model}/jo/similar/${term}%23${pos}?numberOfEntries=1000&format=json'
  .sim_urlpattern_nopos='http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/${model}/jo/similar/${term}?numberOfEntries=1000&format=json'
  .sense_urlpattern='http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/${model}/jo/senses/${term}%23${pos}&format=json&sensetype=CW'
  .sense_urlpattern_nopos='http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/${model}/jo/senses/${term}&format=json&sensetype=CW'
  .sense_fine_urlpattern='http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/${model}/jo/senses/${term}%23${pos}?format=json&sensetype=CW-finer'
  .sense_fine_urlpattern_nopos='http://ltmaggie.informatik.uni-hamburg.de/jobimviz/ws/api/${model}/jo/senses/${term}?format=json&sensetype=CW-finer'

  .jbt_models_available <- function() list(
    en_jbt_trigram = list(lang = 'en', desc='ngram', apiname = 'trigram', sensemodel = F, finersensemodel = F, POS = F),
    en_jbt_stanfordNew = list(lang = 'en', desc='parse', apiname = 'stanfordNew', sensemodel = T, finersensemodel = T, POS = T),
    #en_jbt_stanfordContext = list(lang = 'en', desc='parse', apiname = 'stanfordContext', sensemodel = F, finersensemodel = F, POS = T),
    #en_jbt_wikipediaStanford = list(lang = 'en', desc='parse', apiname = 'wikipediaStanford', sensemodel = F, finersensemodel = F, POS = T),
    #en_jbt_medlineTrigram = list(lang = 'en', desc='ngram', apiname = 'medlineTrigram', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_medlineParsed = list(lang = 'en', desc='ngram', apiname = 'medlineParsed', sensemodel = F, finersensemodel = F, POS = T),
    #en_jbt_medlineTrigramMwe = list(lang = 'en', desc='ngram', apiname = 'medlineTrigramMwe', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_superfamilyBigram = list(lang = 'en', desc='ngram', apiname = 'superfamilyBigram', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_superfamilyTrigram = list(lang = 'en', desc='ngram', apiname = 'superfamilyTrigram', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_pfamBigram = list(lang = 'en', desc='ngram', apiname = 'pfamBigram', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_pfamTrigram = list(lang = 'en', desc='ngram', apiname = 'pfamTrigram', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_reviewsTrigram = list(lang = 'en', desc='ngram', apiname = 'reviewsTrigram', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_twitter2012Bigram = list(lang = 'en', desc='ngram', apiname = 'twitter2012Bigram', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_google = list(lang = 'en', desc='ngram', apiname = 'google', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_stanford = list(lang = 'en', desc='parse', apiname = 'stanford', sensemodel = F, finersensemodel = F, POS = T),
    #en_jbt_google1520 = list(lang = 'en', desc='ngram', apiname = 'google1520', sensemodel = F, finersensemodel = F, POS = F),
    #en_jbt_wikipediaTrigram = list(lang = 'en', desc='ngram', apiname = 'wikipediaTrigram', sensemodel = F, finersensemodel = F, POS = F),
    de_jbt_germanTrigram = list(lang = 'de', desc='ngram', apiname = 'germanTrigram', sensemodel = F, finersensemodel = F, POS = F),
    de_jbt_germanParsedLemma = list(lang = 'de', desc='parse', apiname = 'germanParsedLemma', sensemodel = F, finersensemodel = F, POS = T),
    #de_jbt_germanTrigramMwe = list(lang = 'de', desc='jbt_mwe', apiname = 'germanTrigramMwe', sensemodel = F, finersensemodel = F, POS = F),
    #de_jbt_germanEducrawl = list(lang = 'de', desc='parse', apiname = 'germanEducrawl', sensemodel = F, finersensemodel = F, POS = T),
    #de_jbt_twitterDETrigram = list(lang = 'de', desc='ngram', apiname = 'twitterDETrigram', sensemodel = F, finersensemodel = F, POS = F),
    es_jbt_spanishTrigram = list(lang = 'es', desc='ngram', apiname = 'spanishTrigram', sensemodel = F, finersensemodel = F, POS = F),
    fr_jbt_frenchTrigram = list(lang = 'fr', desc='ngram', apiname = 'frenchTrigram', sensemodel = F, finersensemodel = F, POS = F),
    ar_jbt_arabicTrigram = list(lang = 'ar', desc='ngram', apiname = 'arabicTrigram', sensemodel = F, finersensemodel = F, POS = F),
    bn_jbt_bengaliBigram = list(lang = 'bn', desc='ngram', apiname = 'bengaliBigram', sensemodel = F, finersensemodel = F, POS = F),
    he_jbt_hebrewTrigram = list(lang = 'he', desc='ngram', apiname = 'hebrewTrigram', sensemodel = F, finersensemodel = F, POS = F),
    hi_jbt_hindiTrigram = list(lang = 'hi', desc='ngram', apiname = 'hindiTrigram', sensemodel = F, finersensemodel = F, POS = F),
    hi_jbt_hindiBigram = list(lang = 'hi', desc='ngram', apiname = 'hindiBigram', sensemodel = F, finersensemodel = F, POS = F),
    nl_jbt_dutchTrigram = list(lang = 'nl', desc='ngram', apiname = 'dutchTrigram', sensemodel = F, finersensemodel = F, POS = F),
    ru_jbt_russianTrigram = list(lang = 'ru', desc='ngram', apiname = 'russianTrigram', sensemodel = F, finersensemodel = F, POS = F),
    sv_jbt_swedishTrigramMwe = list(lang = 'sv', desc='ngram', apiname = 'swedishTrigramMwe', sensemodel = F, finersensemodel = F, POS = F),
    tr_jbt_turkishTrigram = list(lang = 'tr', desc='ngram', apiname = 'turkishTrigram', sensemodel = F, finersensemodel = F, POS = F)
  )

  .get_jbt_url = function(pattern, model, term, pos)
    return(stringr::str_interp(pattern))

  .modelnames_for_lang <- function(lang) {
    matching_models <- grep(paste0('^',lang,'_'), names(.jbt_models_available()), value=T)
    return(matching_models)
  }

  .get_best_modelname_for_lang <- function(lang) {
    matching_models <- .modelnames_for_lang(lang)
    if(length(matching_models) > 0){
      return(matching_models[[1]])
    }
    return(NULL)
  }

  #'
  #' Helper function to convert POS tags to JBT POS tags
  #'
  .convertToJbtPOS <- function(POS) {
    POS_ <- toupper(POS)
    switch(
      POS_,
      A = 'JJ',
      J = 'JJ',
      N = 'NN',
      V = 'VB',
      {
        #warning(sprintf('unkown POS %s', POS))
        POS
      }
    )
  }

  .cleanJbtTerm <- function(...){
    if(length(...)[[1]] > 1) # gsub can handle list and vectors, but not lists of lists, ...
      sapply(..., .cleanJbtTerm)
    else
      # clean both, isas (':') or jbt terms ('#'), clear empty spaces
      gsub('^\\s+', '',  # whitespae at end
           gsub('\\s+$', '', # whitespae at front
                gsub('#.*', '', # jbt term: foo#NN
                     gsub(':.*','', ...) # isa term: foo:4711
                )))
  }

  .get_json_from_url = function(url){
    tryCatch(
      expr = {
        # get from api
       util$message(sprintf('querying  \'%s\'.', url))
        # try to fetch and read json document
        js_doc <- jsonlite::fromJSON(txt = url)
        return(js_doc)
      },
      error = function(err) {
        util$message(sprintf('ERROR retrieving \'%s\': %s', url, err))
        return(NA)
      }
    )
  }

  #'
  #'
  #'
  .get_JBT_similarities <- function(term, POS = NA, jbt_modelname) {
    if(is.na(POS)) POS <- 'N'
    model <- models[[jbt_modelname]]()
    if(model$POS)
      jbtPOS <- .convertToJbtPOS(POS)
    else
      jbtPOS <- ''

    # get from temp dir if existent
    fname <- cache$get_filename(term, jbtPOS, dirname = cache$data_temp_dir(), prefix = paste0('jbtsimapi__', jbt_modelname, '__'))
    js_doc <- cache$load(filename = fname, computefun = function() {
      pattern <- if(model$POS) .sim_urlpattern else .sim_urlpattern_nopos
      url <- .get_jbt_url(pattern, model$apiname, term, jbtPOS)
      .get_json_from_url(url)
    })

    if (is.list(js_doc)) {
      if (length(js_doc$results) > 0) {
        sim <- data.frame(list(jbtterm=js_doc$results$key, score=js_doc$results$score), row.names = NULL)
        sim['term'] <- .cleanJbtTerm(sim$jbtterm)
        return(sim)
      }
    }
    return(setNames(data.frame(matrix(ncol = 2, nrow = 0)), c('jbtterm', 'score'))
)
  }

  #'
  #'
  #'
  .get_JBT_senses <- function(term, POS = NA, finer=T, isas = F, jbt_modelname) {
    if(is.na(POS)) POS <- 'N'
    model <- models[[jbt_modelname]]()
    if(model$POS)
      jbtPOS <- .convertToJbtPOS(POS)
    else
      jbtPOS <- ''

    fname <- cache$get_filename(term, jbtPOS, dirname = cache$data_temp_dir(), prefix = paste0('jbtsenseapi', if(finer) 'finer' else '' ,'__', jbt_modelname, '__'))
    json_doc <- cache$load(fname, function() {
      if(finer){
        pattern <- if(model$POS) .sense_fine_urlpattern else .sense_fine_urlpattern_nopos
        url <- .get_jbt_url(pattern, model$apiname, term, jbtPOS)
      }else{
        pattern <- if(model$POS) .sense_urlpattern else .sense_urlpattern_nopos
        url <- .get_jbt_url(pattern, model$apiname, term, jbtPOS)
      }
      .get_json_from_url(url)
    })

    if (is.list(json_doc)) {
      if (length(json_doc$result) > 0){
        if (isas){
          return(Filter(function(l) length(l) > 0, json_doc$result$isas))
        }
        # else
        return(Filter(function(l) length(l) > 0, json_doc$result$senses))
      }
    }
    # else
    return(list())
  }

  .getmodel <- function(jbtmodelname){
    jbtmodel <- .jbt_models_available()[[jbtmodelname]]
    jbtmodel$name <- jbtmodelname
    jbtmodel$sim <- function(term, POS = NA) .get_JBT_similarities(term, POS, jbtmodelname)
    jbtmodel$senses <- function(term, POS = NA, finer=T, isas = F) .get_JBT_senses(term, POS, finer, isas, jbtmodelname)
    return(function() jbtmodel)
  }

}) # end with(...)
uhh-lt/senseasim documentation built on March 2, 2020, 9:04 p.m.