R/linguistic_analysis.R

key <- "d64d9507f0b74acca33d9786a33fbc34"

analyze_linguistics <- function(text, analyzer = c("Tokens", "Constituency_Tree", "POS_Tags"), language = "en") {
  #Removing punctuation from text
  text <- gsub("[[:punct:]]", "", text)

  #Setting basic variables
  url <- "https://westus.api.cognitive.microsoft.com/linguistics/v1.0/analyze"

  #Setting key value, will throw error if env var not set
  key <- get_ling_analytics_key()

  #Checking for valid analyzer and throwing error if not valid
  if(!match_kind(analyzer)) {
    stop("Supplied analyzer is not supported. Pass in one of: POS_Tags, Constituency_Tree, Tokens",call. = FALSE)
  }

  #Checking for valid language and throwing error if not valid
  if(!match_language(language)) {
    stop("Supplied language is not supported. Pass in one of: en,es,fr,pt",call. = FALSE)
  }

  #Getting the id of the analyzer requested
  analyzers <- get_analyzers()
  analyzer_ids <- get_analyzer_id(analyzer, analyzers)

  #Coverting the request body to JSON
  request_body_json <- jsonlite::toJSON(list(language = jsonlite::unbox(language), analyzerIds = analyzer_ids, text = jsonlite::unbox(text)), auto_unbox = FALSE)

  #Sending request
  result <- httr::POST(url,
                       body = request_body_json,
                       httr::add_headers(.headers = c("Content-Type" = "application/json",
                                                      "Ocp-Apim-Subscription-Key" = key)))

  # #Throwing error if request does not return json
   if (httr::http_type(result) != "application/json") {
     stop("API did not return json", call. = FALSE)
   }

  #Converting JSON to R-usable object
  output <- httr::content(result)

  #Throwing error if status code != 200
  if (httr::status_code(result) != 200) {
    stop(
      paste(
        "API request failed.\nStatus code:",
        output$statusCode, "\nMessage:",
        output$message
      ),
      call. = FALSE
    )
  }

  #Coverting output to a dataframe

  pos = FALSE
  con = FALSE
  tok = FALSE

  if ("pos_tags" %in% tolower(analyzer))
  {
    count <- 1
    if(get_analyzer_kind(output[[1]][1],analyzers) == "POS_Tags") {
      count <- 1
    } else if (get_analyzer_kind(output[[2]][1],analyzers) == "POS_Tags") {
      count <- 2
    } else if (get_analyzer_kind(output[[3]][1],analyzers) == "POS_Tags") {
      count <- 3
    }
    n <- NROW(output[[count]]$result[[1]])
    p <- data.frame(matrix(unlist(output[[count]]$result[[1]]), nrow = n, byrow = T))
    words <- strsplit(text, split = " ")
    p$Text <- as.list(words[[1]])
    names(p) <- c("POS", "Text")
    p$POS_long <- lapply(as.character(p$POS), find_POS)
    p <- p[,c(2,1,3)]

    pos = TRUE
  }
  if ("constituency_tree" %in% tolower(analyzer))
  {
    count <- 1
    if(get_analyzer_kind(output[[1]][1],analyzers) == "Constituency_Tree") {
      count <- 1
    } else if (get_analyzer_kind(output[[2]][1],analyzers) == "Constituency_Tree") {
      count <- 2
    } else if (get_analyzer_kind(output[[3]][1],analyzers) == "Constituency_Tree") {
      count <- 3
    }
    n <- NROW(output[[count]]$result[[1]])
    c <- output[[count]]$result[[1]]
    con = TRUE
  }
  if ("tokens" %in% tolower(analyzer))
  {
    count <- 1
    if(get_analyzer_kind(output[[1]][1],analyzers) == "Tokens") {
      count <- 1
    } else if (get_analyzer_kind(output[[2]][1],analyzers) == "Tokens") {
      count <- 2
    } else if (get_analyzer_kind(output[[3]][1],analyzers) == "Tokens") {
      count <- 3
    }
    n <- NROW(output[[count]]$result[[1]]$Tokens)
    t <- data.frame(matrix(unlist(output[[count]]$result[[1]]$Tokens), nrow = n, byrow = T))
    names(t) <- c("Length", "NormalizedToken", "Offset", "RawToken")
    tok = TRUE
  }

  if (pos && con && tok) {
    out <- list(POS_Tags = p, Constituency_Tree = c, Tokens = t)
  } else if (pos && con) {
    out <- list(POS_Tags = p, Constituency_Tree = c)
  } else if (pos && tok) {
    out <- list(POS_Tags = p, Tokens = t)
  } else if (con && tok) {
    out <- list(Constituency_Tree = c, Tokens = t)
  } else if (pos) {
    out <- p
  } else if (con) {
    out <- c
  } else if (tok) {
    out <- t
  }

  out
}

get_analyzers <- function() {
  #Setting basic variables
  url <- "https://westus.api.cognitive.microsoft.com/linguistics/v1.0/analyzers"

  #Setting key value, will throw error if env var not set
  key <- get_ling_analytics_key()

  #Sending request
  result <- httr::GET(url, httr::add_headers(.headers = c("Ocp-Apim-Subscription-Key" = key)))

  #Throwing error if request does not return json
  if (httr::http_type(result) != "application/json") {
    stop("API did not return json", call. = FALSE)
  }

  #Converting JSON to R-usable object
  output <- httr::content(result)

  #Throwing error if status code != 200
  if (httr::status_code(result) != 200) {
    stop(
      paste(
        "API request failed.\nStatus code:",
        output$statusCode, "\nMessage:",
        output$message
      ),
      call. = FALSE
    )
  }

  n <- NROW(output)
  #Coverting output to a dataframe
  output <- data.frame(matrix(unlist(output), nrow = n, byrow = T))
  names(output) <- c("Id", "Language", "Kind", "Specification", "Implementation")
  output
}

get_ling_analytics_key <- function() {
  key <- Sys.getenv('KEY_LA')
  if (identical(key, "")) {
    stop("Please set env var KEY_LA to your key for the linguistic analysis APIS. Key provided by Microsoft (https://www.microsoft.com/cognitive-services)",
         call. = FALSE)
  }

  key
}

match_kind <- function(kind) {
  kind <- tolower(kind)
  valid_kind <- c("pos_tags", "constituency_tree", "tokens")

  for (k in kind) {
    if (!(k %in% valid_kind)) {
      return(FALSE)
    }
  }
  TRUE
}

get_analyzer_id <- function(kind, list) {
  num <- 0
  analyzers <- c()
  for (x in list$Kind) {
    num <- num + 1;
    for (k in kind) {
      if (x == k) {
        analyzers <- c(analyzers, as.character(list$Id[num]))
      }
    }
  }

  analyzers
}

get_analyzer_kind <- function(id, analyzers) {
  num <- 0
  for (a in analyzers$Id) {
    num <- num + 1;
    if (a == as.character(id)) {
      return(analyzers$Kind[num])
    }
  }
}

match_language <- function(lang) {
  lang <- tolower(lang)
  valid_lang <- c("en", "es", "fr", "pt")

  if (lang %in% valid_lang) {
    return(TRUE)
  }
  else {
    return(FALSE)
  }
}

find_POS <- function(abr) {
  abr <- tolower(abr)
  switch(abr,
         cc = "conjunction, coordinating",
         cd = "numeral, cardinal",
         dt = "determiner",
         ex = "existential there",
         fw = "foreign word",
         'in' = "preposition or subordinating conjunction",
         jj = "adjective or numeral, ordinal",
         jjr = "adjective, compatative",
         jjs = "adjective, superlative",
         ls = "list item marker",
         md = "modal auxiliary",
         nn = "noun, common, singular or mass",
         nnp = "noun, proper, singular",
         nnps = "noun, proper, plural",
         nns = "noun, common, plural",
         pdt = "pre-determiner",
         pos = "genitive marker",
         prp = "pronoun, personal",
         'prp$' = "pronoun, possessive",
         rb = "adverb",
         rbr = "adverb, comparative",
         rbs = "adverb, superlative",
         rp = "particle",
         sym = "symbol",
         to = 'to" as preposition or infinitive marker',
         uh = "interjection",
         vb = "verb, base form",
         vbd = "verb, past tense",
         vbg = "verb, present participle or gerund",
         vbn = "verb, past participle",
         vbp = "verb, present tense, not 3rd person singular",
         vbz = "verb, present tense, 3rd person singular",
         wdt = "WH-determiner",
         wp = "WH-pronoun",
         'wp$' = "WH-pronoun, possessive",
         wrb = "Wh-adverb")
}
dereklegenzoff/cognitiver documentation built on May 5, 2019, 3:49 a.m.