key <- "d64d9507f0b74acca33d9786a33fbc34"
analyze_linguistics <- function(text, analyzer = c("Tokens", "Constituency_Tree", "POS_Tags"), language = "en") {
#Removing punctuation from text
text <- gsub("[[:punct:]]", "", text)
#Setting basic variables
url <- "https://westus.api.cognitive.microsoft.com/linguistics/v1.0/analyze"
#Setting key value, will throw error if env var not set
key <- get_ling_analytics_key()
#Checking for valid analyzer and throwing error if not valid
if(!match_kind(analyzer)) {
stop("Supplied analyzer is not supported. Pass in one of: POS_Tags, Constituency_Tree, Tokens",call. = FALSE)
}
#Checking for valid language and throwing error if not valid
if(!match_language(language)) {
stop("Supplied language is not supported. Pass in one of: en,es,fr,pt",call. = FALSE)
}
#Getting the id of the analyzer requested
analyzers <- get_analyzers()
analyzer_ids <- get_analyzer_id(analyzer, analyzers)
#Coverting the request body to JSON
request_body_json <- jsonlite::toJSON(list(language = jsonlite::unbox(language), analyzerIds = analyzer_ids, text = jsonlite::unbox(text)), auto_unbox = FALSE)
#Sending request
result <- httr::POST(url,
body = request_body_json,
httr::add_headers(.headers = c("Content-Type" = "application/json",
"Ocp-Apim-Subscription-Key" = key)))
# #Throwing error if request does not return json
if (httr::http_type(result) != "application/json") {
stop("API did not return json", call. = FALSE)
}
#Converting JSON to R-usable object
output <- httr::content(result)
#Throwing error if status code != 200
if (httr::status_code(result) != 200) {
stop(
paste(
"API request failed.\nStatus code:",
output$statusCode, "\nMessage:",
output$message
),
call. = FALSE
)
}
#Coverting output to a dataframe
pos = FALSE
con = FALSE
tok = FALSE
if ("pos_tags" %in% tolower(analyzer))
{
count <- 1
if(get_analyzer_kind(output[[1]][1],analyzers) == "POS_Tags") {
count <- 1
} else if (get_analyzer_kind(output[[2]][1],analyzers) == "POS_Tags") {
count <- 2
} else if (get_analyzer_kind(output[[3]][1],analyzers) == "POS_Tags") {
count <- 3
}
n <- NROW(output[[count]]$result[[1]])
p <- data.frame(matrix(unlist(output[[count]]$result[[1]]), nrow = n, byrow = T))
words <- strsplit(text, split = " ")
p$Text <- as.list(words[[1]])
names(p) <- c("POS", "Text")
p$POS_long <- lapply(as.character(p$POS), find_POS)
p <- p[,c(2,1,3)]
pos = TRUE
}
if ("constituency_tree" %in% tolower(analyzer))
{
count <- 1
if(get_analyzer_kind(output[[1]][1],analyzers) == "Constituency_Tree") {
count <- 1
} else if (get_analyzer_kind(output[[2]][1],analyzers) == "Constituency_Tree") {
count <- 2
} else if (get_analyzer_kind(output[[3]][1],analyzers) == "Constituency_Tree") {
count <- 3
}
n <- NROW(output[[count]]$result[[1]])
c <- output[[count]]$result[[1]]
con = TRUE
}
if ("tokens" %in% tolower(analyzer))
{
count <- 1
if(get_analyzer_kind(output[[1]][1],analyzers) == "Tokens") {
count <- 1
} else if (get_analyzer_kind(output[[2]][1],analyzers) == "Tokens") {
count <- 2
} else if (get_analyzer_kind(output[[3]][1],analyzers) == "Tokens") {
count <- 3
}
n <- NROW(output[[count]]$result[[1]]$Tokens)
t <- data.frame(matrix(unlist(output[[count]]$result[[1]]$Tokens), nrow = n, byrow = T))
names(t) <- c("Length", "NormalizedToken", "Offset", "RawToken")
tok = TRUE
}
if (pos && con && tok) {
out <- list(POS_Tags = p, Constituency_Tree = c, Tokens = t)
} else if (pos && con) {
out <- list(POS_Tags = p, Constituency_Tree = c)
} else if (pos && tok) {
out <- list(POS_Tags = p, Tokens = t)
} else if (con && tok) {
out <- list(Constituency_Tree = c, Tokens = t)
} else if (pos) {
out <- p
} else if (con) {
out <- c
} else if (tok) {
out <- t
}
out
}
get_analyzers <- function() {
#Setting basic variables
url <- "https://westus.api.cognitive.microsoft.com/linguistics/v1.0/analyzers"
#Setting key value, will throw error if env var not set
key <- get_ling_analytics_key()
#Sending request
result <- httr::GET(url, httr::add_headers(.headers = c("Ocp-Apim-Subscription-Key" = key)))
#Throwing error if request does not return json
if (httr::http_type(result) != "application/json") {
stop("API did not return json", call. = FALSE)
}
#Converting JSON to R-usable object
output <- httr::content(result)
#Throwing error if status code != 200
if (httr::status_code(result) != 200) {
stop(
paste(
"API request failed.\nStatus code:",
output$statusCode, "\nMessage:",
output$message
),
call. = FALSE
)
}
n <- NROW(output)
#Coverting output to a dataframe
output <- data.frame(matrix(unlist(output), nrow = n, byrow = T))
names(output) <- c("Id", "Language", "Kind", "Specification", "Implementation")
output
}
get_ling_analytics_key <- function() {
key <- Sys.getenv('KEY_LA')
if (identical(key, "")) {
stop("Please set env var KEY_LA to your key for the linguistic analysis APIS. Key provided by Microsoft (https://www.microsoft.com/cognitive-services)",
call. = FALSE)
}
key
}
match_kind <- function(kind) {
kind <- tolower(kind)
valid_kind <- c("pos_tags", "constituency_tree", "tokens")
for (k in kind) {
if (!(k %in% valid_kind)) {
return(FALSE)
}
}
TRUE
}
get_analyzer_id <- function(kind, list) {
num <- 0
analyzers <- c()
for (x in list$Kind) {
num <- num + 1;
for (k in kind) {
if (x == k) {
analyzers <- c(analyzers, as.character(list$Id[num]))
}
}
}
analyzers
}
get_analyzer_kind <- function(id, analyzers) {
num <- 0
for (a in analyzers$Id) {
num <- num + 1;
if (a == as.character(id)) {
return(analyzers$Kind[num])
}
}
}
match_language <- function(lang) {
lang <- tolower(lang)
valid_lang <- c("en", "es", "fr", "pt")
if (lang %in% valid_lang) {
return(TRUE)
}
else {
return(FALSE)
}
}
find_POS <- function(abr) {
abr <- tolower(abr)
switch(abr,
cc = "conjunction, coordinating",
cd = "numeral, cardinal",
dt = "determiner",
ex = "existential there",
fw = "foreign word",
'in' = "preposition or subordinating conjunction",
jj = "adjective or numeral, ordinal",
jjr = "adjective, compatative",
jjs = "adjective, superlative",
ls = "list item marker",
md = "modal auxiliary",
nn = "noun, common, singular or mass",
nnp = "noun, proper, singular",
nnps = "noun, proper, plural",
nns = "noun, common, plural",
pdt = "pre-determiner",
pos = "genitive marker",
prp = "pronoun, personal",
'prp$' = "pronoun, possessive",
rb = "adverb",
rbr = "adverb, comparative",
rbs = "adverb, superlative",
rp = "particle",
sym = "symbol",
to = 'to" as preposition or infinitive marker',
uh = "interjection",
vb = "verb, base form",
vbd = "verb, past tense",
vbg = "verb, present participle or gerund",
vbn = "verb, past participle",
vbp = "verb, present tense, not 3rd person singular",
vbz = "verb, present tense, 3rd person singular",
wdt = "WH-determiner",
wp = "WH-pronoun",
'wp$' = "WH-pronoun, possessive",
wrb = "Wh-adverb")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.