This notebook is adapted from John Snow Labs workshop Jupyter/Python tutorial "1. Spark NLP Basics" (https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/1.SparkNLP_Basics.ipynb)

1. Start Spark Session

library(purrr, warn.conflicts = FALSE)
library(sparklyr, warn.conflicts = FALSE)
library(sparknlp, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

version <- Sys.getenv("SPARK_VERSION", unset = "2.4.5")

config <- sparklyr::spark_config()

options(sparklyr.sanitize.column.names.verbose = TRUE)
options(sparklyr.verbose = TRUE)
options(sparklyr.na.omit.verbose = TRUE)
options(sparklyr.na.action.verbose = TRUE)

sc <- sparklyr::spark_connect(master = "local", version = version, config = config)

cat("Apache Spark version: ", sc$home_version, "\n")
cat("Spark NLP version: ", nlp_version())

2. Using Pretrained Pipelines

https://github.com/JohnSnowLabs/spark-nlp-models

testDoc <- "Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brothrs. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!"

Explain Document ML

Stages

pipeline <- nlp_pretrained_pipeline(sc, "explain_document_ml", lang = "en")
system.time(
  result <- nlp_annotate(pipeline, testDoc)
)
names(result)
result$sentence %>% unlist()
result$token %>% unlist()
data.frame(token = unlist(result$token), pos = unlist(result$pos))
data.frame(token = unlist(result$token), lemmas = unlist(result$lemmas), stems = unlist(result$stems), spell = unlist(result$spell))

Explain Document DL

Stages

pipeline_dl <- nlp_pretrained_pipeline(sc, "explain_document_dl", lang = "en")
system.time(
  result <- nlp_annotate(pipeline_dl, testDoc)
)
names(result)
result$entities %>% unlist()
data.frame(token = unlist(result$token), ner_label = unlist(result$ner), spell_corrected = unlist(result$checked),
           POS = unlist(result$pos), lemmas = unlist(result$lemma), stems = unlist(result$stem))

Spell Checker

spell_checker <- nlp_pretrained_pipeline(sc, "check_spelling", lang = "en")
result <- nlp_annotate(spell_checker, testDoc)

names(result)
data.frame(token = unlist(result$token), checked = unlist(result$checked))

Parsing a list of texts

testDoc_list <- c('French author who helped pioner the science-fiction genre.',
'Verne wrate about space, air, and underwater travel before navigable aircrast',
'Practical submarines were invented, and before any means of space travel had been devised.')

testDoc_list
result_list <- nlp_annotate(pipeline, testDoc_list)

length(result_list)
result_list[[1]]

Using fullAnnotate to get more details

text <- 'Peter Parker is a nice guy and lives in New York'
detailed_result <- nlp_annotate_full(pipeline_dl, text)
jsonlite::toJSON(detailed_result, force = TRUE, auto_unbox = TRUE)
detailed_result$entities
chunks <- purrr::map_chr(detailed_result$entities, "result")
entities <- purrr::map_chr(detailed_result$entities, function(x) x$metadata$entity)

df <- data.frame(chunks = chunks, entities = entities)
df
sent_ids <- map_chr(detailed_result$token, function(x) x$metadata$sentence)
tokens <- map_chr(detailed_result$token, "result")
starts <- map_chr(detailed_result$token, "begin")
ends <- map_chr(detailed_result$token, "end")
pos <- map_chr(detailed_result$pos, "result")
ner <- map_chr(detailed_result$ner, "result")

df <- data.frame(sent_id = sent_ids, token = tokens, start = starts, end = ends, pos = pos, ner = ner)
df

Use pretrained match_chunk Pipeline for Individual Noun Phrase

Stages

Pipeline: The pipeline uses regex <DT>?<JJ>*<NN>+ which states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.

pipeline <- nlp_pretrained_pipeline(sc, "match_chunks", lang = "en")
result <- nlp_annotate(pipeline, "The book has many chapters") # single noun phrase
map(result, unlist)
result$chunk
result <- nlp_annotate(pipeline, "the little yellow dog barked at the cat") # Multiple noun phrases
map(result, unlist)
unlist(result$chunk)

Extract exact dates from referential date phrases

pipeline <- nlp_pretrained_pipeline(sc, "match_datetime", lang = "en")
result <- nlp_annotate(pipeline, "I saw him yesterday and he told me that he will visit us next week")
map(result, unlist)
full_result <- nlp_annotate_full(pipeline, "I saw him yesterday and he told me that he will visit us next week")
jsonlite::toJSON(full_result, force = TRUE, auto_unbox = TRUE)

Sentiment Analysis

pipeline <- nlp_pretrained_pipeline(sc, "analyze_sentiment", lang = "en")
result <- nlp_annotate(pipeline, "The movie I watched today was not a good one")
unlist(result$sentiment)


r-spark/sparknlp documentation built on Oct. 15, 2022, 10:50 a.m.