In r-spark/sparknlp: R Interface to John Snow Labs Spark NLP

This notebook is adapted from John Snow Labs workshop Jupyter/Python tutorial "1. Spark NLP Basics" (https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/1.SparkNLP_Basics.ipynb)

1. Start Spark Session

library(purrr, warn.conflicts = FALSE)
library(sparklyr, warn.conflicts = FALSE)
library(sparknlp, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

version <- Sys.getenv("SPARK_VERSION", unset = "2.4.5")

config <- sparklyr::spark_config()

options(sparklyr.sanitize.column.names.verbose = TRUE)
options(sparklyr.verbose = TRUE)
options(sparklyr.na.omit.verbose = TRUE)
options(sparklyr.na.action.verbose = TRUE)

sc <- sparklyr::spark_connect(master = "local", version = version, config = config)

cat("Apache Spark version: ", sc$home_version, "\n")
cat("Spark NLP version: ", nlp_version())

2. Using Pretrained Pipelines

https://github.com/JohnSnowLabs/spark-nlp-models

testDoc <- "Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brothrs. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!"

Explain Document ML

Stages

DocumentAssembler
SentenceDetector
Tokenizer
Stemmer
Part of Speech
SpellChecker (Norvig)

pipeline <- nlp_pretrained_pipeline(sc, "explain_document_ml", lang = "en")

system.time(
  result <- nlp_annotate(pipeline, testDoc)
)

names(result)

result$sentence %>% unlist()

result$token %>% unlist()

data.frame(token = unlist(result$token), pos = unlist(result$pos))

data.frame(token = unlist(result$token), lemmas = unlist(result$lemmas), stems = unlist(result$stems), spell = unlist(result$spell))

Explain Document DL

Stages

DocumentAssembler
SentenceDetector
Tokenizer
NER (NER with GloVe, CoNLL2003 dataset)
Lemmatizer
Stemmer
Part of Speech
SpellChecker (Norvig)

pipeline_dl <- nlp_pretrained_pipeline(sc, "explain_document_dl", lang = "en")

system.time(
  result <- nlp_annotate(pipeline_dl, testDoc)
)

names(result)

result$entities %>% unlist()

data.frame(token = unlist(result$token), ner_label = unlist(result$ner), spell_corrected = unlist(result$checked),
           POS = unlist(result$pos), lemmas = unlist(result$lemma), stems = unlist(result$stem))

Spell Checker

spell_checker <- nlp_pretrained_pipeline(sc, "check_spelling", lang = "en")

result <- nlp_annotate(spell_checker, testDoc)

names(result)

data.frame(token = unlist(result$token), checked = unlist(result$checked))

Parsing a list of texts

testDoc_list <- c('French author who helped pioner the science-fiction genre.',
'Verne wrate about space, air, and underwater travel before navigable aircrast',
'Practical submarines were invented, and before any means of space travel had been devised.')

testDoc_list

result_list <- nlp_annotate(pipeline, testDoc_list)

length(result_list)

result_list[[1]]

Using fullAnnotate to get more details

text <- 'Peter Parker is a nice guy and lives in New York'

detailed_result <- nlp_annotate_full(pipeline_dl, text)

jsonlite::toJSON(detailed_result, force = TRUE, auto_unbox = TRUE)

detailed_result$entities

chunks <- purrr::map_chr(detailed_result$entities, "result")
entities <- purrr::map_chr(detailed_result$entities, function(x) x$metadata$entity)

df <- data.frame(chunks = chunks, entities = entities)
df

sent_ids <- map_chr(detailed_result$token, function(x) x$metadata$sentence)
tokens <- map_chr(detailed_result$token, "result")
starts <- map_chr(detailed_result$token, "begin")
ends <- map_chr(detailed_result$token, "end")
pos <- map_chr(detailed_result$pos, "result")
ner <- map_chr(detailed_result$ner, "result")

df <- data.frame(sent_id = sent_ids, token = tokens, start = starts, end = ends, pos = pos, ner = ner)
df

Use pretrained match_chunk Pipeline for Individual Noun Phrase

Stages

DocumentAssembler
SentenceDetector
Tokenizer
Part of Speech
Chunker

Pipeline: The pipeline uses regex <DT>?<JJ>*<NN>+ which states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.

pipeline <- nlp_pretrained_pipeline(sc, "match_chunks", lang = "en")

result <- nlp_annotate(pipeline, "The book has many chapters") # single noun phrase

map(result, unlist)

result$chunk

result <- nlp_annotate(pipeline, "the little yellow dog barked at the cat") # Multiple noun phrases

map(result, unlist)

unlist(result$chunk)

Extract exact dates from referential date phrases

pipeline <- nlp_pretrained_pipeline(sc, "match_datetime", lang = "en")

result <- nlp_annotate(pipeline, "I saw him yesterday and he told me that he will visit us next week")
map(result, unlist)

full_result <- nlp_annotate_full(pipeline, "I saw him yesterday and he told me that he will visit us next week")
jsonlite::toJSON(full_result, force = TRUE, auto_unbox = TRUE)

Sentiment Analysis

pipeline <- nlp_pretrained_pipeline(sc, "analyze_sentiment", lang = "en")

result <- nlp_annotate(pipeline, "The movie I watched today was not a good one")
unlist(result$sentiment)

r-spark/sparknlp documentation built on Oct. 15, 2022, 10:50 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

r-spark/sparknlp
R Interface to John Snow Labs Spark NLP

In r-spark/sparknlp: R Interface to John Snow Labs Spark NLP

1. Start Spark Session

2. Using Pretrained Pipelines

Explain Document ML

Stages

Explain Document DL

Stages

Spell Checker

Parsing a list of texts

Using fullAnnotate to get more details

Use pretrained match_chunk Pipeline for Individual Noun Phrase

Stages

Extract exact dates from referential date phrases

Sentiment Analysis

R Package Documentation

Browse R Packages

We want your feedback!

r-spark/sparknlp R Interface to John Snow Labs Spark NLP

In r-spark/sparknlp: R Interface to John Snow Labs Spark NLP

1. Start Spark Session

2. Using Pretrained Pipelines

Explain Document ML

Stages

Explain Document DL

Stages

Spell Checker

Parsing a list of texts

Using fullAnnotate to get more details

Use pretrained match_chunk Pipeline for Individual Noun Phrase

Stages

Extract exact dates from referential date phrases

Sentiment Analysis

R Package Documentation

Browse R Packages

We want your feedback!

r-spark/sparknlp
R Interface to John Snow Labs Spark NLP