In r-spark/sparknlp: R Interface to John Snow Labs Spark NLP

This notebook is adapted from John Snow Labs workshop Jupyter/Python tutorial "3.SparkNLP_Pretrained_Models" (https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/3.SparkNLP_Pretrained_Models.ipynb)

Spark NLP offers the following pre-trained models in five languages (English, French, German, Italian, Russia) and all you need to do is to load the pre-trained model into your disk by specifying the model name and then configuring the model parameters as per your use case and dataset. Then you will not need to worry about training a new model from scratch and will be able to enjoy the pre-trained SOTA algorithms directly applied to your own data with transform().

In the official documentation, you can find detailed information regarding how these models are trained by using which algorithms and datasets.

https://github.com/JohnSnowLabs/spark-nlp-models

LemmatizerModel

news_category_test <- pins::pin("https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv")

library(purrr, warn.conflicts = FALSE)
library(sparklyr, warn.conflicts = FALSE)
library(sparknlp, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

version <- Sys.getenv("SPARK_VERSION", unset = "2.4.5")

config <- sparklyr::spark_config()
config$`sparklyr.shell.driver-memory` <- "8g"

options(sparklyr.sanitize.column.names.verbose = TRUE)
options(sparklyr.verbose = TRUE)
options(sparklyr.na.omit.verbose = TRUE)
options(sparklyr.na.action.verbose = TRUE)

sc <- sparklyr::spark_connect(master = "local", version = version, config = config)

cat("Apache Spark version: ", sc$home_version, "\n")
cat("Spark NLP version: ", nlp_version())

news_df <- spark_read_csv(sc, news_category_test) %>% 
  rename(text = description)

head(news_df %>% mutate(text = substr(text, 1, 50)), 20)

lemmatizer <- nlp_lemmatizer_pretrained(sc, name = "lemma_antbnc", input_cols = c("token"), output_col = "lemma",)

file.info(list.files("~/cache_pretrained", full.names = TRUE)) %>% 
  select(mode, uname, grname, mtime)

documentAssembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
tokenizer <- nlp_tokenizer(sc, input_cols = c("document"), output_col = "token")
stemmer <- nlp_stemmer(sc, input_cols = c("token"), output_col = "stem")

nlpPipeline <- ml_pipeline(documentAssembler, tokenizer, stemmer, lemmatizer)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, news_df)

head(
  result %>% 
    mutate(text = substr(text, 1, 50),
           document = substr(to_json(document), 1, 30),
           token = substr(to_json(token), 1, 30),
           stem = substr(to_json(stem), 1, 30),
           lemma = substr(to_json(lemma), 1, 30))


  , 5)

result %>% 
  mutate(token_result = substr(to_json(token.result), 1, 50), lemma_result = substr(to_json(lemma.result), 1, 50)) %>% 
  select(token_result, lemma_result) %>% 
  head(5)

PerceptronModel (POS - Part of speech tags)

pos <- nlp_perceptron_pretrained(sc, name = "pos_anc", input_cols = c("document", "token"), output_col = "pos")

file.info(list.files("~/cache_pretrained", full.names = TRUE)) %>% 
  select(mode, uname, grname, mtime)

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           stemmer,
                           lemmatizer,
                           pos)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, news_df)

head(
  result %>% 
    mutate(text = substr(text, 1, 50),
           document = substr(to_json(document), 1, 30),
           token = substr(to_json(token), 1, 30),
           stem = substr(to_json(stem), 1, 30),
           lemma = substr(to_json(lemma), 1, 30),
           pos = substr(to_json(pos), 1, 30))


  , 5)

result %>%
  mutate(token_result = substr(to_json(token.result), 1, 50),
         pos_result = substr(to_json(pos.result), 1, 50)) %>% 
  select(token_result, pos_result) %>% 
  head(5)

# applying this pipeline to top 100 rows and then converting to local data frame

result <- ml_transform(pipelineModel, head(news_df, 100))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(token.result, token.begin, token.end, stem.result, lemma.result, pos.result))) %>%
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("token", "begin", "end", "stem", "lemma", "pos")) %>% 
  select(-cols) %>%
  collect()

head(result_df, 10)

# same in LightPipeline

light_model <- nlp_light_pipeline(pipelineModel)

light_result <- nlp_annotate(light_model, 'Unions representing workers at Turner   Newall say they are disappointed after talks with stricken parent firm Federal Mogul.')

data.frame(token = unlist(light_result$token),
           stem = unlist(light_result$stem),
           lemma = unlist(light_result$lemma),
           pos = unlist(light_result$pos))

# applying POS chunker to find a custom pattern

chunker <- nlp_chunker(sc, input_cols = c("document", "pos"), output_col = "chunk", regex_parsers = c("<NNP>+", "<DT>?<JJ>*<NN>"))

# NNP: Proper Noun
# NN: COmmon Noun
# DT: Determinator (e.g. the)
# JJ: Adjective

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           stemmer,
                           lemmatizer,
                           pos,
                           chunker)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 100))

head(
  result %>% 
    mutate(text = substr(text, 1, 50),
           document = substr(to_json(document), 1, 30),
           token = substr(to_json(token), 1, 30),
           stem = substr(to_json(stem), 1, 30),
           lemma = substr(to_json(lemma), 1, 30),
           pos = substr(to_json(pos), 1, 30),
           chunk = substr(to_json(chunk), 1, 30))
  , 5)

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(chunk.result, chunk.begin, chunk.end))) %>% 
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("chunk", "begin", "end")) %>% 
  select(-cols) %>% 
  collect()

head(result_df, 10)

Dependency Parser

dep_parser <- nlp_dependency_parser_pretrained(sc, name = "dependency_conllu", 
                                               input_cols = c("document", "pos", "token"), output_col = "dependency")

typed_dep_parser <- nlp_typed_dependency_parser_pretrained(sc, name = "dependency_typed_conllu",
                                                           input_cols = c("token", "pos", "dependency"), output_col = "dependency_type")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           stemmer,
                           lemmatizer,
                           pos,
                           dep_parser,
                           typed_dep_parser)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 100))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(token.result, token.begin, token.end, dependency.result, dependency_type.result))) %>% 
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("chunk", "begin", "end", "dependency", "dependency_type")) %>% 
  select(-cols) %>% 
  collect()

head(result_df, 10)

SpellChecker

spell_checker <- nlp_norvig_spell_checker_pretrained(sc, name = "spellcheck_norvig", input_cols = c("token"), output_col = "corrected")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           stemmer,
                           lemmatizer,
                           pos,
                           spell_checker)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 100))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(token.result, corrected.result, stem.result, lemma.result, pos.result))) %>% 
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("token", "corrected", "stem", "lemma", "pos")) %>% 
  select(-cols) %>% 
  collect()

head(result_df, 10)

# same in LightPipeline

light_model <- nlp_light_pipeline(pipelineModel)

light_result <- nlp_annotate(light_model, "The patint has pain and headache")

data.frame(token = unlist(light_result$token), corrected = unlist(light_result$corrected))

Word Embeddings (Glove)

glove_embeddings <- nlp_word_embeddings_pretrained(sc, name = "glove_100d", 
                                                   input_cols = c("document", "token"), output_col = "embeddings")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           glove_embeddings)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 10))

result %>% mutate(embeddings = to_json(embeddings.embeddings)) %>% 
  select(embeddings) %>% 
  head(1)

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(token.result, embeddings.embeddings))) %>% 
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("token", "embeddings")) %>% 
  select(-cols)

head(result_df, 10)

Elmo Embeddings

elmo_embeddings <- nlp_elmo_embeddings_pretrained(sc, name = "elmo", input_cols = c("document", "token"), output_col = "embeddings")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           elmo_embeddings)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 10))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(token.result, embeddings.embeddings))) %>% 
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("token", "elmo_embeddings")) %>% 
  select(-cols)

head(result_df, 10)

Bert Embeddings

bert_embeddings <- nlp_bert_embeddings_pretrained(sc, name = "bert_base_uncased", input_cols = c("document", "token"), output_col = "embeddings")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           bert_embeddings)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 10))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(token.result, embeddings.embeddings))) %>% 
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("token", "bert_embeddings")) %>% 
  select(-cols)

head(result_df, 10)

UniversalSentenceEncoder

use_embeddings <- nlp_univ_sent_encoder_pretrained(sc, name = "tfhub_use", input_cols = c("document"), output_col = "sentence_embeddings")

nlpPipeline <- ml_pipeline(documentAssembler,
                           use_embeddings)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 10))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(document.result, sentence_embeddings.embeddings))) %>% 
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("document", "USE_embeddings")) %>% 
  select(-cols)

head(result_df, 10)

Loading models from local

list.files("~/cache_pretrained/", full.names = TRUE)

glove_embeddings <- ml_load(sc, "/home/davek/cache_pretrained//glove_100d_en_2.4.0_2.4_1579690104032")

nlp_set_input_cols(glove_embeddings, c("document", "token"))
nlp_set_output_col(glove_embeddings, "glove_embeddings")

Using your own Word embeddings in Spark NLP

custom_embeddings <- nlp_word_embeddings(sc, input_cols = c("sentence", "token"), output_col = "my_embeddings",
                                         storage_path = "PubMed-shuffle-win-2.bin", storage_path_format = "BINARY",
                                         dimension = 200)

Getting Sentence Embeddings from Glove, Elmo and BERT

glove_embeddings <- nlp_word_embeddings_pretrained(sc, name = "glove_100d", input_cols = c("document", "token"), output_col = "embeddings")

embeddingSentence <- nlp_sentence_embeddings(sc, input_cols = c("document", "embeddings"), 
                                             output_col = "sentence_embeddings", pooling_strategy = "AVERAGE")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           glove_embeddings,
                           embeddingSentence)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 10))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(document.result, sentence_embeddings.embeddings))) %>% 
  select(cols) %>% 
  sdf_separate_column(column = "cols", into = c("document", "sentence_embeddings")) %>% 
  select(-cols)

head(result_df, 10)

cosine_dist <- function(x, y) {
  1 - (x %*% y / sqrt(x %*% x * y %*% y))
}

embeddings <- result_df %>%
  select(sentence_embeddings) %>% 
  head(2) %>% 
  collect()

v1 <- embeddings$sentence_embeddings[[1]]
v2 <- embeddings$sentence_embeddings[[2]]

1 - cosine_dist(v1, v2)[1,1]

v2 <- embeddings$sentence_embeddings[[1]]

1- - cosine_dist(v1, v2)[1,1]

NERDL Model

Public NER (CoNLL 2003)

Entities

PERSON, LOCATION, ORGANIZATION, MISC

public_ner <- nlp_ner_dl_pretrained(sc, name = "ner_dl", input_cols = c("document", "token", "embeddings"), output_col = "ner")

documentAssembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")

tokenizer <- nlp_tokenizer(sc, input_cols = c("document"), output_col = "token")

# ner_dl model is trained with glove_100d. So we use the same embeddings in the pipeline
glove_embeddings <- nlp_word_embeddings_pretrained(sc, name = "glove_100d", 
                                                   input_cols = c("document", "token"), output_col = "embeddings")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           glove_embeddings,
                           public_ner)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 10))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(token.result, ner.result))) %>% 
  sdf_separate_column(column = "cols", into = c("tokens", "ner_label")) %>% 
  select(tokens, ner_label)

head(result_df, 50)

NerDL OntoNotes 100D

Entities

'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'

onto_ner <- nlp_ner_dl_pretrained(sc, name = "onto_100", input_cols = c("document", "token", "embeddings"), output_col = "ner")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           glove_embeddings,
                           onto_ner)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 10))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(token.result, ner.result))) %>% 
  sdf_separate_column(column = "cols", into = c("tokens", "ner_label")) %>% 
  select(tokens, ner_label)

head(result_df, 50)

Getting the NER chunks with NER Converter

ner_converter <- nlp_ner_converter(sc, input_cols = c("document", "token", "ner"), output_col = "ner_chunk")

nlpPipeline <- ml_pipeline(documentAssembler,
                           tokenizer,
                           glove_embeddings,
                           onto_ner,
                           ner_converter)

empty_df <- sdf_copy_to(sc, data.frame(text = ""))

pipelineModel <- ml_fit(nlpPipeline, empty_df)

result <- ml_transform(pipelineModel, head(news_df, 10))

result_df <- result %>% 
  mutate(cols = explode(arrays_zip(ner_chunk.result, ner_chunk.metadata))) %>% 
  sdf_separate_column(column = "cols", into = c("chunk", "ner_label")) %>% 
  mutate(ner_label = ner_label.entity) %>% 
  select(chunk, ner_label)

head(result_df, 50)

#fullAnnotate in LightPipeline

light_model <- nlp_light_pipeline(pipelineModel)

light_result <- nlp_annotate_full(light_model, 'Unions representing workers at Turner Newall say they are disappointed after talks with stricken parent firm Federal Mogul and Mike Fitzpatrick in Canada.')

chunks <- map_chr(light_result$ner_chunk, "result")
entities <- map_chr(light_result$ner_chunk, function(x) x$metadata$entity)

data.frame(chunks = chunks, entities = entities)

r-spark/sparknlp documentation built on Oct. 15, 2022, 10:50 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

r-spark/sparknlp
R Interface to John Snow Labs Spark NLP

In r-spark/sparknlp: R Interface to John Snow Labs Spark NLP

LemmatizerModel

PerceptronModel (POS - Part of speech tags)

Dependency Parser

SpellChecker

Word Embeddings (Glove)

Elmo Embeddings

Bert Embeddings

UniversalSentenceEncoder

Loading models from local

Using your own Word embeddings in Spark NLP

Getting Sentence Embeddings from Glove, Elmo and BERT

NERDL Model

Public NER (CoNLL 2003)

NerDL OntoNotes 100D

Getting the NER chunks with NER Converter

R Package Documentation

Browse R Packages

We want your feedback!

r-spark/sparknlp R Interface to John Snow Labs Spark NLP

In r-spark/sparknlp: R Interface to John Snow Labs Spark NLP

LemmatizerModel

PerceptronModel (POS - Part of speech tags)

Dependency Parser

SpellChecker

Word Embeddings (Glove)

Elmo Embeddings

Bert Embeddings

UniversalSentenceEncoder

Loading models from local

Using your own Word embeddings in Spark NLP

Getting Sentence Embeddings from Glove, Elmo and BERT

NERDL Model

Public NER (CoNLL 2003)

NerDL OntoNotes 100D

Getting the NER chunks with NER Converter

R Package Documentation

Browse R Packages

We want your feedback!

r-spark/sparknlp
R Interface to John Snow Labs Spark NLP