This notebook is adapted from John Snow Labs workshop Jupyter/Python tutorial "1. Spark NLP Basics" (https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/1.SparkNLP_Basics.ipynb)
library(purrr, warn.conflicts = FALSE) library(sparklyr, warn.conflicts = FALSE) library(sparknlp, warn.conflicts = FALSE) library(dplyr, warn.conflicts = FALSE) version <- Sys.getenv("SPARK_VERSION", unset = "2.4.5") config <- sparklyr::spark_config() options(sparklyr.sanitize.column.names.verbose = TRUE) options(sparklyr.verbose = TRUE) options(sparklyr.na.omit.verbose = TRUE) options(sparklyr.na.action.verbose = TRUE) sc <- sparklyr::spark_connect(master = "local", version = version, config = config) cat("Apache Spark version: ", sc$home_version, "\n") cat("Spark NLP version: ", nlp_version())
https://github.com/JohnSnowLabs/spark-nlp-models
testDoc <- "Peter is a very good persn. My life in Russia is very intersting. John and Peter are brothrs. However they don't support each other that much. Lucas Nogal Dunbercker is no longer happy. He has a good car though. Europe is very culture rich. There are huge churches! and big houses!"
pipeline <- nlp_pretrained_pipeline(sc, "explain_document_ml", lang = "en")
system.time( result <- nlp_annotate(pipeline, testDoc) )
names(result)
result$sentence %>% unlist()
result$token %>% unlist()
data.frame(token = unlist(result$token), pos = unlist(result$pos))
data.frame(token = unlist(result$token), lemmas = unlist(result$lemmas), stems = unlist(result$stems), spell = unlist(result$spell))
pipeline_dl <- nlp_pretrained_pipeline(sc, "explain_document_dl", lang = "en")
system.time( result <- nlp_annotate(pipeline_dl, testDoc) )
names(result)
result$entities %>% unlist()
data.frame(token = unlist(result$token), ner_label = unlist(result$ner), spell_corrected = unlist(result$checked), POS = unlist(result$pos), lemmas = unlist(result$lemma), stems = unlist(result$stem))
spell_checker <- nlp_pretrained_pipeline(sc, "check_spelling", lang = "en")
result <- nlp_annotate(spell_checker, testDoc) names(result)
data.frame(token = unlist(result$token), checked = unlist(result$checked))
testDoc_list <- c('French author who helped pioner the science-fiction genre.', 'Verne wrate about space, air, and underwater travel before navigable aircrast', 'Practical submarines were invented, and before any means of space travel had been devised.') testDoc_list
result_list <- nlp_annotate(pipeline, testDoc_list) length(result_list)
result_list[[1]]
text <- 'Peter Parker is a nice guy and lives in New York'
detailed_result <- nlp_annotate_full(pipeline_dl, text)
jsonlite::toJSON(detailed_result, force = TRUE, auto_unbox = TRUE)
detailed_result$entities
chunks <- purrr::map_chr(detailed_result$entities, "result") entities <- purrr::map_chr(detailed_result$entities, function(x) x$metadata$entity) df <- data.frame(chunks = chunks, entities = entities) df
sent_ids <- map_chr(detailed_result$token, function(x) x$metadata$sentence) tokens <- map_chr(detailed_result$token, "result") starts <- map_chr(detailed_result$token, "begin") ends <- map_chr(detailed_result$token, "end") pos <- map_chr(detailed_result$pos, "result") ner <- map_chr(detailed_result$ner, "result") df <- data.frame(sent_id = sent_ids, token = tokens, start = starts, end = ends, pos = pos, ner = ner) df
Pipeline:
The pipeline uses regex <DT>?<JJ>*<NN>+
which states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.
pipeline <- nlp_pretrained_pipeline(sc, "match_chunks", lang = "en")
result <- nlp_annotate(pipeline, "The book has many chapters") # single noun phrase
map(result, unlist)
result$chunk
result <- nlp_annotate(pipeline, "the little yellow dog barked at the cat") # Multiple noun phrases
map(result, unlist)
unlist(result$chunk)
pipeline <- nlp_pretrained_pipeline(sc, "match_datetime", lang = "en")
result <- nlp_annotate(pipeline, "I saw him yesterday and he told me that he will visit us next week") map(result, unlist)
full_result <- nlp_annotate_full(pipeline, "I saw him yesterday and he told me that he will visit us next week") jsonlite::toJSON(full_result, force = TRUE, auto_unbox = TRUE)
pipeline <- nlp_pretrained_pipeline(sc, "analyze_sentiment", lang = "en")
result <- nlp_annotate(pipeline, "The movie I watched today was not a good one") unlist(result$sentiment)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.