This notebook is adapted from John Snow Labs workshop Jupyter/Python tutorial "extractor.ipynb" (https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/text-matcher-pipeline/extractor.ipynb)
In the following example, we walk-through our straight forward Text Matcher Annotator.
This annotator will take a list of sentences from a text file and look them up in the given target dataset.
This annotator is an Annotator Model and hence does not require training.
library(sparklyr) library(sparknlp) library(dplyr)
version <- Sys.getenv("SPARK_VERSION", unset = "2.4.0") config <- sparklyr::spark_config() #config$`sparklyr.shell.driver-memory` <- "8g" options(sparklyr.sanitize.column.names.verbose = TRUE) options(sparklyr.verbose = TRUE) options(sparklyr.na.omit.verbose = TRUE) options(sparklyr.na.action.verbose = TRUE) sc <- sparklyr::spark_connect(master = "local[*]", version = version, config = config)
document_assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") sentence_detector <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") tokenizer <- nlp_tokenizer(sc, input_cols = c("document"), output_col = "token") extractor <- nlp_text_matcher(sc, input_cols = c("token", "sentence"), output_col = "entities", path = "entities.txt") finisher <- nlp_finisher(sc, input_cols = "entities", include_metadata = FALSE, clean_annotations = TRUE) pipeline <- ml_pipeline(document_assembler, sentence_detector, tokenizer, extractor, finisher)
tdir <- tempdir() download.file("https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment.parquet.zip", paste0(tdir, "/sentiment.parquet.zip")) unzip(paste0(tdir, "/sentiment.parquet.zip"), exdir = tdir)
data <- spark_read_parquet(sc, "sentiment", paste0(tdir, "/sentiment.parquet")) %>% head(1000) head(data, n = 20)
print("Start fitting") model <- ml_fit(pipeline, data) print("Fitting is ended")
extracted <- ml_transform(model, data) extracted extracted %>% filter(size(finished_entities) != 0)
ml_save(model, "./extractor.model")
same_model <- ml_load(sc, "./extractor.model") ml_transform(same_model, data) %>% filter(size(finished_entities) != 0)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.