knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" ) # reticulate::use_python("/usr/bin/python3.6") reticulate::use_virtualenv("./env", required = TRUE)
Python's Natural Language Toolkit for R.
First install the package.
# install.packages("remotes") remotes::install_github("news-r/nltk4r")
You are advised to make use of a virtual environment.
# replace with path of your choice my_env <- "./env" # create a virtual environment (tested on unix) args <- paste("-m venv", my_env) system2("python3", args) # force reticulate to use env reticulate::use_virtualenv(my_env, required = TRUE) # install gensim & scikit-learn in environment nltk4r::install_nltk(my_env)
Then download the necessary datasets.
nltk4r::download_datasets("all")
Now you're set, you can import the library and get started.
This is a basic example which shows you how to solve a common problem:
library(nltk4r) # from Wikipedia str <- paste( "R is a programming language and free software environment", "for statistical computing and graphics supported by the R Foundation", "for Statistical Computing." ) # tokenize (tokens <- word_tokenize(str)) # Parts of speech pos_tag(tokens, to_r = TRUE) # titdy R data structure pos <- pos_tag(tokens) # Identify named entity chunks <- ne_chunk(pos) # convert to text txt <- nltk_text(tokens) # generate txt$generate()
Classify gender based on last letter in name using naive bayes classifier, from the book
# load data first_names <- first_names(to_r = TRUE) # extract last letter as feature gender_feature <- function(nms){ nms <- substr(nms, nchar(nms), nchar(nms)) purrr::map(nms, function(x){ list( last_letter = x ) }) } features <- gender_feature(first_names$name) feature_set <- purrr::map2(features, first_names$gender, function(g, l){ list( g, l ) }) # split train test train <- list() test <- list() for(i in 1:length(feature_set)){ draw <- sample(1:2, 1) if(draw == 1) train <- append(train, list(feature_set[[i]])) else test <- append(test, list(feature_set[[i]])) } classifier <- train_bayes_classifier(train) classifier$classify(gender_feature("Neo")[[1]]) classifier$classify(gender_feature("Sara")[[1]]) classify_accuracy(classifier, test) classifier$show_most_informative_features(5L)
The last letter of the name is not the best feature we can extract.
# load data first_names <- first_names(to_r = TRUE) # extract last letter as feature gender_feature <- function(nms){ suffix1 <- substr(nms, nchar(nms)-2, nchar(nms)) suffix2 <- substr(nms, nchar(nms)-3, nchar(nms)) purrr::map2(suffix1, suffix2, function(x, y){ list( suffix1 = x, suffix2 = y ) }) } features <- gender_feature(first_names$name) feature_set <- purrr::map2(features, first_names$gender, function(g, l){ list( g, l ) }) # split train test train <- list() test <- list() for(i in 1:length(feature_set)){ draw <- sample(1:2, 1) if(draw == 1) train <- append(train, list(feature_set[[i]])) else test <- append(test, list(feature_set[[i]])) } classifier <- train_bayes_classifier(train) classifier$classify(gender_feature("Katheryn")[[1]]) classifier$classify(gender_feature("Mitch")[[1]]) classify_accuracy(classifier, test) classifier$show_most_informative_features(5L)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.