In news-r/nltk: Integration of the Python Natural Language Toolkit Library

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

# reticulate::use_python("/usr/bin/python3.6")
reticulate::use_virtualenv("./env", required = TRUE)

Natural Language Toolkit for R

Python's Natural Language Toolkit for R.

Installation

First install the package.

# install.packages("remotes")
remotes::install_github("news-r/nltk4r")

You are advised to make use of a virtual environment.

# replace with path of your choice
my_env <- "./env"

# create a virtual environment (tested on unix)
args <- paste("-m venv", my_env)
system2("python3", args) 

# force reticulate to use env
reticulate::use_virtualenv(my_env, required = TRUE)

# install gensim & scikit-learn in environment
nltk4r::install_nltk(my_env)

Then download the necessary datasets.

nltk4r::download_datasets("all")

Now you're set, you can import the library and get started.

Examples

This is a basic example which shows you how to solve a common problem:

library(nltk4r)

# from Wikipedia
str <- paste(
  "R is a programming language and free software environment",
  "for statistical computing and graphics supported by the R Foundation",
  "for Statistical Computing."
)

# tokenize
(tokens <- word_tokenize(str))

# Parts of speech
pos_tag(tokens, to_r = TRUE) # titdy R data structure
pos <- pos_tag(tokens)

# Identify named entity
chunks <- ne_chunk(pos)

# convert to text
txt <- nltk_text(tokens)

# generate
txt$generate()

Gender Classifier

Classify gender based on last letter in name using naive bayes classifier, from the book

Using last letter

# load data
first_names <- first_names(to_r = TRUE) 

# extract last letter as feature
gender_feature <- function(nms){
  nms <- substr(nms, nchar(nms), nchar(nms))
  purrr::map(nms, function(x){
    list(
      last_letter = x
    )
  })
}

features <- gender_feature(first_names$name)
feature_set <- purrr::map2(features, first_names$gender, function(g, l){
  list(
    g, l
  )
})

# split train test
train <- list()
test <- list()
for(i in 1:length(feature_set)){
  draw <- sample(1:2, 1)
  if(draw == 1)
    train <- append(train, list(feature_set[[i]]))
  else
    test <- append(test, list(feature_set[[i]]))
}

classifier <- train_bayes_classifier(train)
classifier$classify(gender_feature("Neo")[[1]])
classifier$classify(gender_feature("Sara")[[1]])

classify_accuracy(classifier, test)
classifier$show_most_informative_features(5L)

Using suffixes

The last letter of the name is not the best feature we can extract.

# load data
first_names <- first_names(to_r = TRUE) 

# extract last letter as feature
gender_feature <- function(nms){
  suffix1 <- substr(nms, nchar(nms)-2, nchar(nms))
  suffix2 <- substr(nms, nchar(nms)-3, nchar(nms))
  purrr::map2(suffix1, suffix2, function(x, y){
    list(
      suffix1 = x,
      suffix2 = y
    )
  })
}

features <- gender_feature(first_names$name)
feature_set <- purrr::map2(features, first_names$gender, function(g, l){
  list(
    g, l
  )
})

# split train test
train <- list()
test <- list()
for(i in 1:length(feature_set)){
  draw <- sample(1:2, 1)
  if(draw == 1)
    train <- append(train, list(feature_set[[i]]))
  else
    test <- append(test, list(feature_set[[i]]))
}

classifier <- train_bayes_classifier(train)
classifier$classify(gender_feature("Katheryn")[[1]])
classifier$classify(gender_feature("Mitch")[[1]])

classify_accuracy(classifier, test)
classifier$show_most_informative_features(5L)