knitr::opts_chunk$set(
  collapse = TRUE,
  warning = FALSE,
  message = FALSE,
  comment = "#>",
  fig.path = "man/figures/README-"
)

scdv

An R package for SCDV (Sparse Composite Document Vectors) algorithm

Travis-CI Build
Status

Installation

# Wait for a while...
# install.packages("scdv")

# The development version from GitHub:
# install.packages("devtools")
devtools::install_github("teramonagi/scdv")

Example

Get (sample) data and do pre-processing

library(scdv)
# Get example document from Project Gutenberg (http://www.gutenberg.org/wiki/Main_Page)
urls <- c(
  "http://www.gutenberg.org/files/98/98-0.txt",
  "http://www.gutenberg.org/files/1342/1342-0.txt"
)
x <- purrr::map(urls, ~ httr::content(httr::GET(.x)))
# pre-processing for each document
doc <- purrr::map(x, ~ tokenizers::tokenize_words(.x, stopwords = stopwords::stopwords("en"))[[1]])
doc[[1]][1:10]

Calculate SCDV(Sparse Composite Document Vector)

# Set the number of cluster (k), and the word2vec dimension (dimension)
k <- 10
dimension <- 30
# Calculate Sparse Composite Document Vector
dv <- scdv::scdv(doc, k, dimension, word2vec_args = list(show_by=25))

Calculate embedding expression by word2vec and visualize these

# Calculate embedding expression by word2vec
wv <- scdv::word2vec(doc, dimension, args = list(show_by=25))
# Sample row and visualize
scdv::visualize(wv[sample(nrow(wv), size = 10), ])
# You can also visualize document vecotr like
#scdv::visualize(dv)


teramonagi/scdv documentation built on June 1, 2019, 3:58 a.m.