inst/doc/vignette.R

## ----create_tmp_dirs----------------------------------------------------------
registry_tmp <- fs::path(tempdir(), "registry")
data_dir_tmp <- fs::path(tempdir(), "data_dir")

if (!file.exists(registry_tmp)){
  dir.create (registry_tmp)
} else {
  file.remove(list.files(registry_tmp, full.names = TRUE))
}
if (!file.exists(data_dir_tmp)) dir.create(data_dir_tmp)

## ----load_pkgs----------------------------------------------------------------
library(cwbtools)
library(data.table)

## ----run_tidytext, include = FALSE, echo = FALSE------------------------------
run_tidytext_example <- requireNamespace("tidytext") && requireNamespace("janeaustenr")

## ----message = FALSE, results = FALSE-----------------------------------------
austen_data_dir_tmp <- fs::path(data_dir_tmp, "austen")

if (!file.exists(austen_data_dir_tmp)) dir.create(austen_data_dir_tmp)
file.remove(list.files(austen_data_dir_tmp, full.names = TRUE))

## -----------------------------------------------------------------------------
Austen <- CorpusData$new()

## ----eval = run_tidytext_example----------------------------------------------
tbl <- tidytext::unnest_tokens(
  janeaustenr::austen_books(),
  word, text, to_lower = FALSE
)

Austen$tokenstream <- as.data.table(tbl)

## ----eval = run_tidytext_example----------------------------------------------
Austen$tokenstream[, stem := SnowballC::wordStem(Austen$tokenstream[["word"]], language = "english")]

## ----eval = run_tidytext_example----------------------------------------------
Austen$tokenstream[, cpos := 0L:(nrow(tbl) - 1L)]

## ----eval = run_tidytext_example----------------------------------------------
cpos_max_min <- function(x)
  list(cpos_left = min(x[["cpos"]]), cpos_right = max(x[["cpos"]]))
Austen$metadata <- Austen$tokenstream[, cpos_max_min(.SD), by = book]
Austen$metadata[, book := as.character(book)]
setcolorder(Austen$metadata, c("cpos_left", "cpos_right", "book"))

head(Austen$tokenstream)

## ----eval = run_tidytext_example----------------------------------------------
Austen$tokenstream[, book := NULL]
setcolorder(Austen$tokenstream, c("cpos", "word", "stem"))
Austen$tokenstream

## ----message = FALSE, eval = run_tidytext_example-----------------------------
Austen$encode(
   corpus = "AUSTEN",
   encoding = "utf8",
   p_attributes = c("word", "stem"),
   s_attributes = "book",
   registry_dir = registry_tmp,
   data_dir = austen_data_dir_tmp,
   method = "R",
   compress = FALSE,
   reload = TRUE
)

## ----eval = run_tidytext_example----------------------------------------------
ids <- RcppCWB::cl_str2id(
  corpus = "AUSTEN",
  p_attribute = "word",
  str = "pride",
  registry = registry_tmp
)

cpos <- RcppCWB::cl_id2cpos(
  corpus = "AUSTEN",
  p_attribute = "word",
  id = ids,
  registry = registry_tmp
)

length(cpos)

## ----get_unga_teifiles--------------------------------------------------------
teidir <- system.file(package = "cwbtools", "xml", "UNGA")
teifiles <- list.files(teidir, full.names = TRUE)
list.files(teidir)

## ----unga_create_dirs, message = FALSE,  results = FALSE----------------------
unga_data_dir_tmp <- fs::path(data_dir_tmp, "unga")
if (!file.exists(unga_data_dir_tmp)) dir.create(unga_data_dir_tmp)
file.remove(list.files(unga_data_dir_tmp, full.names = TRUE))

## ----unga_instantiate_cd------------------------------------------------------
UNGA <- CorpusData$new()
UNGA

## ----basetable, eval = TRUE---------------------------------------------------
metadata <- c(
  doc_lp = "//legislativePeriod",
  doc_session = "//titleStmt/sessionNo",
  doc_date = "//publicationStmt/date",
  doc_url = "//sourceDesc/url",
  doc_src = "//sourceDesc/filetype"
)
UNGA$import_xml(filenames = teifiles, meta = metadata)
UNGA

## ----cleaning, eval = TRUE----------------------------------------------------
to_keep <- which(is.na(UNGA$metadata[["speaker"]]))
UNGA$chunktable <- UNGA$chunktable[to_keep]
UNGA$metadata <- UNGA$metadata[to_keep][, speaker := NULL]

## ----dissect, eval = TRUE, message = FALSE, results = FALSE-------------------
UNGA$tokenize(lowercase = FALSE, strip_punct = FALSE)
UNGA

## -----------------------------------------------------------------------------
UNGA$tokenstream

## ----message = FALSE----------------------------------------------------------
UNGA$encode(
  registry_dir = registry_tmp,
  data_dir = unga_data_dir_tmp,
  corpus = "UNGA",
  encoding = "utf8",
  method = "R",
  p_attributes = "word",
  s_attributes = c(
    "doc_lp", "doc_session", "doc_date",
    "sp_who", "sp_state", "sp_role"
  ),
  compress = FALSE
)

## -----------------------------------------------------------------------------
id_peace <- RcppCWB::cl_str2id(
  corpus = "UNGA",
  p_attribute = "word",
  str = "peace",
  registry = registry_tmp
)
cpos_peace <- RcppCWB::cl_id2cpos(
  corpus = "UNGA",
  p_attribute = "word",
  id = id_peace,
  registry = registry_tmp
)

tab <- data.frame(
  i = unlist(lapply(1:length(cpos_peace), function(x) rep(x, times = 11))),
  cpos = unlist(lapply(cpos_peace, function(x) (x - 5):(x + 5)))
  )
tab[["id"]] <- RcppCWB::cl_cpos2id(
  corpus = "UNGA", p_attribute = "word",
  cpos = tab[["cpos"]], registry = registry_tmp
)
tab[["str"]] <- RcppCWB::cl_id2str(
  corpus = "UNGA", p_attribute = "word",
  id = tab[["id"]], registry = registry_tmp
)

peace_context <- split(tab[["str"]], as.factor(tab[["i"]]))
peace_context <- unname(sapply(peace_context, function(x) paste(x, collapse = " ")))
head(peace_context)

## ----message = FALSE, results = FALSE-----------------------------------------
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters.tm <- VCorpus(
  DirSource(reut21578),
  list(reader = readReut21578XMLasPlain)
)

## ----eval = requireNamespace("tidytext", quietly = TRUE)----------------------
library(tidytext)
reuters.tbl <- tidy(reuters.tm)
reuters.tbl

## ----eval = requireNamespace("tidytext", quietly = TRUE)----------------------
topics <- sapply(reuters.tbl[["topics_cat"]], paste, collapse = "|")
places <- sapply(reuters.tbl[["places"]], paste, collapse = "|")

reuters.tbl[["topics"]] <- topics
reuters.tbl[["places"]] <- places

## ----results = FALSE, eval = requireNamespace("tidytext", quietly = TRUE)-----
Reuters <- CorpusData$new()
reuters_data_dir_tmp <- fs::path(data_dir_tmp, "reuters")
if (!file.exists(reuters_data_dir_tmp)) dir.create(reuters_data_dir_tmp)
file.remove(list.files(reuters_data_dir_tmp, full.names = TRUE))

## ----eval = requireNamespace("tidytext", quietly = TRUE)----------------------
Reuters$chunktable <- data.table(reuters.tbl[, c("id", "text")])
Reuters$metadata <- data.table(reuters.tbl[,c("id", "topics", "places")])
Reuters

## ----message = FALSE, results = FALSE, eval = requireNamespace("tidytext", quietly = TRUE)----
Reuters$tokenize()

## ----eval = requireNamespace("tidytext", quietly = TRUE)----------------------
Reuters$tokenstream

## ----message = FALSE, eval = requireNamespace("tidytext", quietly = TRUE)-----
Reuters$encode(
   corpus = "REUTERS",
   encoding = "utf8",
   p_attributes = "word",
   s_attributes = c("topics", "places"),
   registry_dir = registry_tmp,
   data_dir = reuters_data_dir_tmp,
   method = "R",
   compress = FALSE
)

## ----eval = requireNamespace("tidytext", quietly = TRUE)----------------------
ids <- RcppCWB::cl_str2id(
  corpus = "REUTERS",
  p_attribute = "word",
  str = "oil",
  registry = registry_tmp
)

cpos <- RcppCWB::cl_id2cpos(
  corpus = "REUTERS",
  p_attribute = "word",
  id = ids,
  registry = registry_tmp
)

length(cpos)

## ----eval = requireNamespace("tidytext", quietly = TRUE)----------------------
reuters_size <- RcppCWB::attribute_size(
  corpus = "REUTERS",
  registry = registry_tmp,
  attribute = "word",
  attribute_type = "p"
)

ids <- RcppCWB::cl_cpos2id(
  corpus = "REUTERS",
  registry = registry_tmp,
  p_attribute = "word",
  cpos = 0L:(reuters_size - 1L)
)

token_stream <- RcppCWB::cl_id2str(
  corpus = "REUTERS",
  registry = registry_tmp,
  p_attribute = "word",
  id = ids
)

## ----eval = requireNamespace("SnowballC") && requireNamespace("tidytext")-----
stemmed <- SnowballC::wordStem(token_stream, language = "en")

## ----eval = requireNamespace("SnowballC") && requireNamespace("tidytext")-----
p_attribute_encode(
  token_stream = stemmed,
  p_attribute = "stem",
  encoding = "utf8",
  corpus = "REUTERS",
  registry_dir = registry_tmp,
  data_dir = reuters_data_dir_tmp,
  method = "R",
  verbose = TRUE,
  quietly = TRUE,
  compress = FALSE
)

## -----------------------------------------------------------------------------
unlink(registry_tmp, recursive = TRUE)
unlink(data_dir_tmp, recursive = TRUE)

Try the cwbtools package in your browser

Any scripts or data that you put into this service are public.

cwbtools documentation built on May 29, 2024, 2:45 a.m.