CorpusData | R Documentation |
Manage Corpus Data and Encode CWB Corpus.
Manage Corpus Data and Encode CWB Corpus.
chunktable
A data.table
with column "id" (unique values),
columns with metadata, and a column with text chunks.
tokenstream
A data.table
with a column "cpos" (corpus position), and columns
with positional attributes, such as "word", "lemma", "pos", "stem".
metadata
A data.table
with a column "id", to link data with chunks/tokenstream,
columns with document-level metadata, and a column "cpos_left" and "cpos_right", which can
be generated using method $add_corpus_positions()
.
sentences
A data.table
.
named_entities
A data.table
.
new()
Initialize a new instance of class CorpusData
.
CorpusData$new()
A class CorpusData
object.
print()
Print summary of CorpusData
object.
CorpusData$print()
tokenize()
Simple tokenization of text in chunktable.
CorpusData$tokenize(..., verbose = TRUE, progress = TRUE)
...
Arguments that are passed into tokenizers::tokenize_words()
.
verbose
Logical, whether to be verbose.
progress
Logical, whether to show progress bar.
import_xml()
CorpusData$import_xml( filenames, body = "//body", meta = NULL, mc = NULL, progress = TRUE )
filenames
XXX
body
An xpath expression defining the body of the xml document.
meta
A named character vector with xpath expressions.
mc
A numeric/integer value, number of cores to use.
progress
Logical, whether to show progress bar.
Import XML files.
The CorpusData
object is returned invisibly.
add_corpus_positions()
Add column cpos
to tokenstream and columns cpos_left
and
cpos_right
to metadata.
CorpusData$add_corpus_positions(verbose = TRUE)
verbose
Logical, whether to be verbose.
purge()
Remove patterns from chunkdata that are known to cause problems. This is done most efficiently at the chunkdata level of data preparation as the length of the character vector to handle is much smaller than when tokenization/annotation has been performed.
CorpusData$purge( replacements = list(c("^\\s*<.*?>\\s*$", ""), c("’", "'")) )
replacements
XXX
encode()
Encode corpus. If the corpus already exists, it will be removed.
CorpusData$encode( corpus, p_attributes = "word", s_attributes = NULL, encoding, registry_dir = Sys.getenv("CORPUS_REGISTRY"), data_dir = NULL, method = c("R", "CWB"), verbose = TRUE, compress = FALSE )
corpus
The name of the CWB corpus.
p_attributes
Positional attributes.
s_attributes
Columns that will be encoded as structural attributes.
encoding
Encoding/charset of the CWB corpus.
registry_dir
Corpus registry, the directory where registry files are stored.
data_dir
Directory where to create directory for indexed corpus files.
method
Either "R" or "CWB".
verbose
Logical, whether to be verbose.
compress
Logical, whether to compress corpus.
clone()
The objects of this class are cloneable with this method.
CorpusData$clone(deep = FALSE)
deep
Whether to make a deep clone.
library(RcppCWB)
library(data.table)
# this example relies on the R method to write data to disk, there is also a method "CWB"
# that relies on CWB tools to generate the indexed corpus. The CWB can downloaded
# and installed within the package by calling cwb_install()
# create temporary registry file so that data in RcppCWB package can be used
registry_rcppcwb <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
registry_tmp <- fs::path(tempdir(), "registry")
if (!dir.exists(registry_tmp)) dir.create(registry_tmp)
r <- registry_file_parse("REUTERS", registry_dir = registry_rcppcwb)
r[["home"]] <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters")
registry_file_write(r, corpus = "REUTERS", registry_dir = registry_tmp)
# decode structural attribute 'places'
s_attrs_places <- RcppCWB::s_attribute_decode(
corpus = "REUTERS",
data_dir = system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters"),
s_attribute = "places", method = "R"
)
s_attrs_places[["id"]] <- 1L:nrow(s_attrs_places)
setnames(s_attrs_places, old = "value", new = "places")
# decode positional attribute 'word'
tokens <- apply(s_attrs_places, 1, function(row){
ids <- cl_cpos2id(
corpus = "REUTERS", cpos = row[1]:row[2],
p_attribute = "word", registry = registry_tmp
)
cl_id2str(corpus = "REUTERS", id = ids, p_attribute = "word", registry = registry_tmp)
})
tokenstream <- rbindlist(
lapply(
1L:length(tokens),
function(i) data.table(id = i, word = tokens[[i]]))
)
tokenstream[["cpos"]] <- 0L:(nrow(tokenstream) - 1L)
# create CorpusData object (see vignette for further explanation)
CD <- CorpusData$new()
CD$tokenstream <- as.data.table(tokenstream)
CD$metadata <- as.data.table(s_attrs_places)
# Remove temporary registry with home dir still pointing to RcppCWB data dir
# to prevent data from being deleted
file.remove(fs::path(registry_tmp, "reuters"))
file.remove(registry_tmp)
# create temporary directories (registry directory and one for indexed corpora)
tmpdir <- normalizePath(tempdir(), winslash = "/")
if (.Platform$OS.type == "windows") tmpdir <- normalizePath(tmpdir, winslash = "/")
registry_tmp <- fs::path(tempdir(), "registry")
data_dir_tmp <- fs::path(tempdir(), "data_dir")
if (!dir.exists(registry_tmp)) dir.create(registry_tmp)
if (!dir.exists(data_dir_tmp)) dir.create(data_dir_tmp)
CD$encode(
corpus = "REUTERS", encoding = "utf8",
p_attributes = "word", s_attributes = "places",
registry_dir = registry_tmp, data_dir = data_dir_tmp,
method = "R"
)
reg <- registry_data(name = "REUTERS", id = "REUTERS", home = data_dir_tmp, p_attributes = "word")
registry_file_write(data = reg, corpus = "REUTERS", registry_dir = registry_tmp)
# see whether it works
cl_cpos2id(corpus = "REUTERS", p_attribute = "word", cpos = 0L:4049L, registry = registry_tmp)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.