cwb_makeall | R Documentation |
Wrappers for the CWB tools (cwb-makeall
, cwb-huffcode
,
cwb-compress-rdx
). Unlike the 'original' command line tools, these wrappers
will always perform a specific indexing/compression step on one positional
attribute, and produce all components.
cwb_makeall(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE
)
cwb_huffcode(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE,
delete = TRUE
)
cwb_compress_rdx(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE,
delete = TRUE
)
cwb_encode(
corpus,
registry = Sys.getenv("CORPUS_REGISTRY"),
data_dir,
vrt_dir,
encoding = "utf8",
p_attributes = c("word", "pos", "lemma"),
s_attributes,
skip_blank_lines = TRUE,
strip_whitespace = TRUE,
xml = TRUE,
quietly = FALSE,
verbose = FALSE
)
corpus |
Name of a CWB corpus (upper case). |
p_attribute |
Name of p-attribute. |
registry |
Path to the registry directory, defaults to the value of the environment variable CORPUS_REGISTRY. |
quietly |
A |
delete |
A |
data_dir |
The data directory where |
vrt_dir |
Directory with input corpus files (verticalised format / file
ending *.vrt). Tilde expansion is performed on |
encoding |
The encoding of the files to be encoded. Needs to be an
encoding supported by CWB, see |
p_attributes |
Positional attributes (p-attributes) to be declared. |
s_attributes |
A |
skip_blank_lines |
A |
strip_whitespace |
A |
xml |
A |
verbose |
A |
# The package includes and 'unfinished' corpus of debates in the UN General
# Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it is
# not compressed.
#
# The first step in the following example is to copy the raw
# corpus to a temporary place.
home_dir <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "unga")
tmp_data_dir <- file.path(tempdir(), "indexed_corpora")
tmp_unga_dir <- file.path(tmp_data_dir, "unga2")
if (!file.exists(tmp_data_dir)) dir.create(tmp_data_dir)
if (!file.exists(tmp_unga_dir)){
dir.create(tmp_unga_dir)
} else {
file.remove(list.files(tmp_unga_dir, full.names = TRUE))
}
regfile <- readLines(
system.file(package = "RcppCWB", "extdata", "cwb", "registry", "unga")
)
regfile[grep("^HOME", regfile)] <- sprintf('HOME "%s"', tmp_unga_dir)
regfile[grep("^ID", regfile)] <- "ID unga2"
writeLines(text = regfile, con = file.path(get_tmp_registry(), "unga2"))
for (x in list.files(home_dir, full.names = TRUE)){
file.copy(from = x, to = tmp_unga_dir)
}
# perform cwb_makeall (equivalent to cwb-makeall command line utility)
cwb_makeall(corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry())
cl_load_corpus("UNGA2", registry = get_tmp_registry())
cqp_load_corpus("UNGA2", registry = get_tmp_registry())
# see whether it works
ids_sentence_1 <- cl_cpos2id(
corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry(),
cpos = 0:83
)
tokens_sentence_1 <- cl_id2str(
corpus = "UNGA2", p_attribute = "word",
registry = get_tmp_registry(), id = ids_sentence_1
)
sentence <- gsub("\\s+([\\.,])", "\\1", paste(tokens_sentence_1, collapse = " "))
# perform cwb_huffcode (equivalent to cwb-makeall command line utility)
cwb_huffcode(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
cwb_compress_rdx(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
data_dir <- file.path(tempdir(), "bt_data_dir")
dir.create(data_dir)
cwb_encode(
corpus = "BTMIN",
registry = Sys.getenv("CORPUS_REGISTRY"),
vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"),
data_dir = data_dir,
p_attributes = c("word", "pos", "lemma"),
s_attributes = list(
plenary_protocol = c(
"lp", "protocol_no", "date", "year", "birthday", "version",
"url", "filetype"
),
speaker = c(
"id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id",
"ai_type", "who", "name", "parliamentary_group", "party", "role"
),
p = character()
)
)
unlink(data_dir)
unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.