#' @rdname encode-method
setGeneric("encode", function(.Object, ...) standardGeneric("encode"))
#' Encode CWB Corpus.
#'
#' @param .Object a data.frame to encode
#' @param name name of the (new) CWB corpus
#' @param corpus the name of the CWB corpus
#' @param pAttributes columns of .Object with tokens (such as word/pos/lemma)
#' @param sAttribute a single s-attribute
#' @param sAttributes columns of .Object that will be encoded as structural attributes
#' @param registry path to the corpus registry
#' @param indexedCorpusDir directory where to create directory for indexed corpus files
#' @param verbose logical, whether to be verbose
#' @param ... further parameters
#' @examples
#' \dontrun{
#' library(tm)
#' library(tibble)
#' library(tidytext)
#' library(plyr)
#' reut21578 <- system.file("texts", "crude", package = "tm")
#' reuters.tm <- VCorpus(DirSource(reut21578), list(reader = readReut21578XMLasPlain))
#' reuters.tibble <- tidy(reuters.tm)
#' # reuters.tibble[["topics_cat"]] <- sapply(
#' reuters.tibble[["topics_cat"]],
#' function(x) paste(x, collapse = "|")
#' )
#' reuters.tibble[["places"]] <- sapply(
#' reuters.tibble[["places"]],
#' function(x) paste(x, collapse = "|")
#' )
#' reuters.tidy <- unnest_tokens(
#' reuters.tibble, output = "word", input = "text", to_lower = FALSE
#' )
#' encode(reuters.tidy, name = "reuters", sAttributes = c("language", "places"))
#' }
#' @rdname encode-method
#' @exportMethod encode
setMethod("encode", "data.frame", function(
.Object, name, pAttributes = "word", sAttributes = NULL,
registry = Sys.getenv("CORPUS_REGISTRY"),
indexedCorpusDir = NULL,
verbose = TRUE
){
if (require("plyr", quietly = TRUE)){
# generate indexedCorpusDir if not provided explicitly
if (is.null(indexedCorpusDir)){
pathElements <- strsplit(registry, "/")[[1]]
pathElements <- pathElements[which(pathElements != "")]
pathElements <- pathElements[1:(length(pathElements) - 1)]
registrySuperDir <- paste("/", paste(pathElements, collapse = "/"), sep = "")
targetDir <- grep("index", list.files(registrySuperDir), value = TRUE)
indexedCorpusDir <- file.path(registrySuperDir, targetDir, name)
if (verbose){
cat("No directory for indexed corpus provided - suggesting to use: ", indexedCorpusDir)
feedback <- readline(prompt = "Y/N\n")
if (feedback != "Y") return(NULL)
}
}
if (!dir.exists(indexedCorpusDir)) dir.create(indexedCorpusDir)
# starting basically - pAttribute 'word'
if (verbose) message("... indexing s-attribute word")
vrtTmpFile <- tempfile()
cat(.Object[["word"]], file = vrtTmpFile, sep = "\n")
cwbEncodeCmd <- paste(c(
"cwb-encode",
"-d", indexedCorpusDir, # directory with indexed corpus files
"-f", vrtTmpFile, # vrt file
"-R", file.path(registry, name) # filename of registry file
), collapse = " ")
system(cwbEncodeCmd)
# add other pAttributes than 'word'
if (length(pAttributes > 1)){
for (newAttribute in pAttributes[which(pAttributes != "word")]){
if (verbose) message("... indexing p-attribute ", newAttribute)
cwbEncodeCmd <- paste(c(
"cwb-encode",
"-d", indexedCorpusDir, # directory with indexed corpus files
"-f", vrtTmpFile, # vrt file
"-R", file.path(registry, name), # filename of registry file
"-p", "-", "-P", newAttribute
), collapse = " ")
system(cwbEncodeCmd)
}
}
# call cwb-make
if (verbose) message("... calling cwb-make")
cwbMakeCmd <- paste(c("cwb-make", "-V", name), collapse = " ")
system(cwbMakeCmd)
.Object[["cpos"]] <- c(0:(nrow(.Object) - 1))
# generate tab with begin and end corpus positions
newTab <- plyr::ddply(
.Object,
.variables = "id",
.fun = function(x){
c(
sapply(sAttributes, function(col) unique(x[[col]])[1]),
cpos_left = x[["cpos"]][1],
cpos_right = x[["cpos"]][length(x[["cpos"]])]
)
})
for (sAttr in sAttributes){
if (verbose) message("... indexing s-attribute ", sAttr)
sAttributeTmpFile <- tempfile()
outputMatrix <- matrix(
data = c(
as.character(newTab[["cpos_left"]]),
as.character(newTab[["cpos_right"]]),
as.character(newTab[[sAttr]])
),
ncol = 3
)
outputVector <- paste(apply(outputMatrix, 1, function(x) paste(x, collapse = "\t")), collapse = "\n")
outputVector <- paste(outputVector, "\n", sep = "")
cat(outputVector, file = sAttributeTmpFile)
cwbEncodeCmd <- paste(c(
"cwb-s-encode",
"-d", indexedCorpusDir, # directory with indexed corpus files
"-f", sAttributeTmpFile,
"-V", paste("text", sAttr, sep = "_")
), collapse = " ")
system(cwbEncodeCmd)
if (verbose) message("... adding new s-attribute to registry file")
regeditCmd <- paste(c("cwb-regedit", name, ":add", ":s", paste("text", sAttr, sep = "_")), collapse = " ")
system(regeditCmd)
}
} else {
warning("package 'plyr' required but not available")
}
})
#' @rdname encode-method
setMethod("encode", "data.table", function(.Object, corpus, sAttribute){
stopifnot(ncol(.Object) == 3)
.Object[[1]] <- as.character(.Object[[1]])
.Object[[2]] <- as.character(.Object[[2]])
lines <- apply(.Object, 1, function(x) paste(x, collapse = "\t"))
lines <- paste(lines, "\n", sep = "")
tmp_file <- tempfile()
cat(lines, file = tmp_file)
cmd <- c(
"cwb-s-encode",
"-d", parseRegistry(corpus)[["HOME"]],
"-f", tmp_file,
"-V", sAttribute
)
system(paste(cmd, collapse = " "))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.