#' Parallel text processing and annotation -- the latter via `udpipe`.
#'
#' @name tif2parallel
#' @param tif A dataframe
#' @param mwe A character vector
#' @param output_dir A file path, optional --
#' @param model_dir A file path
#' @param model A character string -- udmodel name
#' @param tagger A character string
#' @param parser A character string
#' @param cores An integer
#' @param per An integer
#' @return A list of data frames
#'
#' @export
#' @rdname tif2parallel
#'
#'
tif2parallel <- function(tif,
mwe = NULL,
output_dir = NULL, ## '/home/jtimm/Desktop/t2p/',
model_dir,
model = 'english-ewt-ud-2.5-191206.udpipe',
tagger = 'default',
parser = 'none',
cores = 6,
per = 1){
MWE <- tolower(mwe)
batches <- split(tif[, c('doc_id', 'text')],
ceiling(seq_along(1:length(tif$doc_id)) / (length(tif$doc_id)/cores/per)))
setwd(model_dir)
udmodel <- udpipe::udpipe_load_model(model)
texting <- function(x,
tifA = tif2annotation,
mwe1 = MWE,
od = output_dir,
t0 = tagger,
p0 = parser,
mod = udmodel){
x <- text2df::tif2sentence(x)
x <- text2df::tif2token(x)
if(!is.null(mwe1)){x <- text2df::token2mwe(tok = x, mwe = mwe1)}
x <- text2df::token2annotation(tok = x,
model = udmodel,
tagger = t0,
parser = p0)
if(!is.null(od)){
fn <- paste0(paste0(sample(LETTERS, 3, TRUE),
collapse = ''),
sample(9999, 1, TRUE))
fn0 <- paste0(od, fn, '.rds')
saveRDS(x, fn0) } else{return(x)}
}
clust <- parallel::makeCluster(cores)
parallel::clusterExport(cl = clust,
varlist = c('batches', 'udmodel', 'MWE'),
envir = environment())
dtm <- pbapply::pblapply(X = batches,
FUN = function(x)
texting(x,
tifA = tif2annotation,
mwe1 = mwe,
mod = ud_model_dir,
od = output_dir,
t0 = tagger,
p0 = parser),
cl = clust)
parallel::stopCluster(clust)
#dtm0 <- data.table::rbindlist(dtm)
return(dtm)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.