Nothing
#' @include S4classes.R
NULL
#' Split corpus or partition into speeches.
#'
#' Split entire corpus or a partition into speeches. The heuristic is to split
#' the corpus/partition into partitions on day-to-day basis first, using the
#' s-attribute provided by `s_attribute_date`. These subcorpora are then
#' splitted into speeches by speaker name, using s-attribute `s_attribute_name`.
#' If there is a gap larger than the number of tokens supplied by argument
#' `gap`, contributions of a speaker are assumed to be two seperate speeches.
#'
#' @param .Object A `partition`, or length-one `character` vector indicating a
#' CWB corpus.
#' @param s_attribute_date A length-one `character` vector, the s-attribute that
#' provides the dates of sessions.
#' @param s_attribute_name A length-one `character` vector, the s-attribute that
#' provides the names of speakers.
#' @param gap An `integer` value, the number of tokens between strucs
#' assumed to make the difference whether a speech has been interrupted (by an
#' interjection or question), or whether to assume seperate speeches.
#' @param mc Whether to use multicore, defaults to `FALSE`. If `progress` is
#' `TRUE`, argument `mc` is passed into `pblapply` as argument `cl`. If
#' `progress` is `FALSE`, `mc` is passed into `mclapply()` as argument
#' `mc.cores`.
#' @param verbose A `logical` value, defaults to `TRUE`.
#' @param progress A `logical` value, whether to show progress bar.
#' @param ... Further arguments.
#' @return A `partition_bundle`, the names of the objects in the bundle are
#' the speaker name, the date of the speech and an index for the number of the
#' speech on a given day, concatenated by underscores.
#' @name as.speeches
#' @exportMethod as.speeches
#' @rdname as.speeches
#' @importFrom cli cli_progress_step cli_progress_done
#' @examples
#' \dontrun{
#' use("polmineR")
#' speeches <- as.speeches(
#' "GERMAPARLMINI",
#' s_attribute_date = "date", s_attribute_name = "speaker"
#' )
#' speeches_count <- count(speeches, p_attribute = "word")
#' tdm <- as.TermDocumentMatrix(speeches_count, col = "count")
#'
#' bt <- partition("GERMAPARLMINI", date = "2009-10-27")
#' speeches <- as.speeches(
#' bt,
#' s_attribute_name = "speaker",
#' s_attribute_date = "date"
#' )
#' summary(speeches)
#' }
setGeneric("as.speeches", function(.Object, ...) standardGeneric("as.speeches"))
#' @exportMethod as.speeches
#' @rdname as.speeches
setMethod("as.speeches", "partition", function(
.Object,
s_attribute_date = grep("date", s_attributes(.Object), value = TRUE),
s_attribute_name = grep("name", s_attributes(.Object), value = TRUE),
gap = 500, mc = FALSE, verbose = TRUE, progress = TRUE
){
stopifnot(
is.character(s_attribute_date),
length(s_attribute_date) == 1L,
is.character(s_attribute_name),
length(s_attribute_name) == 1L
)
is_sibling <- s_attr_is_sibling(
x = s_attribute_date, y = s_attribute_name,
corpus = .Object@corpus, registry = .Object@registry_dir
)
if (is_sibling){
xml <- "flat"
} else {
speaker_is_descendent <- s_attr_is_descendent(
x = s_attribute_name, y = s_attribute_date,
corpus = .Object@corpus, registry = .Object@registry_dir
)
if (isFALSE(speaker_is_descendent)){
stop(
"s-attribute 's_attribute_name' is not a sibling of 's_attribute_date'",
" but is not a descendent then as would be expected"
)
}
xml <- "nested"
}
# as a first step, create partitions by date
n_dates <- length(s_attributes(.Object, s_attribute_date))
if (verbose) cli_alert_info("distinct dates detected: {.val {n_dates}}")
if (n_dates > 1L){
if (verbose) cli_progress_step("subsetting by date")
# The subcorpus may have been generated by a subset on s_attribute date,
# using a regular expression - which will result in ugly names. Remove it.
while(s_attribute_date %in% names(.Object@s_attributes))
.Object@s_attributes[[s_attribute_date]] <- NULL
partition_bundle_dates <- partition_bundle(
.Object,
s_attribute = s_attribute_date
)
if (verbose) cli_progress_done()
} else {
# if .Object originates from a subcorpus object, the s-attributes slot
# is potentially not filled. Populate it to get proper obj names
if (!s_attribute_date %in% names(.Object@s_attributes)){
date_values <- s_attributes(.Object, s_attribute = s_attribute_date)
.Object@s_attributes[[s_attribute_date]] <- date_values
}
partition_bundle_dates <- list(.Object)
}
if (verbose) cli_progress_step("generating speeches")
.split_by_speakers <- function(partition_date, ...){
nested <- lapply(
s_attributes(partition_date, s_attribute_name),
function(speaker_name){
p <- partition(
partition_date,
def = setNames(list(speaker_name), s_attribute_name),
verbose = FALSE,
xml = xml
)
split(p, gap = gap, verbose = FALSE)
}
)
unlist(lapply(1L:length(nested), function(i) nested[[i]]@objects))
}
speaker_list_nested <- blapply(
x = partition_bundle_dates, f = .split_by_speakers,
s_attribute_name = s_attribute_name, gap = gap,
mc = mc, progress = progress
)
speaker_list <- do.call(c, unlist(speaker_list_nested, recursive = FALSE))
if (verbose) cli_progress_step("generating names")
partition_names <- sapply(
speaker_list,
function(x){
paste(
x@s_attributes[[s_attribute_name]],
x@s_attributes[[s_attribute_date]],
gsub("^.*_(\\d+)$", "\\1", x@name),
sep = "_", collapse = "_"
)
}
)
for (i in 1L:length(speaker_list))
name(speaker_list[[i]]) <- partition_names[i]
# at this stage, the list may contain partitions of size 0 - to be dropped
empty_partitions <- which(sapply(speaker_list, size) == 0L)
if (length(empty_partitions) > 0L)
for (i in rev(empty_partitions)) speaker_list[[i]] <- NULL
# the resulting list may be totally unordered - reorder now
if (progress) cli_progress_step("reordering partitions")
speaker_list_ordered <- lapply(
order(sapply(speaker_list, function(x) x@cpos[1,1])),
function(i) speaker_list[[i]]
)
corpus <- if (is.character(.Object)) .Object else .Object@corpus
properties <- corpus_properties(
corpus = .Object@corpus,
registry = .Object@registry_dir
)
if ("type" %in% properties){
type <- corpus_property(.Object@corpus, .Object@registry_dir, "type")
if (type == "plpr"){
if (verbose) cli_progress_step("coercing partitions to plpr_partitions")
speaker_list_ordered <- lapply(
speaker_list_ordered,
function(x) as(x, "plpr_partition")
)
if (verbose) cli_progress_done()
}
}
as.bundle(speaker_list_ordered)
})
#' @importFrom cli cli_progress_bar cli_progress_update
#' @importFrom RcppCWB s_attr_relationship
#' @exportMethod as.speeches
#' @rdname as.speeches
setMethod("as.speeches", "subcorpus", function(
.Object,
s_attribute_date = grep("date", s_attributes(.Object), value = TRUE),
s_attribute_name = grep("name", s_attributes(.Object), value = TRUE),
gap = 500, mc = FALSE, verbose = TRUE, progress = TRUE
){
stopifnot(
is.character(s_attribute_date),
length(s_attribute_date) == 1L,
is.character(s_attribute_name),
length(s_attribute_name) == 1L
)
relation <- s_attr_relationship(
x = s_attribute_date,
y = s_attribute_name,
corpus = .Object@corpus,
registry = .Object@registry_dir
)
xml <- if (relation == 0L) "flat" else "nested"
# as a first step, create subcorpora by date
n_dates <- length(s_attributes(.Object, s_attribute_date))
if (verbose) cli_alert_info("distinct dates: {.val {n_dates}}")
if (n_dates > 1L){
if (verbose) cli_progress_step("subsetting by date")
# The subcorpus may have been generated by a subset on s_attribute date,
# using a regular expression - which will result in ugly names. Remove it.
while(s_attribute_date %in% names(.Object@s_attributes))
.Object@s_attributes[[s_attribute_date]] <- NULL
sc_by_date <- split(.Object, s_attribute = s_attribute_date)
if (verbose) cli_progress_done()
} else {
sc_by_date <- list(.Object)
}
if (verbose) cli_alert_info("generating speeches")
.fn <- function(by_date)
lapply(
split(by_date, s_attribute = s_attribute_name, verbose = FALSE)@objects,
function(by_name) split(
as(by_name, "partition"), gap = gap, verbose = TRUE
)@objects
)
nested <- if (progress) pblapply(sc_by_date, .fn) else lapply(sc_by_date, .fn)
speaker_list <- do.call(c, unlist(nested, recursive = FALSE))
if (verbose) cli_progress_step("generating names")
obj_names <- sapply(
speaker_list,
function(x){
name <- s_attributes(x, s_attribute = s_attribute_name)
date <- s_attributes(x, s_attribute = s_attribute_date)
paste(
name, date, gsub("^.*_(\\d+)$", "\\1", x@name),
sep = "_", collapse = "_"
)
}
)
for (i in 1L:length(speaker_list)) name(speaker_list[[i]]) <- obj_names[i]
# at this stage, the list may contain partitions of size 0 - discard
empty <- which(sapply(speaker_list, size) == 0L)
if (length(empty) > 0L) for (i in rev(empty)) speaker_list[[i]] <- NULL
# the resulting list may be totally unordered - reorder now
if (progress) cli_progress_step("reorder objects")
speaker_list_ordered <- lapply(
order(sapply(speaker_list, function(x) x@cpos[1,1])),
function(i) speaker_list[[i]]
)
corpus <- if (is.character(.Object)) .Object else .Object@corpus
properties <- corpus_properties(
corpus = .Object@corpus,
registry = .Object@registry_dir
)
if ("type" %in% properties){
type <- corpus_property(.Object@corpus, .Object@registry_dir, "type")
if (type == "plpr"){
if (verbose) cli_progress_step("coercing partitions to plpr_subcorpus")
speaker_list_ordered <- lapply(
speaker_list_ordered,
function(x) as(as(x, "subcorpus"), "plpr_subcorpus")
)
if (verbose) cli_progress_done()
}
}
as.bundle(speaker_list_ordered)
})
#' @param subset A `logical` expression evaluated in a temporary `data.table`
#' with columns 'speaker' and 'date' to define a subset of the entire corpus
#' to be turned into speeches. Usually faster than applying `as.speeches()` on
#' a `partition` or `subcorpus`.
#' @rdname as.speeches
#' @examples
#' \dontrun{
#' #' sp <- corpus("GERMAPARLMINI") %>%
#' as.speeches(s_attribute_name = "speaker", s_attribute_date = "date")
#'
#' sp <- corpus("GERMAPARLMINI") %>%
#' as.speeches(
#' s_attribute_name = "speaker",
#' s_attribute_date = "date",
#' subset = {date == as.Date("2009-11-11")},
#' progress = FALSE
#' )
#'
#' sp <- corpus("GERMAPARLMINI") %>%
#' as.speeches(
#' s_attribute_name = "speaker",
#' s_attribute_date = "date",
#' subset = {date == "2009-11-10" & grepl("Merkel", speaker)},
#' progress = FALSE
#' )
#' }
#'
#' @importFrom RcppCWB s_attr_regions s_attr_is_sibling s_attr_is_descendent
setMethod("as.speeches", "corpus", function(
.Object,
s_attribute_date = grep("date", s_attributes(.Object), value = TRUE),
s_attribute_name = grep("name", s_attributes(.Object), value = TRUE),
gap = 500, subset,
mc = FALSE, verbose = TRUE, progress = TRUE
){
speakers <- s_attributes(.Object, s_attribute = s_attribute_name, unique = FALSE)
regions <- s_attr_regions(
corpus = .Object@corpus, s_attr = s_attribute_name,
registry = .Object@registry_dir, data_dir = .Object@data_dir
)
strucs <- 0L:(nrow(regions) - 1L)
sibling <- s_attr_is_sibling(
x = s_attribute_date, y = s_attribute_name,
corpus = .Object@corpus, registry = .Object@registry_dir
)
if (isTRUE(sibling)){
dates <- s_attributes(.Object, s_attribute = s_attribute_date, unique = FALSE)
} else {
# This additional check is not strictly necessary - but we are on the safe side
descendent <- s_attr_is_descendent(
x = s_attribute_name, y = s_attribute_date,
corpus = .Object@corpus, registry = .Object@registry_dir
)
if (isFALSE(descendent))
stop(
"Unknown scenario: s_attribute_name and s_attribute_date are not ",
"siblings, but s_attribute_name is not a descendent of s_attribute_date"
)
s <- cpos2struc(.Object, s_attr = s_attribute_date, cpos = regions[,1L])
dates <- struc2str(.Object, s_attr = s_attribute_date, struc = s)
}
if (!missing(subset)){
expr <- substitute(subset)
dt <- data.table(speaker = speakers, date = as.Date(dates))
keep <- eval(expr, envir = dt)
if (!any(keep)){
warning("subsetting corpus data removes all regions - returning NULL")
return(NULL)
}
speakers <- speakers[keep]
strucs <- strucs[keep]
dates <- dates[keep]
regions <- regions[keep,]
}
chunks_cpos <- split(x = regions, f = speakers)
chunks_dates <- split(x = dates, f = speakers)
chunks_strucs <- split(x = strucs, f = speakers)
new_class <- if (length(.Object@type) == 0L) "subcorpus" else paste(.Object@type, "subcorpus", sep = "_")
prototype <- as(.Object, new_class)
.iter_fn <- function(i){
mx <- matrix(data = chunks_cpos[[i]], byrow = FALSE, ncol = 2L)
# if we have a matrix with only one region (i.e. one row), no need for further splitting,
# we return a subcorpus immediately
if (nrow(mx) == 1L){
y <- prototype
y@strucs = chunks_strucs[[i]]
y@cpos = mx
y@s_attributes = setNames(
list(chunks_dates[[i]], names(chunks_cpos)[i]),
nm = c(s_attribute_date, s_attribute_name)
)
y@xml = "flat"
y@s_attribute_strucs = s_attribute_name
y@name = sprintf("%s_%s_%d", names(chunks_cpos)[[i]], chunks_dates[[i]], 1L)
y@size = mx[,2] - mx[,1] + 1L
return(list(y))
}
distance <- mx[,1L][2L:nrow(mx)] - mx[,2L][1L:(nrow(mx) - 1L)]
beginning <- c(TRUE, ifelse(distance > gap, TRUE, FALSE))
# new speech begins also if date of region is not identical with date of
# preceding region
beginning <- ifelse(
c(
TRUE,
chunks_dates[[i]][2L:length(chunks_dates[[i]])] == chunks_dates[[i]][1L:(length(chunks_dates[[i]]) - 1L)]
),
beginning,
TRUE
)
razor <- cumsum(beginning)
vec_dates <- chunks_dates[[i]][beginning]
# The speech_no vector indicates the number of the speech at a date for the
# speeches referred to with the vec_dates vector. These lines have seen some
# revisions to make the procedure robust. Resorting to the for loop is able
# to handle situations robustly when dates are not increasing throughout.
unique_dates <- unique(vec_dates)
speech_no_aux <- setNames(
rep(1L, times = length(unique_dates)),
nm = unique_dates
)
speech_no <- rep(NA, times = length(vec_dates))
for (k in seq_along(vec_dates)){
speech_no[k] <- speech_no_aux[[vec_dates[k]]]
speech_no_aux[[vec_dates[k]]] <- speech_no_aux[[vec_dates[k]]] + 1L
}
li_cpos <- split(mx, f = razor)
li_strucs <- split(chunks_strucs[[i]], f = razor)
lapply(
seq_along(li_cpos),
function(j){
y <- prototype
y@strucs = li_strucs[[j]]
y@cpos = matrix(data = li_cpos[[j]], byrow = FALSE, ncol = 2L)
y@s_attributes = setNames(
list(vec_dates[[j]], names(chunks_cpos)[i]),
nm = c(s_attribute_date, s_attribute_name)
)
y@xml = "flat"
y@s_attribute_strucs = s_attribute_name
y@name = sprintf("%s_%s_%d", names(chunks_cpos)[i], vec_dates[[j]], speech_no[[j]])
y@size = sum(y@cpos[,2] - y@cpos[,1] + 1L)
y
}
)
}
y <- if (progress){
pblapply(seq_along(chunks_cpos), .iter_fn, cl = mc)
} else {
if (isFALSE(mc)){
lapply(seq_along(chunks_cpos), .iter_fn)
} else {
mclapply(seq_along(chunks_cpos), .iter_fn, mc.cores = mc)
}
}
retval <- as(.Object, "subcorpus_bundle")
retval@xml <- if (sibling) "flat" else "nested"
retval@objects <- unlist(y, recursive = FALSE)
names(retval@objects) <- sapply(retval@objects, name)
retval
})
#' @rdname as.speeches
setMethod("as.speeches", "character", function(
.Object,
s_attribute_date = grep("date", s_attributes(.Object), value = TRUE),
s_attribute_name = grep("name", s_attributes(.Object), value = TRUE),
gap = 500, mc = FALSE, verbose = TRUE, progress = TRUE
){
as.speeches(
.Object = corpus(.Object),
s_attribute_date = s_attribute_date,
s_attribute_name = s_attribute_name,
gap = gap,
mc = mc,
verbose = verbose,
progress = progress
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.