R/as.speeches.R
In polmineR: Verbs and Nouns for Corpus Analysis

#' @include S4classes.R
NULL

#' Split corpus or partition into speeches.
#' 
#' Split entire corpus or a partition into speeches. The heuristic is to split
#' the corpus/partition into partitions on day-to-day basis first, using the
#' s-attribute provided by `s_attribute_date`. These subcorpora are then
#' splitted into speeches by speaker name, using s-attribute `s_attribute_name`.
#' If there is a gap larger than the number of tokens supplied by argument
#' `gap`, contributions of a speaker are assumed to be two seperate speeches.
#' 
#' @param .Object A `partition`, or length-one `character` vector indicating a
#'   CWB corpus.
#' @param s_attribute_date A length-one `character` vector, the s-attribute that
#'   provides the dates of sessions.
#' @param s_attribute_name A length-one `character` vector, the s-attribute that
#'   provides the names of speakers.
#' @param gap An `integer` value, the number of tokens between strucs
#'   assumed to make the difference whether a speech has been interrupted (by an
#'   interjection or question), or whether to assume seperate speeches.
#' @param mc Whether to use multicore, defaults to `FALSE`. If `progress` is
#'   `TRUE`, argument `mc` is passed into `pblapply` as argument `cl`. If
#'   `progress` is `FALSE`, `mc` is passed into `mclapply()` as argument
#'   `mc.cores`.
#' @param verbose A `logical` value, defaults to `TRUE`.
#' @param progress A `logical` value, whether to show progress bar.
#' @param ... Further arguments.
#' @return A `partition_bundle`, the names of the objects in the bundle are
#'   the speaker name, the date of the speech and an index for the number of the
#'   speech on a given day, concatenated by underscores.
#' @name as.speeches
#' @exportMethod as.speeches
#' @rdname as.speeches
#' @importFrom cli cli_progress_step cli_progress_done
#' @examples 
#' \dontrun{
#' use("polmineR")
#' speeches <- as.speeches(
#'   "GERMAPARLMINI",
#'   s_attribute_date = "date", s_attribute_name = "speaker"
#' )
#' speeches_count <- count(speeches, p_attribute = "word")
#' tdm <- as.TermDocumentMatrix(speeches_count, col = "count")
#' 
#' bt <- partition("GERMAPARLMINI", date = "2009-10-27")
#' speeches <- as.speeches(
#'   bt, 
#'   s_attribute_name = "speaker",
#'   s_attribute_date = "date"
#' )
#' summary(speeches)
#' }
setGeneric("as.speeches", function(.Object, ...) standardGeneric("as.speeches"))

#' @exportMethod as.speeches
#' @rdname as.speeches
setMethod("as.speeches", "partition", function(
  .Object,
  s_attribute_date = grep("date", s_attributes(.Object), value = TRUE),
  s_attribute_name = grep("name", s_attributes(.Object), value = TRUE),
  gap = 500, mc = FALSE, verbose = TRUE, progress = TRUE
){
  
  stopifnot(
    is.character(s_attribute_date),
    length(s_attribute_date) == 1L,
    
    is.character(s_attribute_name),
    length(s_attribute_name) == 1L
  )
  
  is_sibling <- s_attr_is_sibling(
    x = s_attribute_date, y = s_attribute_name,
    corpus = .Object@corpus, registry = .Object@registry_dir
  )
  if (is_sibling){
    xml <- "flat"
  } else {
    speaker_is_descendent <- s_attr_is_descendent(
      x = s_attribute_name, y = s_attribute_date, 
      corpus = .Object@corpus, registry = .Object@registry_dir
    )
    if (isFALSE(speaker_is_descendent)){
      stop(
        "s-attribute 's_attribute_name' is not a sibling of 's_attribute_date'",
        " but is not a descendent then as would be expected"
      )
    }
    xml <- "nested"
  }
  
  # as a first step, create partitions by date
  n_dates <- length(s_attributes(.Object, s_attribute_date))
  if (verbose) cli_alert_info("distinct dates detected: {.val {n_dates}}")
  if (n_dates > 1L){
    if (verbose) cli_progress_step("subsetting by date")
    # The subcorpus may have been generated by a subset on s_attribute date,
    # using a regular expression - which will result in ugly names. Remove it.
    while(s_attribute_date %in% names(.Object@s_attributes))
      .Object@s_attributes[[s_attribute_date]] <- NULL
    
    partition_bundle_dates <- partition_bundle(
      .Object,
      s_attribute = s_attribute_date
    )
    if (verbose) cli_progress_done()
  } else {
    # if .Object originates from a subcorpus object, the s-attributes slot
    # is potentially not filled. Populate it to get proper obj names
    if (!s_attribute_date %in% names(.Object@s_attributes)){
      date_values <- s_attributes(.Object, s_attribute = s_attribute_date)
      .Object@s_attributes[[s_attribute_date]] <- date_values
    }
    partition_bundle_dates <- list(.Object)
  }
  
  if (verbose) cli_progress_step("generating speeches")
  .split_by_speakers <- function(partition_date, ...){
    nested <- lapply(
      s_attributes(partition_date, s_attribute_name),
      function(speaker_name){
        p <- partition(
          partition_date,
          def = setNames(list(speaker_name), s_attribute_name),
          verbose = FALSE,
          xml = xml
        )
        split(p, gap = gap, verbose = FALSE)
      }
    )
    unlist(lapply(1L:length(nested), function(i) nested[[i]]@objects))
  }
  speaker_list_nested <- blapply(
    x = partition_bundle_dates, f = .split_by_speakers,
    s_attribute_name = s_attribute_name, gap = gap,
    mc = mc, progress = progress
  )
  speaker_list <- do.call(c, unlist(speaker_list_nested, recursive = FALSE))
  if (verbose) cli_progress_step("generating names")
  partition_names <- sapply(
    speaker_list,
    function(x){
      paste(
        x@s_attributes[[s_attribute_name]],
        x@s_attributes[[s_attribute_date]],
        gsub("^.*_(\\d+)$", "\\1", x@name),
        sep = "_", collapse = "_"
      )
    }
  )
  for (i in 1L:length(speaker_list))
    name(speaker_list[[i]]) <- partition_names[i]
  
  # at this stage, the list may contain partitions of size 0 - to be dropped
  empty_partitions <- which(sapply(speaker_list, size) == 0L)
  if (length(empty_partitions) > 0L)
    for (i in rev(empty_partitions)) speaker_list[[i]] <- NULL

  # the resulting list may be totally unordered - reorder now
  if (progress) cli_progress_step("reordering partitions")
  speaker_list_ordered <- lapply(
    order(sapply(speaker_list, function(x) x@cpos[1,1])),
    function(i) speaker_list[[i]]
  )
  corpus <- if (is.character(.Object)) .Object else .Object@corpus
  properties <- corpus_properties(
    corpus = .Object@corpus,
    registry = .Object@registry_dir
  )
  if ("type" %in% properties){
    type <- corpus_property(.Object@corpus, .Object@registry_dir, "type")
    if (type == "plpr"){
      if (verbose) cli_progress_step("coercing partitions to plpr_partitions")
      speaker_list_ordered <- lapply(
        speaker_list_ordered,
        function(x) as(x, "plpr_partition")
      )
      if (verbose) cli_progress_done()
    }
  }
  
  as.bundle(speaker_list_ordered)
})


#' @importFrom cli cli_progress_bar cli_progress_update
#' @importFrom RcppCWB s_attr_relationship
#' @exportMethod as.speeches
#' @rdname as.speeches
setMethod("as.speeches", "subcorpus", function(
  .Object,
  s_attribute_date = grep("date", s_attributes(.Object), value = TRUE),
  s_attribute_name = grep("name", s_attributes(.Object), value = TRUE),
  gap = 500, mc = FALSE, verbose = TRUE, progress = TRUE
){
  stopifnot(
    is.character(s_attribute_date),
    length(s_attribute_date) == 1L,
    
    is.character(s_attribute_name),
    length(s_attribute_name) == 1L
  )
  
  relation <- s_attr_relationship(
    x = s_attribute_date,
    y = s_attribute_name,
    corpus = .Object@corpus,
    registry = .Object@registry_dir
  )
  
  xml <- if (relation == 0L) "flat" else "nested"

  # as a first step, create subcorpora by date
  n_dates <- length(s_attributes(.Object, s_attribute_date))
  if (verbose) cli_alert_info("distinct dates: {.val {n_dates}}")
  if (n_dates > 1L){
    if (verbose) cli_progress_step("subsetting by date")
    # The subcorpus may have been generated by a subset on s_attribute date,
    # using a regular expression - which will result in ugly names. Remove it.
    while(s_attribute_date %in% names(.Object@s_attributes))
      .Object@s_attributes[[s_attribute_date]] <- NULL
    
    sc_by_date <- split(.Object, s_attribute = s_attribute_date)
    if (verbose) cli_progress_done()
  } else {
    sc_by_date <- list(.Object)
  }
  
  if (verbose) cli_alert_info("generating speeches")
  .fn <- function(by_date)
    lapply(
      split(by_date, s_attribute = s_attribute_name, verbose = FALSE)@objects,
      function(by_name) split(
        as(by_name, "partition"), gap = gap, verbose = TRUE
      )@objects
    )

  nested <- if (progress) pblapply(sc_by_date, .fn) else lapply(sc_by_date, .fn)
  speaker_list <- do.call(c, unlist(nested, recursive = FALSE))
  
  if (verbose) cli_progress_step("generating names")
  obj_names <- sapply(
    speaker_list,
    function(x){
      name <- s_attributes(x, s_attribute = s_attribute_name)
      date <- s_attributes(x, s_attribute = s_attribute_date)
      paste(
        name, date, gsub("^.*_(\\d+)$", "\\1", x@name),
        sep = "_", collapse = "_"
      )
    }
  )
  for (i in 1L:length(speaker_list)) name(speaker_list[[i]]) <- obj_names[i]
  
  # at this stage, the list may contain partitions of size 0 - discard
  empty <- which(sapply(speaker_list, size) == 0L)
  if (length(empty) > 0L) for (i in rev(empty)) speaker_list[[i]] <- NULL
  
  # the resulting list may be totally unordered - reorder now
  if (progress) cli_progress_step("reorder objects")
  speaker_list_ordered <- lapply(
    order(sapply(speaker_list, function(x) x@cpos[1,1])),
    function(i) speaker_list[[i]]
  )
  corpus <- if (is.character(.Object)) .Object else .Object@corpus
  properties <- corpus_properties(
    corpus = .Object@corpus,
    registry = .Object@registry_dir
  )
  if ("type" %in% properties){
    type <- corpus_property(.Object@corpus, .Object@registry_dir, "type")
    if (type == "plpr"){
      if (verbose) cli_progress_step("coercing partitions to plpr_subcorpus")
      speaker_list_ordered <- lapply(
        speaker_list_ordered,
        function(x) as(as(x, "subcorpus"), "plpr_subcorpus")
      )
      if (verbose) cli_progress_done()
    }
  }
  
  as.bundle(speaker_list_ordered)
})


#' @param subset A `logical` expression evaluated in a temporary `data.table`
#'   with columns 'speaker' and 'date' to define a subset of the entire corpus
#'   to be turned into speeches. Usually faster than applying `as.speeches()` on
#'   a `partition` or `subcorpus`.
#' @rdname as.speeches
#' @examples
#' \dontrun{
#' #' sp <- corpus("GERMAPARLMINI") %>%
#'   as.speeches(s_attribute_name = "speaker", s_attribute_date = "date")
#' 
#' sp <- corpus("GERMAPARLMINI") %>%
#'   as.speeches(
#'     s_attribute_name = "speaker",
#'     s_attribute_date = "date",
#'     subset = {date == as.Date("2009-11-11")},
#'     progress = FALSE
#'   )
#'   
#' sp <- corpus("GERMAPARLMINI") %>%
#'   as.speeches(
#'     s_attribute_name = "speaker",
#'     s_attribute_date = "date",
#'     subset = {date == "2009-11-10" & grepl("Merkel", speaker)},
#'     progress = FALSE
#'   )
#' }
#' 
#' @importFrom RcppCWB s_attr_regions s_attr_is_sibling s_attr_is_descendent
setMethod("as.speeches", "corpus", function( 
  .Object,
  s_attribute_date = grep("date", s_attributes(.Object), value = TRUE),
  s_attribute_name = grep("name", s_attributes(.Object), value = TRUE),
  gap = 500, subset,
  mc = FALSE, verbose = TRUE, progress = TRUE
){
  
  speakers <- s_attributes(.Object, s_attribute = s_attribute_name, unique = FALSE)
  regions <- s_attr_regions(
    corpus = .Object@corpus, s_attr = s_attribute_name,
    registry = .Object@registry_dir, data_dir = .Object@data_dir
  )
  strucs <- 0L:(nrow(regions) - 1L)
  
  
  sibling <- s_attr_is_sibling(
    x = s_attribute_date, y = s_attribute_name,
    corpus = .Object@corpus, registry = .Object@registry_dir
  )

  if (isTRUE(sibling)){
    dates <- s_attributes(.Object, s_attribute = s_attribute_date, unique = FALSE)
  } else {
    # This additional check is not strictly necessary - but we are on the safe side
    descendent <- s_attr_is_descendent(
      x = s_attribute_name, y = s_attribute_date,
      corpus = .Object@corpus, registry = .Object@registry_dir
    )
    if (isFALSE(descendent))
      stop(
        "Unknown scenario: s_attribute_name and s_attribute_date are not ",
        "siblings, but s_attribute_name is not a descendent of s_attribute_date"
      )
    s <- cpos2struc(.Object, s_attr = s_attribute_date, cpos = regions[,1L])
    dates <- struc2str(.Object, s_attr = s_attribute_date, struc = s)
  }
  
  if (!missing(subset)){
    expr <- substitute(subset)
    dt <- data.table(speaker = speakers, date = as.Date(dates))
    keep <- eval(expr, envir = dt)
    
    if (!any(keep)){
      warning("subsetting corpus data removes all regions - returning NULL")
      return(NULL)
    }
    
    speakers <- speakers[keep]
    strucs <- strucs[keep]
    dates <- dates[keep]
    regions <- regions[keep,]
  }

  chunks_cpos <- split(x = regions, f = speakers)
  chunks_dates <- split(x = dates, f = speakers)
  chunks_strucs <- split(x = strucs, f = speakers)

  new_class <- if (length(.Object@type) == 0L) "subcorpus" else paste(.Object@type, "subcorpus", sep = "_")
  prototype <- as(.Object, new_class)

  .iter_fn <- function(i){
    mx <- matrix(data = chunks_cpos[[i]], byrow = FALSE, ncol = 2L)
    
    # if we have a matrix with only one region (i.e. one row), no need for further splitting,
    # we return a subcorpus immediately
    if (nrow(mx) == 1L){
      y <- prototype
      
      y@strucs = chunks_strucs[[i]]
      y@cpos = mx
      y@s_attributes = setNames(
        list(chunks_dates[[i]], names(chunks_cpos)[i]),
        nm = c(s_attribute_date, s_attribute_name)
      )
      y@xml = "flat"
      y@s_attribute_strucs = s_attribute_name
      y@name = sprintf("%s_%s_%d", names(chunks_cpos)[[i]], chunks_dates[[i]], 1L)
      y@size = mx[,2] - mx[,1] + 1L
      
      return(list(y))
    }
    
    distance <- mx[,1L][2L:nrow(mx)] - mx[,2L][1L:(nrow(mx) - 1L)]
    beginning <- c(TRUE, ifelse(distance > gap, TRUE, FALSE))
    
    # new speech begins also if date of region is not identical with date of 
    # preceding region
    beginning <- ifelse(
      c(
        TRUE,
        chunks_dates[[i]][2L:length(chunks_dates[[i]])] == chunks_dates[[i]][1L:(length(chunks_dates[[i]]) - 1L)]
      ),
      beginning,
      TRUE
    )
    razor <- cumsum(beginning)
    vec_dates <- chunks_dates[[i]][beginning]
    
    # The speech_no vector indicates the number of the speech at a date for the
    # speeches referred to with the vec_dates vector. These lines have seen some
    # revisions to make the procedure robust. Resorting to the for loop is able
    # to handle situations robustly when dates are not increasing throughout.
    unique_dates <- unique(vec_dates)
    speech_no_aux <- setNames(
      rep(1L, times = length(unique_dates)),
      nm = unique_dates
    )
    speech_no <- rep(NA, times = length(vec_dates))
    for (k in seq_along(vec_dates)){
      speech_no[k] <- speech_no_aux[[vec_dates[k]]]
      speech_no_aux[[vec_dates[k]]] <- speech_no_aux[[vec_dates[k]]] + 1L
    }
    
    li_cpos <- split(mx, f = razor)
    li_strucs <- split(chunks_strucs[[i]], f = razor)
    lapply(
      seq_along(li_cpos),
      function(j){
        y <- prototype
        
        y@strucs = li_strucs[[j]]
        y@cpos = matrix(data = li_cpos[[j]], byrow = FALSE, ncol = 2L)
        y@s_attributes = setNames(
          list(vec_dates[[j]], names(chunks_cpos)[i]),
          nm = c(s_attribute_date, s_attribute_name)
        )
        y@xml = "flat"
        y@s_attribute_strucs = s_attribute_name
        y@name = sprintf("%s_%s_%d", names(chunks_cpos)[i], vec_dates[[j]], speech_no[[j]])
        y@size = sum(y@cpos[,2] - y@cpos[,1] + 1L)
        
        y
      }
    )
  }
  
  y <- if (progress){
    pblapply(seq_along(chunks_cpos), .iter_fn, cl = mc)
  } else {
    if (isFALSE(mc)){
      lapply(seq_along(chunks_cpos), .iter_fn)
    } else {
      mclapply(seq_along(chunks_cpos), .iter_fn, mc.cores = mc)
    }
  }
  
  retval <- as(.Object, "subcorpus_bundle")
  retval@xml <- if (sibling) "flat" else "nested"
  retval@objects <- unlist(y, recursive = FALSE)
  names(retval@objects) <- sapply(retval@objects, name)
  
  retval
})



#' @rdname as.speeches
setMethod("as.speeches", "character", function( 
  .Object,
  s_attribute_date = grep("date", s_attributes(.Object), value = TRUE),
  s_attribute_name = grep("name", s_attributes(.Object), value = TRUE),
  gap = 500, mc = FALSE, verbose = TRUE, progress = TRUE
){
  as.speeches(
    .Object = corpus(.Object),
    s_attribute_date = s_attribute_date,
    s_attribute_name = s_attribute_name,
    gap = gap,
    mc = mc,
    verbose = verbose,
    progress = progress
  )
})