R/split_match_regex_to_transcript.R
In textshape: Tools for Reshaping Text

#' #' Split Text by Regex Into a Transcript
#' #'
#' #' A wrapper for \code{\link[textshape]{split_match_regex}} and
#' #' \pkg{textreadr}'s \code{as_transript} to detect person variable, split the
#' #' text into turns of talk, and convert to a data.frame with \code{person} and
#' #' \code{dialogue} variables.  There is a bit of cleansing that is closer to
#' #' \code{as_transript} than \code{\link[textshape]{split_transcript}}.
#' #'
#' #' @param x A vector with split points.
#' #' @param person.regex A vector of places (elements) to split on or a regular
#' #' expression if \code{regex} argument is \code{TRUE}.
#' #' @param col.names  A character vector specifying the column names of the
#' #' transcript columns.
#' #' @param dash A character string to replace the en and em dashes special
#' #' characters (default is to remove).
#' #' @param ellipsis A character string to replace the ellipsis special
#' #' characters.
#' #' @param quote2bracket logical. If \code{TRUE} replaces curly quotes with
#' #' curly braces (default is \code{FALSE}).  If \code{FALSE} curly quotes are
#' #' removed.
#' #' @param rm.empty.rows logical.  If \code{TRUE}
#' #' \code{\link[textreadr]{read_transcript}}  attempts to remove empty rows.
#' #' @param skip Integer; the number of lines of the data file to skip before
#' #' beginning to read data.
#' #' @param \ldots ignored.
#' #' @return Returns a data.frame of dialogue and people.
#' #' @export
#' split_match_regex_to_transcript <- function (x, person.regex = "^[A-Z]{3,}",
#'     col.names = c("Person", "Dialogue"), dash = "", ellipsis = "...",
#'     quote2bracket = FALSE, rm.empty.rows = TRUE, skip = 0, ...) {
#'
#'     text2transcript(
#'         combine_list(
#'             split_match(x, split = person.regex, include = TRUE, regex = TRUE)
#'         ),
#'         person.regex = person.regex,
#'         col.names = col.names,
#'         dash = dash,
#'         ellipsis = ellipsis,
#'         quote2bracket = quote2bracket,
#'         rm.empty.rows = rm.empty.rows,
#'         skip = skip,
#'         ...
#'     )
#'
#' }
#'
#'
#' combine_list <- function (x, fix.punctuation = TRUE, ...) {
#'
#'     if (!is.list(x)) x <- list(x)
#'     x <- unlist(lapply(x, paste, collapse = " "))
#'     if (isTRUE(fix.punctuation)) {
#'         x <- gsub("(\\s+(?=[,.?!;:%-]))|((?<=[$-])\\s+)", "", x, perl = TRUE)
#'     }
#'     unname(x)
#'
#' }
#'
#'
#'
#' text2transcript <- function(text, person.regex = NULL,
#'     col.names = c("Person", "Dialogue"), text.var = NULL,
#'     merge.broke.tot = TRUE, header = FALSE, dash = "", ellipsis = "...",
#'     quote2bracket = FALSE, rm.empty.rows = TRUE, na = "", skip = 0, ...) {
#'
#'     sep <- ":"
#'     text <- unlist(strsplit(text, "\n"))
#'     text <- paste(
#'         gsub(
#'             paste0('(', person.regex, ')'),
#'             "\\1SEP_PLACE_HOLDER",
#'             text,
#'             perl = TRUE
#'         ),
#'         collapse = "\n"
#'     )
#'
#'     text <- gsub(":", "SYMBOL_PLACE_HOLDER", text)
#'     text <- gsub("SEP_PLACE_HOLDER", ":", text, fixed = TRUE)
#'
#'     ## Use read.table to split read the text as a table
#'     x <- utils::read.table(
#'         text=text,
#'         header = header,
#'         sep = sep,
#'         skip=skip,
#'         quote = ""
#'     )
#'
#'     x[[2]] <- gsub("SYMBOL_PLACE_HOLDER", ":", x[[2]], fixed = TRUE)
#'
#'     if (!is.null(text.var) & !is.numeric(text.var)) {
#'         text.var <- which(colnames(x) == text.var)
#'     } else {
#'         text.col <- function(dataframe) {
#'             dial <- function(x) {
#'                 if(is.factor(x) | is.character(x)) {
#'                     n <- max(nchar(as.character(x)), na.rm = TRUE)
#'                 } else {
#'                     n <- NA
#'                 }
#'             }
#'             which.max(unlist(lapply(dataframe, dial)))
#'         }
#'         text.var <- text.col(x)
#'     }
#'
#'     x[[text.var]] <- trimws(
#'         iconv(as.character(x[[text.var]]), "", "ASCII", "byte")
#'     )
#'
#'     if (is.logical(quote2bracket)) {
#'         if (quote2bracket) {
#'             rbrac <- "}"
#'             lbrac <- "{"
#'         } else {
#'             lbrac <- rbrac <- ""
#'         }
#'     } else {
#'             rbrac <- quote2bracket[2]
#'             lbrac <- quote2bracket[1]
#'     }
#'
#'     ser <- c("<e2><80><9c>", "<e2><80><9d>", "<e2><80><98>", "<e2><80><99>",
#'     	"<e2><80><9b>", "<ef><bc><87>", "<e2><80><a6>", "<e2><80><93>",
#'     	"<e2><80><94>", "<c3><a1>", "<c3><a9>", "<c2><bd>")
#'
#'     reps <- c(lbrac, rbrac, "'", "'", "'", "'", ellipsis, dash, dash, "a", "e",
#'         "half")
#'
#'     Encoding(x[[text.var]]) <-"latin1"
#'     x[[text.var]] <- clean(.mgsub(ser, reps, x[[text.var]]))
#'     if(rm.empty.rows) {
#'         x <- rm_empty_row(x)
#'     }
#'     if (!is.null(col.names)) {
#'         colnames(x) <- col.names
#'     }
#'
#'     x <- as.data.frame(x, stringsAsFactors = FALSE)
#'
#'     if (merge.broke.tot) {
#'         x <- combine_tot(x)
#'     }
#'     x <- rm_na_row(x, rm.empty.rows)
#'     class(x) <- c("textreadr", "data.frame")
#'     x
#' }
#'
#' clean <- function (text.var) {
#'     gsub("\\s+", " ", gsub("\\\\r|\\\\n|\\n|\\\\t", " ", text.var))
#' }
#'
#' rm_na_row <- function(x, remove = TRUE) {
#'     if (!remove) return(x)
#'     x[rowSums(is.na(x)) != ncol(x), ]
#' }
#'
#' rm_empty_row <- function(dataframe) {
#'     x <- paste2(dataframe, sep = "")
#'     x <- gsub("\\s+", "", x)
#'     ind <- x != ""
#'     return(dataframe[ind, , drop = FALSE])
#' }
#'
#' #Helper function used in read.transcript
#' #' @importFrom data.table :=
#' combine_tot <- function(x){
#'     person <- NULL
#'     nms <- colnames(x)
#'     colnames(x) <- c('person', 'z')
#'     x <- data.table::data.table(x)
#'
#'     exp <- parse(text='list(text = paste(z, collapse = " "))')[[1]]
#'     out <- x[, eval(exp),
#'         by = list(person, 'new' = data.table::rleid(person))][,
#'         'new' := NULL][]
#'     data.table::setnames(out, nms)
#'     out
#' }
#'