#' #' Split Text by Regex Into a Transcript
#' #'
#' #' A wrapper for \code{\link[textshape]{split_match_regex}} and
#' #' \pkg{textreadr}'s \code{as_transript} to detect person variable, split the
#' #' text into turns of talk, and convert to a data.frame with \code{person} and
#' #' \code{dialogue} variables. There is a bit of cleansing that is closer to
#' #' \code{as_transript} than \code{\link[textshape]{split_transcript}}.
#' #'
#' #' @param x A vector with split points.
#' #' @param person.regex A vector of places (elements) to split on or a regular
#' #' expression if \code{regex} argument is \code{TRUE}.
#' #' @param col.names A character vector specifying the column names of the
#' #' transcript columns.
#' #' @param dash A character string to replace the en and em dashes special
#' #' characters (default is to remove).
#' #' @param ellipsis A character string to replace the ellipsis special
#' #' characters.
#' #' @param quote2bracket logical. If \code{TRUE} replaces curly quotes with
#' #' curly braces (default is \code{FALSE}). If \code{FALSE} curly quotes are
#' #' removed.
#' #' @param rm.empty.rows logical. If \code{TRUE}
#' #' \code{\link[textreadr]{read_transcript}} attempts to remove empty rows.
#' #' @param skip Integer; the number of lines of the data file to skip before
#' #' beginning to read data.
#' #' @param \ldots ignored.
#' #' @return Returns a data.frame of dialogue and people.
#' #' @export
#' split_match_regex_to_transcript <- function (x, person.regex = "^[A-Z]{3,}",
#' col.names = c("Person", "Dialogue"), dash = "", ellipsis = "...",
#' quote2bracket = FALSE, rm.empty.rows = TRUE, skip = 0, ...) {
#'
#' text2transcript(
#' combine_list(
#' split_match(x, split = person.regex, include = TRUE, regex = TRUE)
#' ),
#' person.regex = person.regex,
#' col.names = col.names,
#' dash = dash,
#' ellipsis = ellipsis,
#' quote2bracket = quote2bracket,
#' rm.empty.rows = rm.empty.rows,
#' skip = skip,
#' ...
#' )
#'
#' }
#'
#'
#' combine_list <- function (x, fix.punctuation = TRUE, ...) {
#'
#' if (!is.list(x)) x <- list(x)
#' x <- unlist(lapply(x, paste, collapse = " "))
#' if (isTRUE(fix.punctuation)) {
#' x <- gsub("(\\s+(?=[,.?!;:%-]))|((?<=[$-])\\s+)", "", x, perl = TRUE)
#' }
#' unname(x)
#'
#' }
#'
#'
#'
#' text2transcript <- function(text, person.regex = NULL,
#' col.names = c("Person", "Dialogue"), text.var = NULL,
#' merge.broke.tot = TRUE, header = FALSE, dash = "", ellipsis = "...",
#' quote2bracket = FALSE, rm.empty.rows = TRUE, na = "", skip = 0, ...) {
#'
#' sep <- ":"
#' text <- unlist(strsplit(text, "\n"))
#' text <- paste(
#' gsub(
#' paste0('(', person.regex, ')'),
#' "\\1SEP_PLACE_HOLDER",
#' text,
#' perl = TRUE
#' ),
#' collapse = "\n"
#' )
#'
#' text <- gsub(":", "SYMBOL_PLACE_HOLDER", text)
#' text <- gsub("SEP_PLACE_HOLDER", ":", text, fixed = TRUE)
#'
#' ## Use read.table to split read the text as a table
#' x <- utils::read.table(
#' text=text,
#' header = header,
#' sep = sep,
#' skip=skip,
#' quote = ""
#' )
#'
#' x[[2]] <- gsub("SYMBOL_PLACE_HOLDER", ":", x[[2]], fixed = TRUE)
#'
#' if (!is.null(text.var) & !is.numeric(text.var)) {
#' text.var <- which(colnames(x) == text.var)
#' } else {
#' text.col <- function(dataframe) {
#' dial <- function(x) {
#' if(is.factor(x) | is.character(x)) {
#' n <- max(nchar(as.character(x)), na.rm = TRUE)
#' } else {
#' n <- NA
#' }
#' }
#' which.max(unlist(lapply(dataframe, dial)))
#' }
#' text.var <- text.col(x)
#' }
#'
#' x[[text.var]] <- trimws(
#' iconv(as.character(x[[text.var]]), "", "ASCII", "byte")
#' )
#'
#' if (is.logical(quote2bracket)) {
#' if (quote2bracket) {
#' rbrac <- "}"
#' lbrac <- "{"
#' } else {
#' lbrac <- rbrac <- ""
#' }
#' } else {
#' rbrac <- quote2bracket[2]
#' lbrac <- quote2bracket[1]
#' }
#'
#' ser <- c("<e2><80><9c>", "<e2><80><9d>", "<e2><80><98>", "<e2><80><99>",
#' "<e2><80><9b>", "<ef><bc><87>", "<e2><80><a6>", "<e2><80><93>",
#' "<e2><80><94>", "<c3><a1>", "<c3><a9>", "<c2><bd>")
#'
#' reps <- c(lbrac, rbrac, "'", "'", "'", "'", ellipsis, dash, dash, "a", "e",
#' "half")
#'
#' Encoding(x[[text.var]]) <-"latin1"
#' x[[text.var]] <- clean(.mgsub(ser, reps, x[[text.var]]))
#' if(rm.empty.rows) {
#' x <- rm_empty_row(x)
#' }
#' if (!is.null(col.names)) {
#' colnames(x) <- col.names
#' }
#'
#' x <- as.data.frame(x, stringsAsFactors = FALSE)
#'
#' if (merge.broke.tot) {
#' x <- combine_tot(x)
#' }
#' x <- rm_na_row(x, rm.empty.rows)
#' class(x) <- c("textreadr", "data.frame")
#' x
#' }
#'
#' clean <- function (text.var) {
#' gsub("\\s+", " ", gsub("\\\\r|\\\\n|\\n|\\\\t", " ", text.var))
#' }
#'
#' rm_na_row <- function(x, remove = TRUE) {
#' if (!remove) return(x)
#' x[rowSums(is.na(x)) != ncol(x), ]
#' }
#'
#' rm_empty_row <- function(dataframe) {
#' x <- paste2(dataframe, sep = "")
#' x <- gsub("\\s+", "", x)
#' ind <- x != ""
#' return(dataframe[ind, , drop = FALSE])
#' }
#'
#' #Helper function used in read.transcript
#' #' @importFrom data.table :=
#' combine_tot <- function(x){
#' person <- NULL
#' nms <- colnames(x)
#' colnames(x) <- c('person', 'z')
#' x <- data.table::data.table(x)
#'
#' exp <- parse(text='list(text = paste(z, collapse = " "))')[[1]]
#' out <- x[, eval(exp),
#' by = list(person, 'new' = data.table::rleid(person))][,
#' 'new' := NULL][]
#' data.table::setnames(out, nms)
#' out
#' }
#'
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.