sitsr: R wrapper for SITS

Documented in prepConversations

#' Prep transcripts for SITS model
#' 
#' Function takes path to a corpus of raw conversation transcripts, preprocesses the text, and
#' writes files locally needed to run the SITS model.
#'
#' @param rawCorpusPath Path to folder containing raw text files.  Files need to have .txt extension, which each row holding tab separated values of (1) row number, (2) speaker identifier, and (3) turn text.
#' @param sitsCorpusPath Path to folder in which to store prepared SITS corpus.
#' @param corpusName Desired name of corpus.
#' @param overwrite Boolean indicating whether or not to overwrite existing files at sitsCorpusPath.
#' @param returnData Boolean indicator whether or not to return a data frame of transcript information including preprocessed text.
#' @param ... Other arguments to stm::textProcessor() and stm::prepDocuments() for text preproccessing.
#' 
#' @details This function has two purposes.  First, it preproccesses raw transcripts.  It does so using 2 functions from the
#' the stm package.  See help files and examples in the stm package for more information on the basic
#' text cleaning operations performed by these functions.  This function uses all default arguments.
#' 
#' The second purpose of this function is to format and write transcript information into 6 files needed for the SITS model.  These files are stored
#' in the path provided by the sitsCorpusPath argument.  The files are named using the corpusName argument.
#' More information on the required formatting of inforamation for the SITS model is found at https://github.com/vietansegan/sits.
#' 
#' @return Optionally returns preproccessed transcripts in the form of a data frame.
#' 
#' @seealso \link[stm]{textProcessor}, \link[stm]{prepDocuments}
#' 
#' @references 
#' 
#' Nguyen, Viet-An, Jordan Boyd-Graber and Philip Resnik. 2012. SITS: A hierarchical nonparametric model using speaker identity for topic segmentation in multiparty conversations. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers-Volume 1. Association for Computational Linguistics pp. 78–87.
#'  
#' Nguyen, Viet-An, Jordan Boyd-Graber, Philip Resnik, Deborah A Cai, Jennifer E Midberry and Yuanxin Wang. 2014. “Modeling topic control to detect influence in conversations using nonparametric topic models.” Machine Learning 95(3):381–421.   
#'
#' Nguyen, Viet-An. 2014. “Speaker Identity for Topic Segmentation (SITS).” https://github.com/vietansegan/sits.
#' 
#' @export
prepConversations <- function(rawCorpusPath, sitsCorpusPath, corpusName, overwrite = TRUE, returnData = FALSE, ...){

    rawCorpusPath <- paste0(normalizePath(rawCorpusPath, mustWork = TRUE), "/")
    sitsCorpusPath <- paste0(normalizePath(sitsCorpusPath, mustWork = FALSE), "/")
    path <- paste0(sitsCorpusPath, corpusName)
    
    ## create folder if it doesn't exist
    ## if it does exist, delete old files
    if(!file.exists(sitsCorpusPath)){
        dir.create(sitsCorpusPath)
    }else{
        if(file.exists(paste0(path, ".txt"))){
            if(overwrite) file.remove(paste0(path, ".txt"))
            else stop(paste("File", paste0(path, ".txt"), "exits.  Set `overwrite` = TRUE to overwrite it."))
        }
        
        
        if(file.exists(paste0(path, ".shows"))){
            if(overwrite) file.remove(paste0(path, ".shows"))
            else stop(paste("File", paste0(path, ".shows"), "exits.  Set `overwrite` = TRUE to overwrite it."))
        }
        
        if(file.exists(paste0(path, ".authors"))){
            if(overwrite) file.remove(paste0(path, ".authors"))
            else stop(paste("File", paste0(path, ".authors"), "exits.  Set `overwrite` = TRUE to overwrite it."))
        }
        
        if(file.exists(paste0(path, ".whois"))){
            if(overwrite) file.remove(paste0(path, ".whois"))
            else stop(paste("File", paste0(path, ".whois"), "exits.  Set `overwrite` = TRUE to overwrite it."))
        }
        
        if(file.exists(paste0(path, ".voc"))){
            if(overwrite) file.remove(paste0(path, ".voc"))
            else stop(paste("File", paste0(path, ".voc"), "exits.  Set `overwrite` = TRUE to overwrite it."))
        }
        
        
        if(file.exists(paste0(path, ".words"))){
            if(overwrite) file.remove(paste0(path, ".words"))
            else stop(paste("File", paste0(path, ".words"), "exits.  Set `overwrite` = TRUE to overwrite it."))
        }
    }

    ## read in transcripts ##
    file_names <- list.files(path = rawCorpusPath, pattern = "*.txt", full.names = TRUE)

    convo_df <- plyr::adply(.data = file_names,
                            .margins = 1,
                            .fun = utils::read.table,
                            sep = "\t",
                            stringsAsFactors = FALSE,
                            quote = NULL)
    
    if(ncol(convo_df) != 4){
        stop("Error in processing transcripts.  Transcripts should be tab separated text files with row number, speaker identifier, and turn text in each row.")
    }

    colnames(convo_df) <- c("convo_id", "turn", "speaker", "turn_text")


    ## cleaning the text
    docs_temp <- stm::textProcessor(documents = convo_df$turn_text, ...)
    docs_done <- stm::prepDocuments(documents = docs_temp$documents, vocab = docs_temp$vocab, ...)

    ## word id: 0-indexed
    ## prep documents fixed vocab indexing problems, but it also removed documents that were empty
    ## putting those back in
    removed <- which(! 1:nrow(convo_df) %in% as.numeric(names(docs_done$documents)) )

    convo_df$word_ids <- NA
    convo_df$word_ids[removed] <- ""
    word_ids <- lapply(docs_done$documents, function(d){
        paste(unlist(apply(d, 2, function(c) rep(c[1], c[2])-1)), collapse = " ")
    })

    convo_df$processed_text <- NA
    convo_df$processed_text[removed] <- 0
    processed_text <- unlist(lapply(docs_done$documents, function(d){
        paste(unlist(apply(d, 2, function(c) docs_done$vocab[rep(c[1], c[2])])), collapse = " ")
        }))
    
    convo_df$n_words <- NA
    convo_df$n_words[removed] <- 0
    n_words <- unlist(lapply(docs_done$documents, function(d) sum(d[2,])))
    
    ## if none to remove
    if(length(removed) > 0){
        convo_df$word_ids[-removed] <- word_ids
        convo_df$processed_text[-removed] <- processed_text
        convo_df$n_words[-removed] <- n_words
    }else{
        convo_df$word_ids <- word_ids
        convo_df$processed_text <- processed_text
        convo_df$n_words <- n_words
    }

    ## convo id: 0-indexed
    convo_df$convo_id <- (as.numeric(convo_df$convo_id) - 1)

    ## turn id: 0-indexed
    convo_df$turn_id <- unlist(tapply(convo_df$convo_id, convo_df$convo_id, function(x) 0:(length(x)-1)))

    ## speaker id: 0-indexed
    speaker_df <- data.frame("speaker" = unique(convo_df$speaker),
                             "speaker_id" = 0:(length(unique(convo_df$speaker))-1))

    convo_df <- plyr::join(convo_df, speaker_df, by = "speaker")


    ## write SITS files
    utils::write.table(x = unlist(tapply(convo_df$convo_id, convo_df$convo_id, append, "")),
                file = paste0(path, ".shows"),
                quote = FALSE,
                sep = "\n",
                row.names = FALSE,
                col.names = FALSE)

    utils::write.table(x = unlist(tapply(convo_df$speaker_id, convo_df$convo_id, append, -1)),
                file = paste0(path, ".authors"),
                quote = FALSE,
                sep = "\t",
                row.names = FALSE,
                col.names = FALSE)

    utils::write.table(x = unique(convo_df[,c("speaker")]),
                file = paste0(path, ".whois"),
                quote = FALSE,
                sep = "\t",
                row.names = FALSE,
                col.names = FALSE)

    utils::write.table(x = docs_done$vocab,
                file = paste0(path, ".voc"),
                quote = FALSE,
                sep = "\n",
                row.names = FALSE,
                col.names = FALSE)
    
    utils::write.table(x = trimws(apply(convo_df[,c("turn", "speaker", "processed_text")], 1, paste, collapse = "\t")),
                       file = paste0(path, ".txt"),
                       quote = FALSE,
                       sep = "\n",
                       row.names = FALSE,
                       col.names = FALSE)

    print_turn <- function(i){
        cat(convo_df$n_words[i])
        cat("\t")
        cat(unlist(convo_df$word_ids[i]))
        cat("\n")
        if((convo_df$convo_id[i] != convo_df$convo_id[i+1]) | (nrow(convo_df) == i)){
            cat("\n")
        }
    }

    sink(paste0(path, ".words"), append = TRUE)
    header <- c(length(unique(convo_df$convo_id)), nrow(convo_df)+length(unique(convo_df$convo_id)))
    cat(header, sep = "\n")
    plyr::l_ply(1:nrow(convo_df), print_turn)
    sink()
    
    # change from strings to NA for R-users
    convo_df$processed_text[which(convo_df$processed_text == 0)] <- NA
    convo_df$word_ids[which(convo_df$word_ids == "")] <- NA
    
    if(returnData) return(convo_df)
}