sitsr: R wrapper for SITS

Documented in readSits

#' Reads locally saved model results
#' 
#' Reads locally saved model output from runSits function and returns parameter estimates of interest.
#'
#' @param model List of model specifications output from runSits function.
#'
#' @return List of estimated model parameters and useful summaries of the data including
#' 
#' \item{agenda_setting}{Last iteration of the agenda-setting measure for each speaker}
#' \item{topic_shifting}{Last iteration of topic shift indicator for each speaking turn}
#' \item{logbeta}{K x V matrix of the log of the word probabilities for each topic}
#' \item{wordcounts}{Vector of word counts}
#' 
#' 
#' @seealso \link{runSits}, \link{prepConversations}
#' 
#' @references 
#' 
#' Nguyen, Viet-An, Jordan Boyd-Graber and Philip Resnik. 2012. SITS: A hierarchical nonparametric model using speaker identity for topic segmentation in multiparty conversations. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers-Volume 1. Association for Computational Linguistics pp. 78–87.
#'  
#' Nguyen, Viet-An, Jordan Boyd-Graber, Philip Resnik, Deborah A Cai, Jennifer E Midberry and Yuanxin Wang. 2014. “Modeling topic control to detect influence in conversations using nonparametric topic models.” Machine Learning 95(3):381–421.   
#'
#' Nguyen, Viet-An. 2014. “Speaker Identity for Topic Segmentation (SITS).” https://github.com/vietansegan/sits.
#' 
#'
#' @export
readSits <- function(model){
    
    sitsCorpusPath <- paste0(normalizePath(model$sitsCorpusPath, mustWork = TRUE), "/")

    ## reading in data ##
    ## NOTE: changing everything to be 1-indexed ##

    data_path <- paste0(sitsCorpusPath, model$corpusName)

    ## vector of speaker per turn, by 1-index
    speaker <- data.table::fread(paste0(data_path, ".authors"), header = FALSE)
    colnames(speaker) <- c("speaker_id")
    separators <- which(speaker$speaker_id == -1)
    speaker <- speaker[-separators,]
    speaker <- speaker + 1

    ## vector of convo_ids, by 1-index
    convos <- data.table::fread(paste0(data_path, ".shows"), blank.lines.skip = TRUE, header = FALSE)
    colnames(convos) <- c("convo_id")
    convos <- convos + 1

    # transcripts <- data.table::fread(paste0(data_path, ".text"), blank.lines.skip = TRUE)
    # colnames(transcripts) <- c("turn_id", "author", "text")

    ## vector of unique voc words, alphabetical order
    voc <- data.table::fread(paste0(data_path, ".voc"), header = FALSE)
    colnames(voc) <- c("voc")

    ## dataframe relating names of speaker to unique 1-index id
    speaker_ids <- data.table::fread(paste0(data_path, ".whois"), header = FALSE)
    speaker_ids <- data.frame(speaker_ids$V1)
    colnames(speaker_ids) <- c("speaker_name")
    speaker_ids$speaker_id <- 1:nrow(speaker_ids)

    ## list of words spoken in each turn, 1-indexed
    ## will match up with voc
    words_ind <- readLines(paste0(data_path, ".words"))
    words_ind <- strsplit(words_ind, "\t")
    words_ind <- lapply(words_ind, function(x) x[2])
    words_ind <- words_ind[-c(1,2)]
    words_ind <- words_ind[-separators]
    words_ind <- lapply(words_ind, function(i) as.numeric(strsplit(i, split = " ")[[1]]))
    words_ind <- lapply(words_ind, function(x) x+1)


    ## reading in results ##
    results_path <- paste0(model$outputPath, model$modelFolder)

    ## last agenda setting
    as <- data.table::fread(paste0(results_path, "pi.txt"))
    colnames(as) <- c("speaker_id", "score")
    as[,1] <- speaker_ids$speaker_id
    as$speaker_name <- speaker_ids$speaker_name
    as <- as[,c("speaker_id", "speaker_name", "score")] #reorder cols

    ## last shift
    shifts <- data.table::fread(paste0(results_path, "shift_asgn.txt"))
    colnames(shifts) <- c("shift")
    separators <- which(shifts$shift == -1)
    shifts <- shifts[-separators,]
    shifts <- cbind(convos, speaker, shifts)
    colnames(shifts) <- c("convo_id", "speaker_id", "shift")

    shifts <- plyr::join(x = shifts, y = speaker_ids, by = "speaker_id")
    shifts <- shifts[,c("convo_id", "speaker_id", "speaker_name", "shift")]

    ## last topic assignments
    z <- readLines(paste0(results_path, "topic_asgn.txt"))
    z <- strsplit(z, "\t")

    words_by_turn <- unlist(lapply(z, function(x) as.numeric(x[1])))
    topic_asgn <- lapply(z, function(x) x[2])

    topic_asgn <- topic_asgn[-separators]
    topic_asgn <- lapply(topic_asgn, function(i) as.numeric(strsplit(i, split = " ")[[1]]))
    topic_asgn <- lapply(topic_asgn, function(i) i+1)

    ## counting how many times word w was assigned to T1, T2, ... TK
    word_topic <- lapply(1:length(topic_asgn), function(i) cbind(words_ind[[i]], topic_asgn[[i]]))
    word_topic <- plyr::ldply(word_topic)
    colnames(word_topic) <- c("word_ind", "topic_ind")

    counts <- plyr::ldply(.data = sort(unique(word_topic$word_ind)),
                    .fun = function(x, K){
                        t <- table(word_topic$topic_ind[word_topic$word_ind == x])
                        c <- rep(0, K)
                        c[as.numeric(names(t))] <- t
                        return(c)
                    },
                    K = model$K)
    colnames(counts) <- paste0("topic", 1:model$K)
    rownames(counts) <- voc$voc


    ## Label topics
    ## log of the word probabilities for each topic
    ## vocab rows by K columns
    est_beta <- apply(counts, 2, function(x, alpha) (x + alpha)/(sum(x) + nrow(counts)*alpha), alpha = model$alpha)
    est_logbeta <- apply(est_beta, 1, log)

    
    
    ## returning results ##

    return(list("agenda_setting" = as,
                "topic_shifting" = shifts,
                "logbeta" = est_logbeta,
                "wordcounts" = rowSums(counts)))

}