#' Reads locally saved model results
#'
#' Reads locally saved model output from runSits function and returns parameter estimates of interest.
#'
#' @param model List of model specifications output from runSits function.
#'
#' @return List of estimated model parameters and useful summaries of the data including
#'
#' \item{agenda_setting}{Last iteration of the agenda-setting measure for each speaker}
#' \item{topic_shifting}{Last iteration of topic shift indicator for each speaking turn}
#' \item{logbeta}{K x V matrix of the log of the word probabilities for each topic}
#' \item{wordcounts}{Vector of word counts}
#'
#'
#' @seealso \link{runSits}, \link{prepConversations}
#'
#' @references
#'
#' Nguyen, Viet-An, Jordan Boyd-Graber and Philip Resnik. 2012. SITS: A hierarchical nonparametric model using speaker identity for topic segmentation in multiparty conversations. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers-Volume 1. Association for Computational Linguistics pp. 78–87.
#'
#' Nguyen, Viet-An, Jordan Boyd-Graber, Philip Resnik, Deborah A Cai, Jennifer E Midberry and Yuanxin Wang. 2014. “Modeling topic control to detect influence in conversations using nonparametric topic models.” Machine Learning 95(3):381–421.
#'
#' Nguyen, Viet-An. 2014. “Speaker Identity for Topic Segmentation (SITS).” https://github.com/vietansegan/sits.
#'
#'
#' @export
readSits <- function(model){
sitsCorpusPath <- paste0(normalizePath(model$sitsCorpusPath, mustWork = TRUE), "/")
## reading in data ##
## NOTE: changing everything to be 1-indexed ##
data_path <- paste0(sitsCorpusPath, model$corpusName)
## vector of speaker per turn, by 1-index
speaker <- data.table::fread(paste0(data_path, ".authors"), header = FALSE)
colnames(speaker) <- c("speaker_id")
separators <- which(speaker$speaker_id == -1)
speaker <- speaker[-separators,]
speaker <- speaker + 1
## vector of convo_ids, by 1-index
convos <- data.table::fread(paste0(data_path, ".shows"), blank.lines.skip = TRUE, header = FALSE)
colnames(convos) <- c("convo_id")
convos <- convos + 1
# transcripts <- data.table::fread(paste0(data_path, ".text"), blank.lines.skip = TRUE)
# colnames(transcripts) <- c("turn_id", "author", "text")
## vector of unique voc words, alphabetical order
voc <- data.table::fread(paste0(data_path, ".voc"), header = FALSE)
colnames(voc) <- c("voc")
## dataframe relating names of speaker to unique 1-index id
speaker_ids <- data.table::fread(paste0(data_path, ".whois"), header = FALSE)
speaker_ids <- data.frame(speaker_ids$V1)
colnames(speaker_ids) <- c("speaker_name")
speaker_ids$speaker_id <- 1:nrow(speaker_ids)
## list of words spoken in each turn, 1-indexed
## will match up with voc
words_ind <- readLines(paste0(data_path, ".words"))
words_ind <- strsplit(words_ind, "\t")
words_ind <- lapply(words_ind, function(x) x[2])
words_ind <- words_ind[-c(1,2)]
words_ind <- words_ind[-separators]
words_ind <- lapply(words_ind, function(i) as.numeric(strsplit(i, split = " ")[[1]]))
words_ind <- lapply(words_ind, function(x) x+1)
## reading in results ##
results_path <- paste0(model$outputPath, model$modelFolder)
## last agenda setting
as <- data.table::fread(paste0(results_path, "pi.txt"))
colnames(as) <- c("speaker_id", "score")
as[,1] <- speaker_ids$speaker_id
as$speaker_name <- speaker_ids$speaker_name
as <- as[,c("speaker_id", "speaker_name", "score")] #reorder cols
## last shift
shifts <- data.table::fread(paste0(results_path, "shift_asgn.txt"))
colnames(shifts) <- c("shift")
separators <- which(shifts$shift == -1)
shifts <- shifts[-separators,]
shifts <- cbind(convos, speaker, shifts)
colnames(shifts) <- c("convo_id", "speaker_id", "shift")
shifts <- plyr::join(x = shifts, y = speaker_ids, by = "speaker_id")
shifts <- shifts[,c("convo_id", "speaker_id", "speaker_name", "shift")]
## last topic assignments
z <- readLines(paste0(results_path, "topic_asgn.txt"))
z <- strsplit(z, "\t")
words_by_turn <- unlist(lapply(z, function(x) as.numeric(x[1])))
topic_asgn <- lapply(z, function(x) x[2])
topic_asgn <- topic_asgn[-separators]
topic_asgn <- lapply(topic_asgn, function(i) as.numeric(strsplit(i, split = " ")[[1]]))
topic_asgn <- lapply(topic_asgn, function(i) i+1)
## counting how many times word w was assigned to T1, T2, ... TK
word_topic <- lapply(1:length(topic_asgn), function(i) cbind(words_ind[[i]], topic_asgn[[i]]))
word_topic <- plyr::ldply(word_topic)
colnames(word_topic) <- c("word_ind", "topic_ind")
counts <- plyr::ldply(.data = sort(unique(word_topic$word_ind)),
.fun = function(x, K){
t <- table(word_topic$topic_ind[word_topic$word_ind == x])
c <- rep(0, K)
c[as.numeric(names(t))] <- t
return(c)
},
K = model$K)
colnames(counts) <- paste0("topic", 1:model$K)
rownames(counts) <- voc$voc
## Label topics
## log of the word probabilities for each topic
## vocab rows by K columns
est_beta <- apply(counts, 2, function(x, alpha) (x + alpha)/(sum(x) + nrow(counts)*alpha), alpha = model$alpha)
est_logbeta <- apply(est_beta, 1, log)
## returning results ##
return(list("agenda_setting" = as,
"topic_shifting" = shifts,
"logbeta" = est_logbeta,
"wordcounts" = rowSums(counts)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.