R/runSits.R

Defines functions runSits

Documented in runSits

#' Runs SITS model
#' 
#' Runs locally saved SITS model by calling commands to the Terminal using the RStudio API package.
#'
#' @param sitsPath Path to locally saved SITS package.
#' @param corpusName Name of corpus (specified in prepConversations function)
#' @param sitsCorpusPath Path to prepared SITS corpus (specified in prepConversations function)
#' @param outputPath Desired path of SITS model output
#' @param K Number of topics
#' @param alpha Hyperparameter for Dirichlet prior on document topic proportions
#' @param beta Hyperparameter for Dirichlet prior on topic word probabilities
#' @param gam Hyperparameter for Beta prior on speaker topic shift behavior
#' @param burnIn Number of burn in iterations
#' @param maxIter Maximum number of iterations
#' @param sampleLag Number of iterations to lag sample collection
#'
#' @return List indicating user provided model specifications and the path where SITS results were saved locally.
#'
#' @references 
#' 
#' Nguyen, Viet-An, Jordan Boyd-Graber and Philip Resnik. 2012. SITS: A hierarchical nonparametric model using speaker identity for topic segmentation in multiparty conversations. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers-Volume 1. Association for Computational Linguistics pp. 78–87.
#'  
#' Nguyen, Viet-An, Jordan Boyd-Graber, Philip Resnik, Deborah A Cai, Jennifer E Midberry and Yuanxin Wang. 2014. “Modeling topic control to detect influence in conversations using nonparametric topic models.” Machine Learning 95(3):381–421.   
#'
#' Nguyen, Viet-An. 2014. “Speaker Identity for Topic Segmentation (SITS).” https://github.com/vietansegan/sits.
#' 
#'
#' @export
runSits <- function(sitsPath,
                    corpusName,
                    sitsCorpusPath,
                    outputPath,
                    K,
                    alpha,
                    beta,
                    gam,
                    burnIn,
                    maxIter,
                    sampleLag){
    
    model <- "param"
    
    sitsPath <- paste0(normalizePath(sitsPath, mustWork = TRUE), "/")
    sitsCorpusPath <- paste0(normalizePath(sitsCorpusPath, mustWork = TRUE), "/")
    outputPath <- paste0(normalizePath(outputPath, mustWork = FALSE), "/")
    if(!file.exists(outputPath)) dir.create(outputPath)


    model_java <- paste("java -cp 'dist/sits.jar:lib/*' segmentation.TopicSegmentation //",
                        paste("--dataset", corpusName, "//"),
                        paste("--input", sitsCorpusPath, "//"),
                        paste("--output", outputPath, "//"),
                        paste("--model", model, "-v //"),
                        paste("--K", K, "//"),
                        paste("--alpha", alpha, "//"),
                        paste("--beta", beta, "//"),
                        paste("--gamma", gam, "//"),
                        paste("--burnIn", burnIn, "//"),
                        paste("--maxIter", maxIter, "//"),
                        paste("--sampleLag", sampleLag, "//"))


    ## open terminal in rstudio
    term <- rstudioapi::terminalCreate()

    ## wait for it to start
    while(!rstudioapi::terminalRunning(term)) {
        Sys.sleep(0.1)
    }

    if(rstudioapi::terminalContext(term)$shell != "Bash"){
        stop("Not written for other shells")
    }

    ## navigate to sits package
    rstudioapi::terminalSend(term, "cd \n")
    rstudioapi::terminalSend(term, paste("cd", sitsPath, "\n"))

    ## build sits package
    ## TODO: implement some sort of if condition to see if these steps need to be taken
    rstudioapi::terminalSend(term, "ant compile \n")
    rstudioapi::terminalSend(term, "ant clean-build \n")
    rstudioapi::terminalSend(term, "ant jar \n")

    ## run model
    rstudioapi::terminalSend(term, paste(model_java, "\n"))

    ## TODO: think of something better than hard coding in 10 second wait
    ## terminal becomes "busy" only after above commands are actually exectued
    ## (not just written in terminal)
    Sys.sleep(10)
    while(rstudioapi::terminalBusy(term)){
        Sys.sleep(0.1)
    }
    rstudioapi::terminalKill(term)
    
    
    folder <- paste0("RANDOM_asm_B-", burnIn, "_M-", maxIter, "_L-", sampleLag, "_a-", alpha,
                     "_b-", beta, "_g-", gam, "_K-", K, "_opt-false/")

    message(paste0("Model succesfully run.  Model output saved locally here: \n",
                   "\t", outputPath, folder, "\n",
                   "Use readSits() to read results into R."))
    
    return(list("sitsPath" = sitsPath,
                "sitsCorpusPath" = sitsCorpusPath,
                "corpusName" = corpusName,
                "outputPath" = outputPath,
                "model" = model,
                "K" = K,
                "alpha" = alpha,
                "beta" = beta,
                "gam" = gam,
                "burnIn" = burnIn,
                "maxIter" = maxIter,
                "sampleLag" = sampleLag,
                "modelFolder" = folder))
}
erossiter/sitsr documentation built on May 23, 2019, 7:34 a.m.