R/conversions.R

Defines functions as.tokens.dfm as.JST.default as.JST.multiChains as.JST.sentopicmodel as.JST as.rJST.default as.rJST.multiChains as.rJST.sentopicmodel as.rJST as.LDA.default as.LDA.multiChains as.LDA.sentopicmodel as.sentopicmodel.default as.sentopicmodel.multiChains as.sentopicmodel.sentopicmodel as.sentopicmodel.JST as.sentopicmodel.rJST as.sentopicmodel.LDA as.sentopicmodel_defaults as.sentopicmodel LDAvis as.LDA.keyATM_output as.LDA_lda as.LDA.textmodel_lda as.LDA.LDA_VEM as.LDA.LDA_Gibbs as.LDA.STM as.LDA

Documented in as.JST as.LDA as.LDA.keyATM_output as.LDA_lda as.LDA.LDA_Gibbs as.LDA.LDA_VEM as.LDA.STM as.LDA.textmodel_lda as.rJST as.sentopicmodel as.tokens.dfm LDAvis

# From other packages -----------------------------------------------------

#' Conversions from other packages to LDA
#'
#' @description These functions converts estimated models from other topic
#'   modeling packages to the format used by \pkg{sentopics}.
#'
#' @param x an estimated topic model from \pkg{stm}, \pkg{topicmodels} or
#'   \pkg{seededlda}.
#' @param list the list containing an estimated model from \pkg{lda}.
#' @param docs for some objects, the documents used to initialize the model.
#' @param alpha for \pkg{lda} models, the document-topic mixture hyperparameter.
#'   If missing, the hyperparameter will be set to `50/K`.
#' @param eta for \pkg{lda} models, the topic-word mixture hyperparameter. Other
#'   packages refer to this hyperparameter as *beta*. If missing, the
#'   hyperparameter will be set to `0.01`.
#' @param ... arguments passed to other methods.
#'
#' @details Some models do not store the topic assignment of each word (for
#'   example, estimated through variational inference). For these, the
#'   conversion is limited and some functionalities of \pkg{sentopics} will be
#'   disabled. The list of affected functions is subject to change and currently
#'   includes \code{\link[=fit.sentopicmodel]{fit()}}, [mergeTopics()] and [rJST.LDA()].
#'
#'   Since models from the \pkg{lda} package are simply lists of outputs, the
#'   function `as.LDA_lda()` is not related to the other methods and should be
#'   applied directly on lists containing a model.
#'
#' @return A S3 list of class `LDA`, as if it was created and estimated using
#'   [LDA()] and \code{\link[=fit.sentopicmodel]{fit()}}.
#'
#' @rdname as.LDA
#' @export
#' @examples
#' \donttest{
#' ## stm
#' library("stm")
#' stm <- stm(poliblog5k.docs, poliblog5k.voc, K=25,
#'            prevalence=~rating, data=poliblog5k.meta,
#'            max.em.its=2, init.type="Spectral")
#' as.LDA(stm, docs = poliblog5k.docs)
#'
#' ## lda
#' library("lda")
#' data("cora.documents")
#' data("cora.vocab")
#' lda <- lda.collapsed.gibbs.sampler(cora.documents,
#'                                    5, ## Num clusters
#'                                    cora.vocab,
#'                                    100, ## Num iterations
#'                                    0.1,
#'                                    0.1)
#' LDA <- as.LDA_lda(lda, docs = cora.documents, alpha = .1, eta = .1)
#'
#' ## topicmodels
#' data("AssociatedPress", package = "topicmodels")
#' lda <- topicmodels::LDA(AssociatedPress[1:20,],
#'                         control = list(alpha = 0.1), k = 2)
#' LDA <- as.LDA(lda, docs = AssociatedPress[1:20,])
#'
#' ## seededlda
#' library("seededlda")
#' lda <- textmodel_lda(dfm(ECB_press_conferences_tokens),
#'                      k = 6, max_iter = 100)
#' LDA <- as.LDA(lda)
#'
#' ## keyATM
#' library("keyATM")
#' data(keyATM_data_bills, package = "keyATM")
#' keyATM_docs <- keyATM_read(keyATM_data_bills$doc_dfm)
#' out <- keyATM(docs = keyATM_docs, model = "base",
#'               no_keyword_topics = 5,
#'               keywords = keyATM_data_bills$keywords)
#' LDA <- as.LDA(out, docs = keyATM_docs)
#' }
as.LDA <- function(x, ...) {
  UseMethod("as.LDA")
}
#' @rdname as.LDA
#' @export
as.LDA.STM <- function(x, docs, ...) {
  ## TODO: better identification of priors
  ## mu + sigma for logistic-normal => translate to? using eta instead?
  ## prior for beta? 0?

  K <- x$settings$dim$K

  doc.length <- as.integer(unlist(lapply(docs, function(x) sum(x[2, ]))))
  theta <- x$theta
  theta_prior <- rep(0, ncol(theta))

  zd <- rebuild_L1d_from_posterior(doc.length, theta, theta_prior)
  zd <- t(zd)

  ## adjust zd to integer
  diff <- doc.length - as.integer(colSums(round(zd)))
  for (i in which(diff != 0)) {
    while (diff[i] != 0L) {
      possible <- zd[, i] > (-diff[i] - .5)
      idx <- which.min( (zd[possible, i] - round(zd[possible, i])*diff[i] ))
      zd[which(possible)[idx], i] <- zd[which(possible)[idx], i] + 1*sign(diff[i])
      diff[i] <- as.integer(doc.length[i] - sum(round(zd[, i])))
    }
  }
  zd <- round(zd)
  storage.mode(zd) <- "integer"
  stopifnot(isTRUE(all.equal(colSums(zd), doc.length)))


  phi <- t(exp(x$beta$logbeta[[1]]))
  beta <- t(phi)
  beta[] <- .Machine$double.eps

  ## adjust zw to integer
  zw <- rebuild_zw_from_posterior2(zd, phi, beta)
  diff <- x$settings$dim$wcounts$x - as.integer(colSums(round(zw)))
  for (i in which(diff != 0)) {
    while (diff[i] != 0L) {
      possible <- (zw[, i] > (-diff[i] - .5))
      idx <- which.min( (zw[possible, i] - round(zw[possible, i])*diff[i] ))
      zw[which(possible)[idx], i] <- zw[which(possible)[idx], i] + 1*sign(diff[i])
      diff[i] <- as.integer(x$settings$dim$wcounts$x[i] - sum(round(zw[, i])))
    }
  }
  zw <- round(zw)
  diff2 <- as.integer(rowSums(zd) - rowSums(round(zw)))
  ## quickly adjust second dimension
  while (any(diff2 < 0)) {
    count <- 1L;
    while (TRUE) {
      non_zero <- length(zw[diff2 < 0][zw[diff2 < 0] > count])
      deduct <- ceiling(abs(min(diff2))/non_zero)
      if (deduct <= count) break
      else count <- count + 1
    }
    zw[diff2 < 0][zw[diff2 < 0] > count] <- zw[diff2 < 0][zw[diff2 < 0] > count] - deduct
    diff2 <- as.integer(rowSums(zd) - rowSums(round(zw)))
  }
  diff <- x$settings$dim$wcounts$x - as.integer(colSums(round(zw)))
  zw <- zw + stats::r2dtable(1, diff2, diff)[[1]]
  storage.mode(zw) <- "integer"
  stopifnot(isTRUE(all.equal(colSums(zw), x$settings$dim$wcounts$x)))
  stopifnot(isTRUE(all.equal(rowSums(zw), rowSums(zd))))


  ## Attempt to recreate ZA (not working)
  # za <- rebuild_za_ARRAY(zd, zw)
  # za <- lapply(seq_along(docs), function(d) {
  #   do.call(c,
  #           lapply(seq_along(x$vocab), function(v) {
  #             rep(1:K, times = za[v, , d])
  #           }))
  # })
  #
  # which(x$vocab=="price")
  # t(za[786,,]) |> colSums()
  # zw[, 786]
  #
  # dw <- unname(convert(quanteda::dfm(tokens), "matrix")[, types(tokens)])
  # storage.mode(dw) <- "integer"

  ## tokens
  build_tokens <- get("build_tokens", envir = getNamespace("quanteda"))
  make_docvars <- get("make_docvars", envir = getNamespace("quanteda"))
  tokens <- build_tokens(
    x = lapply(docs, function(doc) rep(doc[1, ], times = doc[2, ])),
    types = x$vocab,
    docvars = make_docvars(length(docs)))
  stopifnot(isTRUE(all.equal(unname(lengths(cleanPadding(tokens))),
                             doc.length)))

  vocabulary <- makeVocabulary(tokens, NULL, 1L)

  LDA <- structure(list(
    tokens = vocabulary$toks,
    vocabulary = vocabulary$vocabulary,
    K = K,
    alpha = as.matrix(rep(0, K)),
    beta = beta,
    it = x$convergence$its,
    zd = zd,
    zw = zw,
    theta = theta,
    phi = phi,
    logLikelihood = x$convergence$bound
  ), class = c("LDA", "sentopicmodel"), reversed = TRUE, Sdim = "L2",
  approx = TRUE)
  LDA <- as.LDA(reorder_sentopicmodel(LDA))

  LDA
}


#' @rdname as.LDA
#' @export
as.LDA.LDA_Gibbs <- function(x, docs, ...) {
  K <- x@k
  alpha <- x@alpha
  beta <- x@beta
  beta[] <- x@control@delta

  tokens <- quanteda::as.tokens(quanteda::as.dfm(docs))
  vocabulary <- makeVocabulary(tokens, NULL, 1L)

  za <- split(x@z, rep(seq_len(length(tokens)),
                       times = lengths(cleanPadding(tokens))))

  LDA <- structure(list(
    tokens = vocabulary$toks,
    vocabulary = vocabulary$vocabulary,
    K = K,
    alpha = as.matrix(rep(alpha, K)),
    beta = beta,
    it = x@iter,
    theta = x@gamma,
    phi = exp(t(x@beta)),
    za = za,
    logLikelihood = NULL
  ), class = c("LDA", "sentopicmodel"), reversed = TRUE, Sdim = "L2")
  LDA <- fit(LDA, 0, displayProgress = FALSE)

  LDA
}

#' @rdname as.LDA
#' @export
as.LDA.LDA_VEM <- function(x, docs, ...) {
  K <- x@k
  alpha <- x@alpha
  alpha <- rep(alpha, K)

  dfm <- quanteda::as.dfm(docs)
  tokens <- quanteda::as.tokens(dfm)
  dfm <- as.matrix(dfm)
  vocabulary <- makeVocabulary(tokens, NULL, 1L)

  doc.length <- lengths(tokens, use.names = FALSE)
  zd <- rebuild_L1d_from_posterior(lengths(tokens), x@gamma, alpha)
  zd <- t(zd)

  ## adjust zd to integer
  diff <- doc.length - as.integer(colSums(round(zd)))
  for (i in which(diff != 0)) {
    while (diff[i] != 0L) {
      possible <- zd[, i] > (-diff[i] - .5)
      idx <- which.min( (zd[possible, i] - round(zd[possible, i])*diff[i] ))
      zd[which(possible)[idx], i] <- zd[which(possible)[idx], i] + 1*sign(diff[i])
      diff[i] <- as.integer(doc.length[i] - sum(round(zd[, i])))
    }
  }
  zd <- round(zd)
  storage.mode(zd) <- "integer"
  stopifnot(isTRUE(all.equal(colSums(zd), doc.length)))

  phi <- t(exp(x@beta))
  beta <- x@beta
  beta[] <- .Machine$double.eps

  ## adjust zw to integer
  zw <- rebuild_zw_from_posterior2(zd, phi, beta)
  diff <- as.integer(colSums(dfm) - colSums(round(zw)))
  for (i in which(diff != 0)) {
    while (diff[i] != 0L) {
      possible <- (zw[, i] > (-diff[i] - .5))
      idx <- which.min( (zw[possible, i] - round(zw[possible, i])*diff[i] ))
      zw[which(possible)[idx], i] <- zw[which(possible)[idx], i] + 1*sign(diff[i])
      diff[i] <- as.integer(sum(dfm[, i]) - sum(round(zw[, i])))
    }
  }
  zw <- round(zw)
  diff2 <- as.integer(rowSums(zd) - rowSums(round(zw)))
  ## quickly adjust second dimension
  while (any(diff2 < 0)) {
    count <- 1L;
    while (TRUE) {
      non_zero <- length(zw[diff2 < 0][zw[diff2 < 0] > count])
      deduct <- ceiling(abs(min(diff2))/non_zero)
      if (deduct <= count) break
      else count <- count + 1
    }
    zw[diff2 < 0][zw[diff2 < 0] > count] <- zw[diff2 < 0][zw[diff2 < 0] > count] - deduct
    diff2 <- as.integer(rowSums(zd) - rowSums(round(zw)))
  }
  diff <- as.integer(colSums(dfm)) - as.integer(colSums(round(zw)))
  zw <- zw + stats::r2dtable(1, diff2, diff)[[1]]
  storage.mode(zw) <- "integer"
  stopifnot(isTRUE(all.equal(colSums(zw), unname(colSums(dfm)))))
  stopifnot(isTRUE(all.equal(rowSums(zw), rowSums(zd))))

  LDA <- structure(list(
    tokens = vocabulary$toks,
    vocabulary = vocabulary$vocabulary,
    K = K,
    alpha = as.matrix(alpha),
    beta = beta,
    it = x@iter,
    zd = zd,
    zw = zw,
    theta = x@gamma,
    phi = phi,
    logLikelihood = x@logLiks
  ), class = c("LDA", "sentopicmodel"), reversed = TRUE, Sdim = "L2",
  approx = TRUE)
  LDA <- as.LDA(reorder_sentopicmodel(LDA))

  LDA
}

#' @rdname as.LDA
#' @export
as.LDA.textmodel_lda <- function(x, ...) {

  version <- x$version

  if (!(version >= "1.2.0")) stop("Conversion is not allowed from models created using a version of `seededlda` below 1.2.0")

  labels <- colnames(x$theta)

  beta <- x$phi
  beta[] <- x$beta
  alpha <- x$alpha

  tokens <- as.tokens(x$data)
  vocabulary <- makeVocabulary(tokens, NULL, 1L)

  zd <- rebuild_L1d_from_posterior(lengths(cleanPadding(tokens)), x$theta, alpha)
  zd <- t(zd)
  zd <- round(zd)
  storage.mode(zd) <- "integer"

  zw <- t(as.matrix(x$words))
  zw <- round(zw)
  storage.mode(zw) <- "integer"

  LDA <- structure(list(
    tokens = vocabulary$toks,
    vocabulary = vocabulary$vocabulary,
    K = x$k,
    alpha = as.matrix(x$alpha),
    beta = beta,
    it = x$last_iter,
    theta = x$theta,
    phi = t(x$phi),
    zd = zd,
    zw = zw,
    logLikelihood = NULL
  ), class = c("LDA", "sentopicmodel"), reversed = TRUE, Sdim = "L2",
  approx = TRUE, labels = list(L1 = colnames(x$theta)))
  LDA <- as.LDA(reorder_sentopicmodel(LDA))

  LDA
}

#' @rdname as.LDA
#' @export
# @usage as.LDA.lda(list, docs, alpha, eta)
as.LDA_lda <- function(list, docs, alpha, eta) {
  zw <- list$topics
  zd <- list$document_sums
  vocab <- colnames(zw)

  K <- nrow(zw)

  build_tokens <- get("build_tokens", envir = getNamespace("quanteda"))
  make_docvars <- get("make_docvars", envir = getNamespace("quanteda"))
  tokens <- build_tokens(
    x = lapply(docs, function(doc) rep(doc[1, ] + 1L, times = doc[2, ])),
    types = colnames(zw),
    docvars = make_docvars(length(docs)))

  stopifnot(isTRUE(all.equal(unname(lengths(cleanPadding(tokens))),
                             colSums(zd))))

  za <- mapply(function(doc, assignments) rep(assignments + 1L, times = doc[2, ]),
               docs, list$assignments)

  vocabulary <- makeVocabulary(tokens, NULL, 1L)

  if (missing(alpha)) alpha <- 50/K
  if (missing(eta)) eta <- 0.01
  beta <- zw
  beta[] <- eta

  LDA <- structure(list(
    tokens = vocabulary$toks,
    vocabulary = vocabulary$vocabulary,
    K = K,
    alpha = as.matrix(rep(alpha, K)),
    beta = beta,
    it = 1,
    theta = t(zd),
    phi = t(zw),
    za = za,
    logLikelihood = NULL
  ), class = c("LDA", "sentopicmodel"), reversed = TRUE, Sdim = "L2")
  LDA <- fit(LDA, 0, displayProgress = FALSE)

  LDA
}



#' @rdname as.LDA
#' @export
as.LDA.keyATM_output <- function(x, docs, ...) {
  K = x$keyword_k + x$no_keyword_topics


  labels <- colnames(x$theta)


  alpha <- as.matrix(rep(utils::tail(x$values_iter$alpha_iter$alpha, 1), K))


  tokens <- quanteda::as.tokens(docs$W_raw)
  vocabulary <- makeVocabulary(tokens, NULL, 1L)

  # Align keyATM phi to the new vocabulary order
  reorder <- order(match(x$vocab,vocabulary$vocabulary$word))
  x$vocab <- x$vocab[reorder]
  stopifnot(identical(
    vocabulary$vocabulary$word,
    x$vocab
  ))
  x$phi <- x$phi[, reorder]
  beta <- x$phi
  beta[] <- x$priors$beta
  stopifnot(identical(
    order(match(x$vocab,vocabulary$vocabulary$word)),
    seq(x$V)
  ))

  zd <- rebuild_L1d_from_posterior(lengths(cleanPadding(tokens)), x$theta, alpha)
  zd <- unname(t(zd))
  doc.length <- lengths(tokens, use.names = FALSE)

  ## adjust zd to integer
  diff <- doc.length - as.integer(colSums(round(zd)))
  for (i in which(diff != 0)) {
    while (diff[i] != 0L) {
      possible <- zd[, i] > (-diff[i] - .5)
      idx <- which.min( (zd[possible, i] - round(zd[possible, i])*diff[i] ))
      zd[which(possible)[idx], i] <- zd[which(possible)[idx], i] + 1*sign(diff[i])
      diff[i] <- as.integer(doc.length[i] - sum(round(zd[, i])))
    }
  }
  zd <- round(zd)
  diff2 <- as.integer(x$topic_counts - rowSums(zd))
  ## quickly adjust second dimension
  while (any(diff2 < 0)) {
    count <- 1L;
    while (TRUE) {
      non_zero <- length(zd[diff2 < 0][zd[diff2 < 0] > count])
      deduct <- ceiling(abs(min(diff2))/non_zero)
      if (deduct <= count) break
      else count <- count + 1
    }
    zd[diff2 < 0][zd[diff2 < 0] > count] <- zd[diff2 < 0][zd[diff2 < 0] > count] - deduct
    diff2 <- as.integer(x$topic_counts - rowSums(zd))
  }
  diff <- doc.length - as.integer(colSums(round(zd)))
  zd <- zd + stats::r2dtable(1, diff2, diff)[[1]]
  storage.mode(zd) <- "integer"
  stopifnot(isTRUE(all.equal(colSums(zd), doc.length)))
  stopifnot(isTRUE(all.equal(rowSums(zd), x$topic_counts)))



  phi <- t(x$phi)

  ## adjust zw to integer
  dfm <- quanteda::dfm(tokens)
  dfm <- dfm[, vocabulary$vocabulary$word] # Ensure correct order
  zw <- rebuild_zw_from_posterior2(zd, phi, beta)
  diff <- as.integer(quanteda::colSums(dfm) - colSums(round(zw)))
  for (i in which(diff != 0)) {
    while (diff[i] != 0L) {
      possible <- (zw[, i] > (-diff[i] - .5))
      idx <- which.min( (zw[possible, i] - round(zw[possible, i])*diff[i] ))
      zw[which(possible)[idx], i] <- zw[which(possible)[idx], i] + 1*sign(diff[i])
      diff[i] <- as.integer(sum(dfm[, i]) - sum(round(zw[, i])))
    }
  }
  zw <- round(zw)
  diff2 <- as.integer(rowSums(zd) - rowSums(round(zw)))
  ## quickly adjust second dimension
  while (any(diff2 < 0)) {
    count <- 1L;
    while (TRUE) {
      non_zero <- length(zw[diff2 < 0][zw[diff2 < 0] > count])
      deduct <- ceiling(abs(min(diff2))/non_zero)
      if (deduct <= count) break
      else count <- count + 1
    }
    zw[diff2 < 0][zw[diff2 < 0] > count] <- zw[diff2 < 0][zw[diff2 < 0] > count] - deduct
    diff2 <- as.integer(rowSums(zd) - rowSums(round(zw)))
  }
  diff <- as.integer(quanteda::colSums(dfm)) - as.integer(colSums(round(zw)))
  zw <- zw + stats::r2dtable(1, diff2, diff)[[1]]
  storage.mode(zw) <- "integer"
  stopifnot(isTRUE(all.equal(colSums(zw), unname(quanteda::colSums(dfm)))))
  stopifnot(isTRUE(all.equal(rowSums(zw), rowSums(zd))))

  LDA <- structure(list(
    tokens = vocabulary$toks,
    vocabulary = vocabulary$vocabulary,
    K = K,
    alpha = alpha,
    beta = beta,
    it = x$options$iterations,
    theta = x$theta,
    phi = phi,
    zd = zd,
    zw = zw,
    logLikelihood = NULL
  ), class = c("LDA", "sentopicmodel"), reversed = TRUE, Sdim = "L2",
  approx = TRUE, labels = list(L1 = colnames(x$theta)))
  LDA <- as.LDA(reorder_sentopicmodel(LDA))

  LDA
}



# To LDAvis ---------------------------------------------------------------


#' Visualize a LDA model using \pkg{LDAvis}
#'
#' This function call \pkg{LDAvis} to create a dynamic visualization of an
#' estimated topic model.
#'
#' @param x an `LDA` model
#' @param ... further arguments passed on to [LDAvis::createJSON()] and
#'   [LDAvis::serVis()].
#'
#' @details The CRAN release of \pkg{LDAvis} does not support UTF-8 characters
#'   and automatically reorder topics. To solve these two issues, please install
#'   the development version of \pkg{LDAvis} from github
#'   (`devtools::install_github("cpsievert/LDAvis")`).
#'
#' @return Nothing, called for its side effects.
#' @export
#'
#' @seealso [plot.sentopicmodel()]
#'
#' @examples
#' lda <- LDA(ECB_press_conferences_tokens)
#' lda <- fit(lda, 100)
#' LDAvis(lda)
LDAvis <- function(x, ...) {
  mis <- missingSuggets(c("LDAvis", "servr"))
  if (length(mis) > 0) stop("Suggested packages are missing for the LDAvis function.\n",
                            "Please install first the following packages: ",
                            paste0(mis, collapse = ", "),".\n",
                            "Install command: install.packages(",
                            paste0("'", mis, "'", collapse = ", "),")" )

  stopifnot(inherits(x, c("LDA", "sentopicmodel")))
  zw <- rebuild_zw(as.sentopicmodel(x))
  zd <- rebuild_zd(as.sentopicmodel(x))
  json <- LDAvis::createJSON(
    phi = t(x$phi),
    theta = x$theta,
    doc.length = colSums(zd),
    vocab = x$vocabulary$word,
    term.frequency = colSums(zw),
    reorder.topics = FALSE,
    ...)
  LDAvis::serVis(json, encoding = "UTF-8", ...)
}


# To sentopicmodel --------------------------------------------------------

#' @name sentopics-conversions
#' @rdname sentopics-conversions
#' @keywords internal
#'
#' @title Internal conversions between **sentopics** models.
#'
#' @param x A **sentopics** model
#'
#' @return A **sentopics** model of the relevant class
#' @export
as.sentopicmodel <- function(x) {
  UseMethod("as.sentopicmodel")
}
as.sentopicmodel_defaults <- function(x) {
  ### perhaps create too many objects
  ### especially in multiChains, recreate things that are stored in base
  if (is.null(x$L2)) x$L2 <- 1
  if (is.null(x$L2prior)) x$L2prior <- x$L1prior
  if (is.null(x$L2post) & x$it > 0) {
    L2post <- array(1, dim = c(1, x$L1, length(x$tokens)))
    dimnames(L2post) = c(list(L2 = levels(x$vocabulary$lexicon)),
                              dimnames(x$L1post)[2],
                              dimnames(x$L1post)[1])
    x$L2post <- L2post
  }
  if (is.null(x$L1cycle)) x$L1cycle <- 0
  if (is.null(x$L2cycle)) x$L2cycle <- 0
  if (is.null(x$logLikelihood)) {
    logLikelihood <- 0
    attr(logLikelihood, "components") <- list(
      logLikelihoodW = 0,
      logLikelihoodL1 = 0,
      logLikelihoodL2 = 0
    )
    x$logLikelihood <- logLikelihood
  }
  x
}
#' @export
as.sentopicmodel.LDA <- function(x) {
  rename <- stats::setNames(names(x), names(x))
  translate <- stats::setNames(
    c("L1", "L2", "L1prior", "L2prior", "L1post", "L2post", "L1cycle", "L2cycle", "logLikelihoodL1", "logLikelihoodL2"),
    c("K", "S", "alpha", "gamma", "theta", "pi", "alphaCycle", "gammaCycle", "logLikelihoodK", "logLikelihoodS")
  )
  if (x$it > 0) {
    if (length(dim(x$phi)) < 3) {
      names <- dimnames(x$phi)
      names[3] <- list(sent = levels(x$vocabulary$lexicon))
      x$phi <- array(x$phi,
                     dim = c(nrow(x$phi), 1, ncol(x$phi)),
                     dimnames = names[c(1,3,2)])
    }
  }
  rename <- replace(rename, names(translate), translate)[rename]
  names(x) <- rename
  class(x) <- setdiff(class(x), "LDA")
  as.sentopicmodel_defaults(x)
}
#' @export
as.sentopicmodel.rJST <- function(x) {
  rename <- stats::setNames(names(x), names(x))
  translate <- stats::setNames(
    c("L1", "L2", "L1prior", "L2prior", "L1post", "L2post", "L1cycle", "L2cycle", "logLikelihoodL1", "logLikelihoodL2"),
    c("K", "S", "alpha", "gamma", "theta", "pi", "alphaCycle", "gammaCycle", "logLikelihoodK", "logLikelihoodS")
  )
  rename <- replace(rename, names(translate), translate)[rename]
  names(x) <- rename
  class(x) <- setdiff(class(x), "rJST")
  as.sentopicmodel_defaults(x)
}
#' @export
as.sentopicmodel.JST <- function(x) {
  rename <- stats::setNames(names(x), names(x))
  translate <- stats::setNames(
    c("L1", "L2", "L1prior", "L2prior", "L1post", "L2post", "L1cycle", "L2cycle", "logLikelihoodL1", "logLikelihoodL2"),
    c("S", "K", "gamma", "alpha", "pi", "theta", "gammaCycle", "alphaCycle", "logLikelihoodS", "logLikelihoodK")
  )
  rename <- replace(rename, names(translate), translate)[rename]
  names(x) <- rename
  class(x) <- setdiff(class(x), "JST")

  as.sentopicmodel_defaults(x)
}
#' @export
as.sentopicmodel.sentopicmodel <- function(x) {
  x
}
#' @export
as.sentopicmodel.multiChains <- function(x) {
  attr(x, "containedClass") <- "sentopicmodel"
  x
}
#' @export
as.sentopicmodel.default <- function(x) {
  stop("Unexpected object passed to as.sentopicmodel")
}

# From sentopicmodel ------------------------------------------------------

#' @export
as.LDA.sentopicmodel <- function(x, ...) {
  rename <- stats::setNames(names(x), names(x))
  translate <- stats::setNames(
    c("K", "S", "alpha", "gamma", "theta", "pi", "alphaCycle", "gammaCycle", "logLikelihoodK", "logLikelihoodS"),
    c("L1", "L2", "L1prior", "L2prior", "L1post", "L2post", "L1cycle", "L2cycle", "logLikelihoodL1", "logLikelihoodL2")
  )
  # correct phi theta pi structure and names
  if (x$it > 0) {
    # dimnames(x$theta) <- list(doc_id = names(x$tokens), topic = paste0("topic", 1:x$K))
    dimnames(x$L1post) <- list(doc_id = names(x$tokens), topic = create_labels(x, "LDA", flat = FALSE)[["L1"]])
    dimnames(x$phi) <- list(word = x$vocabulary$word, sent = levels(x$vocabulary$lexicon), topic = create_labels(x, "LDA", flat = FALSE)[["L1"]])
    x$phi <- x$phi[ ,1, ]
  }
  rename <- replace(rename, names(translate), translate)[rename]
  names(x) <- rename
  # x$phi <- x$phi[,,] ## this drops un-needed sentiment dimension ### Need to account for labels..
  x$S <- x$gamma <- x$pi <- x$alphaCycle <- x$gammaCycle <- x$logLikelihoodS <- NULL
  class(x) <- union("LDA", class(x))
  x
}
#' @export
as.LDA.multiChains <- function(x, ...) {
  # atrs <- attributes(x)
  # x <- lapply(unclass(x), as.LDA)
  # attributes(x) <- atrs
  attr(x, "containedClass") <- "LDA"
  x
}
#' @export
as.LDA.default <- function(x, ...) {
  stop("Unexpected object passed to as.LDA")
}

#' @rdname sentopics-conversions
#' @export
as.rJST <- function(x) {
  UseMethod("as.rJST")
}
#' @export
as.rJST.sentopicmodel <- function(x) {
  rename <- stats::setNames(names(x), names(x))
  translate <- stats::setNames(
    c("K", "S", "alpha", "gamma", "theta", "pi", "alphaCycle", "gammaCycle", "logLikelihoodK", "logLikelihoodS"),
    c("L1", "L2", "L1prior", "L2prior", "L1post", "L2post", "L1cycle", "L2cycle", "logLikelihoodL1", "logLikelihoodL2")
  )
  # correct phi theta pi structure and names
  if (x$it > 0) {
    dimnames(x$L1post) <- list(doc_id = names(x$tokens), topic = create_labels(x, "rJST", flat = FALSE)[["L1"]])
    dimnames(x$L2post) <- list(sent = create_labels(x, "rJST", flat = FALSE)[["L2"]], topic = create_labels(x, "rJST", flat = FALSE)[["L1"]], doc_id = names(x$tokens))
    dimnames(x$phi) <- list(word = x$vocabulary$word, sent = create_labels(x, "rJST", flat = FALSE)[["L2"]], topic = create_labels(x, "rJST", flat = FALSE)[["L1"]])
  }
  rename <- replace(rename, names(translate), translate)[rename]
  names(x) <- rename
  class(x) <- union("rJST", class(x))
  x
}
#' @export
as.rJST.multiChains <- function(x) {
  # atrs <- attributes(x)
  # x <- lapply(unclass(x), as.rJST)
  # attributes(x) <- atrs
  attr(x, "containedClass") <- "rJST"
  x
}
#' @export
as.rJST.default <- function(x) {
  stop("Unexpected object passed to as.rJST")
}

#' @rdname sentopics-conversions
#' @export
as.JST <- function(x) {
  UseMethod("as.JST")
}
#' @export
as.JST.sentopicmodel <- function(x) {
  rename <- stats::setNames(names(x), names(x))
  translate <- stats::setNames(
    c("S", "K", "gamma", "alpha", "pi", "theta", "gammaCycle", "alphaCycle", "logLikelihoodS", "logLikelihoodK"),
    c("L1", "L2", "L1prior", "L2prior", "L1post", "L2post", "L1cycle", "L2cycle", "logLikelihoodL1", "logLikelihoodL2")
  )
  # correct phi theta pi structure and names
  if (x$it > 0) {
    dimnames(x$L1post) <- list(doc_id = names(x$tokens), sent = levels(x$vocabulary$lexicon))
    dimnames(x$L2post) <- list(topic = create_labels(x, "JST", flat = FALSE)[["L2"]], sent = create_labels(x, "JST", flat = FALSE)[["L1"]], doc_id = names(x$tokens))
    dimnames(x$phi) <- list(word = x$vocabulary$word, topic = create_labels(x, "JST", flat = FALSE)[["L2"]], sent = create_labels(x, "JST", flat = FALSE)[["L1"]])
  }
  rename <- replace(rename, names(translate), translate)[rename]
  names(x) <- rename
  class(x) <- union("JST", class(x))
  x
}
#' @export
as.JST.multiChains <- function(x) {
  # atrs <- attributes(x)
  # x <- lapply(unclass(x), as.JST)

  # ## rename base in attributes
  # rename <- stats::setNames(names(atrs$base), names(atrs$base))
  # translate <- stats::setNames(
  #   c("S", "K", "gamma", "alpha", "pi", "theta", "gammaCycle", "alphaCycle", "logLikelihoodS", "logLikelihoodK"),
  #   c("L1", "L2", "L1prior", "L2prior", "L1post", "L2post", "L1cycle", "L2cycle", "logLikelihoodL1", "logLikelihoodL2")
  # )
  # rename <- replace(rename, names(translate), translate)[rename]
  # names(atrs$base) <- rename

  # attributes(x) <- atrs
  attr(x, "containedClass") <- "JST"
  x
}
#' @export
as.JST.default <- function(x) {
  stop("Unexpected object passed to as.JST")
}

# Others ------------------------------------------------------------------

#' @importFrom quanteda as.tokens
#' @export
quanteda::as.tokens

#' Convert back a dfm to a tokens object
#'
#' @param x [quanteda::dfm] to be coerced
#' @param concatenator only used for consistency with the generic
#' @param tokens optionally, the tokens from which the dfm was created.
#'   Providing the initial tokens will ensure that the word order will be
#'   respected in the coerced object.
#' @param ignore_list a character vector of words that should not be removed
#'   from the initial tokens object. Useful to avoid removing some lexicon word
#'   following the usage of [quanteda::dfm_trim()].
#' @param padding if `TRUE`, leaves an empty string where the removed tokens
#'   previously existed. The use of padding is encouraged to improve the
#'   behavior of the coherence metrics (see [coherence()]) that rely on word
#'   positions.
#' @param case_insensitive only used when the `tokens` argument is provided.
#'   Default to `FALSE`. This function removes words in the initial [tokens]
#'   based on the remaining features in the [quanteda::dfm] object. This check is
#'   case-sensitive by default, and can be relaxed by setting this argument to
#'   `TRUE`.
#' @param ... unused
#'
#' @return a quanteda [quanteda::tokens] object.
#' @seealso [quanteda::as.tokens()] [quanteda::dfm()]
#' @export
#' @examples
#' library("quanteda")
#' dfm <- dfm(ECB_press_conferences_tokens, tolower = FALSE)
#' dfm <- dfm_trim(dfm, min_termfreq = 200)
#' as.tokens(dfm)
#' as.tokens(dfm, tokens = ECB_press_conferences_tokens)
#' as.tokens(dfm, tokens = ECB_press_conferences_tokens, padding = FALSE)
as.tokens.dfm <- function(x, concatenator = NULL, tokens = NULL, ignore_list = NULL, case_insensitive = FALSE, padding = TRUE, ...) {
  if (!is.null(tokens)) {
    if (!is.null(ignore_list)) keep <- c(dimnames(x)$features, ignore_list) else
      keep <- dimnames(x)$features
    ntypes <- length(dimnames(x)$features)
    ## force removal of padding in the dfm object
    if (!padding) keep <- setdiff(keep, "")
    res <- quanteda::as.tokens(
      quanteda::tokens_select(tokens,
                              keep,
                              selection = "keep",
                              case_insensitive = case_insensitive,
                              padding = padding
      )
    )
    if (length(quanteda::types(res)) + 1 < ntypes) warning("The returned tokens object has less types than the number of column in the dfm input. This could indicate that:\n\t1. The input dfm was lowercase, unlike the provided tokens object.\n\t2. The dfm does not originate from the tokens object.\nTo solve the first problem, consider using the argument `case_insensitive = TRUE` or using the function `quanteda::tokens_tolower()`")
    res
  } else {
    # quanteda::as.tokens(apply(quanteda::dfm_remove(x, ""), 1, function(x) rep(names(x), times = x)))

    if (min(x) < 0) stop("Dfm input should not contain negative values")


    #faster
    # tmp <- quanteda::convert(quanteda::t(x), to = "tripletlist")
    # word <- rep(tmp$document, times = tmp$frequency)
    # ff <- factor(rep(tmp$feature, times = tmp$frequency), rownames(x))
    # res <- as.tokens(split(word, ff))
    # attr(res, "docvars") <- attr(x, "docvars")
    # res

    # R CMD check
    feature <- frequency <- document <- NULL

    vocab <- colnames(x)
    docs <- rownames(x)
    colnames(x) <- seq_len(ncol(x))
    quanteda::docnames(x) <- seq_len(nrow(x))
    tmp <- quanteda::convert(x, to = "tripletlist")
    tmp <- lapply(tmp, as.integer)
    data.table::setDT(tmp)

    missing <- setdiff(seq_len(nrow(x)), unique(tmp$document))

    if (length(missing) > 0) {
      tmp <- rbind(
        tmp,
        data.table::data.table(
          document = missing,
          feature = 1L,
          frequency = 0L
        ))
    }

    toks <- tmp[, list(toks = list(rep(feature, times = frequency))), by = document]
    toks <- toks[order(document)]

    build_tokens <- get("build_tokens", envir = getNamespace("quanteda"))
    make_docvars <- get("make_docvars", envir = getNamespace("quanteda"))

    res <- build_tokens(
      x = toks$toks,
      types = vocab,
      docvars = make_docvars(nrow(toks), docname = docs)
    )

    attr(res, "docvars") <- attr(x, "docvars")
    res
  }
}

Try the sentopics package in your browser

Any scripts or data that you put into this service are public.

sentopics documentation built on Sept. 20, 2024, 5:06 p.m.