R/generators.R
In sentopics: Tools for Joint Sentiment and Topic Analysis of Textual Data

generateVocab <- function(nTopics = 5, nSentiments = 3, nWords = 5, nCommonWords = 2, betaDirichlet = 0, hierarchy = c("rJST", "JST")) {
  if (betaDirichlet <= 0) FLAG <- TRUE else FLAG <- FALSE

  hierarchy <- match.arg(hierarchy)
  if (hierarchy == "rJST") reversed <- TRUE else reversed <- FALSE

  vocab <- vector()
  for (i in 1:nTopics) {
    topicVocab <- rep(paste0("topic", i), nWords * nSentiments)
    for (j in 1:nSentiments) {
      # print((i-1)*nWords*nSentiments+(1:nWords)+(nWords*(j-1)))
      topicVocab[(1:nWords) + (nWords * (j - 1))] <- paste0(topicVocab[(1:nWords) + (nWords * (j - 1))], "sent", j, "w", 1:nWords)
    }
    vocab <- c(vocab, topicVocab)
  }
  vocab

  if (nCommonWords > 0) {
    topicVocab <- as.vector(sapply(1:nTopics, function(x) paste0("topic", x, "w", 1:nCommonWords)))

    vocab <- c(vocab, topicVocab)

    topicVocab <- as.vector(sapply(1:nSentiments, function(x) paste0("sentiment", x, "w", 1:nCommonWords)))
    vocab <- c(vocab, topicVocab)

  }

  betaa <- array(0, dim = c(length(vocab), nTopics, nSentiments))
  for (i in 1:nTopics) {
    for (j in 1:nSentiments) {
      # print(((j-1)*nWords):(j*nWords))
      # print(((i-1)*nWords*nSentiments) + ((j-1)*nWords+1):(j*nWords))
      if (FLAG) {
        betaa[ c( ((i - 1) * nWords * nSentiments) + ((j - 1) * nWords + 1):(j * nWords),
                  seq(nTopics * nSentiments * nWords + 1 + (i - 1) * nCommonWords, length.out = nCommonWords),
                  seq(nTopics * nSentiments * nWords + nTopics * nCommonWords + 1 + (j - 1) * nCommonWords, length.out = nCommonWords))
               , i, j] <- rep(1/(nWords + nCommonWords * 2), nWords + nCommonWords * 2)
      } else {
        betaa[ c( ((i - 1) * nWords * nSentiments) + ((j - 1) * nWords + 1):(j * nWords),
                  seq(nTopics * nSentiments * nWords + 1 + (i - 1) * nCommonWords, length.out = nCommonWords),
                  seq(nTopics * nSentiments * nWords + nTopics * nCommonWords + 1 + (j - 1) * nCommonWords, length.out = nCommonWords))
               , i, j] <- rdirichlet(1, rep(betaDirichlet, nWords + nCommonWords * 2))
      }
    }
    # betaa[(1:(nWords*nSentiments))+(nWords*nSentiments*(i-1)),i,] <- MCMCpack::rdirichlet(1,rep(1,nWords*nSentiments))
  }

  # nTopics*nSentiments

  dimnames(betaa) <- list(word = vocab, topics = paste0("topic", 1:nTopics), sent = paste0("sent", 1:nSentiments))
  colSums(betaa)

  rowSums(betaa, dims = 2) ## topic wise
  rowSums(aperm(betaa, c(1, 3, 2)), dims = 2) ## sentiment wise

  if (reversed) betaa <- aperm(betaa, c(1,3,2))

  betaa
}

generateDocuments <- function(vocab, nDocs = 100, L1prior = 1, L2prior = 5, nWords = 400, nClass = 1) {
  if (length(L1prior) == 1) {
    L1priorMatrix <- matrix(0, dim(vocab)[3], nClass)
    for (c in 1:nClass) {
      L1priorMatrix[, c] <- rep(L1prior, dim(vocab)[3])
    }
  } else if (length(L1prior) == nClass) {
    L1priorMatrix <- matrix(0, dim(vocab)[3], nClass)
    for (c in 1:nClass) {
      L1priorMatrix[, c] <- rep(L1prior[c], dim(vocab)[3])
    }
  } else if (length(L1prior) == nClass * dim(vocab)[3]) {
    L1priorMatrix <- L1prior
  } else {
    stop("Please provide a valid L1prior: either an unique value, an unique value per class, or a complete matrix T x C")
  }

  if (length(L2prior) == 1) {
    L2priorArray <- array(0, dim = c(dim(vocab)[3:2], nClass))
    for (c in 1:nClass) {
      L2priorArray[, , c] <- rep(L2prior, dim(vocab)[3] * dim(vocab)[2])
    }
  } else if (length(L2prior) == nClass) {
    L2priorArray <- array(0, dim = c(dim(vocab)[3:2], nClass))
    for (c in 1:nClass) {
      L2priorArray[, , c] <- rep(L2prior[c], dim(vocab)[3] * dim(vocab)[2])
    }
  } else if (length(L2prior) == nClass * dim(vocab)[3] * dim(vocab)[2]) {
    L2priorArray <- array(L2prior, dim = c(dim(vocab)[3:2], nClass))
  } else {
    stop("Please provide a valid L2prior: either an unique value, an unique value per class, or a complete array T x S x C")
  }

  docs <- vector(mode = "list", nDocs)
  docLengths <- ceiling(stats::rnorm(nDocs, nWords, sd = nWords/5)) ## Vary document length
  docLengths[docLengths < 1] <- 1
  classVector <- rep(0, nDocs)
  pi <- matrix(0, dim(vocab)[3], dim(vocab)[2])
  S <- dim(vocab)[2]
  V <- dimnames(vocab)[[1]]
  for (i in 1:nDocs) {
    c <- sample(1:nClass, 1)
    classVector[i] <- c

    theta <- rdirichlet(1, L1priorMatrix[, c])
    pi <- t(sapply(1:dim(vocab)[3], function(x) rdirichlet(1, L2priorArray[x, , c])))
    dim(pi) <- c(dim(vocab)[3:2]) ## otherwise error when S = 1
    # pi <- MCMCpack::rdirichlet(length(theta),L2prior)    ### error generation
    docs[[i]] <- vector("integer", docLengths[i])
    z <- sample.int(dim(vocab)[3], docLengths[i], prob = theta, replace = TRUE)
    for (j in 1:docLengths[i]) {
      # z <- sample(1:dim(vocab)[2], 1, prob = theta)
      zj <- z[j]
      l <- sample.int(S, 1L, prob = pi[zj, ], replace = TRUE)
      docs[[i]][j] <- sample(V, 1L, prob = vocab[, l, zj], replace = TRUE)
    }
  }
  docs <- quanteda::as.tokens(docs)
  # docs <- sapply(docs, paste0, collapse = " ")
  # Encoding(docs) <- "UTF-8"
  quanteda::docvars(docs, "class") <- classVector
  docs
}


generatePartialLexicon <- function(toks, Sindex = c(1,3)) {

  ## CMD check
  word <- value <- NULL
  
  vocab <- data.table::data.table(word = c(attr(toks, "types")))
  lex <- vocab[grep(paste0("sentiment[",paste0(Sindex, collapse = ""),"]"), word)]
  lex$value <- 1
  lex[grep("sentiment1", word), "value" := -1][]
  if (2 %in% Sindex & 3 %in% Sindex) lex[grep("sentiment2", word), "value" := 0][]

  quanteda::dictionary(list("negative" = lex[value == -1, word],
                            "positive" = lex[value == 1, word]))
}


distToGenerative <- function(x, generativeVocabulary, plot = FALSE, full = FALSE, method = "invariantEuclidean") {
  if (!inherits(x, "multiChains")) stop("Only valid for multiChains objects.")
  if (nrow(x[[1]]$vocabulary) < nrow(generativeVocabulary)) stop("Not all words of the generative vocabulary are present in the generated text.")

  x[[length(x) + 1]] <- x[[1]]
  x[[length(x)]]$phi <- generativeVocabulary[order(match(rownames(generativeVocabulary),x$vocabulary$word)),,,drop = FALSE]

  attr(x, "nChains") <- length(x)
  d <- chainsDistances(x, method, FUN_aggregate = "max")
  if (plot) {
    coord <- stats::cmdscale(stats::as.dist(d))
    plot(coord[, 1], coord[, 2], type = "n", xlab = "Coordinate 1", ylab = "Coordinate 2", main = paste0("Distance to true vocabulary"))
    graphics::text(coord[-attr(x, "nChains"), 1], coord[-attr(x, "nChains"), 2], rownames(coord), cex = 0.8)

    graphics::points(coord[attr(x, "nChains"), 1], coord[attr(x, "nChains"), 2], cex = 2, pch = 1, lwd = 4, col = "gray55")
    graphics::abline(v = coord[attr(x, "nChains"), 1], h = coord[attr(x, "nChains"), 2], col = "gray75")
  } else {
    if (full) {
      d
    } else {
      d[nrow(d), -nrow(d)]
    }
  }
}

#' Generate vocabulary from list of words
#'
#' @param list list of words. First list level determine the topic. Second list
#'   level indicate the sentiment
#'
#' @return an array usable in `generateDocuments`
#'
#' @examples
#' sentopics:::vocabFromList(unname(list(
#' topic1 = list(sent1 = c("tasty", "delicious"), sent2 = c("bad", "smelly")),
#' topic2 = list(sent1 = c("surprising", "amazing"), sent2 = c("boring", "annoying")),
#' topic3 = list(sent1 = "fearless", sent2 = "coward")
#' )))
#' sentopics:::vocabFromList(lapply(1:3, function(i) c(paste0("topic",i,"w1"))))
#' @keywords internal
#' @noRd
vocabFromList <- function(list) {

  index <- unique(unlist(list, use.names = FALSE))
  n <- length(index)
  S <- sum(sapply(list, is.list))
  if (any(sapply(list, is.list))) {
    S <- max(sapply(list, function(x) {
      ifelse(is.list(x), length(x), 1)
    }))
  } else S = 0
  if (S == 0) S <- 1
  K <- length(list)

  vocab <- array(0, dim = c(n, S, K))
  rownames(vocab) <- index
  if (S == 1) {
    for (i in seq_along(list)) {
      match <- index %in% list[[i]]
      vocab[match, 1, i] <- 1/sum(match)
    }
  } else {
    list <- unlist(list, recursive = FALSE, use.names = FALSE)
    dim(list) <- c(S, K)
    for (i in 1:K) {
      for (j in 1:S) {
        match <- index %in% list[[j, i]]
        vocab[match, j, i] <- 1/sum(match)
      }
    }
  }

  dimnames(vocab) <- list(word = index,
                          sent = paste0("sent", 1:S),
                          topics = paste0("topic", 1:K))

  vocab
}

Any scripts or data that you put into this service are public.

sentopics documentation built on Sept. 20, 2024, 5:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sentopics
Tools for Joint Sentiment and Topic Analysis of Textual Data

R/generators.R
In sentopics: Tools for Joint Sentiment and Topic Analysis of Textual Data

Defines functions vocabFromList distToGenerative generatePartialLexicon generateDocuments generateVocab

Try the sentopics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sentopics Tools for Joint Sentiment and Topic Analysis of Textual Data

R/generators.R In sentopics: Tools for Joint Sentiment and Topic Analysis of Textual Data

Defines functions vocabFromList distToGenerative generatePartialLexicon generateDocuments generateVocab

Try the sentopics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sentopics
Tools for Joint Sentiment and Topic Analysis of Textual Data

R/generators.R
In sentopics: Tools for Joint Sentiment and Topic Analysis of Textual Data