R/generators.R

Defines functions vocabFromList distToGenerative generatePartialLexicon generateDocuments generateVocab

generateVocab <- function(nTopics = 5, nSentiments = 3, nWords = 5, nCommonWords = 2, betaDirichlet = 0, hierarchy = c("rJST", "JST")) {
  if (betaDirichlet <= 0) FLAG <- TRUE else FLAG <- FALSE

  hierarchy <- match.arg(hierarchy)
  if (hierarchy == "rJST") reversed <- TRUE else reversed <- FALSE

  vocab <- vector()
  for (i in 1:nTopics) {
    topicVocab <- rep(paste0("topic", i), nWords * nSentiments)
    for (j in 1:nSentiments) {
      # print((i-1)*nWords*nSentiments+(1:nWords)+(nWords*(j-1)))
      topicVocab[(1:nWords) + (nWords * (j - 1))] <- paste0(topicVocab[(1:nWords) + (nWords * (j - 1))], "sent", j, "w", 1:nWords)
    }
    vocab <- c(vocab, topicVocab)
  }
  vocab

  if (nCommonWords > 0) {
    topicVocab <- as.vector(sapply(1:nTopics, function(x) paste0("topic", x, "w", 1:nCommonWords)))

    vocab <- c(vocab, topicVocab)

    topicVocab <- as.vector(sapply(1:nSentiments, function(x) paste0("sentiment", x, "w", 1:nCommonWords)))
    vocab <- c(vocab, topicVocab)

  }

  betaa <- array(0, dim = c(length(vocab), nTopics, nSentiments))
  for (i in 1:nTopics) {
    for (j in 1:nSentiments) {
      # print(((j-1)*nWords):(j*nWords))
      # print(((i-1)*nWords*nSentiments) + ((j-1)*nWords+1):(j*nWords))
      if (FLAG) {
        betaa[ c( ((i - 1) * nWords * nSentiments) + ((j - 1) * nWords + 1):(j * nWords),
                  seq(nTopics * nSentiments * nWords + 1 + (i - 1) * nCommonWords, length.out = nCommonWords),
                  seq(nTopics * nSentiments * nWords + nTopics * nCommonWords + 1 + (j - 1) * nCommonWords, length.out = nCommonWords))
               , i, j] <- rep(1/(nWords + nCommonWords * 2), nWords + nCommonWords * 2)
      } else {
        betaa[ c( ((i - 1) * nWords * nSentiments) + ((j - 1) * nWords + 1):(j * nWords),
                  seq(nTopics * nSentiments * nWords + 1 + (i - 1) * nCommonWords, length.out = nCommonWords),
                  seq(nTopics * nSentiments * nWords + nTopics * nCommonWords + 1 + (j - 1) * nCommonWords, length.out = nCommonWords))
               , i, j] <- rdirichlet(1, rep(betaDirichlet, nWords + nCommonWords * 2))
      }
    }
    # betaa[(1:(nWords*nSentiments))+(nWords*nSentiments*(i-1)),i,] <- MCMCpack::rdirichlet(1,rep(1,nWords*nSentiments))
  }

  # nTopics*nSentiments

  dimnames(betaa) <- list(word = vocab, topics = paste0("topic", 1:nTopics), sent = paste0("sent", 1:nSentiments))
  colSums(betaa)

  rowSums(betaa, dims = 2) ## topic wise
  rowSums(aperm(betaa, c(1, 3, 2)), dims = 2) ## sentiment wise

  if (reversed) betaa <- aperm(betaa, c(1,3,2))

  betaa
}

generateDocuments <- function(vocab, nDocs = 100, L1prior = 1, L2prior = 5, nWords = 400, nClass = 1) {
  if (length(L1prior) == 1) {
    L1priorMatrix <- matrix(0, dim(vocab)[3], nClass)
    for (c in 1:nClass) {
      L1priorMatrix[, c] <- rep(L1prior, dim(vocab)[3])
    }
  } else if (length(L1prior) == nClass) {
    L1priorMatrix <- matrix(0, dim(vocab)[3], nClass)
    for (c in 1:nClass) {
      L1priorMatrix[, c] <- rep(L1prior[c], dim(vocab)[3])
    }
  } else if (length(L1prior) == nClass * dim(vocab)[3]) {
    L1priorMatrix <- L1prior
  } else {
    stop("Please provide a valid L1prior: either an unique value, an unique value per class, or a complete matrix T x C")
  }

  if (length(L2prior) == 1) {
    L2priorArray <- array(0, dim = c(dim(vocab)[3:2], nClass))
    for (c in 1:nClass) {
      L2priorArray[, , c] <- rep(L2prior, dim(vocab)[3] * dim(vocab)[2])
    }
  } else if (length(L2prior) == nClass) {
    L2priorArray <- array(0, dim = c(dim(vocab)[3:2], nClass))
    for (c in 1:nClass) {
      L2priorArray[, , c] <- rep(L2prior[c], dim(vocab)[3] * dim(vocab)[2])
    }
  } else if (length(L2prior) == nClass * dim(vocab)[3] * dim(vocab)[2]) {
    L2priorArray <- array(L2prior, dim = c(dim(vocab)[3:2], nClass))
  } else {
    stop("Please provide a valid L2prior: either an unique value, an unique value per class, or a complete array T x S x C")
  }

  docs <- vector(mode = "list", nDocs)
  docLengths <- ceiling(stats::rnorm(nDocs, nWords, sd = nWords/5)) ## Vary document length
  docLengths[docLengths < 1] <- 1
  classVector <- rep(0, nDocs)
  pi <- matrix(0, dim(vocab)[3], dim(vocab)[2])
  S <- dim(vocab)[2]
  V <- dimnames(vocab)[[1]]
  for (i in 1:nDocs) {
    c <- sample(1:nClass, 1)
    classVector[i] <- c

    theta <- rdirichlet(1, L1priorMatrix[, c])
    pi <- t(sapply(1:dim(vocab)[3], function(x) rdirichlet(1, L2priorArray[x, , c])))
    dim(pi) <- c(dim(vocab)[3:2]) ## otherwise error when S = 1
    # pi <- MCMCpack::rdirichlet(length(theta),L2prior)    ### error generation
    docs[[i]] <- vector("integer", docLengths[i])
    z <- sample.int(dim(vocab)[3], docLengths[i], prob = theta, replace = TRUE)
    for (j in 1:docLengths[i]) {
      # z <- sample(1:dim(vocab)[2], 1, prob = theta)
      zj <- z[j]
      l <- sample.int(S, 1L, prob = pi[zj, ], replace = TRUE)
      docs[[i]][j] <- sample(V, 1L, prob = vocab[, l, zj], replace = TRUE)
    }
  }
  docs <- quanteda::as.tokens(docs)
  # docs <- sapply(docs, paste0, collapse = " ")
  # Encoding(docs) <- "UTF-8"
  quanteda::docvars(docs, "class") <- classVector
  docs
}


generatePartialLexicon <- function(toks, Sindex = c(1,3)) {

  ## CMD check
  word <- value <- NULL
  
  vocab <- data.table::data.table(word = c(attr(toks, "types")))
  lex <- vocab[grep(paste0("sentiment[",paste0(Sindex, collapse = ""),"]"), word)]
  lex$value <- 1
  lex[grep("sentiment1", word), "value" := -1][]
  if (2 %in% Sindex & 3 %in% Sindex) lex[grep("sentiment2", word), "value" := 0][]

  quanteda::dictionary(list("negative" = lex[value == -1, word],
                            "positive" = lex[value == 1, word]))
}


distToGenerative <- function(x, generativeVocabulary, plot = FALSE, full = FALSE, method = "invariantEuclidean") {
  if (!inherits(x, "multiChains")) stop("Only valid for multiChains objects.")
  if (nrow(x[[1]]$vocabulary) < nrow(generativeVocabulary)) stop("Not all words of the generative vocabulary are present in the generated text.")

  x[[length(x) + 1]] <- x[[1]]
  x[[length(x)]]$phi <- generativeVocabulary[order(match(rownames(generativeVocabulary),x$vocabulary$word)),,,drop = FALSE]

  attr(x, "nChains") <- length(x)
  d <- chainsDistances(x, method, FUN_aggregate = "max")
  if (plot) {
    coord <- stats::cmdscale(stats::as.dist(d))
    plot(coord[, 1], coord[, 2], type = "n", xlab = "Coordinate 1", ylab = "Coordinate 2", main = paste0("Distance to true vocabulary"))
    graphics::text(coord[-attr(x, "nChains"), 1], coord[-attr(x, "nChains"), 2], rownames(coord), cex = 0.8)

    graphics::points(coord[attr(x, "nChains"), 1], coord[attr(x, "nChains"), 2], cex = 2, pch = 1, lwd = 4, col = "gray55")
    graphics::abline(v = coord[attr(x, "nChains"), 1], h = coord[attr(x, "nChains"), 2], col = "gray75")
  } else {
    if (full) {
      d
    } else {
      d[nrow(d), -nrow(d)]
    }
  }
}

#' Generate vocabulary from list of words
#'
#' @param list list of words. First list level determine the topic. Second list
#'   level indicate the sentiment
#'
#' @return an array usable in `generateDocuments`
#'
#' @examples
#' sentopics:::vocabFromList(unname(list(
#' topic1 = list(sent1 = c("tasty", "delicious"), sent2 = c("bad", "smelly")),
#' topic2 = list(sent1 = c("surprising", "amazing"), sent2 = c("boring", "annoying")),
#' topic3 = list(sent1 = "fearless", sent2 = "coward")
#' )))
#' sentopics:::vocabFromList(lapply(1:3, function(i) c(paste0("topic",i,"w1"))))
#' @keywords internal
#' @noRd
vocabFromList <- function(list) {

  index <- unique(unlist(list, use.names = FALSE))
  n <- length(index)
  S <- sum(sapply(list, is.list))
  if (any(sapply(list, is.list))) {
    S <- max(sapply(list, function(x) {
      ifelse(is.list(x), length(x), 1)
    }))
  } else S = 0
  if (S == 0) S <- 1
  K <- length(list)

  vocab <- array(0, dim = c(n, S, K))
  rownames(vocab) <- index
  if (S == 1) {
    for (i in seq_along(list)) {
      match <- index %in% list[[i]]
      vocab[match, 1, i] <- 1/sum(match)
    }
  } else {
    list <- unlist(list, recursive = FALSE, use.names = FALSE)
    dim(list) <- c(S, K)
    for (i in 1:K) {
      for (j in 1:S) {
        match <- index %in% list[[j, i]]
        vocab[match, j, i] <- 1/sum(match)
      }
    }
  }

  dimnames(vocab) <- list(word = index,
                          sent = paste0("sent", 1:S),
                          topics = paste0("topic", 1:K))

  vocab
}

Try the sentopics package in your browser

Any scripts or data that you put into this service are public.

sentopics documentation built on May 31, 2023, 8:26 p.m.