R/tidy-methods.R

Defines functions augment.tidylda tidy.matrix tidy.tidylda glance.tidylda

Documented in augment.tidylda glance.tidylda tidy.matrix tidy.tidylda

# necessary to pass R CMD check with tidy select
utils::globalVariables(c("topic", "lambda", "idx1"))



#' Glance method for \code{tidylda} objects
#' @description
#'   \code{glance} constructs a single-row summary "glance" of a \code{tidylda}
#'   topic model.
#' @param x an object of class \code{tidylda}
#' @param ... other arguments passed to methods,currently not used
#' @return
#'   \code{glance} returns a one-row \code{\link[tibble]{tibble}} with the
#'   following columns:
#'
#'   \code{num_topics}: the number of topics in the model
#'   \code{num_documents}: the number of documents used for fitting
#'   \code{num_tokens}: the number of tokens covered by the model
#'   \code{iterations}: number of total Gibbs iterations run
#'   \code{burnin}: number of burn-in Gibbs iterations run
#' @examples
#' \donttest{
#' dtm <- nih_sample_dtm
#'
#' lda <- tidylda(data = dtm, k = 10, iterations = 100, burnin = 75)
#'
#' glance(lda)
#' }
#' @export
glance.tidylda <- function(x, ...) {
  
  # get some objects to return
  if (inherits(x$call, "call")) {
    call_params <- as.list(x$call)
    
    if (!"burnin" %in% names(call_params)) {
      call_params$burnin <- NA
    }
  } else {
    call_params <- list(
      iterations = NA,
      burnin = NA
    )
  }
  
  out <- data.frame(
    num_topics = nrow(x$beta),
    num_documents = nrow(x$theta),
    num_tokens = ncol(x$beta),
    iterations = call_params$iterations,
    burnin = call_params$burnin,
    stringsAsFactors = FALSE
  )
  
  tibble::as_tibble(out)
}

#' Tidy a matrix from a \code{tidylda} topic model
#' @description
#' Tidy the result of a \code{tidylda} topic model
#' @param x an object of class \code{tidylda} or an individual \code{beta}, 
#'   \code{theta}, or \code{lambda} matrix.
#' @param matrix the matrix to tidy; one of \code{'beta'}, \code{'theta'}, or
#'   \code{'lambda'}
#' @param log do you want to have the result on a log scale? Defaults to \code{FALSE}
#' @param ... other arguments passed to methods,currently not used
#' @return
#'   Returns a \code{\link[tibble]{tibble}}.
#'
#'   If \code{matrix = "beta"} then the result is a table of one row per topic
#'   and token with the following columns: \code{topic}, \code{token}, \code{beta}
#'
#'   If \code{matrix = "theta"} then the result is a table of one row per document
#'   and topic with the following columns: \code{document}, \code{topic}, \code{theta}
#'
#'   If \code{matrix = "lambda"} then the result is a table of one row per topic
#'   and token with the following columns: \code{topic}, \code{token}, \code{lambda}
#' @note
#'   If \code{log = TRUE} then "log_" will be appended to the name of the third
#'   column of the resulting table. e.g "\code{beta}" becomes "\code{log_beta}".
#' @examples
#' \donttest{
#' dtm <- nih_sample_dtm
#'
#' lda <- tidylda(data = dtm, k = 10, iterations = 100, burnin = 75)
#'
#' tidy_beta <- tidy(lda, matrix = "beta")
#'
#' tidy_theta <- tidy(lda, matrix = "theta")
#'
#' tidy_lambda <- tidy(lda, matrix = "lambda")
#' }
#' @export
tidy.tidylda <- function(x, matrix, log = FALSE, ...) {
  
  if (!inherits(matrix, "character") |
      !sum(c("beta", "theta", "lambda") %in% matrix) >= 1) {
    stop("matrix should be one of c('beta', 'theta', 'lambda')")
  }
  
  if (matrix == "beta") {
    out <- tidy.matrix(x = x$beta, matrix = matrix, log = log)
  } else if (matrix == "lambda") {
    out <- tidy.matrix(x = x$lambda, matrix = matrix, log = log)
  } else {
    out <- tidy.matrix(x = x$theta, matrix = matrix, log = log)
  }
  
  out
}

#' Tidy an individual matrix. Useful for predictions and called from tidy.tidylda
#' @describeIn tidy.tidylda Tidy an individual matrix.
#'   Useful for predictions and called from tidy.tidylda
#' @export
tidy.matrix <- function(x, matrix, log = FALSE, ...) {
  
  # check inputs
  if (!inherits(matrix, "character") |
      !sum(c("beta", "theta", "lambda") %in% matrix) >= 1) {
    stop("matrix should be one of c('beta', 'theta', 'lambda')")
  }
  
  if (!is.logical(log)) {
    stop("log must be logical.")
  }
  
  out <- as.data.frame(x, stringsAsFactors = FALSE)
  
  out$names_col <- rownames(x)
  
  out <- tidyr::pivot_longer(
    data = out, cols = setdiff(colnames(out), "names_col"),
    names_to = "index", values_to = "value"
  )
  
  if (matrix == "beta") {
    
    colnames(out) <- c("topic", "token", "beta")
    
    out$topic <- as.numeric(out$topic)
    
  } else if (matrix == "lambda") {
    
    colnames(out) <- c("topic", "token", "lambda")
    
    out$topic <- as.numeric(out$topic)
    
  } else { # meanse matrix  == theta
    
    colnames(out) <- c("document", "topic", "theta")
    
    out$topic <- as.numeric(stringr::str_replace_all(out$topic, "^X", ""))
    
  }
  
  if (log) {
    out[[3]] <- log(out[[3]])
    
    names(out)[3] <- paste0("log_", names(out)[3])
  }
  
  out
  
}


#' Augment method for \code{tidylda} objects
#' @description
#'   \code{augment} appends observation level model outputs.
#' @param x an object of class \code{tidylda}
#' @param data a tidy tibble containing one row per original document-token pair, 
#'   such as is returned by \link[tidytext]{tdm_tidiers} with column names
#'   c("document", "term") at a minimum.
#' @param type one of either "class" or "prob"
#' @param document_col character specifying the name of the column that
#'   corresponds to document IDs. Defaults to \code{"document"}.
#' @param term_col character specifying the name of the column that
#'   corresponds to term/token IDs. Defaults to \code{"term"}.
#' @param ... other arguments passed to methods,currently not used
#' @return
#'   \code{augment} returns a tidy tibble containing one row per document-token
#'   pair, with one or more columns appended, depending on the value of \code{type}.
#'   
#'   If \code{type = 'prob'}, then one column per topic is appended. Its value
#'   is P(topic | document, token).
#'   
#'   If \code{type = 'class'}, then the most-probable topic for each document-token
#'   pair is returned. If multiple topics are equally probable, then the topic
#'   with the smallest index is returned by default.
#' @details 
#'   The key statistic for \code{augment} is P(topic | document, token) =
#'   P(topic | token) * P(token | document). P(topic | token) are the entries
#'   of the 'lambda' matrix in the \code{\link[tidylda]{tidylda}} object passed
#'   with \code{x}. P(token | document) is taken to be the frequency of each
#'   token normalized within each document.
#' @export
augment.tidylda <- function(
  x,
  data,
  type = c("class", "prob"),
  document_col = "document",
  term_col = "term",
  ...
) {
  
  # check inputs
  if (sum(c("class", "prob") %in% type[1]) < 1) {
    stop("type must be one of 'class' or 'prob'")
  }
  
  # If they didn't pass a data.frame or if that data.frame doesn't have the right
  # columns, then try to make a dtm , get its relative frequencies, and re-create
  # a tibble with document-token pairs
  if (! inherits(data, "data.frame")) {
    
    dtm <- try(
      convert_dtm(data),
      silent = TRUE
    )
    
    if (inherits(dtm, "try-error")) { # if that fails exit
      stop(
        "data argument must either be a data.frame or tibble with 'document' and 'term' columns ",
        "or coercible to a dgCMatrix. Supported classes are ",
        "c('Matrix', 'matrix', 'simple_triplet_matrix', 'dfm', 'DocumentTermMatrix'), ",
        "However, I see class(data) = ", class(data)
      )
    } 
    
    # get each row to sum to one
    dtm <- dtm / Matrix::rowSums(dtm)
    
    # cast as a triplet matrix
    data <- tidy_dgcmatrix(dtm)
    
    # fix column names
    document_col <- "document"
    
    term_col <- "term"
    
    colnames_data <- c(document_col, term_col)
    
    
  }else {
    if (! all(c(document_col, term_col) %in% colnames(data))) {
      stop(
        "data is a data.frame but both ",
        document_col,
        " and ",
        term_col, 
        " aren't in colnames(data)"
      )
    }
    
    # get column names of data to return later
    colnames_data <- colnames(data)
    
    # if a tidy tibble, need to get fraction of words in each document
    tmp <- 
      data %>% 
      dplyr::group_by(.data[[document_col]], .data[[term_col]]) %>%
      dplyr::summarise(n = dplyr::n()) %>%
      dplyr::mutate(count = .data$n / sum(.data$n))
    
    data <- dplyr::left_join(
      tmp,
      data
    )
    
  }
  
  tidy_lambda <- tidy(x, "lambda")
  
  tidy_lambda <- tidyr::pivot_wider(
    tidy_lambda, 
    names_from = topic, 
    values_from = lambda
  )

  result <- dplyr::right_join(
    tidy_lambda,
    data,
    by = c("token" = term_col)
  )
  
  colnames(result)[colnames(result) == "token"] <- term_col
  
  topic_names <- colnames(x$theta)
  
  result[, topic_names] <- 
    result[, topic_names] * result$count
  
  # return class or probs based on user input
  if (type[1] == "class") {
    tmp <- apply(result[, topic_names], 1,function(y) which.max(y)[1])
    
    result$topic <- tmp
    
    result <- result[, c(colnames_data, "topic")]
  } else {
    result <- result[, c(colnames_data, topic_names)]
  }
  
  tibble::as_tibble(result)  
}

Try the tidylda package in your browser

Any scripts or data that you put into this service are public.

tidylda documentation built on July 26, 2023, 5:34 p.m.