R/sparse_casters.R

Defines functions cast_dfm cast_dtm cast_tdm cast_sparse

Documented in cast_dfm cast_dtm cast_sparse cast_tdm

#' Create a sparse matrix from row names, column names, and values
#' in a table.
#'
#' This function supports non-standard evaluation through the tidyeval framework.
#'
#' @param data A tbl
#' @param row Column name to use as row names in sparse matrix, as string or symbol
#' @param column Column name to use as column names in sparse matrix, as string or symbol
#' @param value Column name to use as sparse matrix values (default 1) as string or symbol
#' @param ... Extra arguments to pass on to [sparseMatrix()]
#'
#' @return A sparse Matrix object, with one row for each unique value in
#' the `row` column, one column for each unique value in the `column`
#' column, and with as many non-zero values as there are rows in `data`.
#'
#' @details Note that cast_sparse ignores groups in a grouped tbl_df. The arguments
#' `row`, `column`, and `value` are passed by expression and support
#' [quasiquotation][rlang::quasiquotation]; you can unquote strings and symbols.
#'
#' @examples
#'
#' dat <- data.frame(a = c("row1", "row1", "row2", "row2", "row2"),
#'                   b = c("col1", "col2", "col1", "col3", "col4"),
#'                   val = 1:5)
#'
#' cast_sparse(dat, a, b)
#'
#' cast_sparse(dat, a, b, val)
#'
#' @import Matrix
#' @export

cast_sparse <- function(data, row, column, value, ...) {
  row_col <- quo_name(enquo(row))
  column_col <- quo_name(enquo(column))
  value_col <- enquo(value)
  if (quo_is_missing(value_col)) {
    value_col <- 1
  }
  data <- ungroup(data)
  data <- distinct(data, !!sym(row_col), !!sym(column_col), .keep_all = TRUE)
  row_names <- data[[row_col]]
  col_names <- data[[column_col]]
  if (is.numeric(value_col)) {
    values <- value_col
  } else {
    value_col <- quo_name(value_col)
    values <- data[[value_col]]
  }

  # if it's a factor, preserve ordering
  if (is.factor(row_names)) {
    row_u <- levels(row_names)
    i <- as.integer(row_names)
  } else {
    row_u <- unique(row_names)
    i <- match(row_names, row_u)
  }

  if (is.factor(col_names)) {
    col_u <- levels(col_names)
    j <- as.integer(col_names)
  } else {
    col_u <- unique(col_names)
    j <- match(col_names, col_u)
  }

  ret <- Matrix::sparseMatrix(
    i = i, j = j, x = values,
    dimnames = list(row_u, col_u), ...
  )

  ret
}

#' Casting a data frame to
#' a DocumentTermMatrix, TermDocumentMatrix, or dfm
#'
#' This turns a "tidy" one-term-per-document-per-row data frame into a
#' DocumentTermMatrix or TermDocumentMatrix from the tm package, or a
#' dfm from the quanteda package. These functions support non-standard
#' evaluation through the tidyeval framework. Groups are ignored.
#'
#' @param data Table with one-term-per-document-per-row
#' @param term Column containing terms as string or symbol
#' @param document Column containing document IDs as string or symbol
#' @param value Column containing values as string or symbol
#' @param weighting The weighting function for the DTM/TDM
#' (default is term-frequency, effectively unweighted)
#' @param ... Extra arguments passed on to
#' [sparseMatrix()]
#'
#' @details The arguments `term`, `document`, and `value`
#' are passed by expression and support [quasiquotation][rlang::quasiquotation];
#' you can unquote strings and symbols.
#'
#' @rdname document_term_casters
#' @export
cast_tdm <- function(data, term, document, value,
                     weighting = tm::weightTf, ...) {
  term <- quo_name(enquo(term))
  document <- quo_name(enquo(document))
  value <- quo_name(enquo(value))
  m <- cast_sparse(data, !!term, !!document, !!value, ...)
  tm::as.TermDocumentMatrix(m, weighting = weighting)
}

#' @rdname document_term_casters
#' @export
cast_dtm <- function(data, document, term, value,
                     weighting = tm::weightTf, ...) {
  document <- quo_name(enquo(document))
  term <- quo_name(enquo(term))
  value <- quo_name(enquo(value))
  m <- cast_sparse(data, !!document, !!term, !!value, ...)
  tm::as.DocumentTermMatrix(m, weighting = weighting)
}

#' @rdname document_term_casters
#' @export
cast_dfm <- function(data, document, term, value, ...) {
  document <- quo_name(enquo(document))
  term <- quo_name(enquo(term))
  value <- quo_name(enquo(value))
  m <- cast_sparse(data, !!document, !!term, !!value, ...)
  quanteda::as.dfm(m)
}

Try the tidytext package in your browser

Any scripts or data that you put into this service are public.

tidytext documentation built on Jan. 8, 2023, 1:12 a.m.