R/corpus.R

Defines functions transform_symbols.corpus print.corpus decode.corpus encode.corpus is.coded.corpus `type<-.corpus` type.corpus `metadata<-.corpus` metadata.corpus num_elements.corpus num_sequences.corpus c.corpus `[[<-.corpus` `[<-.corpus` `[.corpus` as.list.corpus is.corpus corpus

Documented in corpus decode.corpus encode.corpus is.coded.corpus is.corpus num_elements.corpus num_sequences.corpus transform_symbols.corpus

#' Corpus
#'
#' Creates \code{corpus} objects,
#' which comprise ordered sets of (possibly coded) vectors,
#' each describing a sequence of discrete symbols.
#' @param x List of sequences, where each sequence is an object of class
#' \code{vec} or \code{coded_vec}.
#' @param type (Character scalar) Symbol type.
#' @param metadata (List) List of metadata information.
#' @return An object of class \code{corpus}.
#' @rdname corpus
#' @export
corpus <- function(x, type, metadata = list()) {
  checkmate::qassert(x, "l")
  checkmate::qassert(metadata, "l")
  checkmate::qassert(type, "S1")
  cl <- if (all(purrr::map_lgl(x, is.coded_vec)))
    "coded_vec" else if (all(purrr::map_lgl(x, is.vec)))
      "vec" else stop("every element of <x> must be an object of class ",
                      "'coded_vec' or 'vec'")
  if (!all(type == purrr::map_chr(x, function(y) type(y))))
    stop("not all sequences were of type '", type, "'")
  class(x) <- c("corpus", "list")
  type(x) <- type
  metadata(x) <- metadata
  attr(x, "coded") <- cl == "coded_vec"
  x
}

#' @rdname corpus
#' @export
is.corpus <- function(x) {
  is(x, "corpus")
}

#' @export
as.list.corpus <- function(x, ...) {
  attributes(x) <- NULL
  x
}

#' @export
`[.corpus` <- function(x, i) {
  corpus(as.list(x)[i], type = type(x), metadata = metadata(x))
}

#' @export
`[<-.corpus` <- function(x, i, value) {
  # We perform some sanity checks before allowing the assignment
  value <- corpus(x = as.list(value), type = type(x))
  if (!(is.coded(x) == is.coded(value)))
    stop("old corpus and new value must either be both uncoded or both coded")
  value <- as.list(value)
  NextMethod("[<-.corpus")
}

#' @export
`[[<-.corpus` <- function(x, i, value) {
  x[i] <- list(value)
  x
}

#' @export
c.corpus <- function(...) {
  x <- list(...)
  types <- unique(purrr::map_chr(x, type))
  if (length(types) > 1L) stop("cannot combine corpora of different types")
  type <- types
  corpus(do.call(c, lapply(x, as.list)), type = type)
}

#' @rdname num_sequences
#' @export
num_sequences.corpus <- function(x) length(x)

#' @rdname num_elements
#' @export
num_elements.corpus <- function(x) {
  sum(vapply(x, num_elements, integer(1)))
}

#' @export
metadata.corpus <- function(x) attr(x, "metadata")

#' @export
`metadata<-.corpus` <- function(x, value) {
  attr(x, "metadata") <- value
  x
}

#' @export
type.corpus <- function(x) {
  attr(x, "type")
}

`type<-.corpus` <- function(x, value) {
  attr(x, "type") <- value
  x
}

#' @rdname is.coded
#' @export
is.coded.corpus <- function(x) attr(x, "coded")

#' @rdname encode
#' @export
encode.corpus <- function(x) {
  if (is.coded(x)) {
    x
  } else {
    meta <- metadata(x)
    purrr::map(x, encode) %>%
      corpus(type = type(x), metadata = metadata(x))
  }
}


#' @rdname encode
#' @export
decode.corpus <- function(x) {
  if (is.coded(x)) {
    meta <- metadata(x)
    purrr::map(x, decode) %>%
      corpus(type = type(x), metadata = metadata(x))
  } else {
    x
  }
}


#' @export
print.corpus <- function(x, ...) {
  n <- num_sequences(x)
  N <- num_elements(x)
  cat("\nA corpus of", n , ngettext(n, "sequence", "sequences"), "\n")
  cat("  total size =", N, ngettext(N, "symbol", "symbols"), "\n")
  cat("  symbol type = '", type(x), "'\n", sep = "")
  cat("  coded =", tolower(is.coded(x)), "\n")
  if (length(metadata(x)) > 0L) cat(" (Metadata available)", "\n")
  cat("\n")
}

#' @param progress (Scalar character) Progress bar type, to be passed to
#' \code{\link[plyr]{llply}}.
#' @rdname transform_symbols
#' @export
transform_symbols.corpus <- function(x, f, type,
                                     progress = if (interactive()) "text" else "none",
                                     ...) {
  stopifnot(is.function(f))
  checkmate::qassert(type, "S1")
  corpus(
    plyr::llply(x, transform_symbols, f, type = type, ..., .progress = progress),
    type = type,
    metadata = metadata(x)
  )
}
pmcharrison/hrep documentation built on Feb. 18, 2024, 2:33 a.m.