R/region_matrix.R

Defines functions ranges_to_cpos region_matrix_context region_matrix_to_count_matrix region_matrix_to_ids get_region_matrix

Documented in get_region_matrix ranges_to_cpos region_matrix_context region_matrix_to_count_matrix region_matrix_to_ids

#' Get Matrix with Regions for Strucs.
#' 
#' The return value is an integer matrix with the left and right corpus positions
#' of the strucs in columns one and two, respectively.
#' 
#' @param corpus a CWB corpus
#' @param s_attribute a structural attribute
#' @param strucs strucs
#' @param registry the registry directory
#' @rdname get_region_matrix
#' @export get_region_matrix
#' @return A matrix with integer values indicating left and right corpus positions
#' (columns 1 and 2, respectively).
#' @examples 
#' y <- get_region_matrix(
#'   corpus = "REUTERS", s_attribute = "id",
#'   strucs = 0L:5L, registry = get_tmp_registry()
#'   )
get_region_matrix <- function(corpus, s_attribute, strucs, registry = Sys.getenv("CORPUS_REGISTRY")){
  check_registry(registry)
  check_corpus(corpus, registry)
  check_strucs(corpus = corpus, s_attribute = s_attribute, strucs = strucs, registry = registry)
  .get_region_matrix(corpus = corpus, s_attribute = s_attribute, strucs = strucs, registry = registry)
}


#' Get IDs and Counts for Region Matrices.
#' 
#' @param corpus a CWB corpus
#' @param p_attribute a positional attribute
#' @param registry registry directory
#' @param matrix a regions matrix
#' @rdname region_matrix_ops
#' @name region_matrix_ops
#' @export region_matrix_to_ids
#' @examples
#' # Scenario 1: Get full text for a subcorpus defined by regions
#' m <- get_region_matrix(
#'   corpus = "REUTERS", s_attribute = "places",
#'   strucs = 4L:5L, registry = get_tmp_registry()
#'   )
#' ids <- region_matrix_to_ids(
#'   corpus = "REUTERS", p_attribute = "word",
#'   registry = get_tmp_registry(), matrix = m
#'   )
#' tokenstream <- cl_id2str(
#'   corpus = "REUTERS", p_attribute = "word",
#'   registry = get_tmp_registry(), id = ids
#'   )
#' txt <- paste(tokenstream, collapse = " ")
#' txt
#' 
#' # Scenario 2: Get data.frame with counts for region matrix
#' y <- region_matrix_to_count_matrix(
#'   corpus = "REUTERS", p_attribute = "word",
#'   registry = get_tmp_registry(), matrix = m
#'   )
#' df <- as.data.frame(y)
#' colnames(df) <- c("token_id", "count")
#' df[["token"]] <- cl_id2str(
#'   "REUTERS", p_attribute = "word",
#'   registry = get_tmp_registry(), id = df[["token_id"]]
#'   )
#' df[order(df[["count"]], decreasing = TRUE),]
#' head(df)
region_matrix_to_ids <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), matrix){
  check_registry(registry)
  check_corpus(corpus, registry)
  check_p_attribute(p_attribute = p_attribute, corpus = corpus, registry = registry)
  check_region_matrix(region_matrix = matrix)
  .region_matrix_to_ids(corpus = corpus, p_attribute = p_attribute, registry = registry, matrix = matrix)
}


#' @rdname region_matrix_ops
#' @export region_matrix_to_count_matrix
region_matrix_to_count_matrix <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), matrix){
  check_registry(registry)
  check_corpus(corpus, registry)
  check_p_attribute(p_attribute = p_attribute, corpus = corpus, registry = registry)
  stopifnot(is.matrix(matrix))
  .region_matrix_to_count_matrix(corpus = corpus, p_attribute = p_attribute, registry = registry, matrix = matrix)
}

#' @rdname region_matrix_ops
#' @param s_attribute If not `NULL`, a structural attribute (length-one
#'   `character` vector), typically indicating a sentence ("s").
#' @param boundary Structural attribute (length-one `character` vector) that
#'   serves as a boundary and that shall not be transgressed.
#' @param left An `integer` value, number of strucs to move to the left.
#' @param right An `integer` value, number of strucs to move to the right.
#' @export
region_matrix_context <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY"), matrix, p_attribute, s_attribute, boundary, left, right){
  
  if (!is.null(s_attribute)){
    check_s_attribute(s_attribute = s_attribute, corpus = corpus, registry = registry)
  }
  
  if (!is.null(boundary)){
    check_s_attribute(s_attribute = boundary, corpus = corpus, registry = registry)
  }

  .region_matrix_context(
    corpus = corpus,
    registry = registry,
    region_matrix = matrix,
    p_attribute = p_attribute,
    s_attribute = s_attribute,
    boundary = boundary,
    left = as.integer(left),
    right = as.integer(right)
  )
}

#' @details `ranges_to_cpos()` will turn a `matrix` of ranges into an `integer` 
#'   vector with the individual corpus positions covered by the ranges.
#' @rdname region_matrix_ops
#' @param ranges A two-column integer `matrix` of ranges (left and right corpus
#'   positions in first and second column, respectively).
#' @export
ranges_to_cpos <- function(ranges){
  stopifnot(
    is.integer(ranges),
    is.matrix(ranges),
    all(ranges[,2] >= ranges[,1])
  )
  if (nrow(ranges) == 0L) return(integer())
  if (ncol(ranges) != 2L){
    warning(
      "ranges_to_cpos() requires two-column integer matrix as input ",
      sprintf("but input matrix has %d columns!", ncol(ranges))
    )
  }
  .ranges_to_cpos(ranges)
}

Try the RcppCWB package in your browser

Any scripts or data that you put into this service are public.

RcppCWB documentation built on July 9, 2023, 7:40 p.m.