R/RcppExports.R

Defines functions count_kmers binary_enc

Documented in binary_enc count_kmers

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Encode a character vector into a binary matrix
#'
#' Create a binary matrix for
#' categorical data with optional noise
#'
#' The column of the resulting matrix will represent the encoded values, as specified
#' in the \code{keys} parameter, retaining the order and accessible in the \code{colnames} of the
#' resulting matrix. Note that  elements of the \code{keys} parameter need to be
#' unique. This function is used by \code{fit_logistic_signature}
#'
#' @param x The vector to be encoded
#' @param keys The values to encode into
#' @param as_equal Compare keys versus strings as being equal? Otherwise looks for the `key` substring in x (default)
#' @examples
#'  binary_enc(c('foo', 'bar', 'baz', 'bazinga'), c('foo', 'baz'))
#'  binary_enc(c('foo', 'bar', 'baz', 'bazinga'), c('foo', 'baz'), as_equal=TRUE)
binary_enc <- function(x, keys, as_equal = FALSE) {
    .Call('_contextendR_binary_enc', PACKAGE = 'contextendR', x, keys, as_equal)
}

#' Count kmers
#'
#' Count the number of sequences that contain kmers. For example, the set of
#' sequences \code{c("AAAAAA", "AGCAGC", "TTAAGG")} will detect 
#' "AAA" only once, in the first sequence. Even though the kmer "AAA" occurs
#' more than once in the first sequence - it's only in one sequence (the two
#' other sequences does not contain any "AAA" kmer) - so the function will
#' return a count of 1.
#'
#' @param sequences A vector of Strings (DNA sequences) of length `n`
#' @param kmer_size Integer. Size of kmers to use for analysis.
#' @export
count_kmers <- function(sequences, kmer_size) {
    .Call('_contextendR_count_kmers', PACKAGE = 'contextendR', sequences, kmer_size)
}
lindberg-m/contextendR documentation built on Jan. 8, 2022, 3:16 a.m.