R/RcppExports.R

Defines functions unsqueezed_index_to_str squeezed_index_to_str unsqueezed_mers squeezed_mers genome_to_libsvm kmers reverse_complement

Documented in genome_to_libsvm kmers reverse_complement squeezed_index_to_str squeezed_mers unsqueezed_index_to_str unsqueezed_mers

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Reverse complement of DNA string
#'
#' @param dna DNA string
#' @return reverse complement of DNA string
#' @export
#' @examples
#' reverse_complement("ATCG")
reverse_complement <- function(dna) {
    .Call(`_MIC_reverse_complement`, dna)
}

#' Generates genome kmers
#'
#' @param x genome in string format
#' @param k kmer length
#' @param simplify returns a numeric vector of kmer counts,
#' without associated string. This is useful to save memory,
#' but should always be used with anchor = true.
#' @param canonical only record canonical kmers
#' (i.e., the lexicographically smaller of a kmer and its reverse complement)
#' @param squeeze remove non-canonical kmers
#' @param anchor includes unobserved kmers (with counts of 0).
#' This is useful when generating a dense matrix where kmers of different
#' genomes align.
#' @param clean_up only include valid bases (ACTG) in kmer counts
#' (excludes non-coding results such as N)
#' @param key_as_int return kmer index (as "kmer_index")
#' rather than the full kmer string. Useful for index-coded data structures
#' such as libsvm.
#' @param starting_index the starting index, only used if key_as_int = TRUE.
#' @return list of kmer values, either as a list of a single vector
#' (if simplify = TRUE), or as a named list containing "kmer_string" and
#' "kmer_value".
#' @export
#' @examples
#' kmers("ATCGCAGT")
kmers <- function(x, k = 3L, simplify = FALSE, canonical = TRUE, squeeze = FALSE, anchor = TRUE, clean_up = TRUE, key_as_int = FALSE, starting_index = 1L) {
    .Call(`_MIC_kmers`, x, k, simplify, canonical, squeeze, anchor, clean_up, key_as_int, starting_index)
}

#' Converts a genome to kmers stored in libsvm format on disk
#'
#' @param x genome in string format
#' @param target_path path to store libsvm file (.txt)
#' @param label libsvm label
#' @param k kmer length
#' @param canonical only record canonical kmers
#' (i.e., the lexicographically smaller of a kmer and its reverse complement)
#' @param squeeze remove non-canonical kmers
#' @return boolean indicating success
#' @description
#' This function converts a single genome to a libsvm file containing kmer
#' counts. The libsvm format will be as follows:
#'
#' \preformatted{
#'   label 1:count 2:count 3:count ...
#' }
#' Label is optional and defaults to 0. The kmer counts are indexed by the
#' kmer index, which is the lexicographically sorted index of the kmer.
#' Libsvm is a sparse format.
#'
#' @seealso
#' For multiple genomes in a directory, processed in parallel, see [genomes_to_kmer_libsvm()]
#'
#' For more details on libsvm format, see
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}
#'
#' @export
#' @examples
#' temp_libsvm_path <- tempfile(fileext = ".txt")
#' genome_to_libsvm("ATCGCAGT", temp_libsvm_path)
#' readLines(temp_libsvm_path)
genome_to_libsvm <- function(x, target_path, label = as.character( c("0")), k = 3L, canonical = TRUE, squeeze = FALSE) {
    .Call(`_MIC_genome_to_libsvm`, x, target_path, label, k, canonical, squeeze)
}

#' Generates all permutations of squeezed kmers
#' @param k kmer length
#' @return vector of squeezed kmers
#' @export
#' @examples
#' squeezed_mers(3)
squeezed_mers <- function(k = 3L) {
    .Call(`_MIC_squeezed_mers`, k)
}

#' Generates all permutations of unsqueezed kmers
#' @param k kmer length
#' @return vector of unsqueezed kmers
#' @export
#' @examples
#' unsqueezed_mers(3)
unsqueezed_mers <- function(k = 3L) {
    .Call(`_MIC_unsqueezed_mers`, k)
}

#' Get str conversion of squeezed kmer using index
#' @param x integer vector of kmer indices
#' @param k kmer length
#' @param starting_index starting index (libsvm is usually indexed starting at 1)
#' @return vector of squeezed kmer strings
#' @export
#' @examples
#' squeezed_index_to_str(2, k = 3)
squeezed_index_to_str <- function(x, k, starting_index = 1L) {
    .Call(`_MIC_squeezed_index_to_str`, x, k, starting_index)
}

#' Get str conversion of unsqueezed kmer using index
#' @param x integer vector of kmer indices
#' @param k kmer length
#' @param starting_index starting index (libsvm is usually indexed starting at 1)
#' @return vector of unsqueezed kmer strings
#' @export
#' @examples
#' unsqueezed_index_to_str(2, k = 3)
unsqueezed_index_to_str <- function(x, k, starting_index = 1L) {
    .Call(`_MIC_unsqueezed_index_to_str`, x, k, starting_index)
}

Try the MIC package in your browser

Any scripts or data that you put into this service are public.

MIC documentation built on April 12, 2025, 2:26 a.m.