#' Returns the vocabulary from character string
#'
#' Use this function with a character string.
#'
#' @param char character string of text with the length of one
#' @param verbose TRUE/FALSE
#' @examples
#' getVocabulary(data(crispr_sample))
#' getVocabulary("abcd")
#' @export
getVocabulary <- function(char, verbose = F) {
stopifnot(!is.null(char))
stopifnot(nchar(char) > 0)
vocabulary <- sort(unique(tokenizers::tokenize_characters(
stringr::str_c(stringr::str_to_lower(char),collapse = "\n"), strip_non_alphanum = FALSE, simplify = TRUE)))
if (verbose)
message("The vocabulary:", vocabulary)
return(vocabulary)
}
#' Preprocess string to semi-redundant one-hot vector
#'
#' @description
#' Outputs semi-redundant set of input character string.
#' Collapse, tokenize, and vectorize the character.
#' Use this function with a character string as input. For example,
#' if the input text is ABCDEFGHI and the length(maxlen) is 5, the generating chunks would be:
#' X(1): ABCDE and Y(1): F;
#' X(2): BCDEF and Y(2): G;
#' X(3): CDEFG and Y(3): H;
#' X(4): DEFGH and Y(4): I
#'
#' @param char character input string of text with the length of one
#' @param maxlen length of the semi-redundant sequences
#' @param vocabulary char contains the vocabulary from the input char
#' If no vocabulary exists, it is generated from the input char
#' @param verbose TRUE/FALSE
#' @export
preprocessSemiRedundant <- function(char,
maxlen = 250,
vocabulary = c("l", "p", "a", "c", "g", "t"),
verbose = F) {
stopifnot(!is.null(char))
stopifnot(nchar(char) > 0)
stopifnot(maxlen > 0)
# Load, collapse, and tokenize text ("ACGT" -> "a" "c" "g" "t")
text <- tokenizers::tokenize_characters(stringr::str_c(stringr::str_to_lower(char), collapse = "\n"), strip_non_alphanum = FALSE, simplify = TRUE)
# Generating vocabulary from input char with the function getVocabulary()
if (missing(vocabulary)) {
if (verbose)
message("Finding the vocabulary ...")
vocabulary <- getVocabulary(char)
}
if(verbose)
message("Vocabulary size:", length(vocabulary))
# Cut the text in semi-redundant sequences of maxlen characters
if (verbose)
message("Generation of semi-redundant sequences ...")
dataset <- purrr::map(seq(1, length(text) - maxlen, by = 1),
~ list(sentece = text[.x:(.x + maxlen - 1)],
next_char = text[.x + maxlen]))
dataset <- purrr::transpose(dataset)
x <-
array(0, dim = c(length(dataset$sentece), maxlen, length(vocabulary)))
y <- array(0, dim = c(length(dataset$sentece), length(vocabulary)))
# Vectorization
if (verbose)
message("Vectorization ...")
if (verbose)
pb <- txtProgressBar(min = 0,
max = length(dataset$sentece),
style = 3)
for (i in 1:length(dataset$sentece)) {
if (verbose)
setTxtProgressBar(pb, i)
# generate one-hot encoding for one subset
x[i, ,] <- sapply(vocabulary, function(x) {
as.integer(x == dataset$sentece[[i]])
})
# target (next nucleotide in sequence)
y[i,] <- as.integer(vocabulary == dataset$next_char[[i]])
}
results <- list("X" = x, "Y" = y)
return(results)
}
#' Wrapper of the preprocessSemiRedundant()-function
#'
#' @description
#' Is called on the genomic contents of one
#' FASTA file. Multiple entries are combined with newline characters.
#' @param path path to the FASTA file
#' @param maxlen length of the semi-redundant sequences
#' @param vocabulary char contains the vocabulary from the input char
#' If no vocabulary exists, it is generated from the input char
#' @param verbose TRUE/FALSE
#' @export
preprocessFasta <- function(path,
maxlen = 250,
vocabulary = c("l", "p", "a", "c", "g", "t"),
verbose = F) {
# process corpus
fasta.file <- Biostrings::readDNAStringSet(path)
seq <- paste0("l", paste(fasta.file, collapse = "p"),"l")
if(verbose)
message("Preprocessing the data ...")
seq.processed <-
preprocessSemiRedundant(char = seq, maxlen = maxlen, vocabulary = vocabulary,
verbose = F)
return(seq.processed)
}
#' One-hot-encodes integer sequence
#'
#' \code{sequenceToArray} Helper function for \code{\link{{fastaFileGenerator}}, returns one hot encoding for sequence
#'
#' @param sequence Sequence of integers.
#' @param maxlen Length of one sample
#' @param vocabulary Set of characters to encode.
#' @param startInd Start positions of samples in \code{sequence}.
#' @export
sequenceToArray <- function(sequence, maxlen, vocabulary, startInd){
stopifnot(length(sequence) > maxlen)
startInd <- startInd - startInd[1] + 1
numberOfSamples <- length(startInd)
# every row in z one-hot encodes one character in sequence, oov is zero-vector
z <- keras::to_categorical(sequence, num_classes = length(vocabulary) + 1)[ , -1]
x <- array(0, dim = c(numberOfSamples, maxlen, length(vocabulary)))
for (i in 1:numberOfSamples){
start <- startInd[i]
x[i, , ] <- z[start : (start + maxlen - 1), ]
}
y <- z[startInd + maxlen, ]
list(x, y)
}
#' One-hot-encodes integer sequence
#'
#' \code{sequenceToArrayLabel} Helper function for \code{\link{{fastaLabelGenerator}}, returns one hot encoding for sequence and returns samples from
#' specified positions
#'
#' @param sequence Sequence of integers.
#' @param maxlen Length of predictor sequence.
#' @param vocabulary Set of characters to encode.
#' @param startInd Start positions of samples in \code{sequence}.
#' @export
sequenceToArrayLabel <- function(sequence, maxlen, vocabulary, startInd){
stopifnot(length(sequence) > (maxlen - 1))
startInd <- startInd - startInd[1] + 1
numberOfSamples <- length(startInd)
# every row in z one-hot encodes one character in sequence, oov is zero-vector
z <- keras::to_categorical(matrix(sequence, ncol = 1), num_classes = length(vocabulary) + 1)[ , -1]
z <- matrix(z, ncol = length(vocabulary))
x <- array(0, dim = c(numberOfSamples, maxlen, length(vocabulary)))
for (i in 1:numberOfSamples){
start <- startInd[i]
x[i, , ] <- z[start : (start + maxlen - 1), ]
}
return(x)
}
#' Computes start position of samples
#'
#' Helper function for \code{\link{{fastaLabelGenerator}} and \code{\link{{fastaFileGenerator}}. Computes positions in sequence where samples can be extracted
#'
#' @param seq_vector Vector of character sequences.
#' @param length_vector Length of sequences in \code{seq_vector}.
#' @param maxlen Length of one predictor sequence.
#' @param step Distance between samples from one entry in \code{seq_vector}.
#' @param train_mode Either "lm" for language model or "label" for label classification. Language models need one character more
#' (the target) for one sample.
#' @export
getStartInd <- function(seq_vector, length_vector, maxlen, step, train_mode = "label"){
stopifnot(train_mode == "lm" | train_mode == "label")
if (length(length_vector) > 1){
startNewEntry <- cumsum(c(1, length_vector[-length(length_vector)]))
if (train_mode == "label"){
indexVector <- purrr::map(1:(length(length_vector) - 1), ~seq(startNewEntry[.x], startNewEntry[.x + 1] - maxlen, by = step))
} else {
indexVector <- purrr::map(1:(length(length_vector) - 1), ~seq(startNewEntry[.x], startNewEntry[.x + 1] - maxlen - 1, by = step))
}
indexVector <- unlist(indexVector)
last_seq <- length(seq_vector)
if (!(startNewEntry[last_seq] > (sum(length_vector) - maxlen + 1))) {
if (train_mode == "label"){
indexVector <- c(indexVector, seq(startNewEntry[last_seq], sum(length_vector) - maxlen + 1, by = step))
} else {
indexVector <- c(indexVector, seq(startNewEntry[last_seq], sum(length_vector) - maxlen, by = step))
}
}
return(indexVector)
} else {
if (train_mode == "label"){
indexVector <- seq(1, length_vector - maxlen + 1, by = step)
} else {
indexVector <- seq(1, length_vector - maxlen, by = step)
}
return(indexVector)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.