#' Custom generator for fasta/fastq files
#'
#' @description
#' \code{fastaFileGenerator} Iterates over folder containing .fasta/.fastq files and produces one-hot-encoding of predictor sequences
#' and target variables.
#'
#' @param corpus.dir Input directory where .fasta files are located or path to single file ending with .fasta or .fastq
#' (as specified in format argument).
#' @param format File format, either fasta or fastq.
#' @param batch.size Number of batches.
#' @param maxlen Length of predictor sequence.
#' @param max_iter Stop after max_iter number of iterations failed to produce a new batch.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param randomFiles Logical, whether to go through files randomly or sequential.
#' @param step How often to take a sample.
#' @param showWarnings Logical, give warning if character outside vocabulary appears
#' @param seed Sets seed for set.seed function, for reproducible results when using \code{randomFiles} or \code{shuffleFastaEntries}
#' @param shuffleFastaEntries Logical, shuffle entries in every fasta file before connecting them to sequence.
#' @param verbose Whether to show message.
#' @param numberOfFiles Use only specified number of files, ignored if greater than number of files in corpus.dir.
#' @param fileLog Write name of files to csv file if path is specified.
#' @param reverseComplements Logical, half of batch contains sequences and other its reverse complements. Reverse complement
#' is given by reversed order of sequence and switching A/T and C/G. \code{batch.size} argument has to be even, otherwise 1 will be added
#' to \code{batch.size}
#' @return A list of length 2. First element is a 3-dimensional tensor with dimensions (batch.size, maxlen, length(vocabulary)), encoding
#' the predictor sequences. Second element is a matrix with dimensions (batch.size, length(vocabulary)), encoding the targets.
#' @export
fastaFileGenerator <- function(corpus.dir,
format = "fasta",
batch.size = 256,
maxlen = 250,
max_iter = 2000,
vocabulary = c("a", "c", "g", "t"),
verbose = FALSE,
randomFiles = FALSE,
step = 1,
showWarnings = FALSE,
seed = 1234,
shuffleFastaEntries = FALSE,
numberOfFiles = NULL,
fileLog = NULL,
reverseComplements = FALSE){
if ((batch.size %% 2 != 0) & reverseComplements){
message <- paste("Batch size has to be even for reverse complements. Setting batch size to", batch.size + 1)
message(message)
batch.size <- batch.size + 1
}
if (reverseComplements){
# halve batch.size, other half gets filled with reverse complements
batch.size <- batch.size/2
}
tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), vocabulary)
start_index_list <- vector("list")
file_index <- 1
num_samples <- 0
start_index <- 1
# single file
if (endsWith(corpus.dir, paste0(".", format))) {
fasta.file <- Biostrings::readDNAStringSet(corpus.dir, format = format)
num_files <- 1
fasta.files <- corpus.dir
} else {
fasta.files <- list.files(
path = xfun::normalize_path(corpus.dir),
pattern = paste0("\\.", format, "$"),
full.names = TRUE)
num_files <- length(fasta.files)
set.seed(seed)
if (randomFiles) fasta.files <- sample(fasta.files, replace = FALSE)
}
# regular expression for chars outside vocabulary
pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]")
# sequence vector collects strings until one batch can be created
sequence_list <- vector("list")
sequence_list_index <- 1
fasta.file <- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
if (shuffleFastaEntries){
fasta.file <- sample(fasta.file)
}
seq_vector <- stringr::str_to_lower(paste(fasta.file))
length_vector <- Biostrings::width(fasta.file)
# pad short sequences with zeros
short_seq_index <- which(length_vector < (maxlen + 1))
for (i in short_seq_index){
seq_vector[i] <- paste0(paste(rep("0", (maxlen + 1) - length_vector[i]),collapse = ""), seq_vector[i])
length_vector[i] <- maxlen + 1
}
nucSeq <- paste(seq_vector, collapse = "")
# start positions of samples
start_indices <- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step, train_mode = "lm")
nucSeq <- stringr::str_to_lower(nucSeq)
nucSeq <- keras::texts_to_sequences(tokenizer, nucSeq)[[1]] - 1
# use subset of files
if (!is.null(numberOfFiles) && (numberOfFiles < length(fasta.files))){
fasta.files <- fasta.files[1:numberOfFiles]
num_files <- length(fasta.files)
}
# log file
if (!is.null(fileLog)){
if (!endsWith(fileLog, ".csv")) fileLog <- paste0(fileLog, ".csv")
write.table(x = filePath, file = fileLog)
}
# test for chars outside vocabulary
if (showWarnings){
charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)
if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
}
if (verbose) message("Initializing ...")
function() {
iter <- 1
# loop until enough samples collected
while(num_samples < batch.size){
# loop through sub-sequences/files until sequence of suitable length is found
while((start_index > length(start_indices)) | length(start_indices) == 0){
# go to next file
file_index <<- file_index + 1
start_index <<- 1
if (file_index > length(fasta.files)) file_index <<- 1
filePath <<- fasta.files[[file_index]]
fasta.file <<- Biostrings::readDNAStringSet(filePath)
# log file
if (!is.null(fileLog)){
write.table(x = filePath, file = fileLog, append = TRUE)
}
# test for chars outside vocabulary
if (showWarnings){
charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)
if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
}
if (shuffleFastaEntries){
fasta.file <<- sample(fasta.file)
}
seq_vector <<- stringr::str_to_lower(paste(fasta.file))
length_vector <<- Biostrings::width(fasta.file)
# pad short sequences with zeros
short_seq_index <<- which(length_vector < (maxlen + 1))
for (i in short_seq_index){
seq_vector[i] <<- paste0(paste(rep("0", (maxlen + 1) - length_vector[i]),collapse = ""), seq_vector[i])
length_vector[i] <<- maxlen + 1
}
nucSeq <<- paste(seq_vector, collapse = "")
# start positions of samples
start_indices <<- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step, train_mode = "lm")
nucSeq <<- stringr::str_to_lower(nucSeq)
nucSeq <<- keras::texts_to_sequences(tokenizer, nucSeq)[[1]] - 1
if(iter > max_iter){
stop('exceeded max_iter value, try reducing maxlen parameter')
break
}
iter <- iter + 1
}
# go as far as possible in sequence or stop when enough samples are collected
remainingSamples <- batch.size - num_samples
end_index <- min(length(start_indices), start_index + remainingSamples - 1)
subsetStartIndices <- start_indices[start_index:end_index]
sequence_list[[sequence_list_index]] <- nucSeq[subsetStartIndices[1]:(subsetStartIndices[length(subsetStartIndices)] + maxlen)]
start_index_list[[sequence_list_index]] <- subsetStartIndices
sequence_list_index <<- sequence_list_index + 1
num_new_samples <- end_index - start_index + 1
num_samples <- num_samples + num_new_samples
start_index <<- end_index + 1
}
# one hot encode strings collected in sequence_list and connect arrays
array_list <- purrr::map(1:length(sequence_list), ~sequenceToArray(sequence_list[[.x]],
maxlen = maxlen, vocabulary = vocabulary,
startInd = start_index_list[[.x]]))
# create samples from reverse complements
if (reverseComplements){
a <- which(stringr::str_to_lower(vocabulary) == "a")
c <- which(stringr::str_to_lower(vocabulary) == "c")
g <- which(stringr::str_to_lower(vocabulary) == "g")
t <- which(stringr::str_to_lower(vocabulary) == "t")
# reverse order and switch a/t and c/g
for (i in length(sequence_list)){
sequence_list[[i]] <- rev(sequence_list[[i]])
sequence_list[[i]] <- plyr::mapvalues(sequence_list[[i]], c(a,c,g,t), c(t,g,c,a), warn_missing = FALSE)
}
array_list_rev_comp <- purrr::map(1:length(sequence_list), ~sequenceToArray(sequence_list[[.x]],
maxlen = maxlen, vocabulary = vocabulary,
startInd = rev(length(sequence_list[[.x]]) - start_index_list[[.x]] - maxlen + 1)))
array_list <- c(array_list, array_list_rev_comp)
}
x <- array_list[[1]][[1]]
y <- array_list[[1]][[2]]
if (length(array_list) > 1){
for (i in 2:length(array_list)){
x <- abind::abind(x, array_list[[i]][[1]], along = 1)
y <- rbind(y, array_list[[i]][[2]])
}
}
# coerce y type to matrix
if (dim(x)[1] == 1){
dim(y) <- c(1, length(vocabulary))
}
# empty sequence_list for next batch
start_index_list <<- vector("list")
sequence_list <<- vector("list")
sequence_list_index <<- 1
num_samples <<- 0
return(list(X = x, Y = y))
}
}
#' Custom generator for fasta files and label targets
#'
#' @description
#' \code{fastaLabelGenerator} Iterates over folder containing .fasta files and produces one-hot-encoding of predictor sequences
#' and target variables. Targets will be read from fasta headers.
#'
#' @param corpus.dir Input directory where .fasta files are located or path to single file ending with .fasta or .fastq
#' (as specified in format argument).
#' @param format File format, either fasta or fastq.
#' @param batch.size Number of batches.
#' @param maxlen Length of predictor sequence.
#' @param max_iter Stop after max_iter number of iterations failed to produce a new batch.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param randomFiles Logical, whether to go through files randomly or sequential.
#' @param step How often to take a sample.
#' @param showWarnings Logical, give warning if character outside vocabulary appears.
#' @param seed Sets seed for set.seed function, for reproducible results when using \code{randomFiles} or \code{shuffleFastaEntries}
#' @param shuffleFastaEntries Logical, shuffle fasta entries.
#' @param verbose Whether to show message.
#' @param numberOfFiles Use only specified number of files, ignored if greater than number of files in corpus.dir.
#' @param fileLog Write name of files to csv file if path is specified.
#' @param labelVocabulary Character vector of possible targets. Targets outside \code{labelVocabulary} will get discarded.
#' @param reverseComplements Logical, half of batch contains sequences and other its reverse complements. Reverse complement
#' is given by reversed order of sequence and switching A/T and C/G. \code{batch.size} argument has to be even, otherwise 1 will be added
#' to \code{batch.size}
#' @return A list of length 2. First element is a 3-dimensional tensor with dimensions (batch.size, maxlen, length(vocabulary)), encoding
#' the predictor sequences. Second element is a matrix with dimensions (batch.size, length(vocabulary)), encoding the targets.
#' @export
fastaLabelGenerator <- function(corpus.dir,
format = "fasta",
batch.size = 256,
maxlen = 250,
max_iter = 10000,
vocabulary = c("a", "c", "g", "t"),
verbose = FALSE,
randomFiles = FALSE,
step = 1,
showWarnings = FALSE,
seed = 1234,
shuffleFastaEntries = FALSE,
numberOfFiles = NULL,
fileLog = NULL,
labelVocabulary = c("x", "y", "z"),
reverseComplements = TRUE){
if ((batch.size %% 2 != 0) & reverseComplements){
message <- paste("Batch size has to be even for reverse complements. Setting batch size to", batch.size + 1)
message(message)
batch.size <- batch.size + 1
}
if (reverseComplements){
# halve batch.size, other half gets filled with reverse complements
batch.size <- batch.size/2
}
vocabulary <- stringr::str_to_lower(vocabulary)
labelVocabulary <- stringr::str_to_lower(labelVocabulary)
start_index_list <- vector("list")
file_index <- 1
num_samples <- 0
start_index <- 1
tokenizer_pred <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), vocabulary)
tokenizer_target <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = FALSE, lower = TRUE, filters = "\t\n"),
labelVocabulary)
# single file
if (endsWith(corpus.dir, paste0(".", format))) {
num_files <- 1
fasta.files <- corpus.dir
} else {
fasta.files <- list.files(
path = xfun::normalize_path(corpus.dir),
pattern = paste0("\\.", format, "$"),
full.names = TRUE)
num_files <- length(fasta.files)
}
set.seed(seed)
if (randomFiles) fasta.files <- sample(fasta.files, replace = FALSE)
# pre-load the first file
fasta.file <- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
if (shuffleFastaEntries){
fasta.file <- sample(fasta.file)
}
seq_vector <- stringr::str_to_lower(paste(fasta.file))
length_vector <- Biostrings::width(fasta.file)
label_vector <- trimws(stringr::str_to_lower(names(fasta.file)))
# regular expression for chars outside vocabulary
pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]")
# sequence vector collects strings until one batch can be created
sequence_list <- vector("list")
target_list <- vector("list")
sequence_list_index <- 1
# filter out samples with labels outside labelVocabulary
label_filter <- label_vector %in% labelVocabulary
seq_vector <- seq_vector[label_filter]
length_vector <- length_vector[label_filter]
label_vector <- label_vector[label_filter]
# pad short sequences with zeros
short_seq_index <- which(length_vector < maxlen)
for (i in short_seq_index){
seq_vector[i] <- paste0(paste(rep("0", maxlen - length_vector[i]),collapse = ""), seq_vector[i])
length_vector[i] <- maxlen
}
nucSeq <- paste(seq_vector, collapse = "")
start_indices <- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step, train_mode = "label")
startNewEntry <- cumsum(c(1, length_vector[-length(length_vector)]))
nucSeq <- keras::texts_to_sequences(tokenizer_pred, nucSeq)[[1]] - 1
# use subset of files
if (!is.null(numberOfFiles) && (numberOfFiles < length(fasta.files))){
fasta.files <- fasta.files[1:numberOfFiles]
num_files <- length(fasta.files)
}
# log file
if (!is.null(fileLog)){
if (!endsWith(fileLog, ".csv")) fileLog <- paste0(fileLog, ".csv")
write.table(x = fasta.files[file_index], file = fileLog, col.names = FALSE)
}
# test for chars outside vocabulary
if (showWarnings){
charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)
if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
}
if (verbose) message("Initializing ...")
function() {
iter <- 1
# loop until enough samples collected
while(num_samples < batch.size){
# loop through sub-sequences/files until sequence of suitable length is found
while((start_index > length(start_indices)) | length(start_indices) == 0){
# go to next file
file_index <<- file_index + 1
start_index <<- 1
if (file_index > length(fasta.files)) file_index <<- 1
fasta.file <<- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
if (shuffleFastaEntries){
fasta.file <<- sample(fasta.file)
}
seq_vector <<- stringr::str_to_lower(paste(fasta.file))
length_vector <<- Biostrings::width(fasta.file)
label_vector <<- trimws(stringr::str_to_lower(names(fasta.file)))
# log file
if (!is.null(fileLog)){
write.table(x = fasta.files[file_index], file = fileLog, append = TRUE, col.names = FALSE)
}
# test for chars outside vocabulary
if (showWarnings){
charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)
if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
}
# filter out samples with labels outside labelVocabulary
label_filter <<- label_vector %in% labelVocabulary
if (all(!label_filter)) break # no sample in current file
seq_vector <<- seq_vector[label_filter]
length_vector <<- length_vector[label_filter]
label_vector <<- label_vector[label_filter]
# pad short sequences with zeros
short_seq_index <<- which(length_vector < maxlen)
for (i in short_seq_index){
seq_vector[i] <<- paste0(paste(rep("0", maxlen - length_vector[i]),collapse = ""), seq_vector[i])
length_vector[i] <<- maxlen
}
nucSeq <<- paste(seq_vector, collapse = "")
start_indices <<- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step, train_mode = "label")
startNewEntry <<- cumsum(c(1, length_vector[-length(length_vector)]))
nucSeq <<- keras::texts_to_sequences(tokenizer_pred, nucSeq)[[1]] - 1
if(iter > max_iter){
stop('exceeded max_iter value, try reducing maxlen parameter')
break
}
iter <- iter + 1
}
# go as far as possible in sequence or stop when enough samples are collected
remainingSamples <- batch.size - num_samples
end_index <- min(length(start_indices), start_index + remainingSamples - 1)
subsetStartIndices <- start_indices[start_index:end_index]
sequence_list[[sequence_list_index]] <- nucSeq[subsetStartIndices[1] : (subsetStartIndices[length(subsetStartIndices)] + maxlen - 1)]
# collect targets
target_list[[sequence_list_index]] <- as.character(cut(subsetStartIndices, breaks = c(startNewEntry, length(nucSeq)),
labels = label_vector, include.lowest = TRUE, right = FALSE))
start_index_list[[sequence_list_index]] <- subsetStartIndices
sequence_list_index <<- sequence_list_index + 1
num_new_samples <- end_index - start_index + 1
num_samples <- num_samples + num_new_samples
start_index <<- end_index + 1
}
# one hot encode strings collected in sequence_list and connect arrays
array_x_list <- purrr::map(1:length(sequence_list), ~sequenceToArrayLabel(sequence_list[[.x]],
maxlen = maxlen, vocabulary = vocabulary,
startInd = start_index_list[[.x]]))
if (reverseComplements){
a <- which(stringr::str_to_lower(vocabulary) == "a")
c <- which(stringr::str_to_lower(vocabulary) == "c")
g <- which(stringr::str_to_lower(vocabulary) == "g")
t <- which(stringr::str_to_lower(vocabulary) == "t")
for (i in 1:length(array_x_list)){
x <- array_x_list[[i]]
x_rev_comp <- array(0, dim = dim(x))
# switch A/T and C/G
x_rev_comp[ , , a] <- x[ , , t]
x_rev_comp[ , , c] <- x[ , , g]
x_rev_comp[ , , g] <- x[ , , c]
x_rev_comp[ , , t] <- x[ , , a]
x_rev_comp[ , , -c(a,c,g,t)] <- x[ , , -c(a,c,g,t)]
reverse <- (dim(x)[2]):1
x_rev_comp <- array(x_rev_comp[ , reverse, ], dim = dim(x)) # reverse order
array_x_list[[i]] <- abind::abind(x, x_rev_comp, along = 1)
}
}
# one hot encode targets
target_int <- unlist(keras::texts_to_sequences(tokenizer_target, unlist(target_list))) - 1
y <- keras::to_categorical(target_int, num_classes = length(labelVocabulary))
if (reverseComplements) y <- rbind(y, y)
x <- array_x_list[[1]]
if (length(array_x_list) > 1){
for (i in 2:length(array_x_list)){
x <- abind::abind(x, array_x_list[[i]], along = 1)
}
}
# coerce y type to matrix
if (dim(x)[1] == 1){
dim(y) <- c(1, length(labelVocabulary))
}
# empty sequence_list for next batch
start_index_list <<- vector("list")
sequence_list <<- vector("list")
target_list <<- vector("list")
sequence_list_index <<- 1
num_samples <<- 0
return(list(X = x, Y = y))
}
}
#' Custom generator for fasta files
#'
#' @description
#' \code{labelByFolderGenerator} Iterates over folder containing .fasta files and produces one-hot-encoding of predictor sequences
#' and target variables. Files in \code{corpus.dir} should all belong to one class.
#'
#' @param corpus.dir Input directory where .fasta files are located or path to single file ending with .fasta or .fastq
#' (as specified in format argument).
#' @param format File format, either fasta or fastq.
#' @param batch.size Number of batches.
#' @param maxlen Length of predictor sequence.
#' @param max_iter Stop after max_iter number of iterations failed to produce a new batch.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param randomFiles Logical, whether to go through files randomly or sequential.
#' @param step How often to take a sample.
#' @param showWarnings Logical, give warning if character outside vocabulary appears
#' @param seed Sets seed for set.seed function, for reproducible results when using \code{randomFiles} or \code{shuffleFastaEntries}
#' @param shuffleFastaEntries Logical, shuffle fasta entries.
#' @param verbose Whether to show message.
#' @param numberOfFiles Use only specified number of files, ignored if greater than number of files in corpus.dir.
#' @param fileLog Write name of files to csv file if path is specified.
#' @param numTargets Number of columns of target matrix.
#' @param onesColumn Which column of target matrix contains ones
#' @param reverseComplements Logical, half of batch contains sequences and other its reverse complements. Reverse complement
#' is given by reversed order of sequence and switching A/T and C/G. \code{batch.size} argument has to be even, otherwise 1 will be added
#' to \code{batch.size}
#' @return A list of length 2. First element is a 3-dimensional tensor with dimensions (batch.size, maxlen, length(vocabulary)), encoding
#' the predictor sequences. Second element is a matrix with dimensions (batch.size, length(vocabulary)), encoding the targets.
#' @export
labelByFolderGenerator <- function(corpus.dir,
format = "fasta",
batch.size = 256,
maxlen = 250,
max_iter = 10000,
vocabulary = c("a", "c", "g", "t"),
verbose = FALSE,
randomFiles = FALSE,
step = 1,
showWarnings = FALSE,
seed = 1234,
shuffleFastaEntries = FALSE,
numberOfFiles = NULL,
fileLog = NULL,
reverseComplements = TRUE,
numTargets,
onesColumn){
stopifnot(onesColumn <= numTargets)
if ((batch.size %% 2 != 0) & reverseComplements){
message <- paste("Batch size has to be even for reverse complements. Setting batch size to", batch.size + 1)
message(message)
batch.size <- batch.size + 1
}
if (reverseComplements){
# halve batch.size, other half gets filled with reverse complements
batch.size <- batch.size/2
}
vocabulary <- stringr::str_to_lower(vocabulary)
start_index_list <- vector("list")
file_index <- 1
num_samples <- 0
start_index <- 1
tokenizer_pred <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), vocabulary)
# single file
if (endsWith(corpus.dir, paste0(".", format))) {
num_files <- 1
fasta.files <- corpus.dir
} else {
fasta.files <- list.files(
path = xfun::normalize_path(corpus.dir),
pattern = paste0("\\.", format, "$"),
full.names = TRUE)
num_files <- length(fasta.files)
set.seed(seed)
if (randomFiles) fasta.files <- sample(fasta.files, replace = FALSE)
}
fasta.file <- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
if (shuffleFastaEntries){
fasta.file <- sample(fasta.file)
}
seq_vector <- stringr::str_to_lower(paste(fasta.file))
length_vector <- Biostrings::width(fasta.file)
# regular expression for chars outside vocabulary
pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]")
# sequence vector collects strings until one batch can be created
sequence_list <- vector("list")
target_list <- vector("list")
sequence_list_index <- 1
# pad short sequences with zeros
short_seq_index <- which(length_vector < maxlen)
for (i in short_seq_index){
seq_vector[i] <- paste0(paste(rep("0", maxlen - length_vector[i]),collapse = ""), seq_vector[i])
length_vector[i] <- maxlen
}
nucSeq <- paste(seq_vector, collapse = "")
start_indices <- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step)
startNewEntry <- cumsum(c(1, length_vector[-length(length_vector)]))
nucSeq <- keras::texts_to_sequences(tokenizer_pred, nucSeq)[[1]] - 1
# use subset of files
if (!is.null(numberOfFiles) && (numberOfFiles < length(fasta.files))){
fasta.files <- fasta.files[1:numberOfFiles]
num_files <- length(fasta.files)
}
# log file
if (!is.null(fileLog)){
if (!endsWith(fileLog, ".csv")) fileLog <- paste0(fileLog, ".csv")
write.table(x = fasta.files[file_index], file = fileLog, col.names = FALSE, append = TRUE)
}
# test for chars outside vocabulary
if (showWarnings){
charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)
if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
}
if (verbose) message("Initializing ...")
function() {
iter <- 1
# loop until enough samples collected
while(num_samples < batch.size){
# loop through sub-sequences/files until sequence of suitable length is found
while((start_index > length(start_indices)) | length(start_indices) == 0){
# go to next file
file_index <<- file_index + 1
start_index <<- 1
if (file_index > length(fasta.files)) file_index <<- 1
fasta.file <<- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
if (shuffleFastaEntries){
fasta.file <<- sample(fasta.file)
}
seq_vector <<- stringr::str_to_lower(paste(fasta.file))
length_vector <<- Biostrings::width(fasta.file)
# log file
if (!is.null(fileLog)){
write.table(x = fasta.files[file_index], file = fileLog, append = TRUE, col.names = FALSE)
}
# test for chars outside vocabulary
if (showWarnings){
charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)
if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
}
# pad short sequences with zeros
short_seq_index <<- which(length_vector < maxlen)
for (i in short_seq_index){
seq_vector[i] <<- paste0(paste(rep("0", maxlen - length_vector[i]),collapse = ""), seq_vector[i])
length_vector[i] <<- maxlen
}
nucSeq <<- paste(seq_vector, collapse = "")
start_indices <<- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step)
startNewEntry <<- cumsum(c(1, length_vector[-length(length_vector)]))
nucSeq <<- keras::texts_to_sequences(tokenizer_pred, nucSeq)[[1]] - 1
if(iter > max_iter){
stop('exceeded max_iter value, try reducing maxlen parameter')
break
}
iter <- iter + 1
}
# go as far as possible in sequence or stop when enough samples are collected
remainingSamples <- batch.size - num_samples
end_index <- min(length(start_indices), start_index + remainingSamples - 1)
subsetStartIndices <- start_indices[start_index:end_index]
sequence_list[[sequence_list_index]] <- nucSeq[subsetStartIndices[1] : (subsetStartIndices[length(subsetStartIndices)] + maxlen - 1)]
start_index_list[[sequence_list_index]] <- subsetStartIndices
sequence_list_index <<- sequence_list_index + 1
num_new_samples <- end_index - start_index + 1
num_samples <- num_samples + num_new_samples
start_index <<- end_index + 1
}
# one hot encode strings collected in sequence_list and connect arrays
array_x_list <- purrr::map(1:length(sequence_list), ~sequenceToArrayLabel(sequence_list[[.x]],
maxlen = maxlen, vocabulary = vocabulary,
startInd = start_index_list[[.x]]))
if (reverseComplements){
a <- which(stringr::str_to_lower(vocabulary) == "a")
c <- which(stringr::str_to_lower(vocabulary) == "c")
g <- which(stringr::str_to_lower(vocabulary) == "g")
t <- which(stringr::str_to_lower(vocabulary) == "t")
for (i in 1:length(array_x_list)){
x <- array_x_list[[i]]
x_rev_comp <- array(0, dim = dim(x))
# switch A/T and C/G
x_rev_comp[ , , a] <- x[ , , t]
x_rev_comp[ , , c] <- x[ , , g]
x_rev_comp[ , , g] <- x[ , , c]
x_rev_comp[ , , t] <- x[ , , a]
x_rev_comp[ , , -c(a,c,g,t)] <- x[ , , -c(a,c,g,t)]
reverse <- (dim(x)[2]):1
x_rev_comp <- array(x_rev_comp[ , reverse, ], dim = dim(x)) # reverse order
array_x_list[[i]] <- abind::abind(x, x_rev_comp, along = 1)
}
}
x <- array_x_list[[1]]
if (length(array_x_list) > 1){
for (i in 2:length(array_x_list)){
x <- abind::abind(x, array_x_list[[i]], along = 1)
}
}
# one hot encode targets
y <- matrix(0, ncol = numTargets, nrow = dim(x)[1])
y[ , onesColumn] <- 1
# coerce y type to matrix
if (dim(x)[1] == 1){
dim(y) <- c(1, length(numTargets))
}
# empty sequence_list for next batch
start_index_list <<- vector("list")
sequence_list <<- vector("list")
target_list <<- vector("list")
sequence_list_index <<- 1
num_samples <<- 0
return(list(X = x, Y = y))
}
}
#' Initializes generators defined by labelByFolderGenerator function
#'
#' \code{initializeGenerators} Initializes generators defined by \code{\link{{labelByFolderGenerator}} function. Targets get one-hot-encoded in order of directories.
#' Number of classes is given by length of directories.
#'
#' @param directories Vector of paths to folder containing fasta files. Files in one folder should belong to one class.
#' @param format File format.
#' @param batch.size Number of batches, will get rounded to be multiple of number of targets if necessary.
#' @param maxlen Length of predictor sequence.
#' @param max_iter Stop after max_iter number of iterations failed to produce a new batch.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param randomFiles Logical, whether to go through files randomly or sequential.
#' @param step How often to take a sample.
#' @param showWarnings Logical, give warning if character outside vocabulary appears.
#' @param seed Sets seed for set.seed function, for reproducible results when using \code{randomFiles} or \code{shuffleFastaEntries}
#' @param shuffleFastaEntries Logical, shuffle fasta entries.
#' @param verbose Whether to show message.
#' @param numberOfFiles Use only specified number of files, ignored if greater than number of files in \code{directories}.
#' @param fileLog Write name of files to csv file if path is specified.
#' @param reverseComplements Logical, half of batch contains sequences and other its reverse complements. Reverse complement
#' is given by reversed order of sequence and switching A/T and C/G. \code{batch.size} argument has to be even, otherwise 1 will be added
#' to \code{batch.size}
#' @param val Logical, call initialized generarator "genY" or "genValY" where Y is an integer between 1 and length of directories.
#' @export
initializeGenerators <- function(directories,
format = "fasta",
batch.size = 256,
maxlen = 250,
max_iter = 10000,
vocabulary = c("a", "c", "g", "t"),
verbose = FALSE,
randomFiles = FALSE,
step = 1,
showWarnings = FALSE,
seed = 1234,
shuffleFastaEntries = FALSE,
numberOfFiles = NULL,
fileLog = NULL,
reverseComplements = FALSE,
val = FALSE
){
# adjust batch.size
if (batch.size %% length(directories) != 0){
batchType <- ifelse(val, "validation", "training")
batch.size <- ceiling(batch.size/length(directories)) * length(directories)
if (!reverseComplements){
message(paste("Batch size needs to be multiple of number of targets. Setting", batchType, "batch size to", batch.size))
} else {
if ((batch.size/length(directories)) %% 2 == 0){
message(paste("Batch size needs to be multiple of number of targets. Setting", batchType, "batch size to", batch.size))
} else {
subBatchSize <- (batch.size/length(directories))
batch.size <- (subBatchSize + 1) * length(directories)
message(paste("Batch size needs to be multiple of number of targets and batch.size/(number of labels) needs to be even when reverseComlements is true.
Setting", batchType ,"batch size to", batch.size))
}
}
}
numTargets <- length(directories)
if (!val){
# create generator for every folder
for (i in 1:length(directories)){
numberedGen <- paste0("gen", as.character(i))
genAsText <- paste(numberedGen, "<<- labelByFolderGenerator(corpus.dir = directories[i],
format = format,
batch.size = batch.size/numTargets,
maxlen = maxlen,
max_iter = max_iter,
vocabulary = vocabulary,
verbose = verbose,
randomFiles = randomFiles,
step = step,
showWarnings = showWarnings,
seed = seed,
shuffleFastaEntries = shuffleFastaEntries,
numberOfFiles = numberOfFiles,
fileLog = fileLog,
reverseComplements = reverseComplements,
numTargets = numTargets,
onesColumn = i
)"
)
eval(parse(text = genAsText))
}
} else {
# create generator for every folder
for (i in 1:length(directories)){
# different names for validation generators
numberedGenVal <- paste0("genVal", as.character(i))
genAsTextVal <- paste(numberedGenVal, "<<- labelByFolderGenerator(corpus.dir = directories[i],
format = format,
batch.size = batch.size/numTargets,
maxlen = maxlen,
max_iter = max_iter,
vocabulary = vocabulary,
verbose = verbose,
randomFiles = randomFiles,
step = step,
showWarnings = showWarnings,
seed = seed,
shuffleFastaEntries = shuffleFastaEntries,
numberOfFiles = numberOfFiles,
fileLog = fileLog,
reverseComplements = reverseComplements,
numTargets = numTargets,
onesColumn = i
)"
)
eval(parse(text = genAsTextVal))
}
}
}
#' Generator wrapper
#'
#' Iterates over generators created by \code{\link{{initializeGenerators}}
#'
#' @param val Train or validation generator.
#' @param path Path
#' @export
labelByFolderGeneratorWrapper <- function(val, path){
if (!val){
function(){
directories <- path
# combine generators to create one batch
subBatch1 <<- eval(parse(text = paste0("gen", as.character("1"), "()")))
x1 <- subBatch1[[1]]
y1 <- subBatch1[[2]]
if (length(directories) > 1){
for (i in 2:length(directories)){
subBatch1 <<- eval(parse(text = paste0("gen", as.character(i), "()")))
x1 <- abind::abind(x1, subBatch1[[1]], along = 1)
y1 <- rbind(y1, subBatch1[[2]])
}
}
return(list(X = x1, Y = y1))
}
} else {
function(){
directories <- path
# combine generators to create one batch
subBatch2 <<- eval(parse(text = paste0("genVal", as.character("1"), "()")))
x2 <- subBatch2[[1]]
y2 <- subBatch2[[2]]
if (length(directories) > 1){
for (i in 2:length(directories)){
subBatch2 <<- eval(parse(text = paste0("genVal", as.character(i), "()")))
x2 <- abind::abind(x2, subBatch2[[1]], along = 1)
y2 <- rbind(y2, subBatch2[[2]])
}
}
return(list(X = x2, Y = y2))
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.