R/generators.R

Defines functions labelByFolderGeneratorWrapper initializeGenerators labelByFolderGenerator fastaLabelGenerator fastaFileGenerator

Documented in fastaFileGenerator fastaLabelGenerator initializeGenerators labelByFolderGenerator labelByFolderGeneratorWrapper

#' Custom generator for fasta/fastq files
#'
#' @description
#' \code{fastaFileGenerator} Iterates over folder containing .fasta/.fastq files and produces one-hot-encoding of predictor sequences 
#' and target variables.
#' 
#' @param corpus.dir Input directory where .fasta files are located or path to single file ending with .fasta or .fastq 
#' (as specified in format argument).
#' @param format File format, either fasta or fastq.
#' @param batch.size Number of batches.    
#' @param maxlen Length of predictor sequence.  
#' @param max_iter Stop after max_iter number of iterations failed to produce a new batch. 
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param randomFiles Logical, whether to go through files randomly or sequential. 
#' @param step How often to take a sample.
#' @param showWarnings Logical, give warning if character outside vocabulary appears   
#' @param seed Sets seed for set.seed function, for reproducible results when using \code{randomFiles} or \code{shuffleFastaEntries}  
#' @param shuffleFastaEntries Logical, shuffle entries in every fasta file before connecting them to sequence.
#' @param verbose Whether to show message. 
#' @param numberOfFiles Use only specified number of files, ignored if greater than number of files in corpus.dir. 
#' @param fileLog Write name of files to csv file if path is specified.
#' @param reverseComplements Logical, half of batch contains sequences and other its reverse complements. Reverse complement 
#' is given by reversed order of sequence and switching A/T and C/G. \code{batch.size} argument has to be even, otherwise 1 will be added
#' to \code{batch.size}
#' @return A list of length 2. First element is a 3-dimensional tensor with dimensions (batch.size, maxlen, length(vocabulary)), encoding 
#' the predictor sequences. Second element is a matrix with dimensions (batch.size, length(vocabulary)), encoding the targets.
#' @export
fastaFileGenerator <- function(corpus.dir,
                               format = "fasta",
                               batch.size = 256,
                               maxlen = 250,
                               max_iter = 2000,
                               vocabulary = c("a", "c", "g", "t"),
                               verbose = FALSE,
                               randomFiles = FALSE,
                               step = 1, 
                               showWarnings = FALSE,
                               seed = 1234,
                               shuffleFastaEntries = FALSE,
                               numberOfFiles = NULL,
                               fileLog = NULL,
                               reverseComplements = FALSE){
  
  if ((batch.size %% 2 != 0) & reverseComplements){
    message <- paste("Batch size has to be even for reverse complements. Setting batch size to", batch.size + 1)
    message(message)
    batch.size <- batch.size + 1
  }
  
  if (reverseComplements){
    # halve batch.size, other half gets filled with reverse complements
    batch.size <- batch.size/2
  }
  
  tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), vocabulary)
  start_index_list <- vector("list")
  file_index <- 1
  num_samples <- 0
  start_index <- 1
  
  # single file
  if (endsWith(corpus.dir, paste0(".", format))) {
    fasta.file <- Biostrings::readDNAStringSet(corpus.dir, format = format)
    num_files <- 1 
    fasta.files <- corpus.dir   
  } else {
    fasta.files <- list.files(
      path = xfun::normalize_path(corpus.dir),
      pattern = paste0("\\.", format, "$"),
      full.names = TRUE)
    num_files <- length(fasta.files)
    
    set.seed(seed)
    if (randomFiles) fasta.files <- sample(fasta.files, replace = FALSE)
  }
  
  # regular expression for chars outside vocabulary
  pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]")
  
  # sequence vector collects strings until one batch can be created   
  sequence_list <- vector("list")
  sequence_list_index <- 1
  
  fasta.file <- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
  if (shuffleFastaEntries){
    fasta.file <- sample(fasta.file)
  }
  seq_vector <- stringr::str_to_lower(paste(fasta.file))
  length_vector <- Biostrings::width(fasta.file)
  # pad short sequences with zeros
  short_seq_index <- which(length_vector < (maxlen + 1))
  for (i in short_seq_index){
    seq_vector[i] <- paste0(paste(rep("0", (maxlen + 1) - length_vector[i]),collapse = ""), seq_vector[i])
    length_vector[i] <- maxlen + 1
  }
  nucSeq <- paste(seq_vector, collapse = "")  
  
  # start positions of samples
  start_indices <- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step, train_mode = "lm")
  
  nucSeq <- stringr::str_to_lower(nucSeq) 
  nucSeq <- keras::texts_to_sequences(tokenizer, nucSeq)[[1]] - 1
  
  # use subset of files
  if (!is.null(numberOfFiles) && (numberOfFiles < length(fasta.files))){
    fasta.files <- fasta.files[1:numberOfFiles]
    num_files <- length(fasta.files)
  }
  
  # log file
  if (!is.null(fileLog)){
    if (!endsWith(fileLog, ".csv")) fileLog <- paste0(fileLog, ".csv")
    write.table(x = filePath, file = fileLog)
  }
  
  # test for chars outside vocabulary
  if (showWarnings){
    charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)  
    if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
  }
  
  if (verbose) message("Initializing ...")
  
  function() {
    iter <- 1
    # loop until enough samples collected
    while(num_samples < batch.size){  
      # loop through sub-sequences/files until sequence of suitable length is found   
      while((start_index > length(start_indices)) | length(start_indices) == 0){
        
        # go to next file 
        file_index <<- file_index + 1
        start_index <<- 1
        if (file_index > length(fasta.files)) file_index <<- 1
        filePath <<- fasta.files[[file_index]]
        fasta.file <<- Biostrings::readDNAStringSet(filePath)
        # log file
        if (!is.null(fileLog)){
          write.table(x = filePath, file = fileLog, append = TRUE)
        }
        # test for chars outside vocabulary
        if (showWarnings){
          charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)  
          if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
        }
        if (shuffleFastaEntries){
          fasta.file <<- sample(fasta.file)
        }
        seq_vector <<- stringr::str_to_lower(paste(fasta.file))
        length_vector <<- Biostrings::width(fasta.file)
        # pad short sequences with zeros
        short_seq_index <<- which(length_vector < (maxlen + 1))
        for (i in short_seq_index){
          seq_vector[i] <<- paste0(paste(rep("0", (maxlen + 1) - length_vector[i]),collapse = ""), seq_vector[i])
          length_vector[i] <<- maxlen + 1
        }
        nucSeq <<- paste(seq_vector, collapse = "")  
        
        # start positions of samples
        start_indices <<- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step, train_mode = "lm")
        
        nucSeq <<- stringr::str_to_lower(nucSeq) 
        nucSeq <<- keras::texts_to_sequences(tokenizer, nucSeq)[[1]] - 1
        
        
        if(iter > max_iter){
          stop('exceeded max_iter value, try reducing maxlen parameter')
          break
        }
        iter <- iter + 1
      }
      
      # go as far as possible in sequence or stop when enough samples are collected 
      remainingSamples <- batch.size - num_samples
      end_index <- min(length(start_indices), start_index + remainingSamples  - 1)
      
      subsetStartIndices <- start_indices[start_index:end_index]
      sequence_list[[sequence_list_index]] <- nucSeq[subsetStartIndices[1]:(subsetStartIndices[length(subsetStartIndices)] + maxlen)]
      start_index_list[[sequence_list_index]] <- subsetStartIndices
      sequence_list_index <<- sequence_list_index + 1
      
      num_new_samples <- end_index - start_index + 1   
      num_samples <- num_samples + num_new_samples 
      start_index <<- end_index + 1  
    }
    
    # one hot encode strings collected in sequence_list and connect arrays
    array_list <- purrr::map(1:length(sequence_list), ~sequenceToArray(sequence_list[[.x]],
                                                                       maxlen = maxlen, vocabulary = vocabulary,
                                                                       startInd =  start_index_list[[.x]]))
    
    # create samples from reverse complements 
    if (reverseComplements){
      a <- which(stringr::str_to_lower(vocabulary) == "a")
      c <- which(stringr::str_to_lower(vocabulary) == "c")
      g <- which(stringr::str_to_lower(vocabulary) == "g")
      t <- which(stringr::str_to_lower(vocabulary) == "t")
      # reverse order and switch a/t and c/g
      for (i in length(sequence_list)){
        sequence_list[[i]] <- rev(sequence_list[[i]])
        sequence_list[[i]] <- plyr::mapvalues(sequence_list[[i]], c(a,c,g,t), c(t,g,c,a), warn_missing = FALSE)
      }
      
      array_list_rev_comp <- purrr::map(1:length(sequence_list), ~sequenceToArray(sequence_list[[.x]],
                                                                                  maxlen = maxlen, vocabulary = vocabulary,  
                                                                                  startInd = rev(length(sequence_list[[.x]]) - start_index_list[[.x]] - maxlen + 1)))
      array_list <- c(array_list, array_list_rev_comp)
    }
    
    x <- array_list[[1]][[1]]
    y <- array_list[[1]][[2]]
    if (length(array_list) > 1){
      for (i in 2:length(array_list)){
        x <- abind::abind(x, array_list[[i]][[1]], along = 1)
        y <- rbind(y, array_list[[i]][[2]])
      }
    }
    
    # coerce y type to matrix
    if (dim(x)[1] == 1){
      dim(y) <- c(1, length(vocabulary))
    }
    
    # empty sequence_list for next batch 
    start_index_list <<- vector("list")
    sequence_list <<- vector("list")
    sequence_list_index <<- 1
    num_samples <<- 0
    return(list(X = x, Y = y))
  }
}

#' Custom generator for fasta files and label targets
#' 
#' @description
#' \code{fastaLabelGenerator} Iterates over folder containing .fasta files and produces one-hot-encoding of predictor sequences 
#' and target variables. Targets will be read from fasta headers.
#' 
#' @param corpus.dir Input directory where .fasta files are located or path to single file ending with .fasta or .fastq 
#' (as specified in format argument).
#' @param format File format, either fasta or fastq.
#' @param batch.size Number of batches.    
#' @param maxlen Length of predictor sequence.  
#' @param max_iter Stop after max_iter number of iterations failed to produce a new batch. 
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param randomFiles Logical, whether to go through files randomly or sequential. 
#' @param step How often to take a sample.
#' @param showWarnings Logical, give warning if character outside vocabulary appears.   
#' @param seed Sets seed for set.seed function, for reproducible results when using \code{randomFiles} or \code{shuffleFastaEntries}  
#' @param shuffleFastaEntries Logical, shuffle fasta entries.
#' @param verbose Whether to show message. 
#' @param numberOfFiles Use only specified number of files, ignored if greater than number of files in corpus.dir. 
#' @param fileLog Write name of files to csv file if path is specified.
#' @param labelVocabulary Character vector of possible targets. Targets outside \code{labelVocabulary} will get discarded.
#' @param reverseComplements Logical, half of batch contains sequences and other its reverse complements. Reverse complement 
#' is given by reversed order of sequence and switching A/T and C/G. \code{batch.size} argument has to be even, otherwise 1 will be added
#' to \code{batch.size}
#' @return A list of length 2. First element is a 3-dimensional tensor with dimensions (batch.size, maxlen, length(vocabulary)), encoding 
#' the predictor sequences. Second element is a matrix with dimensions (batch.size, length(vocabulary)), encoding the targets.
#' @export
fastaLabelGenerator <- function(corpus.dir,
                                format = "fasta",
                                batch.size = 256,
                                maxlen = 250,
                                max_iter = 10000,
                                vocabulary = c("a", "c", "g", "t"),
                                verbose = FALSE,
                                randomFiles = FALSE,
                                step = 1, 
                                showWarnings = FALSE,
                                seed = 1234,
                                shuffleFastaEntries = FALSE,
                                numberOfFiles = NULL,
                                fileLog = NULL,
                                labelVocabulary = c("x", "y", "z"),
                                reverseComplements = TRUE){
  
  if ((batch.size %% 2 != 0) & reverseComplements){
    message <- paste("Batch size has to be even for reverse complements. Setting batch size to", batch.size + 1)
    message(message)
    batch.size <- batch.size + 1
  }
  
  if (reverseComplements){
    # halve batch.size, other half gets filled with reverse complements
    batch.size <- batch.size/2
  }
  
  vocabulary <- stringr::str_to_lower(vocabulary)
  labelVocabulary <- stringr::str_to_lower(labelVocabulary)
  start_index_list <- vector("list")
  file_index <- 1
  num_samples <- 0
  start_index <- 1
  tokenizer_pred <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), vocabulary) 
  tokenizer_target <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = FALSE, lower = TRUE, filters = "\t\n"),
                                                labelVocabulary) 
  
  # single file
  if (endsWith(corpus.dir, paste0(".", format))) {
    num_files <- 1 
    fasta.files <- corpus.dir   
  } else {
    fasta.files <- list.files(
      path = xfun::normalize_path(corpus.dir),
      pattern = paste0("\\.", format, "$"),
      full.names = TRUE)
    num_files <- length(fasta.files)
  }
  
  set.seed(seed)
  if (randomFiles) fasta.files <- sample(fasta.files, replace = FALSE)
  
  # pre-load the first file
  fasta.file <- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
  if (shuffleFastaEntries){
    fasta.file <- sample(fasta.file)
  }
  seq_vector <- stringr::str_to_lower(paste(fasta.file))
  length_vector <- Biostrings::width(fasta.file)
  label_vector <- trimws(stringr::str_to_lower(names(fasta.file)))
  
  
  # regular expression for chars outside vocabulary
  pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]")
  
  # sequence vector collects strings until one batch can be created   
  sequence_list <- vector("list")
  target_list <- vector("list")
  sequence_list_index <- 1
  
  # filter out samples with labels outside labelVocabulary
  label_filter <- label_vector %in% labelVocabulary
  seq_vector <- seq_vector[label_filter]
  length_vector <- length_vector[label_filter]
  label_vector <- label_vector[label_filter]
  
  # pad short sequences with zeros
  short_seq_index <- which(length_vector < maxlen)
  for (i in short_seq_index){
    seq_vector[i] <- paste0(paste(rep("0", maxlen - length_vector[i]),collapse = ""), seq_vector[i])
    length_vector[i] <- maxlen 
  }
  
  nucSeq <- paste(seq_vector, collapse = "")
  start_indices <- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step, train_mode = "label")
  startNewEntry <- cumsum(c(1, length_vector[-length(length_vector)]))
  nucSeq <- keras::texts_to_sequences(tokenizer_pred, nucSeq)[[1]] - 1
  
  # use subset of files
  if (!is.null(numberOfFiles) && (numberOfFiles < length(fasta.files))){
    fasta.files <- fasta.files[1:numberOfFiles]
    num_files <- length(fasta.files)
  }
  
  # log file
  if (!is.null(fileLog)){
    if (!endsWith(fileLog, ".csv")) fileLog <- paste0(fileLog, ".csv")
    write.table(x = fasta.files[file_index], file = fileLog, col.names = FALSE)
  }
  
  # test for chars outside vocabulary
  if (showWarnings){
    charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)  
    if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
  }
  
  if (verbose) message("Initializing ...")
  
  function() {
    iter <- 1
    # loop until enough samples collected
    while(num_samples < batch.size){  
      # loop through sub-sequences/files until sequence of suitable length is found   
      while((start_index > length(start_indices)) | length(start_indices) == 0){
        
        # go to next file 
        file_index <<- file_index + 1
        start_index <<- 1
        if (file_index > length(fasta.files)) file_index <<- 1
        fasta.file <<- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
        if (shuffleFastaEntries){
          fasta.file <<- sample(fasta.file)
        }
        seq_vector <<- stringr::str_to_lower(paste(fasta.file))
        length_vector <<- Biostrings::width(fasta.file)
        label_vector <<- trimws(stringr::str_to_lower(names(fasta.file)))
        
        # log file
        if (!is.null(fileLog)){
          write.table(x = fasta.files[file_index], file = fileLog, append = TRUE, col.names = FALSE)
        }
        # test for chars outside vocabulary
        if (showWarnings){
          charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)  
          if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
        }
        
        # filter out samples with labels outside labelVocabulary
        label_filter <<- label_vector %in% labelVocabulary
        if (all(!label_filter)) break # no sample in current file
        seq_vector <<- seq_vector[label_filter]
        length_vector <<- length_vector[label_filter]
        label_vector <<- label_vector[label_filter]
        
        # pad short sequences with zeros
        short_seq_index <<- which(length_vector < maxlen)
        for (i in short_seq_index){
          seq_vector[i] <<- paste0(paste(rep("0", maxlen - length_vector[i]),collapse = ""), seq_vector[i])
          length_vector[i] <<- maxlen 
        }
        
        nucSeq <<- paste(seq_vector, collapse = "")
        start_indices <<- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step, train_mode = "label")
        startNewEntry <<- cumsum(c(1, length_vector[-length(length_vector)]))
        nucSeq <<- keras::texts_to_sequences(tokenizer_pred, nucSeq)[[1]] - 1
        
        if(iter > max_iter){
          stop('exceeded max_iter value, try reducing maxlen parameter')
          break
        }
        iter <- iter + 1
      }

      # go as far as possible in sequence or stop when enough samples are collected 
      remainingSamples <- batch.size - num_samples
      end_index <- min(length(start_indices), start_index + remainingSamples  - 1)
      
      subsetStartIndices <- start_indices[start_index:end_index]
        sequence_list[[sequence_list_index]] <- nucSeq[subsetStartIndices[1] : (subsetStartIndices[length(subsetStartIndices)] + maxlen - 1)]
      # collect targets 
      target_list[[sequence_list_index]] <- as.character(cut(subsetStartIndices, breaks = c(startNewEntry, length(nucSeq)),
                                                             labels = label_vector, include.lowest = TRUE, right = FALSE))
      start_index_list[[sequence_list_index]] <- subsetStartIndices
      sequence_list_index <<- sequence_list_index + 1
      num_new_samples <- end_index - start_index + 1   
      num_samples <- num_samples + num_new_samples 
      start_index <<- end_index + 1  
    }
    
    # one hot encode strings collected in sequence_list and connect arrays
    array_x_list <- purrr::map(1:length(sequence_list), ~sequenceToArrayLabel(sequence_list[[.x]],
                                                                              maxlen = maxlen, vocabulary = vocabulary, 
                                                                              startInd =  start_index_list[[.x]]))
    if (reverseComplements){
      a <- which(stringr::str_to_lower(vocabulary) == "a")
      c <- which(stringr::str_to_lower(vocabulary) == "c")
      g <- which(stringr::str_to_lower(vocabulary) == "g")
      t <- which(stringr::str_to_lower(vocabulary) == "t")
      for (i in 1:length(array_x_list)){
        x <- array_x_list[[i]]
        x_rev_comp <- array(0, dim = dim(x))
        # switch A/T and C/G
        x_rev_comp[ , , a] <- x[ , , t]
        x_rev_comp[ , , c] <- x[ , , g]
        x_rev_comp[ , , g] <- x[ , , c]
        x_rev_comp[ , , t] <- x[ , , a]
        x_rev_comp[ , , -c(a,c,g,t)] <- x[ , , -c(a,c,g,t)]
        reverse <- (dim(x)[2]):1
        x_rev_comp <- array(x_rev_comp[ , reverse, ], dim = dim(x)) # reverse order
        array_x_list[[i]] <- abind::abind(x, x_rev_comp, along = 1)
      }
    }
    
    # one hot encode targets
    target_int <- unlist(keras::texts_to_sequences(tokenizer_target, unlist(target_list))) - 1
    y  <- keras::to_categorical(target_int, num_classes = length(labelVocabulary))
    if (reverseComplements) y <- rbind(y, y)
    
    x <- array_x_list[[1]]
    
    if (length(array_x_list) > 1){
      for (i in 2:length(array_x_list)){
        x <- abind::abind(x, array_x_list[[i]], along = 1)
      }
    }
    
    # coerce y type to matrix
    if (dim(x)[1] == 1){
      dim(y) <- c(1, length(labelVocabulary))
    }
    
    # empty sequence_list for next batch 
    start_index_list <<- vector("list")
    sequence_list <<- vector("list")
    target_list <<- vector("list")
    sequence_list_index <<- 1
    num_samples <<- 0
    return(list(X = x, Y = y))
  }
}

#' Custom generator for fasta files
#'
#' @description
#' \code{labelByFolderGenerator} Iterates over folder containing .fasta files and produces one-hot-encoding of predictor sequences 
#' and target variables. Files in \code{corpus.dir} should all belong to one class.
#' 
#' @param corpus.dir Input directory where .fasta files are located or path to single file ending with .fasta or .fastq 
#' (as specified in format argument).
#' @param format File format, either fasta or fastq.
#' @param batch.size Number of batches.    
#' @param maxlen Length of predictor sequence.  
#' @param max_iter Stop after max_iter number of iterations failed to produce a new batch. 
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param randomFiles Logical, whether to go through files randomly or sequential. 
#' @param step How often to take a sample.
#' @param showWarnings Logical, give warning if character outside vocabulary appears   
#' @param seed Sets seed for set.seed function, for reproducible results when using \code{randomFiles} or \code{shuffleFastaEntries}  
#' @param shuffleFastaEntries Logical, shuffle fasta entries.
#' @param verbose Whether to show message. 
#' @param numberOfFiles Use only specified number of files, ignored if greater than number of files in corpus.dir. 
#' @param fileLog Write name of files to csv file if path is specified.
#' @param numTargets Number of columns of target matrix.  
#' @param onesColumn Which column of target matrix contains ones
#' @param reverseComplements Logical, half of batch contains sequences and other its reverse complements. Reverse complement 
#' is given by reversed order of sequence and switching A/T and C/G. \code{batch.size} argument has to be even, otherwise 1 will be added
#' to \code{batch.size}
#' @return A list of length 2. First element is a 3-dimensional tensor with dimensions (batch.size, maxlen, length(vocabulary)), encoding 
#' the predictor sequences. Second element is a matrix with dimensions (batch.size, length(vocabulary)), encoding the targets.
#' @export
labelByFolderGenerator <- function(corpus.dir,
                                   format = "fasta",
                                   batch.size = 256,
                                   maxlen = 250,
                                   max_iter = 10000,
                                   vocabulary = c("a", "c", "g", "t"),
                                   verbose = FALSE,
                                   randomFiles = FALSE,
                                   step = 1, 
                                   showWarnings = FALSE,
                                   seed = 1234,
                                   shuffleFastaEntries = FALSE,
                                   numberOfFiles = NULL,
                                   fileLog = NULL,
                                   reverseComplements = TRUE,
                                   numTargets,
                                   onesColumn){
  
  stopifnot(onesColumn <= numTargets)
  
  if ((batch.size %% 2 != 0) & reverseComplements){
    message <- paste("Batch size has to be even for reverse complements. Setting batch size to", batch.size + 1)
    message(message)
    batch.size <- batch.size + 1
  }
  
  if (reverseComplements){
    # halve batch.size, other half gets filled with reverse complements
    batch.size <- batch.size/2
  }
  
  vocabulary <- stringr::str_to_lower(vocabulary)
  start_index_list <- vector("list")
  file_index <- 1
  num_samples <- 0
  start_index <- 1
  tokenizer_pred <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), vocabulary) 
  
  # single file
  if (endsWith(corpus.dir, paste0(".", format))) {
    num_files <- 1 
    fasta.files <- corpus.dir   
  } else {
    fasta.files <- list.files(
      path = xfun::normalize_path(corpus.dir),
      pattern = paste0("\\.", format, "$"),
      full.names = TRUE)
    num_files <- length(fasta.files)
    
    set.seed(seed)
    if (randomFiles) fasta.files <- sample(fasta.files, replace = FALSE)
  }
  
  fasta.file <- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
  if (shuffleFastaEntries){
    fasta.file <- sample(fasta.file)
  }
  seq_vector <- stringr::str_to_lower(paste(fasta.file))
  length_vector <- Biostrings::width(fasta.file)
  
  # regular expression for chars outside vocabulary
  pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]")
  
  # sequence vector collects strings until one batch can be created   
  sequence_list <- vector("list")
  target_list <- vector("list")
  sequence_list_index <- 1
  
  # pad short sequences with zeros
  short_seq_index <- which(length_vector < maxlen)
  for (i in short_seq_index){
    seq_vector[i] <- paste0(paste(rep("0", maxlen - length_vector[i]),collapse = ""), seq_vector[i])
    length_vector[i] <- maxlen 
  }
  nucSeq <- paste(seq_vector, collapse = "")
  start_indices <- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step)
  startNewEntry <- cumsum(c(1, length_vector[-length(length_vector)]))
  nucSeq <- keras::texts_to_sequences(tokenizer_pred, nucSeq)[[1]] - 1
  
  # use subset of files
  if (!is.null(numberOfFiles) && (numberOfFiles < length(fasta.files))){
    fasta.files <- fasta.files[1:numberOfFiles]
    num_files <- length(fasta.files)
  }
  
  # log file
  if (!is.null(fileLog)){
    if (!endsWith(fileLog, ".csv")) fileLog <- paste0(fileLog, ".csv")
    write.table(x = fasta.files[file_index], file = fileLog, col.names = FALSE, append = TRUE)
  }
  
  # test for chars outside vocabulary
  if (showWarnings){
    charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)  
    if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
  }
  
  if (verbose) message("Initializing ...")
  
  function() {
    iter <- 1
    # loop until enough samples collected
    while(num_samples < batch.size){  
      # loop through sub-sequences/files until sequence of suitable length is found   
      while((start_index > length(start_indices)) | length(start_indices) == 0){
        
        # go to next file 
        file_index <<- file_index + 1
        start_index <<- 1
        if (file_index > length(fasta.files)) file_index <<- 1
        fasta.file <<- Biostrings::readDNAStringSet(fasta.files[file_index], format = format)
        if (shuffleFastaEntries){
          fasta.file <<- sample(fasta.file)
        }
        seq_vector <<- stringr::str_to_lower(paste(fasta.file))
        length_vector <<- Biostrings::width(fasta.file)
        
        # log file
        if (!is.null(fileLog)){
          write.table(x = fasta.files[file_index], file = fileLog, append = TRUE, col.names = FALSE)
        }
        # test for chars outside vocabulary
        if (showWarnings){
          charsOutsideVoc <- stringr::str_detect(stringr::str_to_lower(nucSeq), pattern)  
          if (charsOutsideVoc) warning("file ", filePath, " contains characters outside vocabulary")
        }
        
        # pad short sequences with zeros
        short_seq_index <<- which(length_vector < maxlen)
        for (i in short_seq_index){
          seq_vector[i] <<- paste0(paste(rep("0", maxlen - length_vector[i]),collapse = ""), seq_vector[i])
          length_vector[i] <<- maxlen 
        }
        nucSeq <<- paste(seq_vector, collapse = "")
        start_indices <<- getStartInd(seq_vector = seq_vector, length_vector = length_vector, maxlen = maxlen, step = step)
        startNewEntry <<- cumsum(c(1, length_vector[-length(length_vector)]))
        nucSeq <<- keras::texts_to_sequences(tokenizer_pred, nucSeq)[[1]] - 1
        
        if(iter > max_iter){
          stop('exceeded max_iter value, try reducing maxlen parameter')
          break
        }
        iter <- iter + 1
      }
      
      # go as far as possible in sequence or stop when enough samples are collected 
      remainingSamples <- batch.size - num_samples
      end_index <- min(length(start_indices), start_index + remainingSamples  - 1)
      
      subsetStartIndices <- start_indices[start_index:end_index]
      sequence_list[[sequence_list_index]] <- nucSeq[subsetStartIndices[1] : (subsetStartIndices[length(subsetStartIndices)] + maxlen - 1)]
      start_index_list[[sequence_list_index]] <- subsetStartIndices
      sequence_list_index <<- sequence_list_index + 1
      
      num_new_samples <- end_index - start_index + 1   
      num_samples <- num_samples + num_new_samples 
      start_index <<- end_index + 1  
    }
    
    # one hot encode strings collected in sequence_list and connect arrays
    array_x_list <- purrr::map(1:length(sequence_list), ~sequenceToArrayLabel(sequence_list[[.x]],
                                                                              maxlen = maxlen, vocabulary = vocabulary, 
                                                                              startInd =  start_index_list[[.x]]))
    
    if (reverseComplements){
      a <- which(stringr::str_to_lower(vocabulary) == "a")
      c <- which(stringr::str_to_lower(vocabulary) == "c")
      g <- which(stringr::str_to_lower(vocabulary) == "g")
      t <- which(stringr::str_to_lower(vocabulary) == "t")
      for (i in 1:length(array_x_list)){
        x <- array_x_list[[i]]
        x_rev_comp <- array(0, dim = dim(x))
        # switch A/T and C/G
        x_rev_comp[ , , a] <- x[ , , t]
        x_rev_comp[ , , c] <- x[ , , g]
        x_rev_comp[ , , g] <- x[ , , c]
        x_rev_comp[ , , t] <- x[ , , a]
        x_rev_comp[ , , -c(a,c,g,t)] <- x[ , , -c(a,c,g,t)]
        reverse <- (dim(x)[2]):1
        x_rev_comp <- array(x_rev_comp[ , reverse, ], dim = dim(x)) # reverse order
        array_x_list[[i]] <- abind::abind(x, x_rev_comp, along = 1)
      }
    }
    
    x <- array_x_list[[1]]
    
    if (length(array_x_list) > 1){
      for (i in 2:length(array_x_list)){
        x <- abind::abind(x, array_x_list[[i]], along = 1)
      }
    }
    
    # one hot encode targets
    y  <- matrix(0, ncol = numTargets, nrow = dim(x)[1])
    y[ , onesColumn] <- 1
    
    # coerce y type to matrix
    if (dim(x)[1] == 1){
      dim(y) <- c(1, length(numTargets))
    }
    
    # empty sequence_list for next batch 
    start_index_list <<- vector("list")
    sequence_list <<- vector("list")
    target_list <<- vector("list")
    sequence_list_index <<- 1
    num_samples <<- 0
    return(list(X = x, Y = y))
  }
}

#' Initializes generators defined by labelByFolderGenerator function
#'
#' \code{initializeGenerators} Initializes generators defined by \code{\link{{labelByFolderGenerator}} function. Targets get one-hot-encoded in order of directories.
#' Number of classes is given by length of directories.
#' 
#' @param directories Vector of paths to folder containing fasta files. Files in one folder should belong to one class. 
#' @param format File format.
#' @param batch.size Number of batches, will get rounded to be multiple of number of targets if necessary.
#' @param maxlen Length of predictor sequence.  
#' @param max_iter Stop after max_iter number of iterations failed to produce a new batch. 
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param randomFiles Logical, whether to go through files randomly or sequential. 
#' @param step How often to take a sample.
#' @param showWarnings Logical, give warning if character outside vocabulary appears.   
#' @param seed Sets seed for set.seed function, for reproducible results when using \code{randomFiles} or \code{shuffleFastaEntries}  
#' @param shuffleFastaEntries Logical, shuffle fasta entries.
#' @param verbose Whether to show message. 
#' @param numberOfFiles Use only specified number of files, ignored if greater than number of files in \code{directories}. 
#' @param fileLog Write name of files to csv file if path is specified.
#' @param reverseComplements Logical, half of batch contains sequences and other its reverse complements. Reverse complement 
#' is given by reversed order of sequence and switching A/T and C/G. \code{batch.size} argument has to be even, otherwise 1 will be added
#' to \code{batch.size}
#' @param val Logical, call initialized generarator "genY" or "genValY" where Y is an integer between 1 and length of directories.
#' @export
initializeGenerators <- function(directories,
                                 format = "fasta",
                                 batch.size = 256,
                                 maxlen = 250,
                                 max_iter = 10000,
                                 vocabulary = c("a", "c", "g", "t"),
                                 verbose = FALSE,
                                 randomFiles = FALSE,
                                 step = 1, 
                                 showWarnings = FALSE,
                                 seed = 1234,
                                 shuffleFastaEntries = FALSE,
                                 numberOfFiles = NULL,
                                 fileLog = NULL,
                                 reverseComplements = FALSE, 
                                 val = FALSE
){
  
  # adjust batch.size
  if (batch.size %% length(directories) != 0){
    batchType <- ifelse(val, "validation", "training")
    batch.size <- ceiling(batch.size/length(directories)) * length(directories)
    if (!reverseComplements){
      message(paste("Batch size needs to be multiple of number of targets. Setting", batchType, "batch size to", batch.size))
    } else {
      if ((batch.size/length(directories)) %% 2 == 0){
        message(paste("Batch size needs to be multiple of number of targets. Setting", batchType, "batch size to", batch.size))
      } else {
        subBatchSize <- (batch.size/length(directories))
        batch.size <- (subBatchSize + 1) * length(directories)  
        message(paste("Batch size needs to be multiple of number of targets and batch.size/(number of labels) needs to be even when reverseComlements is true.
                      Setting", batchType  ,"batch size to", batch.size))
      }
    }
  }
  
  numTargets <- length(directories)
  
  if (!val){
    # create generator for every folder
    for (i in 1:length(directories)){
      numberedGen <- paste0("gen", as.character(i))
      genAsText <- paste(numberedGen, "<<- labelByFolderGenerator(corpus.dir = directories[i],
                                       format = format,
                                       batch.size = batch.size/numTargets,
                                       maxlen = maxlen,
                                       max_iter = max_iter,
                                       vocabulary = vocabulary,
                                       verbose = verbose,
                                       randomFiles = randomFiles,
                                       step = step, 
                                       showWarnings = showWarnings,
                                       seed = seed,
                                       shuffleFastaEntries = shuffleFastaEntries,
                                       numberOfFiles = numberOfFiles,
                                       fileLog = fileLog,
                                       reverseComplements = reverseComplements,
                                       numTargets = numTargets,
                                       onesColumn = i
  )"
      )  
      eval(parse(text = genAsText))  
    } 
  } else {
    # create generator for every folder
    for (i in 1:length(directories)){
      # different names for validation generators
      numberedGenVal <- paste0("genVal", as.character(i))
      genAsTextVal <- paste(numberedGenVal, "<<- labelByFolderGenerator(corpus.dir = directories[i],
                                       format = format,
                                       batch.size = batch.size/numTargets,
                                       maxlen = maxlen,
                                       max_iter = max_iter,
                                       vocabulary = vocabulary,
                                       verbose = verbose,
                                       randomFiles = randomFiles,
                                       step = step, 
                                       showWarnings = showWarnings,
                                       seed = seed,
                                       shuffleFastaEntries = shuffleFastaEntries,
                                       numberOfFiles = numberOfFiles,
                                       fileLog = fileLog,
                                       reverseComplements = reverseComplements,
                                       numTargets = numTargets,
                                       onesColumn = i
  )"
      )
      eval(parse(text = genAsTextVal))  
    }
  }
}


#' Generator wrapper
#'
#' Iterates over generators created by \code{\link{{initializeGenerators}}
#' 
#' @param val Train or validation generator.
#' @param path Path 
#' @export
labelByFolderGeneratorWrapper <- function(val, path){
  if (!val){
    function(){
      directories <- path
      # combine generators to create one batch
      subBatch1 <<- eval(parse(text = paste0("gen", as.character("1"), "()")))
      x1 <- subBatch1[[1]]
      y1 <- subBatch1[[2]]
      if (length(directories) > 1){
        for (i in 2:length(directories)){
          subBatch1 <<- eval(parse(text = paste0("gen", as.character(i), "()")))
          x1 <- abind::abind(x1, subBatch1[[1]], along = 1)
          y1 <- rbind(y1, subBatch1[[2]])
        }
      }
      return(list(X = x1, Y = y1))
    }
  } else {
    function(){
      directories <- path
      # combine generators to create one batch
      subBatch2 <<- eval(parse(text = paste0("genVal", as.character("1"), "()")))
      x2 <- subBatch2[[1]]
      y2 <- subBatch2[[2]]
      if (length(directories) > 1){
        for (i in 2:length(directories)){
          subBatch2 <<- eval(parse(text = paste0("genVal", as.character(i), "()")))
          x2 <- abind::abind(x2, subBatch2[[1]], along = 1)
          y2 <- rbind(y2, subBatch2[[2]])
        }
      }
      return(list(X = x2, Y = y2))
    }
  }
}
hiddengenome/altum documentation built on April 22, 2020, 9:33 p.m.