deepG: Deep Learning for Genome Sequence Data

Documented in generator_fasta_lm

#' Language model generator for fasta/fastq files
#'
#' @description Iterates over folder containing fasta/fastq files and produces encoding of predictor sequences
#' and target variables. Will take a sequence of fixed size and use some part of sequence as input and other part as target. 
#'
#' @inheritParams train_model
#' @param path_corpus Input directory where fasta files are located or path to single file ending with fasta or fastq
#' (as specified in format argument). Can also be a list of directories and/or files.
#' @param format File format, either `"fasta"` or `"fastq"`.
#' @param batch_size Number of samples in one batch.
#' @param maxlen Length of predictor sequence.
#' @param max_iter Stop after `max_iter` number of iterations failed to produce a new batch.
#' @param shuffle_file_order Logical, whether to go through files randomly or sequentially.
#' @param step How often to take a sample.
#' @param seed Sets seed for `set.seed` function for reproducible results.
#' @param shuffle_input Whether to shuffle entries in every fasta/fastq file before extracting samples.
#' @param verbose Whether to show messages.
#' @param path_file_log Write name of files to csv file if path is specified.
#' @param reverse_complement Boolean, for every new file decide randomly to use original data or its reverse complement.
#' @param ambiguous_nuc How to handle nucleotides outside vocabulary, either `"zero"`, `"discard"`, `"empirical"` or `"equal"`.
#' \itemize{
#' \item If `"zero"`, input gets encoded as zero vector.
#' \item If `"equal"`, input is repetition of `1/length(vocabulary)`.
#' \item If `"discard"`, samples containing nucleotides outside vocabulary get discarded.
#' \item If `"empirical"`, use nucleotide distribution of current file.
#' }
#' @param proportion_per_seq Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).
#' @param use_quality_score Whether to use fastq quality scores. If TRUE input is not one-hot-encoding but corresponds to probabilities.
#' For example (0.97, 0.01, 0.01, 0.01) instead of (1, 0, 0, 0).
#' @param padding Whether to pad sequences too short for one sample with zeros.
#' @param added_label_path Path to file with additional input labels. Should be a csv file with one column named "file". Other columns should correspond to labels.
#' @param add_input_as_seq Boolean vector specifying for each entry in \code{added_label_path} if rows from csv should be encoded as a sequence or used directly.
#' If a row in your csv file is a sequence this should be `TRUE`. For example you may want to add another sequence, say ACCGT. Then this would correspond to 1,2,2,3,4 in
#' csv file (if vocabulary = c("A", "C", "G", "T")).  If \code{add_input_as_seq} is `TRUE`, 12234 gets one-hot encoded, so added input is a 3D tensor.  If \code{add_input_as_seq} is
#' `FALSE` this will feed network just raw data (a 2D tensor).
#' @param skip_amb_nuc Threshold of ambiguous nucleotides to accept in fasta entry. Complete entry will get discarded otherwise.
#' @param max_samples Maximum number of samples to use from one file. If not `NULL` and file has more than \code{max_samples} samples, will randomly choose a
#' subset of \code{max_samples} samples.
#' @param concat_seq Character string or `NULL`. If not `NULL` all entries from file get concatenated to one sequence with `concat_seq` string between them.
#' Example: If 1.entry AACC, 2. entry TTTG and `concat_seq = "ZZZ"` this becomes AACCZZZTTTG.
#' @param target_len Number of nucleotides to predict at once for language model.
#' @param file_filter Vector of file names to use from path_corpus.
#' @param use_coverage Integer or `NULL`. If not `NULL`, use coverage as encoding rather than one-hot encoding and normalize.
#' Coverage information must be contained in fasta header: there must be a string `"cov_n"` in the header, where `n` is some integer.
#' @param proportion_entries Proportion of fasta entries to keep. For example, if fasta file has 50 entries and `proportion_entries = 0.1`,
#' will randomly select 5 entries.
#' @param sample_by_file_size Sample new file weighted by file size (bigger files more likely).
#' @param n_gram Integer, encode target not nucleotide wise but combine n nucleotides at once. For example for `n=2, "AA" ->  (1, 0,..., 0),`
#' `"AC" ->  (0, 1, 0,..., 0), "TT" -> (0,..., 0, 1)`, where the one-hot vectors have length `length(vocabulary)^n`.
#' @param add_noise `NULL` or list of arguments. If not `NULL`, list must contain the following arguments: \code{noise_type} can be `"normal"` or `"uniform"`;
#' optional arguments `sd` or `mean` if noise_type is `"normal"` (default is `sd=1` and `mean=0`) or `min, max` if `noise_type` is `"uniform"`
#' (default is `min=0, max=1`).
#' @param return_int Whether to return integer encoding or one-hot encoding.
#' @param reshape_xy Can be a list of functions to apply to input and/or target. List elements (containing the reshape functions)
#'  must be called x for input or y for target and each have arguments called x and y. For example: 
#'  `reshape_xy = list(x = function(x, y) {return(x+1)}, y = function(x, y) {return(x+y)})` .
#' For rds generator needs to have an additional argument called sw.
#' @rawNamespace import(data.table, except = c(first, last, between))
#' @importFrom magrittr %>%
#' @examplesIf reticulate::py_module_available("tensorflow")
#' # create dummy fasta files
#' path_input_1 <- tempfile()
#' dir.create(path_input_1)
#' create_dummy_data(file_path = path_input_1,
#'                   num_files = 2,
#'                   seq_length = 8,
#'                   num_seq = 1,
#'                   vocabulary = c("a", "c", "g", "t"))
#' 
#' gen <- generator_fasta_lm(path_corpus = path_input_1, batch_size = 2,
#'                                    maxlen = 7)
#' z <- gen()
#' dim(z[[1]])
#' z[[2]]
#' 
#' @returns A generator function.  
#' @export
generator_fasta_lm <- function(path_corpus,
                               format = "fasta",
                               batch_size = 256,
                               maxlen = 250,
                               max_iter = 10000,
                               vocabulary = c("a", "c", "g", "t"),
                               verbose = FALSE,
                               shuffle_file_order = FALSE,
                               step = 1,
                               seed = 1234,
                               shuffle_input = FALSE,
                               file_limit = NULL,
                               path_file_log = NULL,
                               reverse_complement = FALSE,
                               output_format = "target_right",
                               ambiguous_nuc = "zeros",
                               use_quality_score = FALSE,
                               proportion_per_seq = NULL,
                               padding = TRUE,
                               added_label_path = NULL,
                               add_input_as_seq = NULL,
                               skip_amb_nuc = NULL,
                               max_samples = NULL,
                               concat_seq = NULL,
                               target_len = 1,
                               file_filter = NULL,
                               use_coverage = NULL,
                               proportion_entries = NULL,
                               sample_by_file_size = FALSE,
                               n_gram = NULL,
                               n_gram_stride = 1,
                               add_noise = NULL,
                               return_int = FALSE,
                               reshape_xy = NULL) {
  
  
  ##TODO: add check for n-gram and option for stride
  # if (!is.null(n_gram) & !(any(n_gram_stride == c(n_gram, 1)))) {
  #   stop("When using language model with n_gram encoding, n_gram_stride must be 1 or equal to n_gram")
  # } 
  if (!is.null(n_gram)) {
    # maxlen_n_gram <- ceiling((maxlen - n_gram + 1)/n_gram_stride)
    # target_len_n_gram <- ceiling((target_len - n_gram + 1)/n_gram_stride) 
    if (!n_gram_stride == n_gram) {
      stop("When using train_type='lm' with n_gram encoding, n_gram_stride must be equal to n_gram.")
    }  
  } # else {
  #   maxlen_n_gram <- maxlen
  #   target_len_n_gram <- target_len 
  # }
  
  if (!is.null(reshape_xy)) {
    reshape_xy_bool <- TRUE
    reshape_x_bool <- ifelse(is.null(reshape_xy$x), FALSE, TRUE)
    if (reshape_x_bool && !all(c('x', 'y') %in% formals(reshape_xy$x))) {
      stop("function reshape_xy$x needs to have arguments named x and y")
    }
    reshape_y_bool <- ifelse(is.null(reshape_xy$y), FALSE, TRUE)
    if (reshape_y_bool && !all(c('x', 'y') %in% formals(reshape_xy$y))) {
      stop("function reshape_xy$y needs to have arguments named x and y")
    }
  } else {
    reshape_xy_bool <- FALSE
  }
  
  total_seq_len <- maxlen + target_len
  gen <- generator_fasta_label_folder(path_corpus = path_corpus,
                                      format = format,
                                      batch_size = batch_size,
                                      maxlen = total_seq_len,
                                      max_iter = max_iter,
                                      vocabulary = vocabulary,
                                      shuffle_file_order = shuffle_file_order,
                                      step = step,
                                      seed = seed,
                                      shuffle_input = shuffle_input,
                                      file_limit = file_limit,
                                      path_file_log = path_file_log,
                                      reverse_complement = reverse_complement,
                                      reverse_complement_encoding = FALSE,
                                      num_targets = 1,
                                      ones_column = 1,
                                      ambiguous_nuc = ambiguous_nuc,
                                      proportion_per_seq = proportion_per_seq,
                                      read_data = FALSE,
                                      use_quality_score = use_quality_score,
                                      padding = padding,
                                      added_label_path = added_label_path,
                                      add_input_as_seq = add_input_as_seq,
                                      skip_amb_nuc = skip_amb_nuc,
                                      max_samples = max_samples,
                                      concat_seq = concat_seq,
                                      file_filter = file_filter,
                                      use_coverage = use_coverage,
                                      proportion_entries = proportion_entries,
                                      sample_by_file_size = sample_by_file_size,
                                      n_gram = n_gram,
                                      n_gram_stride = n_gram_stride,
                                      masked_lm = NULL,
                                      add_noise = add_noise,
                                      return_int = return_int)
  
  function() {
    
    if (is.null(added_label_path)) {
      xy <- gen()[[1]]
    } else {
      z <- gen()[[1]]
      added_input <- z[1:(length(z)-1)]
      xy <- z[length(z)][[1]]
    }
    
    xy_list <- slice_tensor_lm(xy = xy,
                               output_format = output_format,
                               target_len = target_len,
                               n_gram = n_gram,
                               # maxlen_n_gram = maxlen_n_gram,
                               # target_len_n_gram = target_len_n_gram, 
                               n_gram_stride = n_gram_stride,
                               total_seq_len = total_seq_len,
                               return_int = return_int)
    
    if (!is.null(added_label_path)) {
      xy_list <- (list(append(added_input, list(xy_list$x)), xy_list$y))
    }
    
    if (reshape_xy_bool) {
      xy_list <- f_reshape(x = xy_list$x, y = xy_list$y,
                           reshape_xy = reshape_xy,
                           reshape_x_bool = reshape_x_bool,
                           reshape_y_bool = reshape_y_bool,
                           reshape_sw_bool = FALSE, sw = NULL)
    } 
    
    return(xy_list)
    
  }
}