deepG: Deep Learning for Genome Sequence Data

Documented in load_prediction predict_model summarize_states

#' Make prediction for nucleotide sequence or entries in fasta/fastq file
#'
#' @description Removes layers (optional) from pretrained model and calculates states of fasta/fastq file or nucleotide sequence.
#' Writes states to h5 or csv file (access content of h5 output with \code{\link{load_prediction}} function).
#' There are several options on how to process an input file:
#' \itemize{
#' \item If `"one_seq"`, computes prediction for sequence argument or fasta/fastq file.
#' Combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry.
#' \item If `"by_entry"`, will output a separate file for each fasta/fastq entry.
#' Names of output files are: `output_dir` + "Nr" + i + `filename` + `output_type`, where i is the number of the fasta entry.
#' \item If `"by_entry_one_file"`, will store prediction for all fasta entries in one h5 file.
#' \item If `"one_pred_per_entry"`, will make one prediction for each entry by either picking random sample for long sequences
#' or pad sequence for short sequences.
#' }
#' 
#' @inheritParams get_generator
#' @inheritParams train_model
#' @inheritParams get_generator 
#' @inheritParams train_model
#' @param layer_name Name of layer to get output from. If `NULL`, will use the last layer.
#' @param path_input Path to fasta file.
#' @param sequence Character string, ignores path_input if argument given.
#' @param round_digits Number of decimal places.
#' @param mode Either `"lm"` for language model or `"label"` for label classification.
#' @param include_seq Whether to include input sequence in h5 file.
#' @param output_format Either `"one_seq"`, `"by_entry"`, `"by_entry_one_file"`, `"one_pred_per_entry"`.
#' @param output_type `"h5"` or `"csv"`. If `output_format`` is `"by_entries_one_file", "one_pred_per_entry"` can only be `"h5"`.
#' @param return_states Return predictions as data frame. Only supported for output_format `"one_seq"`.
#' @param padding Either `"none"`, `"maxlen"`, `"standard"` or `"self"`.
#' \itemize{
#' \item If `"none"`, apply no padding and skip sequences that are too short.
#' \item If `"maxlen"`, pad with maxlen number of zeros vectors.
#' \item If `"standard"`, pad with zero vectors only if sequence is shorter than maxlen. Pads to minimum size required for one prediction.
#' \item If `"self"`, concatenate sequence with itself until sequence is long enough for one prediction.
#' Example: if sequence is "ACGT" and maxlen is 10, make prediction for "ACGTACGTAC". 
#' Only applied if sequence is shorter than maxlen.
#' }
#' @param verbose Boolean.
#' @param filename Filename to store states in. No file output if argument is `NULL`.
#' If `output_format = "by_entry"`, adds "_nr_" + "i" after name, where i is entry number.
#' @param output_dir Directory for file output.
#' @param use_quality Whether to use quality scores.
#' @param quality_string String for encoding with quality scores (as used in fastq format).
#' @param lm_format Either `"target_right"`, `"target_middle_lstm"`, `"target_middle_cnn"` or `"wavenet"`.
#' @param ... Further arguments for sequence encoding with \code{\link{seq_encoding_label}}.
#' @examplesIf reticulate::py_module_available("tensorflow")
#' # make prediction for single sequence and write to h5 file
#' model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
#' vocabulary <- c("a", "c", "g", "t")
#' sequence <- paste(sample(vocabulary, 200, replace = TRUE), collapse = "")
#' output_file <- tempfile(fileext = ".h5")
#' predict_model(output_format = "one_seq", model = model, step = 10,
#'              sequence = sequence, filename = output_file, mode = "label")
#' 
#' # make prediction for fasta file with multiple entries, write output to separate h5 files
#' fasta_path <- tempfile(fileext = ".fasta")
#' create_dummy_data(file_path = fasta_path, num_files = 1,
#'                  num_seq = 5, seq_length = 100,
#'                  write_to_file_path = TRUE)
#' model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
#' output_dir <- tempfile()
#' dir.create(output_dir)
#' predict_model(output_format = "by_entry", model = model, step = 10, verbose = FALSE,
#'                output_dir = output_dir, mode = "label", path_input = fasta_path)
#' list.files(output_dir)
#' 
#' @returns If `return_states = TRUE` returns a list of model predictions and position of corresponding sequences.
#' If additionally `include_seq = TRUE`, list contains sequence strings.
#' If `return_states = FALSE` returns nothing, just writes output to file(s).  
#' @export
predict_model <- function(model, output_format = "one_seq", layer_name = NULL, sequence = NULL, path_input = NULL,
                          round_digits = NULL, filename = "states.h5", step = 1, vocabulary = c("a", "c", "g", "t"),
                          batch_size = 256, verbose = TRUE, return_states = FALSE, 
                          output_type = "h5", padding = "none", use_quality = FALSE, quality_string = NULL,
                          mode = "label", lm_format = "target_right", output_dir = NULL,
                          format = "fasta", include_seq = FALSE, reverse_complement_encoding = FALSE,
                          ambiguous_nuc = "zero", ...) {
  
  stopifnot(padding %in% c("standard", "self", "none", "maxlen"))
  stopifnot(output_format %in% c("one_seq", "by_entry", "by_entry_one_file", "one_pred_per_entry"))
  if (output_format %in% c("by_entry_one_file", "one_pred_per_entry") & output_type == "csv") {
    message("by_entry_one_file or one_pred_per_entry only implemented for h5 output.
            Setting output_type to h5")
    output_type <- "h5"
  }
  
  if (output_format == "one_seq") {
    output_list <- predict_model_one_seq(layer_name = layer_name, sequence = sequence, path_input = path_input,
                                         round_digits = round_digits, filename = filename, step = step, vocabulary = vocabulary,
                                         batch_size = batch_size, verbose = verbose, return_states = return_states, 
                                         padding = padding, quality_string = quality_string, use_quality = use_quality,
                                         output_type = output_type, model = model, mode = mode, lm_format = lm_format,
                                         format = format, include_seq = include_seq, ambiguous_nuc = ambiguous_nuc,
                                         reverse_complement_encoding = reverse_complement_encoding, ...)
    return(output_list)
  }
  
  if (output_format == "by_entry") {
    predict_model_by_entry(layer_name = layer_name, path_input = path_input,
                           round_digits = round_digits, filename = filename, step = step, 
                           # vocabulary = vocabulary, quality_string = quality_string, ambiguous_nuc = ambiguous_nuc,
                           batch_size = batch_size, verbose = verbose, use_quality = use_quality,
                           output_type = output_type, model = model, mode = mode, lm_format = lm_format,
                           output_dir = output_dir, format = format, include_seq = include_seq,
                           padding = padding, reverse_complement_encoding = reverse_complement_encoding, ...)
  }
  
  if (output_format == "by_entry_one_file") {
    predict_model_by_entry_one_file(layer_name = layer_name, path_input = path_input,
                                    round_digits = round_digits, filename = filename, step = step, 
                                    # vocabulary = vocabulary, quality_string = quality_string, ambiguous_nuc = ambiguous_nuc, 
                                    batch_size = batch_size, verbose = verbose, use_quality = use_quality,
                                    model = model, mode = mode, lm_format = lm_format, format = format,
                                    padding = padding, include_seq = include_seq, 
                                    reverse_complement_encoding = reverse_complement_encoding, ...)
  }
  
  if (output_format == "one_pred_per_entry") {
    if (mode == "lm") {
      stop("one_pred_per_entry only implemented for label classification")
    }
    predict_model_one_pred_per_entry(layer_name = layer_name, path_input = path_input,
                                     round_digits = round_digits, filename = filename, 
                                     batch_size = batch_size, verbose = verbose, model = model, format = format,
                                     # ambiguous_nuc = ambiguous_nuc, use_quality = use_quality, vocabulary = vocabulary,
                                     reverse_complement_encoding = reverse_complement_encoding, ...)
  }
  
}


#' Write output of specific model layer to h5 or csv file.
#'
#' Removes layers (optional) from pretrained model and calculates states of fasta file, writes states to h5/csv file.
#' Function combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry.
#' h5 file also contains sequence and positions of targets corresponding to states.
#'
#' @inheritParams generator_fasta_lm
#' @param layer_name Name of layer to get output from. If `NULL`, will use the last layer.
#' @param path_input Path to fasta file.
#' @param sequence Character string, ignores path_input if argument given.
#' @param round_digits Number of decimal places.
#' @param batch_size Number of samples to evaluate at once. Does not change output, only relevant for speed and memory.
#' @param step Frequency of sampling steps.
#' @param filename Filename to store states in. No file output if argument is `NULL`.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param return_states Logical scalar, return states matrix.
#' @param ambiguous_nuc `"zero"` or `"equal"`. 
#' @param verbose Whether to print model before and after removing layers.
#' @param output_type Either `"h5"` or `"csv"`.
#' @param model A keras model. 
#' @param mode Either `"lm"` for language model or `"label"` for label classification.
#' @param format Either `"fasta"` or `"fastq"`.
#' @param include_seq Whether to include input sequence in h5 file.
#' @param ... Further arguments for sequence encoding with \code{\link{seq_encoding_label}}.
#' @noRd
predict_model_one_seq <- function(model, layer_name = NULL, sequence = NULL, path_input = NULL, round_digits = 2,
                                  filename = "states.h5", step = 1, vocabulary = c("a", "c", "g", "t"), batch_size = 256, verbose = TRUE,
                                  return_states = FALSE, target_len = 1, use_quality = FALSE, quality_string = NULL,
                                  output_type = "h5", mode = "lm", lm_format = "target_right",
                                  ambiguous_nuc = "zero", padding = "none", format = "fasta", output_dir = NULL,
                                  include_seq = TRUE, reverse_complement_encoding = FALSE, ...) {
  
  vocabulary <- stringr::str_to_lower(vocabulary)
  stopifnot(mode %in% c("lm", "label"))
  stopifnot(output_type %in% c("h5", "csv"))
  if (!is.null(quality_string)) use_quality <- TRUE
  # if (!is.null(quality_string) & !is.null(sequence)) {
  #   stopifnot(length(sequence) == length(quality_to_probability(quality_string)))
  # }
  file_output <- !is.null(filename)
  if (!file_output) {
    if (!return_states) stop("If filename is NULL, return_states must be TRUE; otherwise function produces no output.")
    filename <- tempfile(fileext = paste0(".", output_type))
  }
  stopifnot(batch_size > 0)
  stopifnot(!file.exists(filename))
  if (reverse_complement_encoding) {
    test_len <- length(vocabulary) != 4
    if (test_len || all(sort(stringr::str_to_lower(vocabulary)) != c("a", "c", "g", "t"))) {
      stop("reverse_complement_encoding only implemented for A,C,G,T vocabulary")
    }
  }
  
  # token for ambiguous nucleotides
  for (i in letters) {
    if (!(i %in% stringr::str_to_lower(vocabulary))) {
      amb_nuc_token <- i
      break
    }
  }
  
  tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), c(vocabulary, amb_nuc_token))
  
  if (is.null(layer_name)) {
    layer_name <- model$output_names
    if (verbose) message(paste("layer_name not specified. Using layer", layer_name))
  }
  
  if (!is.null(sequence) && (!missing(sequence) & sequence != "")) {
    nt_seq <- sequence %>% stringr::str_to_lower()
  } else {
    if (format == "fasta") {
      fasta.file <- microseq::readFasta(path_input)
    }
    if (format == "fastq") {
      fasta.file <- microseq::readFastq(path_input)
    }
    if (nrow(fasta.file) > 1 & verbose) {
      text_1 <- paste("Your file has", nrow(fasta.file), "entries. 'one_seq'  output_format will concatenate them to a single sequence.\n")
      text_2 <- "Use 'by_entry' or 'by_entry_one_file' output_format to evaluate them separately."
      message(paste0(text_1, text_2))
    }
    nt_seq <- paste(fasta.file$Sequence, collapse = "") %>% stringr::str_to_lower()
  }
  
  # tokenize ambiguous nt
  pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]")
  nt_seq <- stringr::str_replace_all(string = nt_seq, pattern = pattern, amb_nuc_token)
  
  if (use_quality & is.null(quality_string)) {
    quality_string <- paste(fasta.file$Quality, collapse = "")
  }
  
  # extract maxlen
  target_middle <- ifelse(mode == "lm" && (lm_format %in% c("target_middle_lstm", "target_middle_cnn")), TRUE, FALSE)
  if (!target_middle) {
    if (reverse_complement_encoding) {
      maxlen <- model$input[[1]]$shape[[2]]
    } else {
      maxlen <- model$input$shape[[2]]
    }
  } else {
    maxlen_1 <- model$input[[1]]$shape[[2]]
    maxlen_2 <- model$input[[2]]$shape[[2]]
    maxlen <- maxlen_1 + maxlen_2
  }
  
  total_seq_len <- ifelse(mode == "lm", maxlen + target_len, maxlen)
  
  # pad sequence
  unpadded_seq_len <- nchar(nt_seq)
  pad_len <- 0
  if (padding == "maxlen") {
    pad_len <- maxlen
  }
  if (padding == "standard" & (unpadded_seq_len < total_seq_len)) {
    pad_len <- total_seq_len - unpadded_seq_len
  }
  if (padding == "self"  & (unpadded_seq_len < total_seq_len)) {
    nt_seq <- strrep(nt_seq, ceiling(total_seq_len / unpadded_seq_len))
    nt_seq <- substr(nt_seq, 1, total_seq_len)
    if (use_quality) {
      quality_string <- strrep(quality_string, ceiling(total_seq_len / unpadded_seq_len))
      quality_string <- substr(quality_string, 1, total_seq_len) 
    }
  } else {
    nt_seq <- paste0(strrep("0", pad_len), nt_seq)
    if (use_quality) quality_string <- paste0(strrep("0", pad_len), quality_string)
  }
  
  if (nchar(nt_seq) < total_seq_len) {
    stop(paste0("Input sequence is shorter than required length (", total_seq_len, "). Use padding argument to pad sequence to bigger size."))
  }
  
  if (use_quality) {
    quality_vector <- quality_string %>% quality_to_probability()
  } else {
    quality_vector <- NULL
  }
  
  # start of samples
  start_indices <- seq(1, nchar(nt_seq) - total_seq_len + 1, by = step)
  num_samples <- length(start_indices)
  
  check_layer_name(model, layer_name)
  model <- tensorflow::tf$keras$Model(model$input, model$get_layer(layer_name)$output)
  if (verbose) {
    cat("Computing output for model at layer", layer_name,  "\n")
    print(model)
  }
  
  # extract number of neurons in last layer
  if (length(model$output$shape$dims) == 3) {
    if (!("lstm" %in% stringr::str_to_lower(model$output_names))) {
      stop("Output dimension of layer is > 1, format not supported yet")
    }
    layer.size <- model$output$shape[[3]]
  } else {
    layer.size <- model$output$shape[[2]]
  }
  
  # tokenize sequence
  nt_seq <- stringr::str_to_lower(nt_seq)
  tokSeq <- keras::texts_to_sequences(tokenizer, nt_seq)[[1]] - 1
  
  # seq end position
  pos_arg <- start_indices + total_seq_len - pad_len - 1
  
  if (include_seq) {
    output_seq <- substr(nt_seq, pad_len + 1, nchar(nt_seq))
  }
  
  # create h5 file to store states
  if (output_type == "h5") {
    h5_file <- hdf5r::H5File$new(filename, mode = "w")
    h5_file[["multi_entries"]] <- FALSE
    h5_file[["sample_end_position"]] <- pos_arg
    if (include_seq) h5_file[["sequence"]] <- output_seq
  }
  
  number_batches <- ceiling(length(start_indices)/batch_size)
  pred_list <- vector("list", number_batches)
  col_names <- c(as.character(1:layer.size), "sample_end_position")
  
  # subset input for target middle
  if (mode == "lm" && lm_format %in% c("target_middle_lstm", "target_middle_cnn")) {
    index_x_1 <- 1:ceiling((total_seq_len - target_len)/2)
    index_x_2 <- (max(index_x_1) + target_len + 1) : total_seq_len
  } 
  
  for (i in 1:number_batches) {
    
    index_start <- ((i - 1) * batch_size) + 1
    index_end <- min(c(num_samples + 1, index_start + batch_size)) - 1
    index <- index_start : index_end 
    
    x <- seq_encoding_label(sequence = tokSeq, 
                            maxlen = total_seq_len,
                            vocabulary = vocabulary,
                            start_ind = start_indices[index],
                            ambiguous_nuc = ambiguous_nuc,
                            tokenizer = NULL,
                            adjust_start_ind = FALSE,
                            quality_vector = quality_vector,
                            ...
    )
    
    if (mode == "lm" && lm_format == "target_middle_lstm") {
      x1 <- x[ , index_x_1, ]
      x2 <- x[ , index_x_2, ]
      
      if (length(index_x_1) == 1 | dim(x)[1] == 1) {
        x1 <- array(x1, dim = c(1, dim(x1)))
      }
      if (length(index_x_2) == 1 | dim(x)[1] == 1) {
        x2 <- array(x2, dim = c(1, dim(x2)))
      }
      
      x2 <- x2[ , dim(x2)[2]:1, ] # reverse order
      
      if (length(dim(x2)) == 2) {
        x2 <- array(x2, dim = c(1, dim(x2)))
      }
      
      x <- list(x1, x2)
    } 
    
    if (mode == "lm" && lm_format == "target_middle_cnn") {
      x <- x[ , c(index_x_1, index_x_2), ]
    } 
    
    if (reverse_complement_encoding) x <- list(x, reverse_complement_tensor(x))
    
    y <- stats::predict(model, x, verbose = 0)
    if (!is.null(round_digits)) y <- round(y, round_digits)
    pred_list[[i]] <- y
    
  }
  
  states <- do.call(rbind, pred_list)
  
  if (file_output) {
    if (output_type == "h5") {
      h5_file[["states"]] <- states
      h5_file$close_all()
    } else {
      col_names <- paste0("N", 1:ncol(states))
      colnames(states) <- col_names 
      utils::write.csv(x = states, file = filename, row.names = FALSE)
    }
  }
  
  if (return_states) {
    output_list <- list()
    output_list$states <- states
    output_list$sample_end_position <- pos_arg
    if (include_seq) output_list$sequence <- output_seq
    return(output_list)
  } else {
    return(NULL)
  }
}

#' Write states to h5 file
#'
#' @description Removes layers (optional) from pretrained model and calculates states of fasta file, writes a separate
#' h5 file for every fasta entry in fasta file. h5 files also contain the nucleotide sequence and positions of targets corresponding to states.
#' Names of output files are: file_path + "Nr" + i + filename + output_type, where i is the number of the fasta entry.
#'
#' @param filename Filename to store states, function adds "_nr_" + "i" after name, where i is entry number.
#' @param output_dir Path to folder, where to write output.
#' @noRd
predict_model_by_entry <- function(model, layer_name = NULL, path_input, round_digits = 2,
                                   filename = "states.h5", output_dir = NULL, step = 1, vocabulary = c("a", "c", "g", "t"),
                                   batch_size = 256, output_type = "h5", mode = "lm",
                                   lm_format = "target_right", format = "fasta", use_quality = FALSE,
                                   reverse_complement_encoding = FALSE, padding = "none", 
                                   verbose = FALSE, include_seq = FALSE, ambiguous_nuc = "zero", ...) {
  
  stopifnot(mode %in% c("lm", "label"))
  stopifnot(!is.null(filename))
  stopifnot(!is.null(output_dir))
  
  if (endsWith(filename, paste0(".", output_type))) {
    filename <- stringr::str_remove(filename, paste0(".", output_type, "$"))
    filename <- basename(filename)
  }
  
  # extract maxlen
  target_middle <- ifelse(mode == "lm" && (lm_format %in% c("target_middle_lstm", "target_middle_cnn")), TRUE, FALSE)
  if (!target_middle) {
    if (reverse_complement_encoding) {
      model$input[[1]]$shape[[2]]
    } else {
      maxlen <- model$input$shape[[2]]
    }
  } else {
    maxlen_1 <- model$input[[1]]$shape[[2]]
    maxlen_2 <- model$input[[2]]$shape[[2]]
    maxlen <- maxlen_1 + maxlen_2
  }
  
  # load fasta file
  if (format == "fasta") {
    fasta.file <- microseq::readFasta(path_input)
  }
  if (format == "fastq") {
    fasta.file <- microseq::readFastq(path_input)
  }
  df <- fasta.file[ , c("Sequence", "Header")]
  names(df) <- c("seq", "header")
  rownames(df) <- NULL
  num_skipped_seq <- 0
  
  for (i in 1:nrow(df)) {
    
    # skip entry if too short
    if ((nchar(df[i, "seq"]) < maxlen) & padding == "none") {
      num_skipped_seq <- num_skipped_seq + 1
      next
    } 
    
    if (use_quality) {
      quality_string <- fasta.file$Quality[i]
    } else {
      quality_string <- NULL
    }
    
    current_file <- paste0(output_dir, "/", filename, "_nr_", as.character(i), ".", output_type)
    
    predict_model_one_seq(layer_name = layer_name, sequence = df[i, "seq"],
                          round_digits = round_digits, path_input = path_input,
                          filename = current_file, quality_string = quality_string,
                          step = step, vocabulary = vocabulary, batch_size = batch_size,
                          verbose = ifelse(i > 1, FALSE, verbose), 
                          output_type = output_type, mode = mode,
                          lm_format = lm_format, model = model, include_seq = include_seq,
                          padding = padding, ambiguous_nuc = "zero", 
                          reverse_complement_encoding = reverse_complement_encoding, ...)
  }
  
  if (verbose & num_skipped_seq > 0) {
    message(paste0("Skipped ", num_skipped_seq,
                   ifelse(num_skipped_seq == 1, " entry", " entries"),
                   ". Use different padding option to evaluate all."))
  }  
  
}

#' Write states to h5 file
#'
#' @description Removes layers (optional) from pretrained model and calculates states of fasta file,
#' writes separate states matrix in one .h5 file for every fasta entry.
#' h5 file also contains the nucleotide sequences and positions of targets corresponding to states.
#' @noRd
predict_model_by_entry_one_file <- function(model, path_input, round_digits = 2, filename = "states.h5",
                                            step = 1,  vocabulary = c("a", "c", "g", "t"), batch_size = 256, layer_name = NULL,
                                            verbose = TRUE, mode = "lm", use_quality = FALSE,
                                            lm_format = "target_right", padding = "none",
                                            format = "fasta", include_seq = TRUE, reverse_complement_encoding = FALSE, ...) {
  
  vocabulary <- stringr::str_to_lower(vocabulary)
  stopifnot(mode %in% c("lm", "label"))
  
  target_middle <- ifelse(mode == "lm" && (lm_format %in% c("target_middle_lstm", "target_middle_cnn")), TRUE, FALSE)
  # extract maxlen
  if (!target_middle) {
    if (reverse_complement_encoding) {
      maxlen <- model$input[[1]]$shape[[2]]
    } else {
      maxlen <- model$input$shape[[2]]
    }
  } else {
    maxlen_1 <- model$input[[1]]$shape[[2]]
    maxlen_2 <- model$input[[2]]$shape[[2]]
    maxlen <- maxlen_1 + maxlen_2
  }
  
  # extract number of neurons in last layer
  if (length(model$output$shape$dims) == 3) {
    if (!("lstm" %in% stringr::str_to_lower(model$output_names))) {
      stop("Output dimension of layer is > 1, format not supported yet")
    }
    layer.size <- model$output$shape[[3]]
  } else {
    layer.size <- model$output$shape[[2]]
  }
  
  # load fasta file
  if (format == "fasta") {
    fasta.file <- microseq::readFasta(path_input)
  }
  if (format == "fastq") {
    fasta.file <- microseq::readFastq(path_input)
  }
  df <- fasta.file[ , c("Sequence", "Header")]
  names(df) <- c("seq", "header")
  rownames(df) <- NULL
  
  if (verbose) {
    # check if names are unique
    if (length(df$header) != length(unique(df$header))) {
      message("Header names are not unique, adding '_header_x' to names (x being the header number)")
      df$header <- paste0(df$header, paste0("_header_", 1:length(df$header)))
    }
  }
  
  # create h5 file to store states
  
  h5_file <- hdf5r::H5File$new(filename, mode = "w")
  h5_file[["multi_entries"]] <- TRUE
  states.grp <- h5_file$create_group("states")
  sample_end_position.grp <- h5_file$create_group("sample_end_position")
  if (include_seq) seq.grp <- h5_file$create_group("sequence")
  
  num_skipped_seq <- 0
  
  for (i in 1:nrow(df)) {
    
    #seq_name <- df$header[i]
    seq_name <- paste0("entry_", i)
    temp_file <- tempfile(fileext = ".h5")
    
    # skip entry if too short
    if ((nchar(df[i, "seq"]) < maxlen) & padding == "none") {
      num_skipped_seq <- num_skipped_seq + 1
      next
    } 
    
    if (use_quality) {
      quality_string <- fasta.file$Quality[i]
    } else {
      quality_string <- NULL
    }
    
    output_list <- predict_model_one_seq(layer_name = layer_name, sequence = df$seq[i], path_input = path_input,
                                         round_digits = round_digits, filename = temp_file, step = step, vocabulary = vocabulary,
                                         batch_size = batch_size, return_states = TRUE, quality_string = quality_string,
                                         output_type = "h5", model = model, mode = mode, lm_format = lm_format,
                                         ambiguous_nuc = "zero", verbose = ifelse(i > 1, FALSE, verbose), 
                                         padding = padding, format = format, include_seq = include_seq,
                                         reverse_complement_encoding = reverse_complement_encoding, ...)
    
    states.grp[[seq_name]] <- output_list$states
    sample_end_position.grp[[seq_name]] <- output_list$sample_end_position
    
    if (include_seq) seq.grp[[seq_name]] <- output_list$sequence
  }
  
  if (verbose & num_skipped_seq > 0) {
    message(paste0("Skipped ", num_skipped_seq,
                   ifelse(num_skipped_seq == 1, " entry", " entries"),
                   ". Use different padding option to evaluate all."))
  }  
  
  h5_file$close_all()
}

#' Get states for label classification model.
#'
#' Computes output at specified model layer. Forces every fasta entry to have length maxlen by either padding sequences shorter than maxlen or taking random subsequence for
#' longer sequences.
#'
#' @inheritParams predict_model_one_seq
#' @noRd
predict_model_one_pred_per_entry <- function(model, layer_name = NULL, path_input, round_digits = 2, format = "fasta",
                                             ambiguous_nuc = "zero", filename = "states.h5", padding = padding,
                                             vocabulary = c("a", "c", "g", "t"), batch_size = 256, verbose = TRUE,
                                             return_states = FALSE, reverse_complement_encoding = FALSE, 
                                             include_seq = FALSE, use_quality = FALSE, ...) {
  
  vocabulary <- stringr::str_to_lower(vocabulary)
  file_type <- "h5"
  stopifnot(batch_size > 0)
  stopifnot(!file.exists(filename))
  # token for ambiguous nucleotides
  for (i in letters) {
    if (!(i %in% stringr::str_to_lower(vocabulary))) {
      amb_nuc_token <- i
      break
    }
  }
  tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "N"), vocabulary)
  
  if (is.null(layer_name)) {
    layer_name <- model$output_names
    if (verbose) message(paste("layer_name not specified. Using layer", layer_name))
  }

  # extract maxlen
  if (reverse_complement_encoding) {
    maxlen <- model$input[[1]]$shape[[2]]
  } else {
    maxlen <- model$input$shape[[2]]
  }
  
  if (format == "fasta") {
    fasta.file <- microseq::readFasta(path_input)
  }
  if (format == "fastq") {
    fasta.file <- microseq::readFastq(path_input)
  }
  
  num_samples <- nrow(fasta.file)
  
  nucSeq <- as.character(fasta.file$Sequence)
  seq_length <- nchar(fasta.file$Sequence)
  if (use_quality) {
    quality_string <- vector("character", length(nucSeq)) 
  } else {
    quality_string <- NULL
  }
  
  for (i in 1:length(nucSeq)) {
    # take random subsequence
    if (seq_length[i] > maxlen) {
      start <- sample(1 : (seq_length[i] - maxlen + 1) , size = 1)
      nucSeq[i] <- substr(nucSeq[i], start = start, stop = start + maxlen - 1)
      if (use_quality) {
        quality_string[i] <- substr(fasta.file$Quality[i], start = start, stop = start + maxlen - 1)
      }  
    }
    # pad sequence
    if (seq_length[i] < maxlen) {
      nucSeq[i] <- paste0(paste(rep("N", maxlen - seq_length[i]), collapse = ""), nucSeq[i])
      if (use_quality) {
        quality_string[i] <- paste0(paste(rep("0", maxlen - seq_length[i]), collapse = ""), fasta.file$Quality[i])
      }  
    }
  }
  
  model <- tensorflow::tf$keras$Model(model$input, model$get_layer(layer_name)$output)
  if (verbose) {
    cat("Computing output for model at layer", layer_name,  "\n")
    print(model)
  } 
  
  # extract number of neurons in last layer
  if (length(model$output$shape$dims) == 3) {
    if (!("lstm" %in% stringr::str_to_lower(model$output_names))) {
      stop("Output dimension of layer is > 1, format not supported yet")
    }
    layer.size <- model$output$shape[[3]]
  } else {
    layer.size <- model$output$shape[[2]]
  }
  
  if (file_type == "h5") {
    # create h5 file to store states
    h5_file <- hdf5r::H5File$new(filename, mode = "w")
    if (!missing(path_input)) h5_file[["fasta_file"]] <- path_input
    
    h5_file[["header_names"]] <- fasta.file$Header
    if (include_seq) h5_file[["sequences"]] <- nucSeq
    h5_file[["states"]] <- array(0, dim = c(0, layer.size))
    h5_file[["multi_entries"]] <- FALSE
    writer <- h5_file[["states"]]
  }
  
  rm(fasta.file)
  #nucSeq <- paste(nucSeq, collapse = "") %>% stringr::str_to_lower()
  number_batches <- ceiling(num_samples/batch_size)
  if (verbose) cat("Evaluating", number_batches, ifelse(number_batches > 1, "batches", "batch"), "\n")
  row <- 1
  string_start_index <- 1 
  ten_percent_steps <- seq(number_batches/10, number_batches, length.out = 10)
  percentage_index <- 1
  
  if (number_batches > 1) {
    for (i in 1:(number_batches - 1)) {
      string_end_index <-string_start_index + batch_size - 1 
      char_seq <- nucSeq[string_start_index : string_end_index] %>% paste(collapse = "") 
      
      if (use_quality) {
        quality_string_subset <- quality_string[string_start_index : string_end_index] %>% paste(collapse = "") 
      } else {
        quality_string_subset <- NULL
      }
      
      if (i == 1) start_ind <- seq(1, nchar(char_seq), maxlen)
      one_hot_batch <- seq_encoding_label(sequence = NULL, maxlen = maxlen, vocabulary = vocabulary,
                                          start_ind = start_ind, ambiguous_nuc = ambiguous_nuc, 
                                          char_sequence = char_seq, quality_vector = quality_string_subset,
                                          tokenizer = tokenizer, adjust_start_ind = TRUE, ...) 
      if (reverse_complement_encoding) one_hot_batch <- list(one_hot_batch, reverse_complement_tensor(one_hot_batch))
      activations <- keras::predict_on_batch(model, one_hot_batch)
      writer[row : (row + batch_size - 1), ] <- activations
      row <- row + batch_size
      string_start_index <- string_end_index + 1
      
      if (verbose & (i > ten_percent_steps[percentage_index]) & percentage_index < 10) {
        cat("Progress: ", percentage_index * 10 ,"% \n")
        percentage_index <- percentage_index + 1
      }
      
    }
  }
  
  # last batch might be shorter
  char_seq <- nucSeq[string_start_index : length(nucSeq)] %>% paste(collapse = "") 
  if (use_quality) {
    quality_string_subset <- quality_string[string_start_index : length(nucSeq)] %>% paste(collapse = "") 
  } else {
    quality_string_subset <- NULL
  }
  one_hot_batch <- seq_encoding_label(sequence = NULL, maxlen = maxlen, vocabulary = vocabulary,
                                      start_ind = seq(1, nchar(char_seq), maxlen), ambiguous_nuc = "zero", nuc_dist = NULL,
                                      quality_vector = quality_string_subset, use_coverage = FALSE, max_cov = NULL,
                                      cov_vector = NULL, n_gram = NULL, n_gram_stride = 1, char_sequence = char_seq,
                                      tokenizer = tokenizer, adjust_start_ind = TRUE, ...) 
  if (reverse_complement_encoding) one_hot_batch <- list(one_hot_batch, reverse_complement_tensor(one_hot_batch))
  activations <- keras::predict_on_batch(model, one_hot_batch)
  writer[row : num_samples, ] <- activations[1 : length(row:num_samples), ]
  
  if (verbose) cat("Progress: 100 % \n")
  
  if (return_states & (file_type == "h5")) states <- writer[ , ]
  if (file_type == "h5") h5_file$close_all()
  if (return_states) return(states)
}


#' Read states from h5 file
#'
#' Reads h5 file created by  \code{\link{predict_model}} function.
#'
#' @param h5_path Path to h5 file.
#' @param rows Range of rows to read. If `NULL` read everything.
#' @param get_sample_position Return position of sample corresponding to state if `TRUE`.
#' @param get_seq Return nucleotide sequence if `TRUE`.
#' @param verbose Boolean.
#' @examplesIf reticulate::py_module_available("tensorflow")
#' # make prediction for single sequence and write to h5 file
#' model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
#' vocabulary <- c("a", "c", "g", "t")
#' sequence <- paste(sample(vocabulary, 200, replace = TRUE), collapse = "")
#' output_file <- tempfile(fileext = ".h5")
#' predict_model(output_format = "one_seq", model = model, step = 10,
#'               sequence = sequence, filename = output_file, mode = "label")
#' load_prediction(h5_path = output_file)
#' 
#' @returns A list of data frames, containing model predictions and sequence positions.
#' @export
load_prediction <- function(h5_path, rows = NULL, verbose = FALSE,
                            get_sample_position = FALSE, get_seq = FALSE) {
  
  if (is.null(rows)) complete <- TRUE
  h5_file <- hdf5r::H5File$new(h5_path, mode = "r")
  
  multi_entries <- ifelse(h5_file[["multi_entries"]][], TRUE, FALSE)
  if (!multi_entries) {
    number_entries <- 1
  } else {
    entry_names <- names(h5_file[["states"]])
    number_entries <- length(entry_names)
    output_list <- list()
  }
  
  train_mode <- "label"
  
  if (get_sample_position & !any(c("sample_end_position", "target_position") %in% names(h5_file))) {
    get_sample_position <- FALSE
    message("File does not contain target positions.")
  }
  
  if (!multi_entries) {
    
    read_states <- h5_file[["states"]]
    
    if (get_sample_position) {
      read_targetPos <- h5_file[["sample_end_position"]]
    }
    
    if (verbose) {
      cat("states matrix has", dim(read_states[ , ])[1], "rows and",  dim(read_states[ , ])[2], "columns \n")
    }
    if (complete) {
      states <- read_states[ , ]
      if (get_sample_position) {
        targetPos <- read_targetPos[ ]
      }
    } else {
      states <- read_states[rows, ]
      if (get_sample_position) {
        targetPos <- read_targetPos[rows]
      }
    }
    
    if (is.null(dim(states))) {
      states <- matrix(states, nrow = 1)
    }
    
    contains_seq <- FALSE
    if (get_seq) {
      if ("sequence" %in% names(h5_file)) {
        contains_seq <- TRUE
        sequence <- h5_file[["sequence"]][]
      } else {
        contains_seq <- FALSE
        message("File does not contain sequence.")
      }
    }
    
    h5_file$close_all()
    output_list <- list(states = states)
    if (get_sample_position) {
      if (train_mode == "label") {
        output_list$sample_end_position <- targetPos
      } else {
        output_list$target_position <- targetPos
      }
    }
    
    if (get_seq && contains_seq) {
      output_list$sequence <- sequence
    }
    
    return(output_list)
    
    # multi entries
  } else {
    
    if (verbose) {
      cat("file contains", number_entries, "entries \n")
    }
    
    if (get_sample_position) {
      target_name <- "sample_end_position"
    }
    
    if (get_seq & !("sequence" %in% names(h5_file))) {
      message("File does not contain sequence.")
      get_seq <- FALSE
    }
    
    for (i in 1:number_entries) {
      
      entry_name <- entry_names[i]
      states <- h5_file[["states"]][[entry_name]][ , ]
      if (is.null(dim(states))) {
        states <- matrix(states, nrow = 1)
      }
      
      if (get_seq) {
        sequence <- h5_file[["sequence"]][[entry_name]][ ]
      }
      
      if (get_sample_position) {
        targetPos <- h5_file[[target_name]][[entry_name]][ ]
      }
      
      if (!complete) {
        states <- states[rows, ]
        
        if (is.null(dim(states))) {
          states <- matrix(states, nrow = 1)
        }
        
        if (get_sample_position) {
          targetPos <- hdf5r::h5attr(read_states, target_name)[rows]
        }
      }
      
      if (get_sample_position) {
        l <- list(states = states, sample_end_position = targetPos)
      } else {
        l <- list(states = states)
      }
      
      if (get_seq) {
        l[["sequence"]] <- sequence
      }
      output_list[[entry_name]] <- l
    }
    h5_file$close_all()
    names(output_list) <- entry_names
    return(output_list)
  }
}

#' Create summary of predictions 
#'
#' Create summary data frame for confidence predictions over 1 or several state files or a data frame.
#' Columns in file or data frame should be confidence predictions for one class,
#' i.e. each rows should sum to 1 and have nonnegative entries. 
#' Output data frame contains average confidence scores, max score and percentage of votes for each class.
#'
#' @param states_path Folder containing state files or a single file with same ending as `file_type`.
#' @param label_names Names of predicted classes.
#' @param file_type `"h5"` or `"csv"`.
#' @param df A states data frame. Ignore `states_dir` argument if not `NULL`.
#' @examples 
#' m <- c(0.9,  0.1, 0.2, 0.01,
#'        0.05, 0.7, 0.2, 0,
#'        0.05, 0.2, 0.6, 0.99) %>% matrix(ncol = 3)
#' 
#' label_names <- paste0("class_", 1:3)
#' df <- as.data.frame(m)
#' pred_summary <- summarize_states(label_names = label_names, df = df)
#' pred_summary
#' 
#' @returns A data frame of predictions summaries.
#' @export
summarize_states <- function(states_path = NULL, label_names = NULL, file_type = "h5", df = NULL) {
  
  if (!is.null(df)) {
    states_path <- NULL
  }
  
  if (is.null(states_path)) {
    state_files <- 1
  } else {
    if (endsWith(states_path, file_type)) {
      state_files <- states_path
    } else {
      state_files <- list.files(states_path, full.names = TRUE)
    }
  }
  
  if (!is.null(label_names)) {
    num_labels <- length(label_names)
  }
  
  summary_list <- list()
  
  for (state_file in state_files) {
    
    if (is.null(df)) {
      if (file_type == "h5") {
        df <- load_prediction(h5_path = state_file, get_sample_position = FALSE, verbose = FALSE)
        df <- as.data.frame(df$states)
      }
      if (file_type == "csv") {
        df <- utils::read.csv(state_file)
        if (ncol(df) != num_labels) {
          df <- utils::read.csv2(state_file)
        }
      }
    }
    
    if (state_file == state_files[1] & is.null(label_names)) {
      label_names <- paste0("X_", 1:ncol(df))
      num_labels <- length(label_names)
    }
    
    stopifnot(ncol(df) == num_labels)
    
    names(df) <- c(label_names)
    
    mean_df <- data.frame(matrix(0, nrow = 1, ncol = num_labels))
    names(mean_df) <- paste0("mean_conf_", label_names)
    max_df <- data.frame(matrix(0, nrow = 1, ncol = num_labels))
    names(max_df) <- paste0("max_conf_", label_names)
    
    for (label in label_names) {
      mean_df[[paste0("mean_conf_", label)]] <-  mean(df[[label]])
      max_df[[paste0("max_conf_", label)]] <-  max(df[[label]])
    }
    
    vote_distribution <- apply(df[label_names], 1, which.max)
    vote_perc <- table(factor(vote_distribution, levels = 1:length(label_names)))/length(vote_distribution)
    votes_df <- data.frame(matrix(vote_perc, nrow = 1, ncol = num_labels))
    names(votes_df) <- paste0("vote_perc_", label_names)
    
    mean_prediction <- label_names[which.max(unlist(mean_df))]
    max_prediction <- label_names[which.max(unlist(max_df))]
    vote_prediction <- label_names[which.max(vote_perc)]
    
    if (is.null(states_path)) {
      file_name <- NA
    } else {
      file_name <- basename(state_file)
    }
    
    summary_list[[state_file]] <- data.frame(file_name, mean_df, max_df, votes_df,
                                             mean_prediction, max_prediction, vote_prediction,
                                             num_prediction = nrow(df))
    
  }
  
  summary_df <- data.table::rbindlist(summary_list)
  return(summary_df)
}