#' Make prediction for nucleotide sequence or entries in fasta/fastq file
#'
#' @description Removes layers (optional) from pretrained model and calculates states of fasta/fastq file or nucleotide sequence.
#' Writes states to h5 or csv file (access content of h5 output with \code{\link{load_prediction}} function).
#' There are several options on how to process an input file:
#' \itemize{
#' \item If `"one_seq"`, computes prediction for sequence argument or fasta/fastq file.
#' Combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry.
#' \item If `"by_entry"`, will output a separate file for each fasta/fastq entry.
#' Names of output files are: `output_dir` + "Nr" + i + `filename` + `output_type`, where i is the number of the fasta entry.
#' \item If `"by_entry_one_file"`, will store prediction for all fasta entries in one h5 file.
#' \item If `"one_pred_per_entry"`, will make one prediction for each entry by either picking random sample for long sequences
#' or pad sequence for short sequences.
#' }
#'
#' @inheritParams get_generator
#' @inheritParams train_model
#' @inheritParams get_generator
#' @inheritParams train_model
#' @param layer_name Name of layer to get output from. If `NULL`, will use the last layer.
#' @param path_input Path to fasta file.
#' @param sequence Character string, ignores path_input if argument given.
#' @param round_digits Number of decimal places.
#' @param mode Either `"lm"` for language model or `"label"` for label classification.
#' @param include_seq Whether to include input sequence in h5 file.
#' @param output_format Either `"one_seq"`, `"by_entry"`, `"by_entry_one_file"`, `"one_pred_per_entry"`.
#' @param output_type `"h5"` or `"csv"`. If `output_format`` is `"by_entries_one_file", "one_pred_per_entry"` can only be `"h5"`.
#' @param return_states Return predictions as data frame. Only supported for output_format `"one_seq"`.
#' @param padding Either `"none"`, `"maxlen"`, `"standard"` or `"self"`.
#' \itemize{
#' \item If `"none"`, apply no padding and skip sequences that are too short.
#' \item If `"maxlen"`, pad with maxlen number of zeros vectors.
#' \item If `"standard"`, pad with zero vectors only if sequence is shorter than maxlen. Pads to minimum size required for one prediction.
#' \item If `"self"`, concatenate sequence with itself until sequence is long enough for one prediction.
#' Example: if sequence is "ACGT" and maxlen is 10, make prediction for "ACGTACGTAC".
#' Only applied if sequence is shorter than maxlen.
#' }
#' @param verbose Boolean.
#' @param filename Filename to store states in. No file output if argument is `NULL`.
#' If `output_format = "by_entry"`, adds "_nr_" + "i" after name, where i is entry number.
#' @param output_dir Directory for file output.
#' @param use_quality Whether to use quality scores.
#' @param quality_string String for encoding with quality scores (as used in fastq format).
#' @param lm_format Either `"target_right"`, `"target_middle_lstm"`, `"target_middle_cnn"` or `"wavenet"`.
#' @param ... Further arguments for sequence encoding with \code{\link{seq_encoding_label}}.
#' @examplesIf reticulate::py_module_available("tensorflow")
#' # make prediction for single sequence and write to h5 file
#' model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
#' vocabulary <- c("a", "c", "g", "t")
#' sequence <- paste(sample(vocabulary, 200, replace = TRUE), collapse = "")
#' output_file <- tempfile(fileext = ".h5")
#' predict_model(output_format = "one_seq", model = model, step = 10,
#' sequence = sequence, filename = output_file, mode = "label")
#'
#' # make prediction for fasta file with multiple entries, write output to separate h5 files
#' fasta_path <- tempfile(fileext = ".fasta")
#' create_dummy_data(file_path = fasta_path, num_files = 1,
#' num_seq = 5, seq_length = 100,
#' write_to_file_path = TRUE)
#' model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
#' output_dir <- tempfile()
#' dir.create(output_dir)
#' predict_model(output_format = "by_entry", model = model, step = 10, verbose = FALSE,
#' output_dir = output_dir, mode = "label", path_input = fasta_path)
#' list.files(output_dir)
#'
#' @returns If `return_states = TRUE` returns a list of model predictions and position of corresponding sequences.
#' If additionally `include_seq = TRUE`, list contains sequence strings.
#' If `return_states = FALSE` returns nothing, just writes output to file(s).
#' @export
predict_model <- function(model, output_format = "one_seq", layer_name = NULL, sequence = NULL, path_input = NULL,
round_digits = NULL, filename = "states.h5", step = 1, vocabulary = c("a", "c", "g", "t"),
batch_size = 256, verbose = TRUE, return_states = FALSE,
output_type = "h5", padding = "none", use_quality = FALSE, quality_string = NULL,
mode = "label", lm_format = "target_right", output_dir = NULL,
format = "fasta", include_seq = FALSE, reverse_complement_encoding = FALSE,
ambiguous_nuc = "zero", ...) {
stopifnot(padding %in% c("standard", "self", "none", "maxlen"))
stopifnot(output_format %in% c("one_seq", "by_entry", "by_entry_one_file", "one_pred_per_entry"))
if (output_format %in% c("by_entry_one_file", "one_pred_per_entry") & output_type == "csv") {
message("by_entry_one_file or one_pred_per_entry only implemented for h5 output.
Setting output_type to h5")
output_type <- "h5"
}
if (output_format == "one_seq") {
output_list <- predict_model_one_seq(layer_name = layer_name, sequence = sequence, path_input = path_input,
round_digits = round_digits, filename = filename, step = step, vocabulary = vocabulary,
batch_size = batch_size, verbose = verbose, return_states = return_states,
padding = padding, quality_string = quality_string, use_quality = use_quality,
output_type = output_type, model = model, mode = mode, lm_format = lm_format,
format = format, include_seq = include_seq, ambiguous_nuc = ambiguous_nuc,
reverse_complement_encoding = reverse_complement_encoding, ...)
return(output_list)
}
if (output_format == "by_entry") {
predict_model_by_entry(layer_name = layer_name, path_input = path_input,
round_digits = round_digits, filename = filename, step = step,
# vocabulary = vocabulary, quality_string = quality_string, ambiguous_nuc = ambiguous_nuc,
batch_size = batch_size, verbose = verbose, use_quality = use_quality,
output_type = output_type, model = model, mode = mode, lm_format = lm_format,
output_dir = output_dir, format = format, include_seq = include_seq,
padding = padding, reverse_complement_encoding = reverse_complement_encoding, ...)
}
if (output_format == "by_entry_one_file") {
predict_model_by_entry_one_file(layer_name = layer_name, path_input = path_input,
round_digits = round_digits, filename = filename, step = step,
# vocabulary = vocabulary, quality_string = quality_string, ambiguous_nuc = ambiguous_nuc,
batch_size = batch_size, verbose = verbose, use_quality = use_quality,
model = model, mode = mode, lm_format = lm_format, format = format,
padding = padding, include_seq = include_seq,
reverse_complement_encoding = reverse_complement_encoding, ...)
}
if (output_format == "one_pred_per_entry") {
if (mode == "lm") {
stop("one_pred_per_entry only implemented for label classification")
}
predict_model_one_pred_per_entry(layer_name = layer_name, path_input = path_input,
round_digits = round_digits, filename = filename,
batch_size = batch_size, verbose = verbose, model = model, format = format,
# ambiguous_nuc = ambiguous_nuc, use_quality = use_quality, vocabulary = vocabulary,
reverse_complement_encoding = reverse_complement_encoding, ...)
}
}
#' Write output of specific model layer to h5 or csv file.
#'
#' Removes layers (optional) from pretrained model and calculates states of fasta file, writes states to h5/csv file.
#' Function combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry.
#' h5 file also contains sequence and positions of targets corresponding to states.
#'
#' @inheritParams generator_fasta_lm
#' @param layer_name Name of layer to get output from. If `NULL`, will use the last layer.
#' @param path_input Path to fasta file.
#' @param sequence Character string, ignores path_input if argument given.
#' @param round_digits Number of decimal places.
#' @param batch_size Number of samples to evaluate at once. Does not change output, only relevant for speed and memory.
#' @param step Frequency of sampling steps.
#' @param filename Filename to store states in. No file output if argument is `NULL`.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param return_states Logical scalar, return states matrix.
#' @param ambiguous_nuc `"zero"` or `"equal"`.
#' @param verbose Whether to print model before and after removing layers.
#' @param output_type Either `"h5"` or `"csv"`.
#' @param model A keras model.
#' @param mode Either `"lm"` for language model or `"label"` for label classification.
#' @param format Either `"fasta"` or `"fastq"`.
#' @param include_seq Whether to include input sequence in h5 file.
#' @param ... Further arguments for sequence encoding with \code{\link{seq_encoding_label}}.
#' @noRd
predict_model_one_seq <- function(model, layer_name = NULL, sequence = NULL, path_input = NULL, round_digits = 2,
filename = "states.h5", step = 1, vocabulary = c("a", "c", "g", "t"), batch_size = 256, verbose = TRUE,
return_states = FALSE, target_len = 1, use_quality = FALSE, quality_string = NULL,
output_type = "h5", mode = "lm", lm_format = "target_right",
ambiguous_nuc = "zero", padding = "none", format = "fasta", output_dir = NULL,
include_seq = TRUE, reverse_complement_encoding = FALSE, ...) {
vocabulary <- stringr::str_to_lower(vocabulary)
stopifnot(mode %in% c("lm", "label"))
stopifnot(output_type %in% c("h5", "csv"))
if (!is.null(quality_string)) use_quality <- TRUE
# if (!is.null(quality_string) & !is.null(sequence)) {
# stopifnot(length(sequence) == length(quality_to_probability(quality_string)))
# }
file_output <- !is.null(filename)
if (!file_output) {
if (!return_states) stop("If filename is NULL, return_states must be TRUE; otherwise function produces no output.")
filename <- tempfile(fileext = paste0(".", output_type))
}
stopifnot(batch_size > 0)
stopifnot(!file.exists(filename))
if (reverse_complement_encoding) {
test_len <- length(vocabulary) != 4
if (test_len || all(sort(stringr::str_to_lower(vocabulary)) != c("a", "c", "g", "t"))) {
stop("reverse_complement_encoding only implemented for A,C,G,T vocabulary")
}
}
# token for ambiguous nucleotides
for (i in letters) {
if (!(i %in% stringr::str_to_lower(vocabulary))) {
amb_nuc_token <- i
break
}
}
tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), c(vocabulary, amb_nuc_token))
if (is.null(layer_name)) {
layer_name <- model$output_names
if (verbose) message(paste("layer_name not specified. Using layer", layer_name))
}
if (!is.null(sequence) && (!missing(sequence) & sequence != "")) {
nt_seq <- sequence %>% stringr::str_to_lower()
} else {
if (format == "fasta") {
fasta.file <- microseq::readFasta(path_input)
}
if (format == "fastq") {
fasta.file <- microseq::readFastq(path_input)
}
if (nrow(fasta.file) > 1 & verbose) {
text_1 <- paste("Your file has", nrow(fasta.file), "entries. 'one_seq' output_format will concatenate them to a single sequence.\n")
text_2 <- "Use 'by_entry' or 'by_entry_one_file' output_format to evaluate them separately."
message(paste0(text_1, text_2))
}
nt_seq <- paste(fasta.file$Sequence, collapse = "") %>% stringr::str_to_lower()
}
# tokenize ambiguous nt
pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]")
nt_seq <- stringr::str_replace_all(string = nt_seq, pattern = pattern, amb_nuc_token)
if (use_quality & is.null(quality_string)) {
quality_string <- paste(fasta.file$Quality, collapse = "")
}
# extract maxlen
target_middle <- ifelse(mode == "lm" && (lm_format %in% c("target_middle_lstm", "target_middle_cnn")), TRUE, FALSE)
if (!target_middle) {
if (reverse_complement_encoding) {
maxlen <- model$input[[1]]$shape[[2]]
} else {
maxlen <- model$input$shape[[2]]
}
} else {
maxlen_1 <- model$input[[1]]$shape[[2]]
maxlen_2 <- model$input[[2]]$shape[[2]]
maxlen <- maxlen_1 + maxlen_2
}
total_seq_len <- ifelse(mode == "lm", maxlen + target_len, maxlen)
# pad sequence
unpadded_seq_len <- nchar(nt_seq)
pad_len <- 0
if (padding == "maxlen") {
pad_len <- maxlen
}
if (padding == "standard" & (unpadded_seq_len < total_seq_len)) {
pad_len <- total_seq_len - unpadded_seq_len
}
if (padding == "self" & (unpadded_seq_len < total_seq_len)) {
nt_seq <- strrep(nt_seq, ceiling(total_seq_len / unpadded_seq_len))
nt_seq <- substr(nt_seq, 1, total_seq_len)
if (use_quality) {
quality_string <- strrep(quality_string, ceiling(total_seq_len / unpadded_seq_len))
quality_string <- substr(quality_string, 1, total_seq_len)
}
} else {
nt_seq <- paste0(strrep("0", pad_len), nt_seq)
if (use_quality) quality_string <- paste0(strrep("0", pad_len), quality_string)
}
if (nchar(nt_seq) < total_seq_len) {
stop(paste0("Input sequence is shorter than required length (", total_seq_len, "). Use padding argument to pad sequence to bigger size."))
}
if (use_quality) {
quality_vector <- quality_string %>% quality_to_probability()
} else {
quality_vector <- NULL
}
# start of samples
start_indices <- seq(1, nchar(nt_seq) - total_seq_len + 1, by = step)
num_samples <- length(start_indices)
check_layer_name(model, layer_name)
model <- tensorflow::tf$keras$Model(model$input, model$get_layer(layer_name)$output)
if (verbose) {
cat("Computing output for model at layer", layer_name, "\n")
print(model)
}
# extract number of neurons in last layer
if (length(model$output$shape$dims) == 3) {
if (!("lstm" %in% stringr::str_to_lower(model$output_names))) {
stop("Output dimension of layer is > 1, format not supported yet")
}
layer.size <- model$output$shape[[3]]
} else {
layer.size <- model$output$shape[[2]]
}
# tokenize sequence
nt_seq <- stringr::str_to_lower(nt_seq)
tokSeq <- keras::texts_to_sequences(tokenizer, nt_seq)[[1]] - 1
# seq end position
pos_arg <- start_indices + total_seq_len - pad_len - 1
if (include_seq) {
output_seq <- substr(nt_seq, pad_len + 1, nchar(nt_seq))
}
# create h5 file to store states
if (output_type == "h5") {
h5_file <- hdf5r::H5File$new(filename, mode = "w")
h5_file[["multi_entries"]] <- FALSE
h5_file[["sample_end_position"]] <- pos_arg
if (include_seq) h5_file[["sequence"]] <- output_seq
}
number_batches <- ceiling(length(start_indices)/batch_size)
pred_list <- vector("list", number_batches)
col_names <- c(as.character(1:layer.size), "sample_end_position")
# subset input for target middle
if (mode == "lm" && lm_format %in% c("target_middle_lstm", "target_middle_cnn")) {
index_x_1 <- 1:ceiling((total_seq_len - target_len)/2)
index_x_2 <- (max(index_x_1) + target_len + 1) : total_seq_len
}
for (i in 1:number_batches) {
index_start <- ((i - 1) * batch_size) + 1
index_end <- min(c(num_samples + 1, index_start + batch_size)) - 1
index <- index_start : index_end
x <- seq_encoding_label(sequence = tokSeq,
maxlen = total_seq_len,
vocabulary = vocabulary,
start_ind = start_indices[index],
ambiguous_nuc = ambiguous_nuc,
tokenizer = NULL,
adjust_start_ind = FALSE,
quality_vector = quality_vector,
...
)
if (mode == "lm" && lm_format == "target_middle_lstm") {
x1 <- x[ , index_x_1, ]
x2 <- x[ , index_x_2, ]
if (length(index_x_1) == 1 | dim(x)[1] == 1) {
x1 <- array(x1, dim = c(1, dim(x1)))
}
if (length(index_x_2) == 1 | dim(x)[1] == 1) {
x2 <- array(x2, dim = c(1, dim(x2)))
}
x2 <- x2[ , dim(x2)[2]:1, ] # reverse order
if (length(dim(x2)) == 2) {
x2 <- array(x2, dim = c(1, dim(x2)))
}
x <- list(x1, x2)
}
if (mode == "lm" && lm_format == "target_middle_cnn") {
x <- x[ , c(index_x_1, index_x_2), ]
}
if (reverse_complement_encoding) x <- list(x, reverse_complement_tensor(x))
y <- stats::predict(model, x, verbose = 0)
if (!is.null(round_digits)) y <- round(y, round_digits)
pred_list[[i]] <- y
}
states <- do.call(rbind, pred_list)
if (file_output) {
if (output_type == "h5") {
h5_file[["states"]] <- states
h5_file$close_all()
} else {
col_names <- paste0("N", 1:ncol(states))
colnames(states) <- col_names
utils::write.csv(x = states, file = filename, row.names = FALSE)
}
}
if (return_states) {
output_list <- list()
output_list$states <- states
output_list$sample_end_position <- pos_arg
if (include_seq) output_list$sequence <- output_seq
return(output_list)
} else {
return(NULL)
}
}
#' Write states to h5 file
#'
#' @description Removes layers (optional) from pretrained model and calculates states of fasta file, writes a separate
#' h5 file for every fasta entry in fasta file. h5 files also contain the nucleotide sequence and positions of targets corresponding to states.
#' Names of output files are: file_path + "Nr" + i + filename + output_type, where i is the number of the fasta entry.
#'
#' @param filename Filename to store states, function adds "_nr_" + "i" after name, where i is entry number.
#' @param output_dir Path to folder, where to write output.
#' @noRd
predict_model_by_entry <- function(model, layer_name = NULL, path_input, round_digits = 2,
filename = "states.h5", output_dir = NULL, step = 1, vocabulary = c("a", "c", "g", "t"),
batch_size = 256, output_type = "h5", mode = "lm",
lm_format = "target_right", format = "fasta", use_quality = FALSE,
reverse_complement_encoding = FALSE, padding = "none",
verbose = FALSE, include_seq = FALSE, ambiguous_nuc = "zero", ...) {
stopifnot(mode %in% c("lm", "label"))
stopifnot(!is.null(filename))
stopifnot(!is.null(output_dir))
if (endsWith(filename, paste0(".", output_type))) {
filename <- stringr::str_remove(filename, paste0(".", output_type, "$"))
filename <- basename(filename)
}
# extract maxlen
target_middle <- ifelse(mode == "lm" && (lm_format %in% c("target_middle_lstm", "target_middle_cnn")), TRUE, FALSE)
if (!target_middle) {
if (reverse_complement_encoding) {
model$input[[1]]$shape[[2]]
} else {
maxlen <- model$input$shape[[2]]
}
} else {
maxlen_1 <- model$input[[1]]$shape[[2]]
maxlen_2 <- model$input[[2]]$shape[[2]]
maxlen <- maxlen_1 + maxlen_2
}
# load fasta file
if (format == "fasta") {
fasta.file <- microseq::readFasta(path_input)
}
if (format == "fastq") {
fasta.file <- microseq::readFastq(path_input)
}
df <- fasta.file[ , c("Sequence", "Header")]
names(df) <- c("seq", "header")
rownames(df) <- NULL
num_skipped_seq <- 0
for (i in 1:nrow(df)) {
# skip entry if too short
if ((nchar(df[i, "seq"]) < maxlen) & padding == "none") {
num_skipped_seq <- num_skipped_seq + 1
next
}
if (use_quality) {
quality_string <- fasta.file$Quality[i]
} else {
quality_string <- NULL
}
current_file <- paste0(output_dir, "/", filename, "_nr_", as.character(i), ".", output_type)
predict_model_one_seq(layer_name = layer_name, sequence = df[i, "seq"],
round_digits = round_digits, path_input = path_input,
filename = current_file, quality_string = quality_string,
step = step, vocabulary = vocabulary, batch_size = batch_size,
verbose = ifelse(i > 1, FALSE, verbose),
output_type = output_type, mode = mode,
lm_format = lm_format, model = model, include_seq = include_seq,
padding = padding, ambiguous_nuc = "zero",
reverse_complement_encoding = reverse_complement_encoding, ...)
}
if (verbose & num_skipped_seq > 0) {
message(paste0("Skipped ", num_skipped_seq,
ifelse(num_skipped_seq == 1, " entry", " entries"),
". Use different padding option to evaluate all."))
}
}
#' Write states to h5 file
#'
#' @description Removes layers (optional) from pretrained model and calculates states of fasta file,
#' writes separate states matrix in one .h5 file for every fasta entry.
#' h5 file also contains the nucleotide sequences and positions of targets corresponding to states.
#' @noRd
predict_model_by_entry_one_file <- function(model, path_input, round_digits = 2, filename = "states.h5",
step = 1, vocabulary = c("a", "c", "g", "t"), batch_size = 256, layer_name = NULL,
verbose = TRUE, mode = "lm", use_quality = FALSE,
lm_format = "target_right", padding = "none",
format = "fasta", include_seq = TRUE, reverse_complement_encoding = FALSE, ...) {
vocabulary <- stringr::str_to_lower(vocabulary)
stopifnot(mode %in% c("lm", "label"))
target_middle <- ifelse(mode == "lm" && (lm_format %in% c("target_middle_lstm", "target_middle_cnn")), TRUE, FALSE)
# extract maxlen
if (!target_middle) {
if (reverse_complement_encoding) {
maxlen <- model$input[[1]]$shape[[2]]
} else {
maxlen <- model$input$shape[[2]]
}
} else {
maxlen_1 <- model$input[[1]]$shape[[2]]
maxlen_2 <- model$input[[2]]$shape[[2]]
maxlen <- maxlen_1 + maxlen_2
}
# extract number of neurons in last layer
if (length(model$output$shape$dims) == 3) {
if (!("lstm" %in% stringr::str_to_lower(model$output_names))) {
stop("Output dimension of layer is > 1, format not supported yet")
}
layer.size <- model$output$shape[[3]]
} else {
layer.size <- model$output$shape[[2]]
}
# load fasta file
if (format == "fasta") {
fasta.file <- microseq::readFasta(path_input)
}
if (format == "fastq") {
fasta.file <- microseq::readFastq(path_input)
}
df <- fasta.file[ , c("Sequence", "Header")]
names(df) <- c("seq", "header")
rownames(df) <- NULL
if (verbose) {
# check if names are unique
if (length(df$header) != length(unique(df$header))) {
message("Header names are not unique, adding '_header_x' to names (x being the header number)")
df$header <- paste0(df$header, paste0("_header_", 1:length(df$header)))
}
}
# create h5 file to store states
h5_file <- hdf5r::H5File$new(filename, mode = "w")
h5_file[["multi_entries"]] <- TRUE
states.grp <- h5_file$create_group("states")
sample_end_position.grp <- h5_file$create_group("sample_end_position")
if (include_seq) seq.grp <- h5_file$create_group("sequence")
num_skipped_seq <- 0
for (i in 1:nrow(df)) {
#seq_name <- df$header[i]
seq_name <- paste0("entry_", i)
temp_file <- tempfile(fileext = ".h5")
# skip entry if too short
if ((nchar(df[i, "seq"]) < maxlen) & padding == "none") {
num_skipped_seq <- num_skipped_seq + 1
next
}
if (use_quality) {
quality_string <- fasta.file$Quality[i]
} else {
quality_string <- NULL
}
output_list <- predict_model_one_seq(layer_name = layer_name, sequence = df$seq[i], path_input = path_input,
round_digits = round_digits, filename = temp_file, step = step, vocabulary = vocabulary,
batch_size = batch_size, return_states = TRUE, quality_string = quality_string,
output_type = "h5", model = model, mode = mode, lm_format = lm_format,
ambiguous_nuc = "zero", verbose = ifelse(i > 1, FALSE, verbose),
padding = padding, format = format, include_seq = include_seq,
reverse_complement_encoding = reverse_complement_encoding, ...)
states.grp[[seq_name]] <- output_list$states
sample_end_position.grp[[seq_name]] <- output_list$sample_end_position
if (include_seq) seq.grp[[seq_name]] <- output_list$sequence
}
if (verbose & num_skipped_seq > 0) {
message(paste0("Skipped ", num_skipped_seq,
ifelse(num_skipped_seq == 1, " entry", " entries"),
". Use different padding option to evaluate all."))
}
h5_file$close_all()
}
#' Get states for label classification model.
#'
#' Computes output at specified model layer. Forces every fasta entry to have length maxlen by either padding sequences shorter than maxlen or taking random subsequence for
#' longer sequences.
#'
#' @inheritParams predict_model_one_seq
#' @noRd
predict_model_one_pred_per_entry <- function(model, layer_name = NULL, path_input, round_digits = 2, format = "fasta",
ambiguous_nuc = "zero", filename = "states.h5", padding = padding,
vocabulary = c("a", "c", "g", "t"), batch_size = 256, verbose = TRUE,
return_states = FALSE, reverse_complement_encoding = FALSE,
include_seq = FALSE, use_quality = FALSE, ...) {
vocabulary <- stringr::str_to_lower(vocabulary)
file_type <- "h5"
stopifnot(batch_size > 0)
stopifnot(!file.exists(filename))
# token for ambiguous nucleotides
for (i in letters) {
if (!(i %in% stringr::str_to_lower(vocabulary))) {
amb_nuc_token <- i
break
}
}
tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "N"), vocabulary)
if (is.null(layer_name)) {
layer_name <- model$output_names
if (verbose) message(paste("layer_name not specified. Using layer", layer_name))
}
# extract maxlen
if (reverse_complement_encoding) {
maxlen <- model$input[[1]]$shape[[2]]
} else {
maxlen <- model$input$shape[[2]]
}
if (format == "fasta") {
fasta.file <- microseq::readFasta(path_input)
}
if (format == "fastq") {
fasta.file <- microseq::readFastq(path_input)
}
num_samples <- nrow(fasta.file)
nucSeq <- as.character(fasta.file$Sequence)
seq_length <- nchar(fasta.file$Sequence)
if (use_quality) {
quality_string <- vector("character", length(nucSeq))
} else {
quality_string <- NULL
}
for (i in 1:length(nucSeq)) {
# take random subsequence
if (seq_length[i] > maxlen) {
start <- sample(1 : (seq_length[i] - maxlen + 1) , size = 1)
nucSeq[i] <- substr(nucSeq[i], start = start, stop = start + maxlen - 1)
if (use_quality) {
quality_string[i] <- substr(fasta.file$Quality[i], start = start, stop = start + maxlen - 1)
}
}
# pad sequence
if (seq_length[i] < maxlen) {
nucSeq[i] <- paste0(paste(rep("N", maxlen - seq_length[i]), collapse = ""), nucSeq[i])
if (use_quality) {
quality_string[i] <- paste0(paste(rep("0", maxlen - seq_length[i]), collapse = ""), fasta.file$Quality[i])
}
}
}
model <- tensorflow::tf$keras$Model(model$input, model$get_layer(layer_name)$output)
if (verbose) {
cat("Computing output for model at layer", layer_name, "\n")
print(model)
}
# extract number of neurons in last layer
if (length(model$output$shape$dims) == 3) {
if (!("lstm" %in% stringr::str_to_lower(model$output_names))) {
stop("Output dimension of layer is > 1, format not supported yet")
}
layer.size <- model$output$shape[[3]]
} else {
layer.size <- model$output$shape[[2]]
}
if (file_type == "h5") {
# create h5 file to store states
h5_file <- hdf5r::H5File$new(filename, mode = "w")
if (!missing(path_input)) h5_file[["fasta_file"]] <- path_input
h5_file[["header_names"]] <- fasta.file$Header
if (include_seq) h5_file[["sequences"]] <- nucSeq
h5_file[["states"]] <- array(0, dim = c(0, layer.size))
h5_file[["multi_entries"]] <- FALSE
writer <- h5_file[["states"]]
}
rm(fasta.file)
#nucSeq <- paste(nucSeq, collapse = "") %>% stringr::str_to_lower()
number_batches <- ceiling(num_samples/batch_size)
if (verbose) cat("Evaluating", number_batches, ifelse(number_batches > 1, "batches", "batch"), "\n")
row <- 1
string_start_index <- 1
ten_percent_steps <- seq(number_batches/10, number_batches, length.out = 10)
percentage_index <- 1
if (number_batches > 1) {
for (i in 1:(number_batches - 1)) {
string_end_index <-string_start_index + batch_size - 1
char_seq <- nucSeq[string_start_index : string_end_index] %>% paste(collapse = "")
if (use_quality) {
quality_string_subset <- quality_string[string_start_index : string_end_index] %>% paste(collapse = "")
} else {
quality_string_subset <- NULL
}
if (i == 1) start_ind <- seq(1, nchar(char_seq), maxlen)
one_hot_batch <- seq_encoding_label(sequence = NULL, maxlen = maxlen, vocabulary = vocabulary,
start_ind = start_ind, ambiguous_nuc = ambiguous_nuc,
char_sequence = char_seq, quality_vector = quality_string_subset,
tokenizer = tokenizer, adjust_start_ind = TRUE, ...)
if (reverse_complement_encoding) one_hot_batch <- list(one_hot_batch, reverse_complement_tensor(one_hot_batch))
activations <- keras::predict_on_batch(model, one_hot_batch)
writer[row : (row + batch_size - 1), ] <- activations
row <- row + batch_size
string_start_index <- string_end_index + 1
if (verbose & (i > ten_percent_steps[percentage_index]) & percentage_index < 10) {
cat("Progress: ", percentage_index * 10 ,"% \n")
percentage_index <- percentage_index + 1
}
}
}
# last batch might be shorter
char_seq <- nucSeq[string_start_index : length(nucSeq)] %>% paste(collapse = "")
if (use_quality) {
quality_string_subset <- quality_string[string_start_index : length(nucSeq)] %>% paste(collapse = "")
} else {
quality_string_subset <- NULL
}
one_hot_batch <- seq_encoding_label(sequence = NULL, maxlen = maxlen, vocabulary = vocabulary,
start_ind = seq(1, nchar(char_seq), maxlen), ambiguous_nuc = "zero", nuc_dist = NULL,
quality_vector = quality_string_subset, use_coverage = FALSE, max_cov = NULL,
cov_vector = NULL, n_gram = NULL, n_gram_stride = 1, char_sequence = char_seq,
tokenizer = tokenizer, adjust_start_ind = TRUE, ...)
if (reverse_complement_encoding) one_hot_batch <- list(one_hot_batch, reverse_complement_tensor(one_hot_batch))
activations <- keras::predict_on_batch(model, one_hot_batch)
writer[row : num_samples, ] <- activations[1 : length(row:num_samples), ]
if (verbose) cat("Progress: 100 % \n")
if (return_states & (file_type == "h5")) states <- writer[ , ]
if (file_type == "h5") h5_file$close_all()
if (return_states) return(states)
}
#' Read states from h5 file
#'
#' Reads h5 file created by \code{\link{predict_model}} function.
#'
#' @param h5_path Path to h5 file.
#' @param rows Range of rows to read. If `NULL` read everything.
#' @param get_sample_position Return position of sample corresponding to state if `TRUE`.
#' @param get_seq Return nucleotide sequence if `TRUE`.
#' @param verbose Boolean.
#' @examplesIf reticulate::py_module_available("tensorflow")
#' # make prediction for single sequence and write to h5 file
#' model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
#' vocabulary <- c("a", "c", "g", "t")
#' sequence <- paste(sample(vocabulary, 200, replace = TRUE), collapse = "")
#' output_file <- tempfile(fileext = ".h5")
#' predict_model(output_format = "one_seq", model = model, step = 10,
#' sequence = sequence, filename = output_file, mode = "label")
#' load_prediction(h5_path = output_file)
#'
#' @returns A list of data frames, containing model predictions and sequence positions.
#' @export
load_prediction <- function(h5_path, rows = NULL, verbose = FALSE,
get_sample_position = FALSE, get_seq = FALSE) {
if (is.null(rows)) complete <- TRUE
h5_file <- hdf5r::H5File$new(h5_path, mode = "r")
multi_entries <- ifelse(h5_file[["multi_entries"]][], TRUE, FALSE)
if (!multi_entries) {
number_entries <- 1
} else {
entry_names <- names(h5_file[["states"]])
number_entries <- length(entry_names)
output_list <- list()
}
train_mode <- "label"
if (get_sample_position & !any(c("sample_end_position", "target_position") %in% names(h5_file))) {
get_sample_position <- FALSE
message("File does not contain target positions.")
}
if (!multi_entries) {
read_states <- h5_file[["states"]]
if (get_sample_position) {
read_targetPos <- h5_file[["sample_end_position"]]
}
if (verbose) {
cat("states matrix has", dim(read_states[ , ])[1], "rows and", dim(read_states[ , ])[2], "columns \n")
}
if (complete) {
states <- read_states[ , ]
if (get_sample_position) {
targetPos <- read_targetPos[ ]
}
} else {
states <- read_states[rows, ]
if (get_sample_position) {
targetPos <- read_targetPos[rows]
}
}
if (is.null(dim(states))) {
states <- matrix(states, nrow = 1)
}
contains_seq <- FALSE
if (get_seq) {
if ("sequence" %in% names(h5_file)) {
contains_seq <- TRUE
sequence <- h5_file[["sequence"]][]
} else {
contains_seq <- FALSE
message("File does not contain sequence.")
}
}
h5_file$close_all()
output_list <- list(states = states)
if (get_sample_position) {
if (train_mode == "label") {
output_list$sample_end_position <- targetPos
} else {
output_list$target_position <- targetPos
}
}
if (get_seq && contains_seq) {
output_list$sequence <- sequence
}
return(output_list)
# multi entries
} else {
if (verbose) {
cat("file contains", number_entries, "entries \n")
}
if (get_sample_position) {
target_name <- "sample_end_position"
}
if (get_seq & !("sequence" %in% names(h5_file))) {
message("File does not contain sequence.")
get_seq <- FALSE
}
for (i in 1:number_entries) {
entry_name <- entry_names[i]
states <- h5_file[["states"]][[entry_name]][ , ]
if (is.null(dim(states))) {
states <- matrix(states, nrow = 1)
}
if (get_seq) {
sequence <- h5_file[["sequence"]][[entry_name]][ ]
}
if (get_sample_position) {
targetPos <- h5_file[[target_name]][[entry_name]][ ]
}
if (!complete) {
states <- states[rows, ]
if (is.null(dim(states))) {
states <- matrix(states, nrow = 1)
}
if (get_sample_position) {
targetPos <- hdf5r::h5attr(read_states, target_name)[rows]
}
}
if (get_sample_position) {
l <- list(states = states, sample_end_position = targetPos)
} else {
l <- list(states = states)
}
if (get_seq) {
l[["sequence"]] <- sequence
}
output_list[[entry_name]] <- l
}
h5_file$close_all()
names(output_list) <- entry_names
return(output_list)
}
}
#' Create summary of predictions
#'
#' Create summary data frame for confidence predictions over 1 or several state files or a data frame.
#' Columns in file or data frame should be confidence predictions for one class,
#' i.e. each rows should sum to 1 and have nonnegative entries.
#' Output data frame contains average confidence scores, max score and percentage of votes for each class.
#'
#' @param states_path Folder containing state files or a single file with same ending as `file_type`.
#' @param label_names Names of predicted classes.
#' @param file_type `"h5"` or `"csv"`.
#' @param df A states data frame. Ignore `states_dir` argument if not `NULL`.
#' @examples
#' m <- c(0.9, 0.1, 0.2, 0.01,
#' 0.05, 0.7, 0.2, 0,
#' 0.05, 0.2, 0.6, 0.99) %>% matrix(ncol = 3)
#'
#' label_names <- paste0("class_", 1:3)
#' df <- as.data.frame(m)
#' pred_summary <- summarize_states(label_names = label_names, df = df)
#' pred_summary
#'
#' @returns A data frame of predictions summaries.
#' @export
summarize_states <- function(states_path = NULL, label_names = NULL, file_type = "h5", df = NULL) {
if (!is.null(df)) {
states_path <- NULL
}
if (is.null(states_path)) {
state_files <- 1
} else {
if (endsWith(states_path, file_type)) {
state_files <- states_path
} else {
state_files <- list.files(states_path, full.names = TRUE)
}
}
if (!is.null(label_names)) {
num_labels <- length(label_names)
}
summary_list <- list()
for (state_file in state_files) {
if (is.null(df)) {
if (file_type == "h5") {
df <- load_prediction(h5_path = state_file, get_sample_position = FALSE, verbose = FALSE)
df <- as.data.frame(df$states)
}
if (file_type == "csv") {
df <- utils::read.csv(state_file)
if (ncol(df) != num_labels) {
df <- utils::read.csv2(state_file)
}
}
}
if (state_file == state_files[1] & is.null(label_names)) {
label_names <- paste0("X_", 1:ncol(df))
num_labels <- length(label_names)
}
stopifnot(ncol(df) == num_labels)
names(df) <- c(label_names)
mean_df <- data.frame(matrix(0, nrow = 1, ncol = num_labels))
names(mean_df) <- paste0("mean_conf_", label_names)
max_df <- data.frame(matrix(0, nrow = 1, ncol = num_labels))
names(max_df) <- paste0("max_conf_", label_names)
for (label in label_names) {
mean_df[[paste0("mean_conf_", label)]] <- mean(df[[label]])
max_df[[paste0("max_conf_", label)]] <- max(df[[label]])
}
vote_distribution <- apply(df[label_names], 1, which.max)
vote_perc <- table(factor(vote_distribution, levels = 1:length(label_names)))/length(vote_distribution)
votes_df <- data.frame(matrix(vote_perc, nrow = 1, ncol = num_labels))
names(votes_df) <- paste0("vote_perc_", label_names)
mean_prediction <- label_names[which.max(unlist(mean_df))]
max_prediction <- label_names[which.max(unlist(max_df))]
vote_prediction <- label_names[which.max(vote_perc)]
if (is.null(states_path)) {
file_name <- NA
} else {
file_name <- basename(state_file)
}
summary_list[[state_file]] <- data.frame(file_name, mean_df, max_df, votes_df,
mean_prediction, max_prediction, vote_prediction,
num_prediction = nrow(df))
}
summary_df <- data.table::rbindlist(summary_list)
return(summary_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.