deepG: Deep Learning for Genome Sequence Data

Documented in n_gram_dist predict_with_n_gram

#' Get distribution of n-grams
#' 
#' Get distribution of next character given previous n nucleotides.
#'
#' @inheritParams generator_fasta_lm
#' @param path_input Path to folder containing fasta files or single fasta file.
#' @param n Size of n gram.
#' @param vocabulary Vector of allowed characters, samples outside vocabulary get discarded.
#' @param file_sample If integer, size of random sample of files in \code{path_input}.
#' @param nuc_dist Nucleotide distribution.
#' @return Returns a matrix with distributions of nucleotides given the previous n nucleotides.
#' @examples
#' temp_dir <- tempfile()
#' dir.create(temp_dir)
#' create_dummy_data(file_path = temp_dir,
#'                   num_files = 3,
#'                   seq_length = 80,
#'                   vocabulary = c("A", "C", "G", "T"),
#'                   num_seq = 2)
#' 
#' m <- n_gram_dist(path_input = temp_dir,
#'                  n = 3,
#'                  step = 1,
#'                  nuc_dist = FALSE)
#' head(round(m, 2))
#' @returns A data frame of n-gram predictions.
#' @export
n_gram_dist <- function(path_input,
                        n = 2,
                        vocabulary = c("A", "C", "G", "T"),
                        format = "fasta",
                        file_sample = NULL,
                        step = 1,
                        nuc_dist = FALSE) {
  
  if (endsWith(path_input, paste0(".", format))) {
    num_files <- 1
    fasta_files <- path_input
  } else {
    fasta_files <- list.files(
      path = path_input,
      pattern = paste0("\\.", format, "$"),
      full.names = TRUE)
    num_files <- length(fasta_files)
  }
  
  # take random subset of files
  if (!is.null(file_sample)){
    fasta_files <- sample(fasta_files)[1:min(file_sample, length(fasta_files))]
    num_files <- length(fasta_files)
  }
  
  l <- vector("list")
  for (i in 1:n){
    l[[i]] <-  vocabulary
  }
  label_df <- apply(expand.grid(l), 2, as.character)
  labels <- vector("character")
  for (i in 1:nrow(label_df)){
    labels[i] <- paste(label_df[i, ], collapse = "")
  }
  #labels
  
  targets <- vector("character")
  for (i in 1:length(vocabulary)){
    targets <- c(targets, rep(vocabulary[i], length(labels)))
  }
  gram <- rep(labels, length(vocabulary))
  freq <- rep(0, length(labels) * length(vocabulary))
  freq_df <- data.frame(gram, targets, freq)
  nuc_table <- vector("list")
  
  for (i in 1:num_files) {
    
    if (format == "fasta") {
      fasta_file <-  microseq::readFasta(fasta_files[i])
      
    } 
    if (format == "fastq") {
      fasta_file <-  microseq::readFastq(fasta_files[i])
    } 
    
    seq_vector <- fasta_file$Sequence
    start_ind <- get_start_ind(seq_vector = seq_vector,
                               length_vector = nchar(seq_vector),
                               maxlen = n, step = step, train_mode = "lm")
    nuc_seq <- paste(seq_vector, collapse = "")
    split_seq <- strsplit(nuc_seq, "")[[1]]
    nuc_seq_length <- nchar(nuc_seq)
    gram <- split_seq[1 : (nuc_seq_length - n)]
    if (n > 1){
      for (j in 2:n){
        gram <- paste0(gram, split_seq[j : (nuc_seq_length - n + j - 1)])
      }
    }
    targets <- split_seq[(n + 1) : nuc_seq_length]
    
    # remove sequences with overlapping fasta entries
    gram <- gram[start_ind]
    targets <- targets[start_ind]
    
    # remove sequences with ambiguous nucleotides
    amb_pos_gram <- c(1:(length(gram)))[stringr::str_detect(gram, paste0("[^", paste0(vocabulary, collapse = ""), "]"))]
    amb_pos_targets <- c(1:(length(gram)))[stringr::str_detect(targets, paste0("[^", paste0(vocabulary, collapse = ""), "]"))]
    amb_pos <- union(amb_pos_gram, amb_pos_targets)
    if (length(amb_pos) > 0){
      gram <- gram[-amb_pos]
      targets <- targets[-amb_pos]
    }
    
    gram_df <- data.frame(gram = factor(gram, levels = labels),
                          targets = factor(targets, levels = vocabulary))
    table_df <- as.data.frame(table(gram_df))
    
    stopifnot(all(freq_df$gram == table_df$gram) & all(freq_df$targets == table_df$targets))
    
    freq_df$freq <- freq_df$freq + table_df$Freq
  }
  
  dist_matrix <- df_to_distribution_matrix(freq_df, vocabulary = vocabulary)
  dist_matrix
}

df_to_distribution_matrix <- function(freq_df, vocabulary = c("A", "C", "G", "T")) {
  
  stopifnot(names(freq_df) == c("gram", "targets", "freq"))
  gram_levels <- levels(factor(freq_df$gram))
  num_levels <- length(gram_levels)
  dist_matrix <- matrix(0, nrow = num_levels, ncol = length(vocabulary))
  dist_matrix <- as.data.frame(dist_matrix)
  rownames(dist_matrix) <- as.character(freq_df$gram[1:nrow(dist_matrix)])
  colnames(dist_matrix) <- vocabulary
 
  for (nuc in vocabulary){
    nuc_column <- freq_df %>% dplyr::filter(targets == nuc) %>% dplyr::select(gram, freq)
    stopifnot(nuc_column$gram == rownames(dist_matrix))
    dist_matrix[ , nuc] <- nuc_column$freq
  }
  dist_matrix$sum <- apply(dist_matrix, 1, sum)
  non_zero <- dist_matrix$sum != 0
  for (nuc in vocabulary) {
    dist_matrix[non_zero, nuc] <- dist_matrix[non_zero, nuc]/dist_matrix$sum[non_zero]
  }
  dist_matrix[ , vocabulary]
}

#' Predict the next nucleotide using n-gram
#'
#' Predict the next nucleotide using n-gram. 
#'
#' @inheritParams generator_fasta_lm
#' @param path_input Path to folder containing fasta files or single fasta file.
#' @param distribution_matrix A data frame containing frequency of next nucleotide given the previous n nucleotides (output of \code{\link{n_gram_dist}} function).
#' @param default_pred Either character from vocabulary or `"random"`. Will be used as prediction if certain n-gram did not appear before.
#' If `"random"` assign random prediction.
#' @param vocabulary Vector of allowed characters, samples outside vocabulary get discarded.
#' @param file_sample If integer, size of random sample of files in \code{path_input}.
#' @param return_data_frames Boolean, whether to return data frame with input, predictions, target position and true target.
#'
#' @examples
#' # create dummy fasta files
#' temp_dir <- tempfile()
#' dir.create(temp_dir)
#' create_dummy_data(file_path = temp_dir,
#'                   num_files = 3,
#'                   seq_length = 8,
#'                   vocabulary = c("A", "C", "G", "T"),
#'                   num_seq = 2)
#' 
#' m <- n_gram_dist(path_input = temp_dir,
#'                  n = 3,
#'                  step = 1,
#'                  nuc_dist = FALSE)
#' 
#' # use distribution matrix to make predictions for one file
#' predictions <- predict_with_n_gram(path_input = list.files(temp_dir, full.names = TRUE)[1], 
#'                                    distribution_matrix = m)
#' 
#' # show accuracy
#' predictions[[1]]
#' 
#' @returns List of prediction evaluations.
#' @export
predict_with_n_gram <- function(path_input, distribution_matrix, default_pred = "random", vocabulary = c("A", "C", "G", "T"),
                                file_sample = NULL, format = "fasta", return_data_frames = FALSE, step = 1) {
  
  n <- nchar(rownames(distribution_matrix)[1])
  pred_int <- apply(distribution_matrix, 1, which.max)
  # predict most common nucleotide if gram did not appear before
  sum_columns <- apply(distribution_matrix, 2, sum)
  zero_rows <- which(sum_columns == 0)
  if (default_pred == "random") {
    random_pred <- sample(1:length(vocabulary), length(zero_rows), replace = TRUE)
    pred_int[zero_rows] <- random_pred
  } else {
    pred_int[zero_rows] <- which(vocabulary == default_pred)
  }
  # integer to nucleotide
  pred <- vector("character")
  for (i in 1:length(pred_int)){
    pred[i] <- vocabulary[pred_int[i]]
  }
  
  model <- data.frame(gram = rownames(distribution_matrix), pred = pred)
  
  if (endsWith(path_input, paste0(".", format))) {
    num_files <- 1
    fasta_files <- path_input
  } else {
    fasta_files <- list.files(
      path = path_input,
      pattern = paste0("\\.", format, "$"),
      full.names = TRUE)
    num_files <- length(fasta_files)
  }
  
  # take random subset of files
  if (!is.null(file_sample)){
    fasta_files <- sample(fasta_files)[1 : min(file_sample, length(fasta_files))]
    num_files <- length(fasta_files)
  }
  
  labels <- rownames(distribution_matrix)
  
  pred_df_list <- vector("list")
  
  for (i in 1:num_files) {
    
    if (format == "fasta") {
      fasta_file <-  microseq::readFasta(fasta_files[i])
      
    } 
    if (format == "fastq") {
      fasta_file <-  microseq::readFastq(fasta_files[i])
    } 
    
    seq_vector <- fasta_file$Sequence
    start_ind <- get_start_ind(seq_vector = seq_vector,
                               length_vector = nchar(seq_vector),
                               maxlen = n, step = step, train_mode = "lm")
    nuc_seq <- paste(seq_vector, collapse = "")
    split_seq <- strsplit(nuc_seq, "")[[1]]
    
    nuc_seq_length <- nchar(nuc_seq)
    gram <- split_seq[1 : (nuc_seq_length - n)]
    if (n > 1){
      for (j in 2:n){
        gram <- paste0(gram, split_seq[j : (nuc_seq_length - n + j - 1)])
      }
    }
    targets <- split_seq[(n + 1) : nuc_seq_length]
    
    # remove sequences with overlapping fasta entries
    gram <- gram[start_ind]
    targets <- targets[start_ind]
    gram_df <- data.frame(gram = factor(gram, levels = labels),
                          targets = factor(targets, levels = vocabulary),
                          target_pos = start_ind + n)
    
    # remove sequences with ambiguous nucleotides
    gram_df <- gram_df[stats::complete.cases(gram_df), ]
    
    pred_df <- dplyr::left_join(gram_df, model, by = "gram")
    names(pred_df)[2] <- "true"
    if (return_data_frames) {
      pred_df_list[[i]] <- list(pred_df, accuracy = sum(pred_df$true == pred_df$pred)/nrow(pred_df))
    } else {
      pred_df_list[[i]] <- list(accuracy = sum(pred_df$true == pred_df$pred)/nrow(pred_df))
    }
  }
  
  return(pred_df_list)
}