R/preprocess.R

Defines functions getStartInd sequenceToArrayLabel sequenceToArray preprocessFasta preprocessSemiRedundant getVocabulary

Documented in getStartInd getVocabulary preprocessFasta preprocessSemiRedundant sequenceToArray sequenceToArrayLabel

#' Returns the vocabulary from character string
#'
#' Use this function with a character string.
#'
#' @param char character string of text with the length of one
#' @param verbose TRUE/FALSE
#' @examples 
#' getVocabulary(data(crispr_sample))
#' getVocabulary("abcd")
#' @export
getVocabulary <- function(char, verbose = F) {
  
  stopifnot(!is.null(char))
  stopifnot(nchar(char) > 0)
  
  vocabulary <- sort(unique(tokenizers::tokenize_characters(
    stringr::str_c(stringr::str_to_lower(char),collapse = "\n"), strip_non_alphanum = FALSE, simplify = TRUE)))
  
  if (verbose)
    message("The vocabulary:", vocabulary)
  return(vocabulary)
}

#' Preprocess string to semi-redundant one-hot vector
#'
#' @description
#' Outputs semi-redundant set of input character string.
#' Collapse, tokenize, and vectorize the character.
#' Use this function with a character string as input. For example, 
#' if the input text is ABCDEFGHI and the length(maxlen) is 5, the generating chunks would be:
#' X(1): ABCDE and Y(1): F;
#' X(2): BCDEF and Y(2): G;
#' X(3): CDEFG and Y(3): H;
#' X(4): DEFGH and Y(4): I
#' 
#' @param char character input string of text with the length of one
#' @param maxlen length of the semi-redundant sequences
#' @param vocabulary char contains the vocabulary from the input char
#' If no vocabulary exists, it is generated from the input char
#' @param verbose TRUE/FALSE
#' @export
preprocessSemiRedundant <- function(char,
                                    maxlen = 250,
                                    vocabulary = c("l", "p", "a", "c", "g", "t"),
                                    verbose = F) {
  
  stopifnot(!is.null(char))
  stopifnot(nchar(char) > 0)
  stopifnot(maxlen > 0)
  
  # Load, collapse, and tokenize text ("ACGT" -> "a" "c" "g" "t")
  text <- tokenizers::tokenize_characters(stringr::str_c(stringr::str_to_lower(char), collapse = "\n"), strip_non_alphanum = FALSE, simplify = TRUE)
  
  # Generating vocabulary from input char with the function getVocabulary()
  if (missing(vocabulary)) {
    if (verbose)
      message("Finding the vocabulary ...")
    vocabulary <- getVocabulary(char)
  }
  
  if(verbose)
    message("Vocabulary size:", length(vocabulary))
  # Cut the text in semi-redundant sequences of maxlen characters
  
  if (verbose)
    message("Generation of semi-redundant sequences ...")
  
  dataset <- purrr::map(seq(1, length(text) - maxlen, by = 1),
                        ~ list(sentece = text[.x:(.x + maxlen - 1)],
                               next_char = text[.x + maxlen]))
  dataset <- purrr::transpose(dataset)
  x <-
    array(0, dim = c(length(dataset$sentece), maxlen, length(vocabulary)))
  y <- array(0, dim = c(length(dataset$sentece), length(vocabulary)))
  # Vectorization
  
  if (verbose)
    message("Vectorization ...")
  if (verbose)
    pb <-  txtProgressBar(min = 0,
                          max = length(dataset$sentece),
                          style = 3)
  for (i in 1:length(dataset$sentece)) {
    if (verbose)
      setTxtProgressBar(pb, i)
    # generate one-hot encoding for one subset
    x[i, ,] <- sapply(vocabulary, function(x) {
      as.integer(x == dataset$sentece[[i]])
    })
    # target (next nucleotide in sequence)
    y[i,] <- as.integer(vocabulary == dataset$next_char[[i]])
  }
  
  results <- list("X" = x, "Y" = y)
  return(results)
}

#' Wrapper of the preprocessSemiRedundant()-function 
#' 
#' @description
#' Is called on the genomic contents of one
#' FASTA file. Multiple entries are combined with newline characters.
#' @param path path to the FASTA file
#' @param maxlen length of the semi-redundant sequences
#' @param vocabulary char contains the vocabulary from the input char
#' If no vocabulary exists, it is generated from the input char
#' @param verbose TRUE/FALSE
#' @export
preprocessFasta <- function(path,
                            maxlen = 250,
                            vocabulary = c("l", "p", "a", "c", "g", "t"),
                            verbose = F) {
  
  
  # process corpus
  fasta.file <- Biostrings::readDNAStringSet(path)
  seq <- paste0("l", paste(fasta.file, collapse = "p"),"l") 
  
  if(verbose)
    message("Preprocessing the data ...")
  
  seq.processed <-
    preprocessSemiRedundant(char = seq, maxlen = maxlen, vocabulary = vocabulary,
                            verbose = F) 
  return(seq.processed)
}

#' One-hot-encodes integer sequence  
#' 
#' \code{sequenceToArray} Helper function for \code{\link{{fastaFileGenerator}}, returns one hot encoding for sequence  
#' 
#' @param sequence Sequence of integers. 
#' @param maxlen Length of one sample
#' @param vocabulary Set of characters to encode.   
#' @param startInd Start positions of samples in \code{sequence}.  
#' @export
sequenceToArray <- function(sequence, maxlen, vocabulary, startInd){
  stopifnot(length(sequence) > maxlen)
  startInd <- startInd - startInd[1] + 1
  numberOfSamples <- length(startInd)
  
  # every row in z one-hot encodes one character in sequence, oov is zero-vector
  z  <- keras::to_categorical(sequence, num_classes = length(vocabulary) + 1)[ , -1]
  x <- array(0, dim = c(numberOfSamples, maxlen, length(vocabulary)))
  for (i in 1:numberOfSamples){
    start <- startInd[i]
    x[i, , ] <- z[start : (start + maxlen - 1), ]
  }
  y <- z[startInd + maxlen, ]
  list(x, y)
}

#' One-hot-encodes integer sequence  
#' 
#' \code{sequenceToArrayLabel} Helper function for \code{\link{{fastaLabelGenerator}}, returns one hot encoding for sequence and returns samples from
#' specified positions  
#'
#' @param sequence Sequence of integers.
#' @param maxlen Length of predictor sequence.  
#' @param vocabulary Set of characters to encode.    
#' @param startInd Start positions of samples in \code{sequence}.  
#' @export
sequenceToArrayLabel <- function(sequence, maxlen, vocabulary, startInd){
  
  stopifnot(length(sequence) > (maxlen - 1))
  startInd <- startInd - startInd[1] + 1
  numberOfSamples <- length(startInd)
  # every row in z one-hot encodes one character in sequence, oov is zero-vector
  z <- keras::to_categorical(matrix(sequence, ncol = 1), num_classes = length(vocabulary) + 1)[ , -1]
  z <- matrix(z, ncol = length(vocabulary)) 
  x <- array(0, dim = c(numberOfSamples, maxlen, length(vocabulary)))
  for (i in 1:numberOfSamples){
    start <- startInd[i]
    x[i, , ] <- z[start : (start + maxlen - 1), ]
  }
  return(x)
}

#' Computes start position of samples
#'
#' Helper function for \code{\link{{fastaLabelGenerator}} and \code{\link{{fastaFileGenerator}}. Computes positions in sequence where samples can be extracted 
#' 
#' @param seq_vector Vector of character sequences.
#' @param length_vector Length of sequences in \code{seq_vector}.
#' @param maxlen Length of one predictor sequence.
#' @param step Distance between samples from one entry in \code{seq_vector}.   
#' @param train_mode Either "lm" for language model or "label" for label classification. Language models need one character more 
#' (the target) for one sample.
#' @export
getStartInd <- function(seq_vector, length_vector, maxlen, step, train_mode = "label"){
  stopifnot(train_mode == "lm" | train_mode == "label")
  if (length(length_vector) > 1){
    startNewEntry <- cumsum(c(1, length_vector[-length(length_vector)]))
    if (train_mode == "label"){
      indexVector <- purrr::map(1:(length(length_vector) - 1), ~seq(startNewEntry[.x], startNewEntry[.x + 1] - maxlen, by = step))
    } else {
      indexVector <- purrr::map(1:(length(length_vector) - 1), ~seq(startNewEntry[.x], startNewEntry[.x + 1] - maxlen - 1, by = step))
      
    }
    indexVector <- unlist(indexVector)
    last_seq <- length(seq_vector)
    if (!(startNewEntry[last_seq] > (sum(length_vector) - maxlen + 1))) {
      if (train_mode == "label"){
        indexVector <- c(indexVector, seq(startNewEntry[last_seq], sum(length_vector) - maxlen + 1, by = step))
      } else {
        indexVector <- c(indexVector, seq(startNewEntry[last_seq], sum(length_vector) - maxlen, by = step))
      }
    }
    return(indexVector)
  } else {
    if (train_mode == "label"){
      indexVector <- seq(1, length_vector - maxlen + 1, by = step)
    } else {
      indexVector <- seq(1, length_vector - maxlen, by = step)
    }
    return(indexVector)
  }
}
hiddengenome/altum documentation built on April 22, 2020, 9:33 p.m.