R/fasta2DT.R

Defines functions fasta2DT

Documented in fasta2DT

#' FASTA file to data table
#'
#' Reads a FASTA file and converts to a data table.
#'
#' @param fastaFile Character: The path to the input FASTA file.
#'
#' @return A data table with the columns \code{$LOCUS} (the locus name) and \code{$SEQ} (the sequence).
#'
#' @examples
#' # Create a link to raw external datasets in genomalicious
#' genomaliciousExtData <- paste0(find.package('genomalicious'), '/extdata')
#'
#' # Path to the demo FASTA file
#' fasta <- paste0(genomaliciousExtData, '/data_COI.fasta')
#'
#' # The FASTA file, printed as lines
#' readLines(fasta)
#'
#' # Import as data table
#' fasta2DT(fasta)
#'
#' @export
fasta2DT <- function(fastaFile){

  # BEGIN ............

  # --------------------------------------------+
  # Libraries and assertions
  # --------------------------------------------+
  require(data.table)

  # --------------------------------------------+
  # Code
  # --------------------------------------------+
  # Read in the FASTA as lines
  fastaLines <- readLines(fastaFile)

  # Determine position of sequence names
  seqIDpos <- grep(pattern='>', x=fastaLines)

  # Get the bases for each sequence
  seqBp <- lapply(1:length(seqIDpos), function(i){
            if(i!=length(seqIDpos)){
              X <- c(seqIDpos[i]+1, seqIDpos[i+1]-1)
            }
            if(i==length(seqIDpos)){
              X <- c(seqIDpos[i]+1, length(fastaLines))
            }
            return(paste(fastaLines[X[1]:X[2]], collapse=''))
  })
  seqBp <- unlist(seqBp)

  # Turn into a data table
  fastaDT <- data.table(LOCUS=fastaLines[seqIDpos], SEQ=seqBp)

  # Adjust LOCUS names and return
  fastaDT$LOCUS <- gsub(pattern='>', x=fastaDT$LOCUS, replacement='')
  return(fastaDT)
}
j-a-thia/genomalicious documentation built on Oct. 19, 2024, 7:51 p.m.