#' FASTA file to data table
#'
#' Reads a FASTA file and converts to a data table.
#'
#' @param fastaFile Character: The path to the input FASTA file.
#'
#' @return A data table with the columns \code{$LOCUS} (the locus name) and \code{$SEQ} (the sequence).
#'
#' @examples
#' # Create a link to raw external datasets in genomalicious
#' genomaliciousExtData <- paste0(find.package('genomalicious'), '/extdata')
#'
#' # Path to the demo FASTA file
#' fasta <- paste0(genomaliciousExtData, '/data_COI.fasta')
#'
#' # The FASTA file, printed as lines
#' readLines(fasta)
#'
#' # Import as data table
#' fasta2DT(fasta)
#'
#' @export
fasta2DT <- function(fastaFile){
# BEGIN ............
# --------------------------------------------+
# Libraries and assertions
# --------------------------------------------+
require(data.table)
# --------------------------------------------+
# Code
# --------------------------------------------+
# Read in the FASTA as lines
fastaLines <- readLines(fastaFile)
# Determine position of sequence names
seqIDpos <- grep(pattern='>', x=fastaLines)
# Get the bases for each sequence
seqBp <- lapply(1:length(seqIDpos), function(i){
if(i!=length(seqIDpos)){
X <- c(seqIDpos[i]+1, seqIDpos[i+1]-1)
}
if(i==length(seqIDpos)){
X <- c(seqIDpos[i]+1, length(fastaLines))
}
return(paste(fastaLines[X[1]:X[2]], collapse=''))
})
seqBp <- unlist(seqBp)
# Turn into a data table
fastaDT <- data.table(LOCUS=fastaLines[seqIDpos], SEQ=seqBp)
# Adjust LOCUS names and return
fastaDT$LOCUS <- gsub(pattern='>', x=fastaDT$LOCUS, replacement='')
return(fastaDT)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.