Nothing
#' @name readFasta
#' @title Read and write FASTA files
#' @aliases readFasta writeFasta
#'
#' @description Reads and writes biological sequences (DNA, RNA, protein) in the FASTA format.
#'
#' @usage readFasta(in.file)
#' writeFasta(fdta, out.file, width = 0)
#'
#' @param in.file url/directory/name of (gzipped) FASTA file to read.
#' @param fdta A \code{\link{tibble}} with sequence data, see \sQuote{Details} below.
#' @param out.file Name of (gzipped) FASTA file to create.
#' @param width Number of characters per line, or 0 for no linebreaks.
#'
#' @details These functions handle input/output of sequences in the commonly used FASTA format.
#' For every sequence it is presumed there is one Header-line starting with a \sQuote{>}. If
#' filenames (\code{in.file} or \code{out.file}) have the extension \code{.gz} they will automatically be
#' compressed/uncompressed.
#'
#' The sequences are stored in a \code{\link{tibble}}, opening up all the possibilities in R for
#' fast and easy manipulations. The content of the file is stored as two columns, \samp{Header}
#' and \samp{Sequence}. If other columns are added, these will be ignored by \code{\link{writeFasta}}.
#'
#' The default \code{width = 0} in \code{\link{writeFasta}} results in no linebreaks in the sequences
#' (one sequence per line).
#'
#' @return \code{\link{readFasta}} returns a \code{\link{tibble}} with the contents of the (gzipped) FASTA
#' file stored in two columns of text. The first, named \samp{Header}, contains
#' the headerlines and the second, named \samp{Sequence}, contains the sequences.
#'
#' \code{\link{writeFasta}} produces a (gzipped) FASTA file.
#'
#' @author Lars Snipen and Kristian Hovde Liland.
#'
#' @seealso \code{\link{readFastq}}.
#'
#' @examples
#' \dontrun{
#' # We need a FASTA-file to read, here is one example file:
#' fa.file <- file.path(file.path(path.package("microseq"),"extdata"),"small.ffn")
#'
#' # Read and write
#' fdta <- readFasta(fa.file)
#' ok <- writeFasta(fdta[4:5,], out.file = "delete_me.fasta")
#'
#' # Make use of dplyr to copy parts of the file to another file
#' readFasta(fa.file) %>%
#' filter(str_detect(Sequence, "TGA$")) %>%
#' writeFasta(out.file = "TGAstop.fasta", width = 80) -> ok
#' }
#'
#' @keywords sequence FASTA
#'
#' @importFrom tibble as_tibble
#' @importFrom stringr str_c
#' @importFrom dplyr filter %>%
#' @importFrom data.table fread fwrite
#'
#' @export readFasta
#' @export writeFasta
#'
readFasta <- function(in.file){
if(!is.character(in.file) | length(in.file) > 1) stop("The argument in.file must be a single text (a filename)")
if(file.exists(in.file)){
in.file <- normalizePath(in.file)
fread(in.file, header = F, sep = "\t", data.table = F, na.strings = "") %>%
filter(!is.na(.data$V1)) -> tbl
hh <- grep("^>", tbl[,1])
hhh <- c(hh, nrow(tbl) + 1)
tibble(Header = str_remove(tbl[hh,1], "^>"),
Sequence = sapply(1:length(hh), function(i){str_c(tbl[(hhh[i]+1):(hhh[i+1]-1),1], collapse = "")})) -> fdta
return(fdta)
} else {
stop("Cannot find ", in.file, ", please correct path and/or file name")
}
}
writeFasta <- function(fdta, out.file, width = 0){
if(!("Header" %in% colnames(fdta))){
stop("This is not a Fasta object, column Header is lacking\n")
}
if(!("Sequence" %in% colnames(fdta))){
stop("This is not a Fasta object, column Sequence is lacking\n")
}
out.file <- file.path(normalizePath(dirname(out.file)),
basename(out.file))
lst <- vector("list", 2 * nrow(fdta))
lst[seq(1, 2 * nrow(fdta), 2)] <- as.list(str_c(">", fdta$Header))
if(width > 0){
lst[seq(2, 2 * nrow(fdta), 2)] <- lapply(fdta$Sequence, chop, width)
} else {
lst[seq(2, 2 * nrow(fdta), 2)] <- as.list(fdta$Sequence)
}
fwrite(tibble(unlist(lst)), file = out.file, compress = "auto", quote = F, col.names = F)
}
chop <- function(sequence, width){
n <- str_length(sequence)
s1 <- seq(1, n, width)
s2 <- pmin(s1 + width - 1, n)
return(str_sub(sequence, s1, s2))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.