#' Read sequences from a file
#'
#' These functions read a sequence alignment file and return a named list of sequences.
#' Currently supported formats are `fasta` and `nexus`. Interleaved formats, where sequences are
#' defined in a repeating blocks, are not supported.
#'
#' The function `[read_sequences]` tries to guess the correct format from the file extension.
#' Alternatively, the correct format can be specified. Currently supported formats are
#' \emph{fasta} and \emph{nexus}.
#'
#' Functions `[read_fasta]` and `[read_nexus]` are then used to read the sequence alignment file.
#'
#' @param file with alignment
#' @param format requested format
#' @return named list of sequences
#'
#' @export
#'
#' @examples
#' seq1 = c("A" = "AAA", "B" = "BBB", "C" = "CCC")
#' fasta = tempfile(fileext = ".fasta")
#' writeLines(paste0(">", names(seq1), "\n", seq1), fasta)
#'
#' seq2 = read_sequences(fasta)
#' seq3 = read_fasta(fasta)
#' identical(seq1, seq2)
#' identical(seq1, seq3)
read_sequences = function(file, format=NULL){
supported_formats = c("fasta", "nexus")
if(!is.null(format)){
if(!format %in% supported_formats) stop("Unsupported format: ", format)
if(format == "fasta") return(read_fasta(file))
if(format == "nexus") return(read_nexus(file))
} else {
ext = tolower(tools::file_ext(file))
if(ext %in% c("fasta", "fst", "fas")) return(read_fasta(file))
if(ext %in% c("nexus", "nex")) return(read_nexus(file))
# else:
stop("Unrecognized extension: ", ext)
}
}
#' @rdname read_sequences
#' @export
read_fasta = function(file){
text = readLines(file)
starts = grep("^>", text)
stops = c(starts[-1]-1, length(text))
sequences = mapply(
function(start, stop, text){
seq = text[(start+1):stop]
seq = gsub("[[:blank:]]*", "", seq)
paste0(seq, collapse="")
},
starts, stops, MoreArgs=list(text)
)
names(sequences) = sub("^>", "", text[starts])
return(sequences)
}
#' @rdname read_sequences
#' @export
read_nexus = function(file){
text = readLines(file)
# Find start and end of the data block:
begin_data = grep("^begin data;", text, ignore.case=TRUE)
ends = grep("^end;", text, ignore.case=TRUE)
end_data = ends[which.min(abs(ends - begin_data))]
# Find the start and end of the data matrix
begin_matrix = grep("^matrix", ignore.case=TRUE, text[begin_data:end_data]) + begin_data - 1
end_matrix = grep("^;", text[begin_matrix:end_data])[1] + begin_matrix - 1
header = parse_nexus_header(text[(begin_data+1):(begin_matrix-1)])
sequences = parse_nexus_sequences(text[(begin_matrix+1):(end_matrix-1)])
return(sequences)
}
#' Internal helper functions for reading alignment
#'
#' These functions are used internally when alignemnt is being read and processed. They tend to
#' extract and proces sequences from particular part of text.
#'
#' @param text part of sequence file parsed by a helper function
#' @name read_sequences_helper
#'
#' @keywords internal
NULL
#' @rdname read_sequences_helper
parse_nexus_header = function(text){
text = sub(";$", "", text)
text = strsplit(text, split=" ", fixed=TRUE)
text = unlist(text)
text = grep("=", text, value=TRUE)
text = strsplit(text, split="=", fixed=TRUE)
header = lapply(text, getElement, 2)
names(header) = lapply(text, getElement, 1)
header
}
#' @rdname read_sequences_helper
parse_nexus_sequences = function(text){
text = strsplit(text, split="[[:blank:]]+")
sequences = sapply(text, getElement, 2)
names(sequences) = sapply(text, getElement, 1)
sequences
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.