Nothing
#' Perform Sequence Alignment Using MUMmer
#'
#' This function orchestrates the alignment of sequences in a specified
#' directory using MUMmer, a tool for aligning large DNA or protein sequences.
#' It can handle GenBank and FASTA file formats. and performs checks to ensure
#' necessary files are present.
#'
#' @param path The directory containing the sequence files.
#' @param cluster Optional vector of cluster names to consider for alignment. If
#' NULL, clusters are inferred from file names. The order of names determines
#' the alignment sequence.
#' @param maptype The type of mapping to perform; "many-to-many" or
#' "one-to-one". "many-to-many" allows for multiple matches between clusters,
#' "one-to-one" restricts alignments to unique matches between a pair.
#' @param seqtype The type of sequences, either "protein" or "nucleotide".
#' @param mummer_options Additional command line options for MUMmer. To see all
#' available options, you can run `nucmer --help` or `promer --help` in the
#' terminal depending on whether you are aligning nucleotide or protein
#' sequences.
#' @param filter_options Additional options for filtering MUMmer results. To
#' view all filtering options, run `delta-filter --help` in the terminal.
#' @param remove_files Logical indicating whether to remove intermediate files
#' generated during the process, defaults to TRUE.
#' @param output_dir Optional directory for output files; defaults to
#' \code{tempdir()}
#'
#' @return A data frame combining all alignment results, or NULL if errors occur
#' during processing.
#'
#' @examples
#' \dontrun{
#' # Basic alignment with default options
#' mummer_alignment(
#' path = "/path/to/sequences",
#' maptype = "many-to-many",
#' seqtype = "protein"
#' )
#'
#' # Alignment with specific MUMmer options
#' mummer_alignment(
#' path = "/path/to/sequences",
#' maptype = "one-to-one",
#' seqtype = "protein",
#' mummer_options = "--maxgap=500 --mincluster=100",
#' filter_options = "-i 90"
#' )
#' }
#' @references Kurtz S, Phillippy A, Delcher AL, Smoot M, Shumway M, Antonescu
#' C, Salzberg SL (2004). Versatile and open software for comparing large
#' genomes. Genome Biology, 5(R12).
#'
#'
#' @export
mummer_alignment <- function(
path,
cluster = NULL,
maptype = "many-to-many",
seqtype = "protein",
mummer_options = "",
filter_options = "",
remove_files = TRUE,
output_dir = tempdir()
){
if (!dir.exists(path)) {
stop("The specified directory does not exist. Please check the file path.")
}
# Ensure the output directory exists
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
if (grepl(" ", path)) {
stop("MUMmer requires a directory path without spaces.")
}
if (grepl(" ", output_dir)) {
stop("MUMmer requires an output directory path without spaces.")
}
all_files <- list.files(path, full.names = TRUE, pattern = "\\.gbk$|\\.gb$|\\.fasta$")
# Move all files to the output dir if specified
if(output_dir != path){
sapply(all_files, function(x) file.copy(x, output_dir, overwrite = TRUE))
path <- output_dir
all_files <- list.files(path, full.names = TRUE, pattern = "\\.gbk$|\\.gb$|\\.fasta$")
}
if (length(all_files) == 0) {
stop("No files found in the specified directory. Check the path or file types.")
}
if(is.null(cluster)){
cluster <- unique(gsub("\\.gbk$|\\.gb$|\\.fasta$", "", basename(all_files)))
}
if (length(cluster) < 2) {
stop("At least two clusters must be provided.")
}
# Check if FASTA files and matching cluster names
cluster_pattern <- paste0("^(", paste(cluster, collapse = "|"), ")\\.fasta$")
fasta_files <- all_files[grep(cluster_pattern, basename(all_files))]
found_fasta_names <- gsub("\\.fasta$", "", basename(fasta_files))
missing_files <- cluster[!cluster %in% found_fasta_names]
gbk_files <- NULL
# Check for GenBank and convert to FASTA if missing
if(length(missing_files) > 0){
cluster_pattern <- paste0("^(", paste(missing_files, collapse = "|"), ")(\\.gbk|\\.gb)$")
gbk_files <- all_files[grep(cluster_pattern, basename(all_files))]
if (length(gbk_files) > 0) {
sapply(gbk_files, function(file) {
genbank_to_fasta(file)
})
fasta_files <- all_files[grep(cluster_pattern, basename(all_files))]
}
pattern <- paste0("^(", paste(cluster, collapse = "|"), ")\\.fasta$")
fasta_files <- list.files(path, pattern = pattern, full.names = TRUE)
found_clusters <- gsub("\\.fasta$", "", basename(fasta_files))
missing_files <- setdiff(cluster, found_clusters)
if(length(missing_files) > 0){
stop(paste("Sequence file(s) for:", paste(missing_files, collapse=", "), "missing."))
}
}
cluster_pairs <-
mapply(c, cluster[-length(cluster)], cluster[-1], SIMPLIFY = FALSE)
links <- lapply(cluster_pairs, function(pair) {
pattern_reference <- paste0(pair[1], "\\.fasta$")
pattern_query <- paste0(pair[2], "\\.fasta$")
reference_seq <- fasta_files[grep(pattern_reference, fasta_files)]
query_seq <- fasta_files[grep(pattern_query, fasta_files)]
tryCatch({
# Call the mummer_alignment function
mummer(
reference = reference_seq,
query = query_seq,
maptype = maptype,
seqtype = seqtype,
mummer_options = mummer_options,
filter_options = filter_options
)
coords_file <- sprintf("%s/%s_%s_%s_%s.coords",
path,
pair[1],
pair[2],
seqtype,
gsub("-","_", maptype)
)
if(seqtype == "nucleotide"){
coords <- parse_nucmer(coords_file, pair[1], pair[2])
} else if(seqtype =="protein"){
coords <- parse_procmer(coords_file, pair[1], pair[2])
}
if (remove_files) {
file.remove(list.files(path=path, pattern=sprintf("%s_%s", pair[1], pair[2]), full.names=TRUE))
}
return(coords)
}, error = function(e) {
warning(sprintf("Error in aligning %s vs %s: %s", pair[1], pair[2], e$message))
return(NULL)
})
})
links <- Filter(Negate(is.null), links)
links <- do.call(rbind, links)
if (remove_files) {
if (output_dir == tempdir()) {
files_to_remove <- all_files
files_to_remove <- files_to_remove[file.exists(files_to_remove)]
file.remove(files_to_remove)
} else if (!is.null(gbk_files) && length(gbk_files) > 0) {
fasta_files_to_remove <- sub("\\.gbk$|\\.gb$", ".fasta", gbk_files)
fasta_files_to_remove <- fasta_files_to_remove[file.exists(fasta_files_to_remove)]
file.remove(fasta_files_to_remove)
}
}
return(links)
}
#' @noRd
mummer <- function(reference, query, maptype = "many-to-many", seqtype = "protein", mummer_options = "", filter_options = "", output_dir = NULL) {
# Validate the sequence type
if (!seqtype %in% c("protein", "nucleotide")) {
stop("Invalid sequence type. Please choose either 'protein' or 'nucleotide'.")
}
# Determine the appropriate command based on the sequence type
command_type <- ifelse(seqtype == "protein", "promer", "nucmer")
# Check if MUMmer (either nucmer or promer) is installed
if (system(paste("command -v", command_type), ignore.stdout = TRUE, ignore.stderr = TRUE) != 0) {
stop(paste(command_type, "is not installed or not in the PATH."))
}
# Validate mapping type
if (!maptype %in% c("many-to-many", "one-to-one")) {
stop("Invalid mapping type. Please choose either 'many-to-many' or 'one-to-one'.")
}
maptype_flag <- ifelse(maptype == "one-to-one", "-1", "-m")
# Check if files exist
if (!file.exists(query)) {
stop("Query file does not exist: ", query)
}
if (!file.exists(reference)) {
stop("Reference file does not exist: ", reference)
}
# Get the directory of the query file
if(!is.null(output_dir)){
query_dir <- output_dir
} else {
query_dir <- dirname(query)
}
# Set prefix for output files to be in the same directory as the query file
prefix <- file.path(query_dir, paste0(
tools::file_path_sans_ext(basename(reference)),
"_",
tools::file_path_sans_ext(basename(query))
))
# Construct and run the MUMmer command
mummer_cmd <- sprintf(
"%s --prefix=%s_%s %s %s %s",
command_type,
prefix,
seqtype,
mummer_options,
reference,
query)
if (system(mummer_cmd) != 0) {
stop(sprintf("Failed to run %s.", command_type))
}
# Run delta-filter with the specified alignment type
delta_filter_cmd <- sprintf(
"delta-filter %s %s %s_%s.delta > %s_%s_%s.delta",
maptype_flag,
filter_options,
prefix,
seqtype,
prefix,
seqtype,
gsub("-", "_", maptype)
)
if (system(delta_filter_cmd) != 0) {
stop("Failed to run delta-filter.")
}
# Extract and format alignment coordinates
# Add -k flag if sequence type is 'protein'
show_coords_options <- if (seqtype == "protein") "-H -T -r -k" else "-H -T -r"
show_coords_cmd <- sprintf(
"show-coords %s %s_%s_%s.delta > %s_%s_%s.coords",
show_coords_options,
prefix,
seqtype,
gsub("-", "_", maptype),
prefix,
seqtype,
gsub("-", "_", maptype)
)
if (system(show_coords_cmd) != 0) {
stop("Failed to run show-coords.")
}
}
#' @noRd
parse_nucmer <- function(path, reference, query){
col_names <- c("start1", "end1", "start2", "end2", "length1", "length2", "identity", "tag1", "tag2")
if(file.exists(path)){
lines <- readLines(path)
# Check if lines is empty
if(length(lines) == 0){
# Return an empty data frame with the specified column names
data <- data.frame(matrix(ncol = length(col_names), nrow = 0))
colnames(data) <- col_names
} else {
# Proceed with reading the table if lines is not empty
data <- read.table(
text = lines,
header = FALSE,
sep = "\t",
fill = TRUE,
col.names = col_names
)
data$cluster1 <- reference
data$cluster2 <- query
}
} else {
stop("The specified path does not exist.")
}
return(data)
}
#' @noRd
parse_procmer <- function(path, reference, query){
col_names <- c("start1", "end1", "start2", "end2", "length1", "length2", "identity",
"similarity", "stop", "frame1", "frame2", "tag1", "tag2")
if(file.exists(path)){
lines <- readLines(path)
# Check if lines is empty
if(length(lines) == 0){
# Return an empty data frame with the specified column names
data <- data.frame(matrix(ncol = length(col_names), nrow = 0))
colnames(data) <- col_names
} else {
# Proceed with reading the table if lines is not empty
data <- read.table(
text = lines,
header = FALSE,
sep = "\t",
fill = TRUE,
col.names = col_names
)
data$cluster1 <- reference
data$cluster2 <- query
}
} else {
stop("The specified path does not exist.")
}
return(data)
}
#' Convert GenBank to FASTA Format
#'
#' This function reads a GenBank file, extracts the sequence and
#' writes it to a new file in FASTA format. It parses the DEFINITION and VERSION
#' for the header and sequences from the ORIGIN section.
#'
#' @param path Path to the GenBank file.
#' @param output_dir Optional path for the output FASTA file. If NULL, the
#' output is saved in the same directory as the input file with the same base
#' name but with a .fasta extension.
#'
#' @return None explicitly, but writes the FASTA formatted data to a file.
#'
#' @examples
#' \donttest{
#' # Path to the example GenBank file in the package
#' gbk_file <- system.file("extdata", "BGC0000001.gbk", package = "geneviewer")
#'
#' # Convert the GenBank file to FASTA format
#' genbank_to_fasta(gbk_file)
#' }
#'
#'
#' @importFrom tools file_path_sans_ext
#'
#' @export
genbank_to_fasta <- function(path, output_dir=NULL){
# Check if the file exists
if (!file.exists(path)) {
stop("File does not exist")
}
# Read all lines from the file
lines <- readLines(path)
# Extract the definition and version for the FASTA header
definition <- lines[grep("^DEFINITION", lines)]
definition <- sub("^DEFINITION\\s+", "", definition)
definition <- sub("[^a-zA-Z0-9]+$", "", definition)
version <- lines[grep("^VERSION", lines)]
version <- sub("^VERSION\\s+", "", version)
fasta_header <- sprintf(">%s %s", version, definition)
# Identify the start of the sequence block
origin_index <- grep("^ORIGIN", lines)
if (length(origin_index) == 0) {
stop("No sequence detected in GenBank file.")
}
end_index <- grep("^//", lines)
# Extract and clean all lines after "ORIGIN"
if (length(origin_index) > 0 && length(end_index) > 0 && origin_index < end_index) {
sequence_lines <- lines[(origin_index + 1):(end_index - 1)]
sequence <- paste(sequence_lines, collapse = "")
sequence <- gsub("\\s+|[0-9]+", "", sequence)
sequence <- toupper(sequence)
} else {
stop("The sequence block is not properly formatted or missing.")
}
# Combine the header and sequence into one FASTA format string
fasta_content <- sprintf("%s\n%s", fasta_header, sequence)
gbk_names <- tools::file_path_sans_ext(basename(path))
if (is.null(output_dir)) {
file_dir <- dirname(path)
fasta_file_path <- file.path(file_dir, sprintf("%s.fasta", gbk_names))
} else {
fasta_file_path <- file.path(output_dir, sprintf("%s.fasta", gbk_names))
}
# Write the FASTA content to the specified file path
writeLines(fasta_content, fasta_file_path)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.