#' @title Quality check graphics for DNA Barcodes
#'
#' @name fasta_organizer
#'
#' @description order species/accessions in a fasta file based in a phylogenetic tree.
#'
#' @param sequence_fasta_file_user A DNA barcodes file in fasta format.
#' @param sequence_nexus_file_user A DNA barcodes file in Nexus format.
#' @param seqinr_seqtype_user Genetic information class. Either DNA ("DNA") or Aminoacid ("AA") option.
#' @param organizer_user Order criterion. order based in a phylogenetic tree ("tree"). Either ascending order ("a_to_z") or descending order ("z_to_a").
#' @param tree_file_user A phylogentic tree file to be used as order criterion.
#' @param ape_root_tree_user Vector. a species names to root the phylogenetic tree.
#' @param ape_ladderize_user Logical. reorganizes the internal structure of the tree based in ladderize function in ape packages.
#' @param remove_taxa_if_match_pattern_user Vector. Species list to be eliminate on phylogenetic tree and DNA barcode file (fasta or nexus)
#' @param file_out_user Character. an optional name to be added to the output file.
#' @param out_directory_user Output directory.
#'
#' @return This function return an ordered fasta file and a phylogenetic tree based in functions parameters.
#'
#' @examples
#' COX1 <-fasta_organizer (sequence_fasta_file_user = NULL,
#' sequence_nexus_file_user = system.file("extdata","COI_aligned_end6.nexus",package="BarcoR"),
#' seqinr_seqtype_user = "DNA",
#' organizer_user = "tree",
#' tree_file_user = system.file("extdata","COI_aligned_end6_newick.tree",package="BarcoR"),
#' ape_root_tree_user = NULL,
#' ape_ladderize_user = TRUE,
#' remove_taxa_if_match_pattern_user = NULL,
#' file_out_user = "COI_ordered",
#' out_directory_user = NULL)
#'
#'
#'
#' @export
#'
#' @importFrom seqinr "read.fasta"
#' @importFrom ape "read.nexus.data" "read.tree" "write.tree" "root" "ladderize"
#' @importFrom utils "read.table" "write.table" "tail"
#'
#'
fasta_organizer <- function(sequence_fasta_file_user = NULL,
sequence_nexus_file_user = NULL,
seqinr_seqtype_user = c("DNA", "AA"),
organizer_user = c("tree", "a_to_z", "z_to_a"),
tree_file_user = NULL,
ape_root_tree_user = NULL,
ape_ladderize_user = TRUE,
remove_taxa_if_match_pattern_user = NULL,
file_out_user = NULL,
out_directory_user = NULL) {
# user input
sequence_fasta_file <- sequence_fasta_file_user
sequence_nexus_file <- sequence_nexus_file_user
seqinr_seqtype <- seqinr_seqtype_user
organizer <- organizer_user
tree_file <- tree_file_user
ape_root_tree <- ape_root_tree_user
ape_ladderize <- ape_ladderize_user
remove_if_match <- remove_taxa_if_match_pattern_user
file_out <- file_out_user
## outdir arguments
if (is.null(out_directory_user)) {
out_directory <- getwd()
} else {
setwd(out_directory_user)
out_directory <- getwd()
}
######## get sequence name
if (is.null(sequence_fasta_file)) { sequence_file <- sequence_nexus_file
} else { sequence_file <- sequence_fasta_file } # for fasta file inputs
sequence_file_for_naming_long <- sub(".*/", "", sequence_file)
sequence_file_for_naming <- gsub("(.*)[.].*","\\1",sequence_file_for_naming_long)
# names for out files
if (is.null(file_out)) {file_out <- sequence_file_for_naming}
############ coerce to fasta from nexus processing section
# sequence_fasta_file <- "~/Desktop/positive_selection_transcriptome/hyphy_on_TLRs/TLR_matrices/TLR1_test_nucleotide.fasta"
if (!is.null(sequence_fasta_file)) {
sequence_list <- seqinr::read.fasta(file = sequence_fasta_file,
seqtype = seqinr_seqtype,
set.attributes = FALSE,
apply.mask = FALSE)
}
# read fasta file
if (!is.null(sequence_nexus_file)) { sequence_list <- ape::read.nexus.data(sequence_nexus_file) }
# get taxa id and >
taxnames_original <- names(sequence_list)
taxnames <- paste0(">", taxnames_original)
# collapse sequence in list
sequence_raw <- lapply(sequence_list, paste, collapse = "")
# make sure all are uppercase for standarization
sequence_raw <- toupper(sequence_raw)
# create a data.frame to organize
all_sequences_df <- data.frame(species = taxnames_original,
fasta_names = taxnames,
sequences = sequence_raw,
stringsAsFactors = FALSE)
# organize
if (organizer == "a_to_z") {
all_sequences_df <- all_sequences_df[order(all_sequences_df$species),]
}
if (organizer == "z_to_a") {
all_sequences_df <- all_sequences_df[order(all_sequences_df$species, decreasing = TRUE),]
}
############ tree processing section
if (organizer == "tree") {
if (is.null(tree_file)) { stop("the 'organizer_user' requires a newick tree file") }
# read master tree
master_tree <- ape::read.tree(file = tree_file)
# create dummy character data.frame to allow tree treaming
taxnames_for_tree <- all_sequences_df$species
taxa_names_select <- data.frame(species = taxnames_for_tree, dummy_trait = 0, stringsAsFactors = FALSE)
row.names(taxa_names_select) <- taxa_names_select[, 1]
# name check to prune
name.check2 <- function(phy, data, data.names = NULL) {
if (is.null(data.names)) {
if (is.vector(data)) {
data.names <- names(data)
}
else {
data.names <- rownames(data)
}
}
t <- phy$tip.label
r1 <- t[is.na(match(t, data.names))]
r2 <- data.names[is.na(match(data.names, t))]
r <- list(sort(r1), sort(r2))
names(r) <- cbind("tree_not_data", "data_not_tree")
if (length(r1) == 0 && length(r2) == 0) {
return("OK")
}
else {
return(r)
}
}
name_check_reduce <- name.check2(master_tree, taxa_names_select)
# name of newick tree
name_on_start <- file_out
if (name_check_reduce[1] == "OK") {
reduced_tree <- master_tree
} else {
cat("\n ****** these taxa are not present in the tree or data file *****\n")
print(name_check_reduce)
reduced_tree <- ape::drop.tip(master_tree, name_check_reduce$tree_not_data)
}
# manipulate tree
if (!is.null(ape_root_tree)) {
# get list outgroups
tip_labels <- reduced_tree$tip.label
outgroup_names <- character()
for (i in 1:length(ape_root_tree)) {
# i <- 1
one_outgroup_names <- tip_labels[grep(ape_root_tree[i], tip_labels)]
outgroup_names <- c(outgroup_names, one_outgroup_names)
}
rm(i)
# organize by preference
cat("\n ***** possible taxa to root the tree: \n")
print(outgroup_names)
cat("\n")
# try to root
for (i in 1:length(outgroup_names)) {
# i <- 1
tree_try <- try(ape::root(reduced_tree, outgroup=outgroup_names[i]), silent = TRUE)
if (!class(tree_try) == "try-error") {
cat("\n ****** the tree was rooted using this taxa: ", outgroup_names[i], "\n")
reduced_tree <- tree_try
break
}
}
rm(tree_try)
}
# get order of tips
reduced_tree <- ape::ladderize(reduced_tree, right = ape_ladderize_user)
is_tip <- reduced_tree$edge[,2] <= length(reduced_tree$tip.label)
ordered_tips <- reduced_tree$edge[is_tip, 2]
# write.tree present -- make optional
ape::write.tree(reduced_tree, file = paste0(name_on_start, "_reduced_newick.tree"), append = FALSE, digits = 10, tree.names = FALSE)
# remove non_present taxa in tree
if (!class(name_check_reduce) == "list") { name_check_reduce <- list(data_not_tree = NA)}
sequences_with_tree <- all_sequences_df[!all_sequences_df[, 1] %in% name_check_reduce$data_not_tree, ]
sequences_with_tree <- subset(sequences_with_tree, select = !names(sequences_with_tree) %in% name_check_reduce$data_not_tree)
# order rows using tree structure
all_sequences_df <- sequences_with_tree[match(reduced_tree$tip.label[ordered_tips], sequences_with_tree$species), ]
}
# remove sequences if requested
if (!is.null(remove_if_match )) {
species_that_match <- all_sequences_df$species[grep(remove_if_match, all_sequences_df$species)]
cat("\n ***** these sequences will be eliminated **** \n")
print(species_that_match)
cat("\n")
all_sequences_df <- all_sequences_df[grep(remove_if_match,all_sequences_df$species, invert = TRUE), ]
}
rownames(all_sequences_df) <- NULL
################### create fasta output
## nbpar -- nucleotide
nbpar <- sapply(all_sequences_df$sequences, length)
sequence_fasta <- vector(length = length(all_sequences_df$sequences) + length(all_sequences_df$fasta_names))
id <- 1
for (i in seq(along = nbpar[-1])) id <- c(id, tail(id, 1) + nbpar[i] + 1)
sequence_fasta[id] <- all_sequences_df$fasta_names
sequence_fasta[-id] <- all_sequences_df$sequences
# write file fasta file
file_out_fasta_name <- paste0(file_out, "_nucleotide.fasta")
sequence_fasta_out <- cat(sequence_fasta, file = file_out_fasta_name, sep = "\n")
cat("\n ******* nucleotide sequences in fasta format have been written to this file - ", file_out_fasta_name, "\n")
# out dataframe
file_out_df_fasta_name <- paste0(file_out, "_nucleotide_df.txt")
utils::write.table(all_sequences_df, file = file_out_df_fasta_name, sep = "\t", row.names = FALSE)
############ processing section: DONE
setwd(out_directory)
return(all_sequences_df)
}
################################## END OF FUNCTION ###########################
####################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.