BarcoR: Quality Check Graphics for DNA Barcodes

Documented in fasta_organizer

#' @title Quality check graphics for DNA Barcodes
#'
#' @name fasta_organizer
#'
#' @description order species/accessions in a fasta file based in a phylogenetic tree.
#'
#' @param sequence_fasta_file_user A DNA barcodes file in fasta format.
#' @param sequence_nexus_file_user A DNA barcodes file in Nexus format.
#' @param seqinr_seqtype_user Genetic information class. Either DNA ("DNA") or Aminoacid ("AA") option.
#' @param organizer_user Order criterion. order based in a phylogenetic tree ("tree"). Either ascending order ("a_to_z") or descending order ("z_to_a").
#' @param tree_file_user A phylogentic tree file to be used as order criterion.
#' @param ape_root_tree_user Vector. a species names to root the phylogenetic tree.
#' @param ape_ladderize_user Logical. reorganizes the internal structure of the tree based in ladderize function in ape packages.
#' @param remove_taxa_if_match_pattern_user Vector. Species list to be eliminate on phylogenetic tree and DNA barcode file (fasta or nexus)
#' @param file_out_user Character. an optional name to be added to the output file.
#' @param out_directory_user Output directory.
#'
#' @return This function return an ordered fasta file and a phylogenetic tree based in functions parameters.
#'
#'  @examples
#'  COX1 <-fasta_organizer (sequence_fasta_file_user = NULL,
#'                 sequence_nexus_file_user = system.file("extdata","COI_aligned_end6.nexus",package="BarcoR"),
#'                 seqinr_seqtype_user = "DNA",
#'                 organizer_user = "tree",
#'                 tree_file_user = system.file("extdata","COI_aligned_end6_newick.tree",package="BarcoR"),
#'                 ape_root_tree_user = NULL,
#'                 ape_ladderize_user = TRUE,
#'                 remove_taxa_if_match_pattern_user = NULL,
#'                 file_out_user = "COI_ordered",
#'                 out_directory_user = NULL)
#'
#'
#'
#' @export
#'
#' @importFrom seqinr "read.fasta"
#' @importFrom ape "read.nexus.data" "read.tree" "write.tree" "root" "ladderize"
#' @importFrom utils "read.table" "write.table" "tail"
#'
#'
fasta_organizer <- function(sequence_fasta_file_user = NULL,
                            sequence_nexus_file_user = NULL,
                            seqinr_seqtype_user = c("DNA", "AA"),
                            organizer_user = c("tree", "a_to_z", "z_to_a"),
                            tree_file_user = NULL,
                            ape_root_tree_user = NULL,
                            ape_ladderize_user = TRUE,
                            remove_taxa_if_match_pattern_user = NULL,
                            file_out_user = NULL,
                            out_directory_user = NULL) {





  # user input

  sequence_fasta_file <- sequence_fasta_file_user
  sequence_nexus_file <- sequence_nexus_file_user
  seqinr_seqtype <- seqinr_seqtype_user
  organizer <- organizer_user
  tree_file <- tree_file_user
  ape_root_tree <- ape_root_tree_user
  ape_ladderize <- ape_ladderize_user
  remove_if_match <- remove_taxa_if_match_pattern_user

  file_out <- file_out_user

  ## outdir arguments

  if (is.null(out_directory_user)) {
    out_directory <- getwd()
  } else {
    setwd(out_directory_user)
    out_directory <- getwd()
  }

  ########  get sequence name

  if (is.null(sequence_fasta_file)) { sequence_file <- sequence_nexus_file
  } else { sequence_file <- sequence_fasta_file } # for fasta file inputs

  sequence_file_for_naming_long <- sub(".*/", "", sequence_file)
  sequence_file_for_naming <- gsub("(.*)[.].*","\\1",sequence_file_for_naming_long)

  # names for out files

  if (is.null(file_out)) {file_out <- sequence_file_for_naming}


  ############ coerce to fasta from nexus processing section

  # sequence_fasta_file <- "~/Desktop/positive_selection_transcriptome/hyphy_on_TLRs/TLR_matrices/TLR1_test_nucleotide.fasta"

  if (!is.null(sequence_fasta_file)) {
    sequence_list <- seqinr::read.fasta(file = sequence_fasta_file,
                                        seqtype = seqinr_seqtype,
                                        set.attributes = FALSE,
                                        apply.mask = FALSE)
  }

  # read fasta file

  if (!is.null(sequence_nexus_file)) { sequence_list <- ape::read.nexus.data(sequence_nexus_file) }

  # get taxa id and >

  taxnames_original <- names(sequence_list)
  taxnames <- paste0(">", taxnames_original)

  # collapse sequence in list

  sequence_raw <- lapply(sequence_list, paste, collapse = "")

  # make sure all are uppercase for standarization
  sequence_raw <- toupper(sequence_raw)

  # create a data.frame to organize

  all_sequences_df <- data.frame(species = taxnames_original,
                                 fasta_names = taxnames,
                                 sequences = sequence_raw,
                                 stringsAsFactors =  FALSE)

  # organize

  if (organizer == "a_to_z") {
    all_sequences_df <- all_sequences_df[order(all_sequences_df$species),]
  }

  if (organizer == "z_to_a") {
    all_sequences_df <- all_sequences_df[order(all_sequences_df$species, decreasing = TRUE),]
  }


  ############ tree processing section


  if (organizer == "tree") {

    if (is.null(tree_file)) { stop("the 'organizer_user' requires a newick tree file") }

    # read master tree
    master_tree <- ape::read.tree(file = tree_file)

    # create dummy character data.frame to allow tree treaming

    taxnames_for_tree <- all_sequences_df$species
    taxa_names_select  <- data.frame(species = taxnames_for_tree, dummy_trait = 0, stringsAsFactors = FALSE)
    row.names(taxa_names_select) <- taxa_names_select[, 1]

    # name check to prune

    name.check2 <- function(phy, data, data.names = NULL) {
      if (is.null(data.names)) {
        if (is.vector(data)) {
          data.names <- names(data)
        }
        else {
          data.names <- rownames(data)
        }
      }
      t <- phy$tip.label
      r1 <- t[is.na(match(t, data.names))]
      r2 <- data.names[is.na(match(data.names, t))]
      r <- list(sort(r1), sort(r2))
      names(r) <- cbind("tree_not_data", "data_not_tree")
      if (length(r1) == 0 && length(r2) == 0) {
        return("OK")
      }
      else {
        return(r)
      }
    }

    name_check_reduce <- name.check2(master_tree, taxa_names_select)

    # name of newick tree

    name_on_start <- file_out


    if (name_check_reduce[1] == "OK") {
      reduced_tree <- master_tree
    } else {
      cat("\n    ****** these taxa are not present in the tree or data file    *****\n")
      print(name_check_reduce)
      reduced_tree <- ape::drop.tip(master_tree, name_check_reduce$tree_not_data)
    }


    # manipulate tree

    if (!is.null(ape_root_tree)) {

      # get list outgroups

      tip_labels <- reduced_tree$tip.label

      outgroup_names <- character()

      for (i in 1:length(ape_root_tree)) {
        # i <- 1
        one_outgroup_names <- tip_labels[grep(ape_root_tree[i], tip_labels)]
        outgroup_names <- c(outgroup_names, one_outgroup_names)
      }
      rm(i)

      # organize by preference

      cat("\n ***** possible taxa to root the tree: \n")
      print(outgroup_names)
      cat("\n")

      # try to root

      for (i in 1:length(outgroup_names)) {
        # i <- 1
        tree_try <- try(ape::root(reduced_tree, outgroup=outgroup_names[i]), silent = TRUE)
        if (!class(tree_try) == "try-error") {
          cat("\n ****** the tree was rooted using this taxa: ", outgroup_names[i], "\n")
          reduced_tree <- tree_try
          break
        }
      }
      rm(tree_try)

    }

    # get order of tips

    reduced_tree <- ape::ladderize(reduced_tree, right = ape_ladderize_user)
    is_tip <- reduced_tree$edge[,2] <= length(reduced_tree$tip.label)
    ordered_tips <- reduced_tree$edge[is_tip, 2]

    # write.tree present -- make optional

    ape::write.tree(reduced_tree, file = paste0(name_on_start, "_reduced_newick.tree"), append = FALSE, digits = 10, tree.names = FALSE)

    # remove non_present taxa in tree

    if (!class(name_check_reduce) == "list") { name_check_reduce <- list(data_not_tree = NA)}

    sequences_with_tree <- all_sequences_df[!all_sequences_df[, 1] %in% name_check_reduce$data_not_tree, ]
    sequences_with_tree <- subset(sequences_with_tree, select = !names(sequences_with_tree) %in% name_check_reduce$data_not_tree)

    # order rows using tree structure

    all_sequences_df <- sequences_with_tree[match(reduced_tree$tip.label[ordered_tips], sequences_with_tree$species), ]

  }

  # remove sequences if requested

  if (!is.null(remove_if_match )) {
    species_that_match <- all_sequences_df$species[grep(remove_if_match, all_sequences_df$species)]
    cat("\n ***** these sequences will be eliminated **** \n")
    print(species_that_match)
    cat("\n")
    all_sequences_df <- all_sequences_df[grep(remove_if_match,all_sequences_df$species, invert = TRUE), ]
  }

  rownames(all_sequences_df) <- NULL

  ################### create fasta output

  ## nbpar -- nucleotide

  nbpar <- sapply(all_sequences_df$sequences, length)
  sequence_fasta <- vector(length = length(all_sequences_df$sequences) + length(all_sequences_df$fasta_names))
  id <- 1
  for (i in seq(along = nbpar[-1])) id <- c(id, tail(id, 1) + nbpar[i] + 1)

  sequence_fasta[id] <- all_sequences_df$fasta_names
  sequence_fasta[-id] <- all_sequences_df$sequences

  # write file fasta file

  file_out_fasta_name <- paste0(file_out, "_nucleotide.fasta")
  sequence_fasta_out <- cat(sequence_fasta, file = file_out_fasta_name, sep = "\n")

  cat("\n ******* nucleotide sequences in fasta format have been written to this file - ", file_out_fasta_name, "\n")

  # out dataframe

  file_out_df_fasta_name <- paste0(file_out, "_nucleotide_df.txt")

  utils::write.table(all_sequences_df, file = file_out_df_fasta_name, sep = "\t", row.names = FALSE)

  ############ processing section: DONE

  setwd(out_directory)

  return(all_sequences_df)

}
##################################    END OF FUNCTION    ###########################
####################################################################################