R/aligned_sequence_matrix_characterizer.R

Defines functions aligned_sequence_matrix_characterizer

Documented in aligned_sequence_matrix_characterizer

#' @title Quality check graphics for DNA Barcodes
#'
#'@name aligned_sequence_matrix_characterizer
#'
#' @description Provide a quality summary of DNA barcodes
#'
#' @param sequence_fasta_file_user An aligment o DNA barcodes in fasta file.
#' @param seqinr_translate_logical_user Logical. If it is TRUE the nucleotides will translate into Aminoacids
#' @param seqinr_genetic_code_user Character. A selection of Genetic code ("standard", "vertebrate.mitochondrial", "yeast.mitochondrial", "protozoan.mitochondrial+mycoplasma", "invertebrate.mitochondrial", "ciliate+dasycladaceal", "echinoderm+flatworm.mitochondrial", "euplotid", "bacterial+plantplastid", "alternativeyeast", "ascidian.mitochondrial", "alternativeflatworm.mitochondrial", "blepharism", "chlorophycean.mitochondrial", "trematode.mitochondrial", "scenedesmus.mitochondrial", "thraustochytrium.mitochondria", "Pterobranchia.mitochondrial", "CandidateDivision.SR1+Gracilibacteria", "Pachysolen.tannophilus")
#' @param consensus_selection_format_value Character.compare with consensus based in ("global","threshold", "selected_taxa")
#' @param consensus_selection_threshold_value numeric. A threshold value to conserv in analysis.
#' @param consensus_selection_taxa_names_value Character. A list of taxa names to filter the sequence data.
#' @param consensus_nucleotide_gap_char_value A character indicating the gap value in DNA sequence data , for example "-".
#' @param consensus_aminoacid_gap_char_value A character indicating the gap value in AA data , for example "X".
#' @param file_out_user Character. an optional name to be added to the output file.
#' @param out_directory_user Output directory
#'
#'
#'

#'
#'
#' @return 4 files: list of ambiguities,codons gaps and quality of DNA barcodes in txt format
#' @examples
#'COX1_UVI_quality <- aligned_sequence_matrix_characterizer (sequence_fasta_file_user = system.file("extdata","COI_ordered_nucleotide2.fasta",package="BarcoR"),
#'                                         seqinr_translate_logical_user = TRUE,
#'                                         seqinr_genetic_code_user = 2,
#'                                         consensus_selection_format_value = "selected_taxa",
#'                                         consensus_selection_threshold_value = 1500,
#'                                         consensus_selection_taxa_names_value = c("Mannophryne_collaris_COX1_MT627188", "Mannophryne_trinitatis_COX1_JX564878",
#'                                         "Anomaloglossus_baeobatrachus_COX1_MK069969", "Anomaloglossus_leopardus_COX1_MK069970", "Anomaloglossus_blanci_COX1_MG264895",
#'                                         "Anomaloglossus_blanci_COX1_NC_037857", "Anomaloglossus_degranvillei_COX1_MG264892", "Anomaloglossus_dewynteri_COX1_MG264891",
#'                                         "Anomaloglossus_surinamensis_COX1_MG264890", "Rheobates_palmatus_COX1_MT627206", "Allobates_trilineatus_COX1_MT627185",
#'                                         "Allobates_sumtuosus_COX1_MT627177", "Allobates_amissibilis_COX1_MT627204", "Allobates_marchesianus_COX1_MT627180",
#'                                         "Allobates_melanolaemus_COX1_MT627203", "Allobates_paleovarzensis_COX1_MT627194", "Allobates_tinae_COX1_MT627190",
#'                                         "Allobates_granti_COX1_MT627176", "Allobates_chalcopis_COX1_MT627182", "Allobates_conspicuus_COX1_MT627186",
#'                                         "Allobates_insperatus_COX1_MT627193", "Allobates_gasconi_COX1_MT627191", "Allobates_tapajos_COX1_MT627175",
#'                                         "Allobates_caeruleodactylus_COX1_MT627199", "Allobates_grillisimilis_COX1_MT627200", "Allobates_crombiei_COX1_MT627174",
#'                                         "Allobates_goianus_COX1_MT627207", "Allobates_carajas_COX1_MT627183", "Allobates_masniger_COX1_MT627198",
#'                                         "Allobates_nunciatus_COX1_MT627196", "Allobates_flaviventris_COX1_MT627192", "Allobates_magnussoni_COX1_MT627173",
#'                                         "Allobates_femoralis_COX1_MT627179", "Allobates_talamancae_COX1_MT627205", "Allobates_undulatus_COX1_MT627181",
#'                                         "Allobates_olfersioides_COX1_MT627202", "Ameerega_hahneli_COX1_MT627178", "Ameerega_hahneli_COX1_MW042031",
#'                                         "Ameerega_cainarachi_COX1_SRX6044059", "Ameerega_rubriventris_COX1_SRX6044017", "Ameerega_braccata_COX1_SRX6044048",
#'                                         "Ameerega_trivittata_COX1_SRX6044007", "Ameerega_parvula_COX1_MW042032", "Ameerega_bilinguis_COX1_MW042030",
#'                                         "Ameerega_silverstonei_COX1_SRX6044019", "Leucostethus_bilsa_COX1_MW042038", "Leucostethus_fugax_COX1_MW042037",
#'                                         "Silverstoneia_flotator_COX1_MW042039", "Epipedobates_machalilla_COX1_MW042036", "Epipedobates_anthonyi_COX1_MW042033",
#'                                         "Epipedobates_boulengeri_COX1_MW042034", "Epipedobates_darwinwallacei_COX1_MW042035", "Hyloxalus_craspedoceps_COX1_SRX6044040",
#'                                         "Hyloxalus_yasuni_COX1_KT221612", "Hyloxalus_subpunctatus_COX1_KY962392", "Dendrobates_fantasticus_ERX4488168",
#'                                         "Dendrobates_reticulatus_COX1_SRX6044054", "Dendrobates_defleri_COX1_SRX6044045", "Dendrobates_sirensis_ERX4072904",
#'                                         "Dendrobates_imitator_ERX3197357", "Dendrobates_mysteriosus_COX1_SRX6044024", "Dendrobates_claudiae_COX1_SRX6044029",
#'                                         "Andinobates_minutus_COX1_SRX6044028", "Dendrobates_sylvaticus_COX1_SRX5894869", "Dendrobates_pumilio_COX1_SRX7846193",
#'                                         "Dendrobates_galactonotus_COX1_SRX6044050", "Dendrobates_castaneoticus_COX1_SRX6044052",
#'                                         "Dendrobates_quinquevittatus_COX1_SRX6044056", "Dendrobates_steyermarki_COX1_SRX6044055", "Dendrobates_auratus_COX1_SRX6761101",
#'                                         "Dendrobates_leucomelas_COX1_SRX6761106", "Dendrobates_tinctorius_COX1_SRX6761107"),
#'                                          consensus_nucleotide_gap_char_value = "-",
#'                                          consensus_aminoacid_gap_char_value = "X",
#'                                          file_out_user = "COI_UVI",
#'                                          out_directory_user = NULL)
#'
#'
#' @export
#'
#' @importFrom seqinr "read.fasta" "translate" "consensus"
#' @importFrom stringr "str_detect"
#' @importFrom ape "ladderize"
#'
#'
#'
#'
aligned_sequence_matrix_characterizer  <- function(sequence_fasta_file_user = NULL,
                                                   seqinr_translate_logical_user = TRUE,
                                                   seqinr_genetic_code_user = 1,
                                                   consensus_selection_format_value = c("global","threshold", "selected_taxa"),
                                                   consensus_selection_threshold_value = 1500,
                                                   consensus_selection_taxa_names_value = NULL,
                                                   consensus_nucleotide_gap_char_value = "-",
                                                   consensus_aminoacid_gap_char_value = "X",
                                                   file_out_user = NULL,
                                                   out_directory_user = NULL){




  # user input

  sequence_fasta_file <- sequence_fasta_file_user
  seqinr_translate <- seqinr_translate_logical_user
  seqinr_genetic_code <- seqinr_genetic_code_user

  consensus_format <- consensus_selection_format_value
  consensus_threshold <- consensus_selection_threshold_value
  consensus_taxa <- consensus_selection_taxa_names_value

  nucleotide_gap_char <- consensus_nucleotide_gap_char_value
  aminoacid_gap_char <- consensus_aminoacid_gap_char_value

  file_out <- file_out_user

  ## outdir arguments

  if(is.null(out_directory_user)) {
    out_directory <- getwd()
  } else {
    setwd(out_directory_user)
    out_directory <- getwd()
  }

  # genetic code info

  genetic_code_df <- data.frame(genetic_code_user = c(1:5,6,9:16,21:26),
                                NCBI_numcode = c("standard", "vertebrate.mitochondrial", "yeast.mitochondrial", "protozoan.mitochondrial+mycoplasma", "invertebrate.mitochondrial", "ciliate+dasycladaceal", "echinoderm+flatworm.mitochondrial", "euplotid", "bacterial+plantplastid", "alternativeyeast", "ascidian.mitochondrial", "alternativeflatworm.mitochondrial", "blepharism", "chlorophycean.mitochondrial", "trematode.mitochondrial", "scenedesmus.mitochondrial", "thraustochytrium.mitochondria",
                                                 "Pterobranchia.mitochondrial", "CandidateDivision.SR1+Gracilibacteria", "Pachysolen.tannophilus"),
                                stringsAsFactors = FALSE)

  print(genetic_code_df)

  # sequence_fasta_file <- "~/Desktop/positive_selection_transcriptome/hyphy_on_TLRs/TLR_matrices/TLR1_test_nucleotide.fasta"

  if(!is.null(sequence_fasta_file)) {
    sequence_list <- seqinr::read.fasta(file = sequence_fasta_file,
                                        seqtype = "DNA",
                                        set.attributes = TRUE,
                                        apply.mask = FALSE)
  }

  ################# processing sequences

  gaps_list <- list()
  ambs_list <- list()
  quality_list <- list()
  AA_list <- list()
  translation_list <- list()
  incomplete_list <- list()
  stop_cod_list <- list()

  cat("\n")

  for(i in 1:length(sequence_list)) {
    # i <- 672
    one_seq <- sequence_list[[i]]
    one_seq_ID <- attributes(one_seq)$name
    one_seq <- as.vector(one_seq)

    cat("***** processing this sequence -- ", one_seq_ID, "\n")

    ###### on nucleotide sequences

    # processing gaps
    gaps_no_gaps <- gsub("-","0",one_seq)
    gaps_no_gaps <- gsub("^(?![0]$).*", "1", gaps_no_gaps, perl=TRUE)
    gaps_no_gaps_df <- as.data.frame(t(gaps_no_gaps), stringsAsFactors = FALSE)
    gaps_no_gaps_df <- as.data.frame(sapply(gaps_no_gaps_df, as.numeric))
    gaps_no_gaps_df <- as.data.frame(t(gaps_no_gaps_df))
    rownames(gaps_no_gaps_df) <- NULL

    gaps_no_gaps_names <- paste0("G_", seq(1:ncol(gaps_no_gaps_df)))
    names(gaps_no_gaps_df) <- gaps_no_gaps_names

    nsites_gene <- ncol(gaps_no_gaps_df)
    nsites_no_gap <- rowSums(gaps_no_gaps_df)
    nsites_gap <- nsites_gene - nsites_no_gap

    # processing ambiguities
    nucleotide_ambiguities <- c("y", "r", "w", "s", "k", "m", "d", "v", "h", "b", "x", "n", "Y", "R", "W", "S", "K", "M", "D", "V", "H", "B", "X", "N", "?")
    nucleotide_ambiguities <- paste0(nucleotide_ambiguities, collapse = "|")
    amb_no_amb <- gsub("[y|r|w|s|k|m|d|v|h|b|x|n|Y|R|W|S|K|M|D|V|H|B|X|N|?]","1",one_seq)
    amb_no_amb <- gsub("^(?![1]$).*", "0", amb_no_amb, perl=TRUE)
    amb_no_amb_df <- as.data.frame(t(amb_no_amb), stringsAsFactors = FALSE)
    amb_no_amb_df <- as.data.frame(sapply(amb_no_amb_df, as.numeric))
    amb_no_amb_df <- as.data.frame(t(amb_no_amb_df))
    rownames(amb_no_amb_df) <- NULL
    nsites_amb <- rowSums(amb_no_amb_df)
    nsites_no_amb <- nsites_gene - nsites_amb

    amb_no_amb_names <- paste0("A_", seq(1:ncol(amb_no_amb_df)))
    names(amb_no_amb_df) <- amb_no_amb_names

    # reporting results 1: one_seq_quality_df

    one_seq_quality_df <- data.frame(one_seq_ID = one_seq_ID, nsites_no_gap = nsites_no_gap, nsites_gap = nsites_gap,
                                     nsites_no_amb = nsites_no_amb, nsites_amb = nsites_amb, stringsAsFactors = FALSE)


    # collecting

    gaps_list[[i]] <- gaps_no_gaps_df
    ambs_list[[i]] <- amb_no_amb_df


    ###########################  for nucleotides that can be translated to protein

    if(seqinr_translate) {

      ############# report incomplete codons

      one_seq_line <- paste0(one_seq, collapse="")

      # process sequence to codons

      where_to_split <- seq(from=3, to=nchar(one_seq_line), by=3)

      #to codons

      one_sequence_split <- mapply(substr, list(one_seq_line), c(0, where_to_split) + 1, c(where_to_split, nchar(one_seq_line)))
      codon_df <- as.data.frame(t(one_sequence_split), stringsAsFactors = FALSE)
      codon_df <- codon_df[,-ncol(codon_df)]
      codon_df <- as.data.frame(codon_df,stringsAsFactors = FALSE)
      codon_names <- paste0("codon_", seq(1:ncol(codon_df)))
      names(codon_df) <- codon_names

      ## incomplete_codons_to_gap

      variants_for_incomplete <- c("[:alpha:][:alpha:]-", "[:alpha:]-[:alpha:]", "-[:alpha:][:alpha:]",
                                   "[:alpha:]--", "-[:alpha:]-", "--[:alpha:]")


      # find elements that match incomplete codon -- not gap or not full codon

      incomp_codon_name_list <- list()
      incomp_codon_variant_list <- list()

      for(m in 1:ncol(codon_df)) {
        # m <- 64
        for(z in 1:length(variants_for_incomplete)) {

          incomplete_codon_col <- stringr::str_detect(codon_df[,m],variants_for_incomplete[z])

          if(any(incomplete_codon_col== TRUE)) {

            for(j in 1:length(incomplete_codon_col)) {
              # j <- 17
              if(incomplete_codon_col[j] == TRUE) {

                cat("***** this sequence -- ", one_seq_ID,
                    "-- has an incomplete condon -- ", names(codon_df)[m], " --  found : ", codon_df[j,m], "\n")
                incomp_codon_name_list[[m]] <- names(codon_df)[m]
                incomp_codon_variant_list[[m]] <- codon_df[j,m]
              }

            }
            rm(j)
          }
        }
        rm(z)
      }
      rm(m)

      ## remove NULL elements
      names_incomplete_codons_list <- Filter(Negate(is.null), incomp_codon_name_list)
      type_incomplete_codons_list <- Filter(Negate(is.null), incomp_codon_variant_list)

      ## summarizing

      nimcomplete_codons <- length(names_incomplete_codons_list)

      if(nimcomplete_codons > 0) {
        pos_incomplete_codons <- numeric()


        counter_NNg <- 0
        counter_NgN <- 0
        counter_gNN <- 0
        counter_Ngg <- 0
        counter_gNg <- 0
        counter_ggN <- 0

        for(m in 1:length(names_incomplete_codons_list)) {
          # m <- 1
          one_complete_codon <- names_incomplete_codons_list[[m]]
          one_complete_codon <- as.numeric(gsub("codon_", "", one_complete_codon))
          pos_incomplete_codons[m] <- one_complete_codon
          # type of incomplete
          replace_incomplete <- type_incomplete_codons_list[[m]]
          replace_incomplete <- gsub("[[:alpha:]]","N",replace_incomplete)
          replace_incomplete <- gsub("[-]","g",replace_incomplete)

          if(replace_incomplete == "NNg") {counter_NNg <- counter_NNg + 1}
          if(replace_incomplete == "NgN") {counter_NgN <- counter_NgN + 1}
          if(replace_incomplete == "gNN") {counter_gNN <- counter_gNN + 1}
          if(replace_incomplete == "Ngg") {counter_Ngg <- counter_Ngg + 1}
          if(replace_incomplete == "gNg") {counter_gNg <- counter_gNg + 1}
          if(replace_incomplete == "ggN") {counter_ggN <- counter_ggN + 1}
        }
        rm(m)
      } else {
        pos_incomplete_codons <- 0
        counter_NNg <- 0
        counter_NgN <- 0
        counter_gNN <- 0
        counter_Ngg <- 0
        counter_gNg <- 0
        counter_ggN <- 0
      }

      # reporting results 2: one_seq_quality_df

      one_seq_quality_df$nimcomplete_codons <- nimcomplete_codons
      one_seq_quality_df$counter_NNg <- counter_NNg
      one_seq_quality_df$counter_NgN <- counter_NgN
      one_seq_quality_df$counter_gNN <- counter_gNN
      one_seq_quality_df$counter_Ngg <- counter_Ngg
      one_seq_quality_df$counter_gNg <- counter_gNg
      one_seq_quality_df$counter_ggN <- counter_ggN

      ###### if translate count

      # translate

      one_sequence_protein_trans <- seqinr::translate(one_seq,
                                                      frame = 0,
                                                      sens = "F",
                                                      numcode = seqinr_genetic_code,
                                                      NAstring = "X",
                                                      ambiguous = FALSE)

      # collect translations

      translation_list[[i]] <- one_sequence_protein_trans

      # map X

      codons_total <- length(one_sequence_protein_trans)

      X_no_X <- gsub("X","0",one_sequence_protein_trans)
      X_no_X <- gsub("^(?![0]$).*", "1", X_no_X, perl=TRUE)
      X_no_X_df <- as.data.frame(t(X_no_X), stringsAsFactors = FALSE)
      X_no_X_df <- as.data.frame(sapply(X_no_X_df, as.numeric))
      X_no_X_df <- as.data.frame(t(X_no_X_df))
      rownames(X_no_X_df) <- NULL
      nsites_no_X <- rowSums(X_no_X_df)
      nsites_X <- codons_total - nsites_no_X

      X_no_X_names <- paste0("codon_", seq(1:ncol(X_no_X_df)))
      names(X_no_X_df) <- X_no_X_names

      one_seq_quality_df$ncodons_no_X <- nsites_no_X
      one_seq_quality_df$ncodons_X <- nsites_X

      # how many stop codons "*"

      ncodons <- length(one_sequence_protein_trans)
      ncodons_seq <- 1:ncodons

      stop_codon_loc <- ncodons_seq[stringr::str_detect(one_sequence_protein_trans,"[*]")]
      nstop_codons <- ifelse(length(stop_codon_loc) == 0, 0, length(stop_codon_loc))
      one_seq_quality_df$nstop_codons <- nstop_codons

      # report

      pos_incomplete_codons_vec <- paste0(pos_incomplete_codons, collapse = "_")
      one_seq_quality_df$pos_incomplete_codons <- pos_incomplete_codons_vec

      stop_codon_loc_vec <- paste0(stop_codon_loc, collapse = "_")
      one_seq_quality_df$pos_stop_codons <- stop_codon_loc_vec
      AA_list[[i]] <- X_no_X_df



    } # seqinr_translate

    # collecting quality

    quality_list[[i]] <- one_seq_quality_df

  }
  rm(i)

  ############################### END of loop processing sequences

  quality_df <- do.call(rbind, quality_list)
  gaps_df <- do.call(rbind, gaps_list)
  ambiguity_df <- do.call(rbind, ambs_list)

  gaps_df$seq_ID <- quality_df$one_seq_ID
  ambiguity_df$seq_ID <- quality_df$one_seq_ID


  ######################### compare with consensus

  ################## select consensus -- global

  if(consensus_format == "global") {
    fasta_alignment_con <- matrix(unlist(sequence_list), ncol = length(sequence_list[[1]]), byrow = TRUE)
    if(seqinr_translate) {
      translation_matrix_con <- matrix(unlist(translation_list), ncol = length(translation_list[[1]]), byrow = TRUE)
    }
  }

  ################## select consensus -- threshold

  if(consensus_format == "threshold") {
    bp_length <- NULL
    seq_name <- NULL
    sequences_indexer_df <- data.frame(order = as.numeric(rownames(quality_df)), bp_length = quality_df$nsites_no_gap, seq_name = quality_df$one_seq_ID, stringsAsFactors = FALSE)

    # filter with threshold
    indexed_of_seq_pass_threshold <- subset(sequences_indexer_df, bp_length >= consensus_threshold)

    cat("\n ****** sequences that pass the 'consensus_selection_threshold_value' for bp_length: ", consensus_threshold, "\n" )
    print(indexed_of_seq_pass_threshold)
    cat(" ****** sequences names\n\n" )
    print(indexed_of_seq_pass_threshold$seq_name)

    # subset list
    selected_list <- list()
    counter <- 0
    for(k in indexed_of_seq_pass_threshold$order) {
      counter <- counter + 1
      selected_list[[counter]] <- sequence_list[[k]]
    }
    rm(k)
    # create subset nucleotide alignment
    fasta_alignment_con <- matrix(unlist(selected_list), ncol = length(sequence_list[[1]]), byrow = TRUE)

    if(seqinr_translate) {

      # subset list
      selected_list <- list()
      counter <- 0
      for(k in indexed_of_seq_pass_threshold$order) {
        counter <- counter + 1
        selected_list[[counter]] <- translation_list[[k]]
      }
      rm(k)
      # create subset aminoacid alignment
      translation_matrix_con <- matrix(unlist(selected_list), ncol = length(translation_list[[1]]), byrow = TRUE)

    }
  }

  ################## select consensus -- "selected_taxa"

  if(consensus_format == "selected_taxa") {

    sequences_indexer_df <- data.frame(order = as.numeric(rownames(quality_df)), bp_length = quality_df$nsites_no_gap, seq_name = quality_df$one_seq_ID, stringsAsFactors = FALSE)

    # filter with consensus_taxa

    indexed_of_seq_pass_threshold <- subset(sequences_indexer_df, seq_name %in% consensus_taxa)

    cat("\n ****** sequences that pass the 'consensus_selection_threshold_value' for bp_length: ", consensus_threshold, "\n" )
    print(indexed_of_seq_pass_threshold)
    cat(" ****** sequences names\n\n" )
    print(indexed_of_seq_pass_threshold$seq_name)

    # subset list
    selected_list <- list()
    counter <- 0
    for(k in indexed_of_seq_pass_threshold$order) {
      counter <- counter + 1
      selected_list[[counter]] <- sequence_list[[k]]
    }
    rm(k)
    # create subset nucleotide alignment
    fasta_alignment_con <- matrix(unlist(selected_list), ncol = length(sequence_list[[1]]), byrow = TRUE)

    if(seqinr_translate) {

      # subset list
      selected_list <- list()
      counter <- 0
      for(k in indexed_of_seq_pass_threshold$order) {
        counter <- counter + 1
        selected_list[[counter]] <- translation_list[[k]]
      }
      rm(k)
      # create subset aminoacid alignment
      translation_matrix_con <- matrix(unlist(selected_list), ncol = length(translation_list[[1]]), byrow = TRUE)

    }
  }


  ##############  nucleotide

  nucleotide_consensus_WITH_gap <- seqinr::consensus(fasta_alignment_con, method = "majority")

  ### consensus with no gap

  matali <- as.matrix(fasta_alignment_con)
  majority2 <- function(x) names(which.max(table(x)))
  consensus_out <- list()

  for(m in 1:ncol(matali)) {
    # m <- 1
    one_column <- matali[,m]
    # remove "-"
    one_column <- one_column[!one_column == nucleotide_gap_char]
    res <- majority2(one_column)

    # if NULL then is a gap
    if(is.null(res)) { res <- nucleotide_gap_char}
    consensus_out[[m]] <- res
  }
  rm(m)
  nucleotide_consensus_NO_gap <- unlist(consensus_out)


  if(seqinr_translate) {

    aminoacid_consensus_WITH_gap <- seqinr::consensus(translation_matrix_con, method = "majority")

    ### consensus with no gap

    matali <- translation_matrix_con
    consensus_out <- list()

    for(m in 1:ncol(matali)) {
      # m <- 1
      one_column <- matali[,m]
      # remove "-"
      one_column <- one_column[!one_column == aminoacid_gap_char]
      res <- majority2(one_column)

      # if NULL then is a gap
      if(is.null(res)) { res <- aminoacid_gap_char}
      consensus_out[[m]] <- res
    }
    rm(m)
    aminoacid_consensus_NO_gap <- unlist(consensus_out)
  }

  ##################### loop to get distance

  nucleotide_full_consensus_WITH_gap_distance_col <- list()
  nucleotide_full_consensus_NO_gap_distance_col <- list()
  nucleotide_one_seq_consensus_WITH_gap_distance_col <- list()
  nucleotide_one_seq_consensus_NO_gap_distance_col <- list()
  nucleotide_one_seq_total_count_no_gap_col <- list()


  aminoacid_full_consensus_WITH_gap_distance_col <- list()
  aminoacid_full_consensus_NO_gap_distance_col <- list()
  aminoacid_one_seq_consensus_WITH_gap_distance_col <- list()
  aminoacid_one_seq_consensus_NO_gap_distance_col <- list()
  aminoacid_one_seq_total_count_no_gap_col <- list()

  cat("\n***** getting distances of seq: ")

  for(i in 1:length(sequence_list)) {

    #i <- 1

    cat(i,"..")

    ## get the distance and weighted distance

    one_seq_vector <- sequence_list[[i]]

    # compare_site_by_site

    nucleotide_consensus_WITH_gap_count <- ifelse(one_seq_vector == nucleotide_consensus_WITH_gap, 0, 1)
    nucleotide_full_consensus_WITH_gap_distance <- sum(nucleotide_consensus_WITH_gap_count)
    nucleotide_consensus_NO_gap_count <- ifelse(one_seq_vector == nucleotide_consensus_NO_gap, 0, 1)
    nucleotide_full_consensus_NO_gap_distance <- sum(nucleotide_consensus_NO_gap_count)

    #### assign gaps to consensus based in one_seq_vector

    # consensus_WITH_gap
    nucleotide_consensus_WITH_gap_to_one_seq <- nucleotide_consensus_WITH_gap
    nucleotide_consensus_WITH_gap_to_one_seq[one_seq_vector == nucleotide_gap_char] <- nucleotide_gap_char
    nucleotide_consensus_WITH_one_seq <- ifelse(one_seq_vector == nucleotide_consensus_WITH_gap_to_one_seq, 0, 1)
    nucleotide_one_seq_consensus_WITH_gap_distance <- sum(nucleotide_consensus_WITH_one_seq)

    # consensus_NO_gap
    nucleotide_consensus_gap_to_one_seq <- nucleotide_consensus_NO_gap
    nucleotide_consensus_gap_to_one_seq[one_seq_vector == nucleotide_gap_char] <- nucleotide_gap_char
    nucleotide_consensus_NO_gap_one_seq <- ifelse(one_seq_vector == nucleotide_consensus_gap_to_one_seq, 0, 1)
    nucleotide_one_seq_consensus_NO_gap_distance <- sum(nucleotide_consensus_NO_gap_one_seq)

    # howmany no gaps in one_seq_vector (i.e., actual nucleotides)

    nucleotide_no_gap_count <- ifelse(!one_seq_vector == nucleotide_gap_char, 1, 0)
    nucleotide_one_seq_total_count_no_gap <- sum(nucleotide_no_gap_count)

    # out data

    nucleotide_full_consensus_WITH_gap_distance_col[[i]] <- nucleotide_full_consensus_WITH_gap_distance
    nucleotide_full_consensus_NO_gap_distance_col[[i]] <- nucleotide_full_consensus_NO_gap_distance
    nucleotide_one_seq_consensus_WITH_gap_distance_col[[i]] <- nucleotide_one_seq_consensus_WITH_gap_distance
    nucleotide_one_seq_consensus_NO_gap_distance_col[[i]] <- nucleotide_one_seq_consensus_NO_gap_distance
    nucleotide_one_seq_total_count_no_gap_col[[i]] <- nucleotide_one_seq_total_count_no_gap

    ##############  codon -- protein

    if(seqinr_translate) {

      ## get the distance and weighted distance for codons

      # i <- 1
      one_seq_vector <- translation_list[[i]]

      # compare_site_by_site

      aminoacid_consensus_WITH_gap_count <- ifelse(one_seq_vector == aminoacid_consensus_WITH_gap, 0, 1)
      aminoacid_full_consensus_WITH_gap_distance <- sum(aminoacid_consensus_WITH_gap_count)
      aminoacid_consensus_NO_gap_count <- ifelse(one_seq_vector == aminoacid_consensus_NO_gap, 0, 1)
      aminoacid_full_consensus_NO_gap_distance <- sum(aminoacid_consensus_NO_gap_count)

      #### assign gaps to consensus based in one_seq_vector

      # consensus_WITH_gap
      aminoacid_consensus_WITH_gap_to_one_seq <- aminoacid_consensus_WITH_gap
      aminoacid_consensus_WITH_gap_to_one_seq[one_seq_vector == aminoacid_gap_char] <- aminoacid_gap_char
      aminoacid_consensus_WITH_one_seq <- ifelse(one_seq_vector == aminoacid_consensus_WITH_gap_to_one_seq, 0, 1)
      aminoacid_one_seq_consensus_WITH_gap_distance <- sum(aminoacid_consensus_WITH_one_seq)

      # consensus_NO_gap
      aminoacid_consensus_gap_to_one_seq <- aminoacid_consensus_NO_gap
      aminoacid_consensus_gap_to_one_seq[one_seq_vector == aminoacid_gap_char] <- aminoacid_gap_char
      aminoacid_consensus_NO_gap_one_seq <- ifelse(one_seq_vector == aminoacid_consensus_gap_to_one_seq, 0, 1)
      aminoacid_one_seq_consensus_NO_gap_distance <- sum(aminoacid_consensus_NO_gap_one_seq)

      # howmany no gaps in one_seq_vector (i.e., actual nucleotides)

      aminoacid_no_gap_count <- ifelse(!one_seq_vector == aminoacid_gap_char, 1, 0)
      aminoacid_one_seq_total_count_no_gap <- sum(aminoacid_no_gap_count)

      # out data

      aminoacid_full_consensus_WITH_gap_distance_col[[i]] <- aminoacid_full_consensus_WITH_gap_distance
      aminoacid_full_consensus_NO_gap_distance_col[[i]] <- aminoacid_full_consensus_NO_gap_distance
      aminoacid_one_seq_consensus_WITH_gap_distance_col[[i]] <- aminoacid_one_seq_consensus_WITH_gap_distance
      aminoacid_one_seq_consensus_NO_gap_distance_col[[i]] <- aminoacid_one_seq_consensus_NO_gap_distance
      aminoacid_one_seq_total_count_no_gap_col[[i]] <- aminoacid_one_seq_total_count_no_gap

    } # tranlate end

  } # distance loop end
  rm(i)

  cat("\n")

  # addd to quality_df

  quality_df$nucleotide_full_consensus_WITH_gap_distance <- unlist(nucleotide_full_consensus_WITH_gap_distance_col)
  quality_df$nucleotide_full_consensus_NO_gap_distance <- unlist(nucleotide_full_consensus_NO_gap_distance_col)
  quality_df$nucleotide_adj_lenght_seq_consensus_WITH_gap_distance <- unlist(nucleotide_one_seq_consensus_WITH_gap_distance_col)
  quality_df$nucleotide_adj_lenght_seq_consensus_NO_gap_distance <- unlist(nucleotide_one_seq_consensus_NO_gap_distance_col)
  quality_df$nucleotide_NO_gap_count <- unlist(nucleotide_one_seq_total_count_no_gap_col)
  quality_df$nucleotide_adj_lenght_seq_consensus_WITH_gap_percent_distance_per_total_length <- 100*(quality_df$nucleotide_adj_lenght_seq_consensus_WITH_gap_distance/quality_df$nucleotide_NO_gap_count)
  quality_df$nucleotide_adj_lenght_seq_consensus_NO_gap_percent_distance_per_total_length <- 100*(quality_df$nucleotide_adj_lenght_seq_consensus_NO_gap_distance/quality_df$nucleotide_NO_gap_count)


  if(seqinr_translate) {
    quality_df$aminoacid_full_consensus_WITH_gap_distance <- unlist(aminoacid_full_consensus_WITH_gap_distance_col)
    quality_df$aminoacid_full_consensus_NO_gap_distance <- unlist(aminoacid_full_consensus_NO_gap_distance_col)
    quality_df$aminoacid_adj_lenght_seq_consensus_WITH_gap_distance <- unlist(aminoacid_one_seq_consensus_WITH_gap_distance_col)
    quality_df$aminoacid_adj_lenght_seq_consensus_NO_gap_distance <- unlist(aminoacid_one_seq_consensus_NO_gap_distance_col)
    quality_df$aminoacid_NO_gap_count <- unlist(aminoacid_one_seq_total_count_no_gap_col)
    quality_df$aminoacid_adj_lenght_seq_consensus_WITH_gap_percent_distance_per_total_length <- 100*(quality_df$aminoacid_adj_lenght_seq_consensus_WITH_gap_distance/quality_df$aminoacid_NO_gap_count)
    quality_df$aminoacid_adj_lenght_seq_consensus_NO_gap_percent_distance_per_total_length <- 100*(quality_df$aminoacid_adj_lenght_seq_consensus_NO_gap_distance/quality_df$aminoacid_NO_gap_count)

  }

  # other indices

  quality_df$namb_percent_per_total_length <- 100*(quality_df$nsites_amb/quality_df$nsites_no_gap)

  ######## write files

  # names

  quality_name <- paste0(file_out, "_quality.txt")
  gaps_name <- paste0(file_out, "_gaps_df.txt")
  ambiguity_name <- paste0(file_out, "_ambiguity_df.txt")

  utils::write.table(quality_df, file = quality_name, sep = "\t", row.names = FALSE)
  utils::write.table(gaps_df, file = gaps_name, sep = "\t", row.names = FALSE)
  utils::write.table(ambiguity_df, file = ambiguity_name, sep = "\t", row.names = FALSE)

  ## if translate

  if(seqinr_translate) {
    AA_df <- do.call(rbind, AA_list)
    AA_df$seq_ID <- quality_df$one_seq_ID
    AA_name <- paste0(file_out, "_codons_df.txt")
    utils::write.table(AA_df, file = AA_name, sep = "\t", row.names = FALSE)
  }

  ## resturn

  if(seqinr_translate) {
    out <- list(gaps_df, ambiguity_df, AA_df, quality_df)
  } else {
    out <- list(gaps_df, ambiguity_df, quality_df)
  }
  return(out)

}
##########################################################################
carvajalc/BarcoR documentation built on Dec. 19, 2021, 1:54 p.m.