#' @title Quality check graphics for DNA Barcodes
#'
#'@name aligned_sequence_matrix_characterizer
#'
#' @description Provide a quality summary of DNA barcodes
#'
#' @param sequence_fasta_file_user An aligment o DNA barcodes in fasta file.
#' @param seqinr_translate_logical_user Logical. If it is TRUE the nucleotides will translate into Aminoacids
#' @param seqinr_genetic_code_user Character. A selection of Genetic code ("standard", "vertebrate.mitochondrial", "yeast.mitochondrial", "protozoan.mitochondrial+mycoplasma", "invertebrate.mitochondrial", "ciliate+dasycladaceal", "echinoderm+flatworm.mitochondrial", "euplotid", "bacterial+plantplastid", "alternativeyeast", "ascidian.mitochondrial", "alternativeflatworm.mitochondrial", "blepharism", "chlorophycean.mitochondrial", "trematode.mitochondrial", "scenedesmus.mitochondrial", "thraustochytrium.mitochondria", "Pterobranchia.mitochondrial", "CandidateDivision.SR1+Gracilibacteria", "Pachysolen.tannophilus")
#' @param consensus_selection_format_value Character.compare with consensus based in ("global","threshold", "selected_taxa")
#' @param consensus_selection_threshold_value numeric. A threshold value to conserv in analysis.
#' @param consensus_selection_taxa_names_value Character. A list of taxa names to filter the sequence data.
#' @param consensus_nucleotide_gap_char_value A character indicating the gap value in DNA sequence data , for example "-".
#' @param consensus_aminoacid_gap_char_value A character indicating the gap value in AA data , for example "X".
#' @param file_out_user Character. an optional name to be added to the output file.
#' @param out_directory_user Output directory
#'
#'
#'
#'
#'
#' @return 4 files: list of ambiguities,codons gaps and quality of DNA barcodes in txt format
#' @examples
#'COX1_UVI_quality <- aligned_sequence_matrix_characterizer (sequence_fasta_file_user = system.file("extdata","COI_ordered_nucleotide2.fasta",package="BarcoR"),
#' seqinr_translate_logical_user = TRUE,
#' seqinr_genetic_code_user = 2,
#' consensus_selection_format_value = "selected_taxa",
#' consensus_selection_threshold_value = 1500,
#' consensus_selection_taxa_names_value = c("Mannophryne_collaris_COX1_MT627188", "Mannophryne_trinitatis_COX1_JX564878",
#' "Anomaloglossus_baeobatrachus_COX1_MK069969", "Anomaloglossus_leopardus_COX1_MK069970", "Anomaloglossus_blanci_COX1_MG264895",
#' "Anomaloglossus_blanci_COX1_NC_037857", "Anomaloglossus_degranvillei_COX1_MG264892", "Anomaloglossus_dewynteri_COX1_MG264891",
#' "Anomaloglossus_surinamensis_COX1_MG264890", "Rheobates_palmatus_COX1_MT627206", "Allobates_trilineatus_COX1_MT627185",
#' "Allobates_sumtuosus_COX1_MT627177", "Allobates_amissibilis_COX1_MT627204", "Allobates_marchesianus_COX1_MT627180",
#' "Allobates_melanolaemus_COX1_MT627203", "Allobates_paleovarzensis_COX1_MT627194", "Allobates_tinae_COX1_MT627190",
#' "Allobates_granti_COX1_MT627176", "Allobates_chalcopis_COX1_MT627182", "Allobates_conspicuus_COX1_MT627186",
#' "Allobates_insperatus_COX1_MT627193", "Allobates_gasconi_COX1_MT627191", "Allobates_tapajos_COX1_MT627175",
#' "Allobates_caeruleodactylus_COX1_MT627199", "Allobates_grillisimilis_COX1_MT627200", "Allobates_crombiei_COX1_MT627174",
#' "Allobates_goianus_COX1_MT627207", "Allobates_carajas_COX1_MT627183", "Allobates_masniger_COX1_MT627198",
#' "Allobates_nunciatus_COX1_MT627196", "Allobates_flaviventris_COX1_MT627192", "Allobates_magnussoni_COX1_MT627173",
#' "Allobates_femoralis_COX1_MT627179", "Allobates_talamancae_COX1_MT627205", "Allobates_undulatus_COX1_MT627181",
#' "Allobates_olfersioides_COX1_MT627202", "Ameerega_hahneli_COX1_MT627178", "Ameerega_hahneli_COX1_MW042031",
#' "Ameerega_cainarachi_COX1_SRX6044059", "Ameerega_rubriventris_COX1_SRX6044017", "Ameerega_braccata_COX1_SRX6044048",
#' "Ameerega_trivittata_COX1_SRX6044007", "Ameerega_parvula_COX1_MW042032", "Ameerega_bilinguis_COX1_MW042030",
#' "Ameerega_silverstonei_COX1_SRX6044019", "Leucostethus_bilsa_COX1_MW042038", "Leucostethus_fugax_COX1_MW042037",
#' "Silverstoneia_flotator_COX1_MW042039", "Epipedobates_machalilla_COX1_MW042036", "Epipedobates_anthonyi_COX1_MW042033",
#' "Epipedobates_boulengeri_COX1_MW042034", "Epipedobates_darwinwallacei_COX1_MW042035", "Hyloxalus_craspedoceps_COX1_SRX6044040",
#' "Hyloxalus_yasuni_COX1_KT221612", "Hyloxalus_subpunctatus_COX1_KY962392", "Dendrobates_fantasticus_ERX4488168",
#' "Dendrobates_reticulatus_COX1_SRX6044054", "Dendrobates_defleri_COX1_SRX6044045", "Dendrobates_sirensis_ERX4072904",
#' "Dendrobates_imitator_ERX3197357", "Dendrobates_mysteriosus_COX1_SRX6044024", "Dendrobates_claudiae_COX1_SRX6044029",
#' "Andinobates_minutus_COX1_SRX6044028", "Dendrobates_sylvaticus_COX1_SRX5894869", "Dendrobates_pumilio_COX1_SRX7846193",
#' "Dendrobates_galactonotus_COX1_SRX6044050", "Dendrobates_castaneoticus_COX1_SRX6044052",
#' "Dendrobates_quinquevittatus_COX1_SRX6044056", "Dendrobates_steyermarki_COX1_SRX6044055", "Dendrobates_auratus_COX1_SRX6761101",
#' "Dendrobates_leucomelas_COX1_SRX6761106", "Dendrobates_tinctorius_COX1_SRX6761107"),
#' consensus_nucleotide_gap_char_value = "-",
#' consensus_aminoacid_gap_char_value = "X",
#' file_out_user = "COI_UVI",
#' out_directory_user = NULL)
#'
#'
#' @export
#'
#' @importFrom seqinr "read.fasta" "translate" "consensus"
#' @importFrom stringr "str_detect"
#' @importFrom ape "ladderize"
#'
#'
#'
#'
aligned_sequence_matrix_characterizer <- function(sequence_fasta_file_user = NULL,
seqinr_translate_logical_user = TRUE,
seqinr_genetic_code_user = 1,
consensus_selection_format_value = c("global","threshold", "selected_taxa"),
consensus_selection_threshold_value = 1500,
consensus_selection_taxa_names_value = NULL,
consensus_nucleotide_gap_char_value = "-",
consensus_aminoacid_gap_char_value = "X",
file_out_user = NULL,
out_directory_user = NULL){
# user input
sequence_fasta_file <- sequence_fasta_file_user
seqinr_translate <- seqinr_translate_logical_user
seqinr_genetic_code <- seqinr_genetic_code_user
consensus_format <- consensus_selection_format_value
consensus_threshold <- consensus_selection_threshold_value
consensus_taxa <- consensus_selection_taxa_names_value
nucleotide_gap_char <- consensus_nucleotide_gap_char_value
aminoacid_gap_char <- consensus_aminoacid_gap_char_value
file_out <- file_out_user
## outdir arguments
if(is.null(out_directory_user)) {
out_directory <- getwd()
} else {
setwd(out_directory_user)
out_directory <- getwd()
}
# genetic code info
genetic_code_df <- data.frame(genetic_code_user = c(1:5,6,9:16,21:26),
NCBI_numcode = c("standard", "vertebrate.mitochondrial", "yeast.mitochondrial", "protozoan.mitochondrial+mycoplasma", "invertebrate.mitochondrial", "ciliate+dasycladaceal", "echinoderm+flatworm.mitochondrial", "euplotid", "bacterial+plantplastid", "alternativeyeast", "ascidian.mitochondrial", "alternativeflatworm.mitochondrial", "blepharism", "chlorophycean.mitochondrial", "trematode.mitochondrial", "scenedesmus.mitochondrial", "thraustochytrium.mitochondria",
"Pterobranchia.mitochondrial", "CandidateDivision.SR1+Gracilibacteria", "Pachysolen.tannophilus"),
stringsAsFactors = FALSE)
print(genetic_code_df)
# sequence_fasta_file <- "~/Desktop/positive_selection_transcriptome/hyphy_on_TLRs/TLR_matrices/TLR1_test_nucleotide.fasta"
if(!is.null(sequence_fasta_file)) {
sequence_list <- seqinr::read.fasta(file = sequence_fasta_file,
seqtype = "DNA",
set.attributes = TRUE,
apply.mask = FALSE)
}
################# processing sequences
gaps_list <- list()
ambs_list <- list()
quality_list <- list()
AA_list <- list()
translation_list <- list()
incomplete_list <- list()
stop_cod_list <- list()
cat("\n")
for(i in 1:length(sequence_list)) {
# i <- 672
one_seq <- sequence_list[[i]]
one_seq_ID <- attributes(one_seq)$name
one_seq <- as.vector(one_seq)
cat("***** processing this sequence -- ", one_seq_ID, "\n")
###### on nucleotide sequences
# processing gaps
gaps_no_gaps <- gsub("-","0",one_seq)
gaps_no_gaps <- gsub("^(?![0]$).*", "1", gaps_no_gaps, perl=TRUE)
gaps_no_gaps_df <- as.data.frame(t(gaps_no_gaps), stringsAsFactors = FALSE)
gaps_no_gaps_df <- as.data.frame(sapply(gaps_no_gaps_df, as.numeric))
gaps_no_gaps_df <- as.data.frame(t(gaps_no_gaps_df))
rownames(gaps_no_gaps_df) <- NULL
gaps_no_gaps_names <- paste0("G_", seq(1:ncol(gaps_no_gaps_df)))
names(gaps_no_gaps_df) <- gaps_no_gaps_names
nsites_gene <- ncol(gaps_no_gaps_df)
nsites_no_gap <- rowSums(gaps_no_gaps_df)
nsites_gap <- nsites_gene - nsites_no_gap
# processing ambiguities
nucleotide_ambiguities <- c("y", "r", "w", "s", "k", "m", "d", "v", "h", "b", "x", "n", "Y", "R", "W", "S", "K", "M", "D", "V", "H", "B", "X", "N", "?")
nucleotide_ambiguities <- paste0(nucleotide_ambiguities, collapse = "|")
amb_no_amb <- gsub("[y|r|w|s|k|m|d|v|h|b|x|n|Y|R|W|S|K|M|D|V|H|B|X|N|?]","1",one_seq)
amb_no_amb <- gsub("^(?![1]$).*", "0", amb_no_amb, perl=TRUE)
amb_no_amb_df <- as.data.frame(t(amb_no_amb), stringsAsFactors = FALSE)
amb_no_amb_df <- as.data.frame(sapply(amb_no_amb_df, as.numeric))
amb_no_amb_df <- as.data.frame(t(amb_no_amb_df))
rownames(amb_no_amb_df) <- NULL
nsites_amb <- rowSums(amb_no_amb_df)
nsites_no_amb <- nsites_gene - nsites_amb
amb_no_amb_names <- paste0("A_", seq(1:ncol(amb_no_amb_df)))
names(amb_no_amb_df) <- amb_no_amb_names
# reporting results 1: one_seq_quality_df
one_seq_quality_df <- data.frame(one_seq_ID = one_seq_ID, nsites_no_gap = nsites_no_gap, nsites_gap = nsites_gap,
nsites_no_amb = nsites_no_amb, nsites_amb = nsites_amb, stringsAsFactors = FALSE)
# collecting
gaps_list[[i]] <- gaps_no_gaps_df
ambs_list[[i]] <- amb_no_amb_df
########################### for nucleotides that can be translated to protein
if(seqinr_translate) {
############# report incomplete codons
one_seq_line <- paste0(one_seq, collapse="")
# process sequence to codons
where_to_split <- seq(from=3, to=nchar(one_seq_line), by=3)
#to codons
one_sequence_split <- mapply(substr, list(one_seq_line), c(0, where_to_split) + 1, c(where_to_split, nchar(one_seq_line)))
codon_df <- as.data.frame(t(one_sequence_split), stringsAsFactors = FALSE)
codon_df <- codon_df[,-ncol(codon_df)]
codon_df <- as.data.frame(codon_df,stringsAsFactors = FALSE)
codon_names <- paste0("codon_", seq(1:ncol(codon_df)))
names(codon_df) <- codon_names
## incomplete_codons_to_gap
variants_for_incomplete <- c("[:alpha:][:alpha:]-", "[:alpha:]-[:alpha:]", "-[:alpha:][:alpha:]",
"[:alpha:]--", "-[:alpha:]-", "--[:alpha:]")
# find elements that match incomplete codon -- not gap or not full codon
incomp_codon_name_list <- list()
incomp_codon_variant_list <- list()
for(m in 1:ncol(codon_df)) {
# m <- 64
for(z in 1:length(variants_for_incomplete)) {
incomplete_codon_col <- stringr::str_detect(codon_df[,m],variants_for_incomplete[z])
if(any(incomplete_codon_col== TRUE)) {
for(j in 1:length(incomplete_codon_col)) {
# j <- 17
if(incomplete_codon_col[j] == TRUE) {
cat("***** this sequence -- ", one_seq_ID,
"-- has an incomplete condon -- ", names(codon_df)[m], " -- found : ", codon_df[j,m], "\n")
incomp_codon_name_list[[m]] <- names(codon_df)[m]
incomp_codon_variant_list[[m]] <- codon_df[j,m]
}
}
rm(j)
}
}
rm(z)
}
rm(m)
## remove NULL elements
names_incomplete_codons_list <- Filter(Negate(is.null), incomp_codon_name_list)
type_incomplete_codons_list <- Filter(Negate(is.null), incomp_codon_variant_list)
## summarizing
nimcomplete_codons <- length(names_incomplete_codons_list)
if(nimcomplete_codons > 0) {
pos_incomplete_codons <- numeric()
counter_NNg <- 0
counter_NgN <- 0
counter_gNN <- 0
counter_Ngg <- 0
counter_gNg <- 0
counter_ggN <- 0
for(m in 1:length(names_incomplete_codons_list)) {
# m <- 1
one_complete_codon <- names_incomplete_codons_list[[m]]
one_complete_codon <- as.numeric(gsub("codon_", "", one_complete_codon))
pos_incomplete_codons[m] <- one_complete_codon
# type of incomplete
replace_incomplete <- type_incomplete_codons_list[[m]]
replace_incomplete <- gsub("[[:alpha:]]","N",replace_incomplete)
replace_incomplete <- gsub("[-]","g",replace_incomplete)
if(replace_incomplete == "NNg") {counter_NNg <- counter_NNg + 1}
if(replace_incomplete == "NgN") {counter_NgN <- counter_NgN + 1}
if(replace_incomplete == "gNN") {counter_gNN <- counter_gNN + 1}
if(replace_incomplete == "Ngg") {counter_Ngg <- counter_Ngg + 1}
if(replace_incomplete == "gNg") {counter_gNg <- counter_gNg + 1}
if(replace_incomplete == "ggN") {counter_ggN <- counter_ggN + 1}
}
rm(m)
} else {
pos_incomplete_codons <- 0
counter_NNg <- 0
counter_NgN <- 0
counter_gNN <- 0
counter_Ngg <- 0
counter_gNg <- 0
counter_ggN <- 0
}
# reporting results 2: one_seq_quality_df
one_seq_quality_df$nimcomplete_codons <- nimcomplete_codons
one_seq_quality_df$counter_NNg <- counter_NNg
one_seq_quality_df$counter_NgN <- counter_NgN
one_seq_quality_df$counter_gNN <- counter_gNN
one_seq_quality_df$counter_Ngg <- counter_Ngg
one_seq_quality_df$counter_gNg <- counter_gNg
one_seq_quality_df$counter_ggN <- counter_ggN
###### if translate count
# translate
one_sequence_protein_trans <- seqinr::translate(one_seq,
frame = 0,
sens = "F",
numcode = seqinr_genetic_code,
NAstring = "X",
ambiguous = FALSE)
# collect translations
translation_list[[i]] <- one_sequence_protein_trans
# map X
codons_total <- length(one_sequence_protein_trans)
X_no_X <- gsub("X","0",one_sequence_protein_trans)
X_no_X <- gsub("^(?![0]$).*", "1", X_no_X, perl=TRUE)
X_no_X_df <- as.data.frame(t(X_no_X), stringsAsFactors = FALSE)
X_no_X_df <- as.data.frame(sapply(X_no_X_df, as.numeric))
X_no_X_df <- as.data.frame(t(X_no_X_df))
rownames(X_no_X_df) <- NULL
nsites_no_X <- rowSums(X_no_X_df)
nsites_X <- codons_total - nsites_no_X
X_no_X_names <- paste0("codon_", seq(1:ncol(X_no_X_df)))
names(X_no_X_df) <- X_no_X_names
one_seq_quality_df$ncodons_no_X <- nsites_no_X
one_seq_quality_df$ncodons_X <- nsites_X
# how many stop codons "*"
ncodons <- length(one_sequence_protein_trans)
ncodons_seq <- 1:ncodons
stop_codon_loc <- ncodons_seq[stringr::str_detect(one_sequence_protein_trans,"[*]")]
nstop_codons <- ifelse(length(stop_codon_loc) == 0, 0, length(stop_codon_loc))
one_seq_quality_df$nstop_codons <- nstop_codons
# report
pos_incomplete_codons_vec <- paste0(pos_incomplete_codons, collapse = "_")
one_seq_quality_df$pos_incomplete_codons <- pos_incomplete_codons_vec
stop_codon_loc_vec <- paste0(stop_codon_loc, collapse = "_")
one_seq_quality_df$pos_stop_codons <- stop_codon_loc_vec
AA_list[[i]] <- X_no_X_df
} # seqinr_translate
# collecting quality
quality_list[[i]] <- one_seq_quality_df
}
rm(i)
############################### END of loop processing sequences
quality_df <- do.call(rbind, quality_list)
gaps_df <- do.call(rbind, gaps_list)
ambiguity_df <- do.call(rbind, ambs_list)
gaps_df$seq_ID <- quality_df$one_seq_ID
ambiguity_df$seq_ID <- quality_df$one_seq_ID
######################### compare with consensus
################## select consensus -- global
if(consensus_format == "global") {
fasta_alignment_con <- matrix(unlist(sequence_list), ncol = length(sequence_list[[1]]), byrow = TRUE)
if(seqinr_translate) {
translation_matrix_con <- matrix(unlist(translation_list), ncol = length(translation_list[[1]]), byrow = TRUE)
}
}
################## select consensus -- threshold
if(consensus_format == "threshold") {
bp_length <- NULL
seq_name <- NULL
sequences_indexer_df <- data.frame(order = as.numeric(rownames(quality_df)), bp_length = quality_df$nsites_no_gap, seq_name = quality_df$one_seq_ID, stringsAsFactors = FALSE)
# filter with threshold
indexed_of_seq_pass_threshold <- subset(sequences_indexer_df, bp_length >= consensus_threshold)
cat("\n ****** sequences that pass the 'consensus_selection_threshold_value' for bp_length: ", consensus_threshold, "\n" )
print(indexed_of_seq_pass_threshold)
cat(" ****** sequences names\n\n" )
print(indexed_of_seq_pass_threshold$seq_name)
# subset list
selected_list <- list()
counter <- 0
for(k in indexed_of_seq_pass_threshold$order) {
counter <- counter + 1
selected_list[[counter]] <- sequence_list[[k]]
}
rm(k)
# create subset nucleotide alignment
fasta_alignment_con <- matrix(unlist(selected_list), ncol = length(sequence_list[[1]]), byrow = TRUE)
if(seqinr_translate) {
# subset list
selected_list <- list()
counter <- 0
for(k in indexed_of_seq_pass_threshold$order) {
counter <- counter + 1
selected_list[[counter]] <- translation_list[[k]]
}
rm(k)
# create subset aminoacid alignment
translation_matrix_con <- matrix(unlist(selected_list), ncol = length(translation_list[[1]]), byrow = TRUE)
}
}
################## select consensus -- "selected_taxa"
if(consensus_format == "selected_taxa") {
sequences_indexer_df <- data.frame(order = as.numeric(rownames(quality_df)), bp_length = quality_df$nsites_no_gap, seq_name = quality_df$one_seq_ID, stringsAsFactors = FALSE)
# filter with consensus_taxa
indexed_of_seq_pass_threshold <- subset(sequences_indexer_df, seq_name %in% consensus_taxa)
cat("\n ****** sequences that pass the 'consensus_selection_threshold_value' for bp_length: ", consensus_threshold, "\n" )
print(indexed_of_seq_pass_threshold)
cat(" ****** sequences names\n\n" )
print(indexed_of_seq_pass_threshold$seq_name)
# subset list
selected_list <- list()
counter <- 0
for(k in indexed_of_seq_pass_threshold$order) {
counter <- counter + 1
selected_list[[counter]] <- sequence_list[[k]]
}
rm(k)
# create subset nucleotide alignment
fasta_alignment_con <- matrix(unlist(selected_list), ncol = length(sequence_list[[1]]), byrow = TRUE)
if(seqinr_translate) {
# subset list
selected_list <- list()
counter <- 0
for(k in indexed_of_seq_pass_threshold$order) {
counter <- counter + 1
selected_list[[counter]] <- translation_list[[k]]
}
rm(k)
# create subset aminoacid alignment
translation_matrix_con <- matrix(unlist(selected_list), ncol = length(translation_list[[1]]), byrow = TRUE)
}
}
############## nucleotide
nucleotide_consensus_WITH_gap <- seqinr::consensus(fasta_alignment_con, method = "majority")
### consensus with no gap
matali <- as.matrix(fasta_alignment_con)
majority2 <- function(x) names(which.max(table(x)))
consensus_out <- list()
for(m in 1:ncol(matali)) {
# m <- 1
one_column <- matali[,m]
# remove "-"
one_column <- one_column[!one_column == nucleotide_gap_char]
res <- majority2(one_column)
# if NULL then is a gap
if(is.null(res)) { res <- nucleotide_gap_char}
consensus_out[[m]] <- res
}
rm(m)
nucleotide_consensus_NO_gap <- unlist(consensus_out)
if(seqinr_translate) {
aminoacid_consensus_WITH_gap <- seqinr::consensus(translation_matrix_con, method = "majority")
### consensus with no gap
matali <- translation_matrix_con
consensus_out <- list()
for(m in 1:ncol(matali)) {
# m <- 1
one_column <- matali[,m]
# remove "-"
one_column <- one_column[!one_column == aminoacid_gap_char]
res <- majority2(one_column)
# if NULL then is a gap
if(is.null(res)) { res <- aminoacid_gap_char}
consensus_out[[m]] <- res
}
rm(m)
aminoacid_consensus_NO_gap <- unlist(consensus_out)
}
##################### loop to get distance
nucleotide_full_consensus_WITH_gap_distance_col <- list()
nucleotide_full_consensus_NO_gap_distance_col <- list()
nucleotide_one_seq_consensus_WITH_gap_distance_col <- list()
nucleotide_one_seq_consensus_NO_gap_distance_col <- list()
nucleotide_one_seq_total_count_no_gap_col <- list()
aminoacid_full_consensus_WITH_gap_distance_col <- list()
aminoacid_full_consensus_NO_gap_distance_col <- list()
aminoacid_one_seq_consensus_WITH_gap_distance_col <- list()
aminoacid_one_seq_consensus_NO_gap_distance_col <- list()
aminoacid_one_seq_total_count_no_gap_col <- list()
cat("\n***** getting distances of seq: ")
for(i in 1:length(sequence_list)) {
#i <- 1
cat(i,"..")
## get the distance and weighted distance
one_seq_vector <- sequence_list[[i]]
# compare_site_by_site
nucleotide_consensus_WITH_gap_count <- ifelse(one_seq_vector == nucleotide_consensus_WITH_gap, 0, 1)
nucleotide_full_consensus_WITH_gap_distance <- sum(nucleotide_consensus_WITH_gap_count)
nucleotide_consensus_NO_gap_count <- ifelse(one_seq_vector == nucleotide_consensus_NO_gap, 0, 1)
nucleotide_full_consensus_NO_gap_distance <- sum(nucleotide_consensus_NO_gap_count)
#### assign gaps to consensus based in one_seq_vector
# consensus_WITH_gap
nucleotide_consensus_WITH_gap_to_one_seq <- nucleotide_consensus_WITH_gap
nucleotide_consensus_WITH_gap_to_one_seq[one_seq_vector == nucleotide_gap_char] <- nucleotide_gap_char
nucleotide_consensus_WITH_one_seq <- ifelse(one_seq_vector == nucleotide_consensus_WITH_gap_to_one_seq, 0, 1)
nucleotide_one_seq_consensus_WITH_gap_distance <- sum(nucleotide_consensus_WITH_one_seq)
# consensus_NO_gap
nucleotide_consensus_gap_to_one_seq <- nucleotide_consensus_NO_gap
nucleotide_consensus_gap_to_one_seq[one_seq_vector == nucleotide_gap_char] <- nucleotide_gap_char
nucleotide_consensus_NO_gap_one_seq <- ifelse(one_seq_vector == nucleotide_consensus_gap_to_one_seq, 0, 1)
nucleotide_one_seq_consensus_NO_gap_distance <- sum(nucleotide_consensus_NO_gap_one_seq)
# howmany no gaps in one_seq_vector (i.e., actual nucleotides)
nucleotide_no_gap_count <- ifelse(!one_seq_vector == nucleotide_gap_char, 1, 0)
nucleotide_one_seq_total_count_no_gap <- sum(nucleotide_no_gap_count)
# out data
nucleotide_full_consensus_WITH_gap_distance_col[[i]] <- nucleotide_full_consensus_WITH_gap_distance
nucleotide_full_consensus_NO_gap_distance_col[[i]] <- nucleotide_full_consensus_NO_gap_distance
nucleotide_one_seq_consensus_WITH_gap_distance_col[[i]] <- nucleotide_one_seq_consensus_WITH_gap_distance
nucleotide_one_seq_consensus_NO_gap_distance_col[[i]] <- nucleotide_one_seq_consensus_NO_gap_distance
nucleotide_one_seq_total_count_no_gap_col[[i]] <- nucleotide_one_seq_total_count_no_gap
############## codon -- protein
if(seqinr_translate) {
## get the distance and weighted distance for codons
# i <- 1
one_seq_vector <- translation_list[[i]]
# compare_site_by_site
aminoacid_consensus_WITH_gap_count <- ifelse(one_seq_vector == aminoacid_consensus_WITH_gap, 0, 1)
aminoacid_full_consensus_WITH_gap_distance <- sum(aminoacid_consensus_WITH_gap_count)
aminoacid_consensus_NO_gap_count <- ifelse(one_seq_vector == aminoacid_consensus_NO_gap, 0, 1)
aminoacid_full_consensus_NO_gap_distance <- sum(aminoacid_consensus_NO_gap_count)
#### assign gaps to consensus based in one_seq_vector
# consensus_WITH_gap
aminoacid_consensus_WITH_gap_to_one_seq <- aminoacid_consensus_WITH_gap
aminoacid_consensus_WITH_gap_to_one_seq[one_seq_vector == aminoacid_gap_char] <- aminoacid_gap_char
aminoacid_consensus_WITH_one_seq <- ifelse(one_seq_vector == aminoacid_consensus_WITH_gap_to_one_seq, 0, 1)
aminoacid_one_seq_consensus_WITH_gap_distance <- sum(aminoacid_consensus_WITH_one_seq)
# consensus_NO_gap
aminoacid_consensus_gap_to_one_seq <- aminoacid_consensus_NO_gap
aminoacid_consensus_gap_to_one_seq[one_seq_vector == aminoacid_gap_char] <- aminoacid_gap_char
aminoacid_consensus_NO_gap_one_seq <- ifelse(one_seq_vector == aminoacid_consensus_gap_to_one_seq, 0, 1)
aminoacid_one_seq_consensus_NO_gap_distance <- sum(aminoacid_consensus_NO_gap_one_seq)
# howmany no gaps in one_seq_vector (i.e., actual nucleotides)
aminoacid_no_gap_count <- ifelse(!one_seq_vector == aminoacid_gap_char, 1, 0)
aminoacid_one_seq_total_count_no_gap <- sum(aminoacid_no_gap_count)
# out data
aminoacid_full_consensus_WITH_gap_distance_col[[i]] <- aminoacid_full_consensus_WITH_gap_distance
aminoacid_full_consensus_NO_gap_distance_col[[i]] <- aminoacid_full_consensus_NO_gap_distance
aminoacid_one_seq_consensus_WITH_gap_distance_col[[i]] <- aminoacid_one_seq_consensus_WITH_gap_distance
aminoacid_one_seq_consensus_NO_gap_distance_col[[i]] <- aminoacid_one_seq_consensus_NO_gap_distance
aminoacid_one_seq_total_count_no_gap_col[[i]] <- aminoacid_one_seq_total_count_no_gap
} # tranlate end
} # distance loop end
rm(i)
cat("\n")
# addd to quality_df
quality_df$nucleotide_full_consensus_WITH_gap_distance <- unlist(nucleotide_full_consensus_WITH_gap_distance_col)
quality_df$nucleotide_full_consensus_NO_gap_distance <- unlist(nucleotide_full_consensus_NO_gap_distance_col)
quality_df$nucleotide_adj_lenght_seq_consensus_WITH_gap_distance <- unlist(nucleotide_one_seq_consensus_WITH_gap_distance_col)
quality_df$nucleotide_adj_lenght_seq_consensus_NO_gap_distance <- unlist(nucleotide_one_seq_consensus_NO_gap_distance_col)
quality_df$nucleotide_NO_gap_count <- unlist(nucleotide_one_seq_total_count_no_gap_col)
quality_df$nucleotide_adj_lenght_seq_consensus_WITH_gap_percent_distance_per_total_length <- 100*(quality_df$nucleotide_adj_lenght_seq_consensus_WITH_gap_distance/quality_df$nucleotide_NO_gap_count)
quality_df$nucleotide_adj_lenght_seq_consensus_NO_gap_percent_distance_per_total_length <- 100*(quality_df$nucleotide_adj_lenght_seq_consensus_NO_gap_distance/quality_df$nucleotide_NO_gap_count)
if(seqinr_translate) {
quality_df$aminoacid_full_consensus_WITH_gap_distance <- unlist(aminoacid_full_consensus_WITH_gap_distance_col)
quality_df$aminoacid_full_consensus_NO_gap_distance <- unlist(aminoacid_full_consensus_NO_gap_distance_col)
quality_df$aminoacid_adj_lenght_seq_consensus_WITH_gap_distance <- unlist(aminoacid_one_seq_consensus_WITH_gap_distance_col)
quality_df$aminoacid_adj_lenght_seq_consensus_NO_gap_distance <- unlist(aminoacid_one_seq_consensus_NO_gap_distance_col)
quality_df$aminoacid_NO_gap_count <- unlist(aminoacid_one_seq_total_count_no_gap_col)
quality_df$aminoacid_adj_lenght_seq_consensus_WITH_gap_percent_distance_per_total_length <- 100*(quality_df$aminoacid_adj_lenght_seq_consensus_WITH_gap_distance/quality_df$aminoacid_NO_gap_count)
quality_df$aminoacid_adj_lenght_seq_consensus_NO_gap_percent_distance_per_total_length <- 100*(quality_df$aminoacid_adj_lenght_seq_consensus_NO_gap_distance/quality_df$aminoacid_NO_gap_count)
}
# other indices
quality_df$namb_percent_per_total_length <- 100*(quality_df$nsites_amb/quality_df$nsites_no_gap)
######## write files
# names
quality_name <- paste0(file_out, "_quality.txt")
gaps_name <- paste0(file_out, "_gaps_df.txt")
ambiguity_name <- paste0(file_out, "_ambiguity_df.txt")
utils::write.table(quality_df, file = quality_name, sep = "\t", row.names = FALSE)
utils::write.table(gaps_df, file = gaps_name, sep = "\t", row.names = FALSE)
utils::write.table(ambiguity_df, file = ambiguity_name, sep = "\t", row.names = FALSE)
## if translate
if(seqinr_translate) {
AA_df <- do.call(rbind, AA_list)
AA_df$seq_ID <- quality_df$one_seq_ID
AA_name <- paste0(file_out, "_codons_df.txt")
utils::write.table(AA_df, file = AA_name, sep = "\t", row.names = FALSE)
}
## resturn
if(seqinr_translate) {
out <- list(gaps_df, ambiguity_df, AA_df, quality_df)
} else {
out <- list(gaps_df, ambiguity_df, quality_df)
}
return(out)
}
##########################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.