#' @title Quality check graphics for DNA Barcodes
#'
#' @name alignment_quality_stats_plotter
#'
#' @description Performs quality plots and quality stats
#'
#' @param metadata_file_name_value a data frame of metadata from DNA barcodes (year, sequencing technology).
#' @param ambiguity_file_name_value a data frame of ambiguity of DNA barcodes (see aligned_sequence_matrix_characterizer output)
#' @param codons_file_name_value a data frame of codons of DNA barcodes (see aligned_sequence_matrix_characterizer output)
#' @param gaps_file_name_value a data frame of gaps of DNA barcodes (see aligned_sequence_matrix_characterizer output)
#' @param quality_file_name_value a data frame of quality of DNA barcodes (see aligned_sequence_matrix_characterizer output)
#' @param common_variable_name_meger_value Character. Column name used to merge the metadata, ambiguity, codons, gaps and quality matrices.
#' @param stripplot_quality_var_names_value List. Column names to be included in stripplot (see quality file for complete list of column names)
#' @param stripplot_group_var_names_value Character. plot the whole variable ("GLOBAL") or split for technology ("technology") in violin plot
#' @param stripplot_indicator_var_names_value Character. variables to be included in a subset from the quality file ("one_seq_ID", "Date_submission_NCBI")
#' @param stripplot_mono_color_value points color in the stripplot.
#' @param stripplot_color_gradient_names_value vector. colors option to be used in a gradient stripplot example ("green","black")
#' @param stripplot_color_median_value Color option for th median value point in the stripplot
#' @param stripplot_alpha_value numeric. Alpha value for the stripplot (see ggplot2 options)
#' @param stripplot_location_n_value Numeric. position in y-axis of number of sequence (n) per technology.
#' @param stripplot_pdf_width_height_value Vector. the width and height for output graphic in pdf format.
#'
#' @param global_time_series_var_x_names_value Vector.Variable (year) to be included in a times-series plot as x-axis. Additionally, a category can be included to group the data.
#' @param global_time_series_var_y_con_names_value Vector. Variables to be included in a time-series plot as y-axis
#' @param global_time_series_var_y_dis_names_value Character. a criterion for grouping sequence under same condition example: "technology"
#' @param global_plot_time_series_x_var_name character. Variable to be included in time series analysis
#' @param global_plot_time_series_group_var_name character. variable name to group under same color in the plot.
#' @param global_plot_time_series_ggsci_or_user_color_names_value Vector. Color used for colorRampPalette.
#' @param global_plot_pdf_width_height_value Vector. the width and height for output graphic in pdf format.
#' @param global_plot_time_series_alpha_value numeric. Alpha value for the time-series (see ggplot2 options)
#'
#' @param per_site_plot_var_y_names_value option ("Date_submission_NCBI", "technology")
#' @param per_site_transform_gap_count_value transformation of count gap value. Example: "sqrt"
#' @param per_site_transform_codon_count_value transformation of count codon value. Example: "sqrt"
#' @param per_site_transform_ambiguity_count_value transformation of count ambiguity value. Example: "sqrt"
#' @param per_site_plot_color_gap_names_value Color option for count gaps plot.
#' @param per_site_plot_color_codon_names_value Color option for count codon plot.
#' @param per_site_plot_color_ambiguity_names_value Color option for count ambiguity plot.
#' @param per_site_add_label_every_n_position_value numeric. break value in the plot.
#' @param per_site_NA_color_value character. Color for NA values
#' @param per_site_plot_alpha_value numeric. Alpha value for the time-series (see ggplot2 options)
#' @param per_site_plot_pdf_width_height_value Vector. the width and height for output graphic in pdf format.
#' @param file_out_user optional prefix name for output files
#' @param out_directory_user output directory path
#'
#' @return A serie of graphics and files with quality stats
#'
#' @examples {
#' COX1_ver_1 <- alignment_quality_stats_plotter (metadata_file_name_value =
#' system.file("extdata","COI_UVI_seq_metadata.txt",package="BarcoR"),
#'ambiguity_file_name_value = system.file("extdata","COI_UVI_ambiguity_df.txt",
#'package="BarcoR"),
#'codons_file_name_value =
#'system.file("extdata","COI_UVI_codons_df.txt",package="BarcoR"),
#'gaps_file_name_value = system.file("extdata","COI_UVI_gaps_df.txt",package="BarcoR"),
#'quality_file_name_value = system.file("extdata","COI_UVI_quality.txt",package="BarcoR"),
#'common_variable_name_meger_value = "one_seq_ID",
#'stripplot_quality_var_names_value = c("nucleotide_adj_lenght_seq_consensus_NO_gap_percent_distance_per_total_length", "aminoacid_adj_lenght_seq_consensus_NO_gap_percent_distance_per_total_length", "namb_percent_per_total_length"),
#'stripplot_group_var_names_value = c("GLOBAL", "technology"),
#'stripplot_indicator_var_names_value = c("one_seq_ID", "Date_submission_NCBI"),
#'stripplot_mono_color_value = "gray25",
#'stripplot_color_gradient_names_value = NULL,
#'stripplot_color_median_value = "red",
#'stripplot_alpha_value = 0.6,
#'stripplot_location_n_value = 10,
#'stripplot_pdf_width_height_value = c(7,5),
#'global_time_series_var_x_names_value = c("Date_submission_NCBI", "technology"),
#'global_time_series_var_y_con_names_value = c("nsites_no_gap","nsites_amb","nimcomplete_codons","ncodons_no_X"),
#'global_time_series_var_y_dis_names_value = NULL,
#'global_plot_time_series_x_var_name = "Date_submission_NCBI",
#'global_plot_time_series_group_var_name = "technology",
#'global_plot_time_series_ggsci_or_user_color_names_value = c("#41B15A", "#289DAE", "#FB1F15"),
#'global_plot_time_series_alpha_value = 0.6,
#'global_plot_pdf_width_height_value = c(9,5),
#'per_site_plot_var_y_names_value = c("Date_submission_NCBI", "technology"),
#'per_site_transform_gap_count_value = "sqrt",
#'per_site_transform_codon_count_value = "sqrt",
#'per_site_transform_ambiguity_count_value = NULL,
#'per_site_plot_color_gap_names_value = c("viridis", "A"),
#'per_site_plot_color_codon_names_value = c("viridis", "C"),
#'per_site_plot_color_ambiguity_names_value = c("white","blue","red"),
#'per_site_add_label_every_n_position_value = 250,
#'per_site_NA_color_value = "gray25",
#'per_site_plot_alpha_value = 1,
#'per_site_plot_pdf_width_height_value = c(9,5),
#'file_out_user = "COX1_ver1",
#'out_directory_user = NULL)
#'}
#'
#' @export
#'
#' @importFrom seqinr "read.fasta" "translate" "consensus"
#' @importFrom stringr "str_detect"
#' @importFrom utils "read.table" "write.table" "globalVariables"
#' @importFrom dplyr "full_join" "count" "bind_rows" "mutate" "%>%"
#' @importFrom ggplot2 "ggplot" "ggtitle" "rel" "element_text" "scale_y_discrete" "scale_y_continuous" "geom_violin" "geom_jitter" "scale_colour_gradientn" "aes" "coord_flip" "stat_summary" "theme_classic" "geom_text" "geom_point" "xlab" "ylab" "labs" "scale_color_manual" "scale_x_continuous" "theme" "element_blank" "element_text" "element_rect"
#' @importFrom grDevices "pdf" "colorRampPalette" "dev.off"
#' @importFrom reshape2 "melt"
#' @importFrom ggsci "pal_startrek" "pal_lancet" "pal_npg"
#' @importFrom viridis "scale_fill_viridis"
#' @importFrom stats "sd" "median"
alignment_quality_stats_plotter <- function(metadata_file_name_value,
ambiguity_file_name_value,
codons_file_name_value,
gaps_file_name_value,
quality_file_name_value,
common_variable_name_meger_value,
stripplot_quality_var_names_value,
stripplot_group_var_names_value = "GLOBAL",
stripplot_indicator_var_names_value,
stripplot_mono_color_value = "gray27",
stripplot_color_gradient_names_value = NULL,
stripplot_color_median_value = "red",
stripplot_alpha_value = 0.6,
stripplot_location_n_value = 10,
stripplot_pdf_width_height_value = c(7,5),
global_time_series_var_x_names_value,
global_time_series_var_y_con_names_value,
global_time_series_var_y_dis_names_value = NULL,
global_plot_time_series_x_var_name,
global_plot_time_series_group_var_name = NULL,
global_plot_time_series_ggsci_or_user_color_names_value = c("#999999", "#E69F00", "#56B4E9"),
global_plot_pdf_width_height_value = c(7,5),
global_plot_time_series_alpha_value = 0.6,
per_site_plot_var_y_names_value,
per_site_transform_gap_count_value = "sqrt",
per_site_transform_codon_count_value ="sqrt",
per_site_transform_ambiguity_count_value = NULL,
per_site_plot_color_gap_names_value = c("viridis", "A"),
per_site_plot_color_codon_names_value = c("viridis", "C"),
per_site_plot_color_ambiguity_names_value = c("white","blue","red"),
per_site_add_label_every_n_position_value = 250,
per_site_NA_color_value = "white",
per_site_plot_alpha_value = 1,
per_site_plot_pdf_width_height_value = c(7,5),
file_out_user = NULL,
out_directory_user = NULL){
# user input
metadata_file_name <- metadata_file_name_value
ambiguity_file_name <- ambiguity_file_name_value
codons_file_name <- codons_file_name_value
gaps_file_name <- gaps_file_name_value
quality_file_name <- quality_file_name_value
common_variable_meger <- common_variable_name_meger_value
stripplot_quality_names <- stripplot_quality_var_names_value
stripplot_group_names <- stripplot_group_var_names_value
stripplot_indicator <- stripplot_indicator_var_names_value
stripplot_gradient <- stripplot_color_gradient_names_value
stripplot_alpha <- stripplot_alpha_value
stripplot_color <- stripplot_mono_color_value
stripplot_color_median <- stripplot_color_median_value
stripplot_location_n <- stripplot_location_n_value
strip_width <- stripplot_pdf_width_height_value[1]
strip_height <- stripplot_pdf_width_height_value[2]
global_TS_x_names <- global_time_series_var_x_names_value
global_TS_y_names <- global_time_series_var_y_con_names_value
global_TS_y_dis_names <- global_time_series_var_y_dis_names_value
global_TS_plot_x <- global_plot_time_series_x_var_name
global_TS_plot_group <- global_plot_time_series_group_var_name
global_TS_ggsci <- global_plot_time_series_ggsci_or_user_color_names_value
global_TS_alpha <- global_plot_time_series_alpha_value
global_width <- global_plot_pdf_width_height_value[1]
global_height <- global_plot_pdf_width_height_value[2]
per_site_x_names <- per_site_plot_var_y_names_value
transform_gap_count <- per_site_transform_gap_count_value
transform_codon_count <- per_site_transform_codon_count_value
transform_ambiguity_count <- per_site_transform_ambiguity_count_value
per_site_gap_color <- per_site_plot_color_gap_names_value
per_site_ambiguity_color <- per_site_plot_color_ambiguity_names_value
per_site_codon_color <- per_site_plot_color_codon_names_value
brakes_every <- per_site_add_label_every_n_position_value
per_site_NA_color <- per_site_NA_color_value
per_site_width <- per_site_plot_pdf_width_height_value[1]
per_site_height <- per_site_plot_pdf_width_height_value[2]
per_site_alpha <- per_site_plot_alpha_value
file_out <- file_out_user
## outdir arguments
if(is.null(out_directory_user)) {
out_directory <- getwd()
} else {
setwd(out_directory_user)
out_directory <- getwd()
}
# See viridis options
viridis_option <- data.frame(name_viridis = c("magma", "inferno", "plasma", "viridis", "cividis", "rocket", "mako", "turbo"),
options = c("A", "B", "C", "D", "E", "F", "G", "H"))
print(viridis_option)
# genetic code info
metadata_df <- utils::read.table(metadata_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
ambiguity_df <- utils::read.table(ambiguity_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
codons_df <- utils::read.table(codons_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
gaps_df <- utils::read.table(gaps_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
quality_df <- utils::read.table(quality_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
# standardize name
names(quality_df)[names(quality_df) == common_variable_meger] <- common_variable_meger
names(quality_df)[names(quality_df) == "seq_ID"] <- common_variable_meger
names(ambiguity_df)[names(ambiguity_df) == common_variable_meger] <- common_variable_meger
names(ambiguity_df)[names(ambiguity_df) == "seq_ID"] <- common_variable_meger
names(codons_df)[names(codons_df) == common_variable_meger] <- common_variable_meger
names(codons_df)[names(codons_df) == "seq_ID"] <- common_variable_meger
names(gaps_df)[names(gaps_df) == common_variable_meger] <- common_variable_meger
names(gaps_df)[names(gaps_df) == "seq_ID"] <- common_variable_meger
# merge metadata_df and quality_df
metadata_quality_df <- dplyr::full_join(metadata_df, quality_df, by = common_variable_meger)
metadata_ambiguity_df <- dplyr::full_join(metadata_df, ambiguity_df, by = common_variable_meger)
metadata_codons_df <- dplyr::full_join(metadata_df, codons_df, by = common_variable_meger)
metadata_gaps_df <- dplyr::full_join(metadata_df, gaps_df, by = common_variable_meger)
#################### stripplot graphs
counter <- 0
list_stripcharts <- list()
for(i in 1:length(stripplot_quality_names)) {
# i <- 1
one_stripchart_df <- metadata_quality_df
one_stripchart_df$GLOBAL <- "GLOBAL"
one_stripchart_df <- subset(one_stripchart_df, select = c(stripplot_quality_names[i],stripplot_indicator, stripplot_group_names))
one_stripchart_x_name <- stripplot_quality_names[i]
names(one_stripchart_df)[1] <- "one_strip_var"
for(j in 1:length(stripplot_group_names)) {
counter <- counter + 1
# j <- 1
one_stripchart_df_1 <- subset(one_stripchart_df, select = c("one_strip_var", stripplot_group_names[j]))
names(one_stripchart_df_1) <- c("one_strip_var", "group")
# add number of observations
group <- NULL
n <- NULL
class <- one_stripchart_df_1 %>% dplyr::count(group) %>% dplyr::mutate(label = paste0("n = ", n))
if(is.null(stripplot_gradient)) {
one_strip_var <- NULL
one_stripchart <- ggplot2::ggplot(one_stripchart_df_1, ggplot2::aes(x= group, y = one_strip_var)) +
ggplot2::geom_violin(trim = FALSE, alpha = stripplot_alpha) +
ggplot2::geom_jitter(position = ggplot2::position_jitter(0), shape = 20, cex = 3, alpha = stripplot_alpha , color = stripplot_color) +
ggplot2::coord_flip() +
ggplot2::stat_summary(fun = median, geom = "point", shape = 18, size = 4, color = stripplot_color_median) +
ggplot2::theme_classic() + ggplot2::ylab(one_stripchart_x_name) +
ggplot2::geom_text(ggplot2::aes(y = stripplot_location_n, label = paste0("n = ", n)), data = class)
} else {
one_strip_var <- NULL
one_stripchart <- ggplot2::ggplot(one_stripchart_df_1, ggplot2::aes(x= group, y = one_strip_var)) +
ggplot2::geom_violin(trim = FALSE, alpha = stripplot_alpha) +
ggplot2::geom_jitter(position = ggplot2::position_jitter(0), shape = 20, cex = 3, alpha = stripplot_alpha , ggplot2::aes(color = one_strip_var)) +
ggplot2::scale_colour_gradientn(colours = stripplot_gradient) +
ggplot2::coord_flip() +
ggplot2::stat_summary(fun = median, geom = "point", shape = 18, size = 4, color = stripplot_color_median) +
ggplot2::theme_classic() + ggplot2::ylab(one_stripchart_x_name) + labs(color = "") +
geom_text(ggplot2::aes(y = stripplot_location_n, label = paste0("n = ", n)), data = class)
}
list_stripcharts[[counter]] <- one_stripchart
}
rm(j)
}
rm(i)
#################### global time series graphs
stats_TS_list <- list()
for(k in 1:length(global_TS_x_names)) {
# k <- 1
## continuous subset to groups
all_global_xy <- subset(metadata_quality_df, select = c(global_TS_x_names[k],global_TS_y_names))
all_global_xy_list <- split(all_global_xy, all_global_xy[,1])
# calculate by global_TS_x_names
collect_all_xy_list <- list()
for(i in 1:length(all_global_xy_list)) {
# i <- 1
one_global_xy <- all_global_xy_list[[i]]
one_x_ref <- unique(one_global_xy[,1])
one_x_ref_name <- names(one_global_xy)[1]
one_x_ref_N <- nrow(one_global_xy)
one_x_df <- data.frame(place_holder = one_x_ref, N_values = one_x_ref_N, stringsAsFactors = FALSE)
names(one_x_df) <- c(one_x_ref_name, "N_values")
collect_y_list <- list()
for(j in 2:ncol(one_global_xy)) {
# j <- 2
one_y_name <- names(one_global_xy)[j]
one_y_mean <- mean(one_global_xy[,j], na.rm = TRUE)
one_y_sd <- stats::sd(one_global_xy[,j], na.rm = TRUE)
one_y_max <- max(one_global_xy[,j], na.rm = TRUE)
one_y_min <- min(one_global_xy[,j], na.rm = TRUE)
one_y_names <- c("mean","sd","max","min")
one_y_df <- data.frame(one_y_mean, one_y_sd, one_y_max, one_y_min,stringsAsFactors = FALSE)
names(one_y_df) <- one_y_names
one_y_df$property <- one_y_name
collect_y_list[[j]] <- one_y_df
}
rm(j)
collect_y_list <- Filter(Negate(is.null), collect_y_list)
collect_y_df <- do.call(rbind, collect_y_list)
collect_y_df <- merge(one_x_df, collect_y_df)
collect_all_xy_list[[i]] <- collect_y_df
}
rm(i)
collect_all_xy_df <- do.call(rbind,collect_all_xy_list)
######## discrete subset to groups
if(!is.null(global_TS_y_dis_names)) {
all_global_dis_xy <- subset(metadata_quality_df, select = c(global_TS_x_names[k],global_TS_y_dis_names))
all_global_dis_xy_list <- split(all_global_dis_xy, all_global_dis_xy[,1])
# calculate by global_TS_x_names
collect_all_dis_xy_list <- list()
for(i in 1:length(all_global_dis_xy_list)) {
# i <- 1
one_global_xy <- all_global_dis_xy_list[[i]]
one_x_ref <- unique(one_global_xy[,1])
one_x_ref_name <- names(one_global_xy)[1]
one_x_ref_N <- nrow(one_global_xy)
one_x_df <- data.frame(place_holder = one_x_ref, N_values = one_x_ref_N, stringsAsFactors = FALSE)
names(one_x_df) <- c(one_x_ref_name, "N_values")
collect_dis_y_list <- list()
for(j in 2:ncol(one_global_xy)) {
# j <- 2
one_y_name <- names(one_global_xy)[j]
one_y_dis_df0 <- data.frame(unlist(table(one_global_xy[,j])))
one_y_dis_df <- t(one_y_dis_df0[,2:ncol(one_y_dis_df0)])
colnames(one_y_dis_df) <- one_y_dis_df0[,1]
one_y_dis_df <- data.frame(one_y_dis_df)
collect_dis_y_list[[j]] <- one_y_dis_df
}
rm(j)
collect_dis_y_list <- Filter(Negate(is.null), collect_dis_y_list)
collect_dis_y_df <- do.call(rbind, collect_dis_y_list)
collect_y_df <- cbind(one_x_df,collect_dis_y_df)
collect_y_df <- collect_y_df[,-2]
collect_all_dis_xy_list[[i]] <- collect_y_df
}
rm(i)
collect_all_dis_xy_df <- dplyr::bind_rows(collect_all_dis_xy_list)
collect_all_dis_xy_df[is.na(collect_all_dis_xy_df)] <- 0
collect_all_xy_df <- dplyr::full_join(collect_all_xy_df,collect_all_dis_xy_df, by = names(collect_all_xy_df)[1])
} # end discrete
stats_TS_list[[k]] <- collect_all_xy_df
}
rm(k)
#################### global scatterplots graphs
all_global_xy_plot_df <- subset(metadata_quality_df, select = c(global_TS_plot_x,global_TS_plot_group,global_TS_y_names))
all_global_xy_plots_list <- list()
for(i in 1:length(global_TS_y_names)) {
# i <- 1
x_TS_axis <- global_TS_plot_x
group_TS_axis <- global_TS_plot_group
one_y_TS_axis <- global_TS_y_names[i]
one_TS_df <- subset(metadata_quality_df, select = c(x_TS_axis,group_TS_axis,one_y_TS_axis))
if(!is.null(group_TS_axis)) {
names(one_TS_df) <- c("x_axis","group", "y_axis")
n_groups <- length(unique(one_TS_df$group))
if(global_TS_ggsci[1] == "uchicago") { color_groups <- ggsci::pal_ucscgb("default")(n_groups)
} else if(global_TS_ggsci[1] == "startrek") { color_groups <- ggsci::pal_startrek("uniform")(n_groups)
} else if(global_TS_ggsci[1] == "lancet") { color_groups <- ggsci::pal_lancet("lanonc")(n_groups)
} else if(global_TS_ggsci[1] == "npg") { color_groups <- ggsci::pal_npg("nrc")(n_groups)
} else {
color_groups <- grDevices::colorRampPalette(global_TS_ggsci)(n_groups)}
x_axis <- NULL
y_axis <- NULL
one_global_scatter <- ggplot2::ggplot(one_TS_df, ggplot2::aes(x= x_axis, y = y_axis)) +
ggplot2::geom_point(ggplot2::aes(color = group), alpha = global_TS_alpha, size = 3, shape = 20) +
ggplot2::theme_classic() + ggplot2::ylab(one_y_TS_axis) + ggplot2::xlab(x_TS_axis) +
ggplot2::labs(color = group_TS_axis) +
ggplot2::scale_color_manual(values=color_groups) +
ggplot2::scale_x_continuous(breaks = seq(min(one_TS_df$x_axis), max(one_TS_df$x_axis), by = 1)) +
ggplot2::theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
all_global_xy_plots_list[[i]] <- one_global_scatter
} else {
names(one_TS_df) <- c("x_axis", "y_axis")
n_groups <- 1
if(global_TS_ggsci[1] == "uchicago") { color_groups <- ggsci::pal_ucscgb("default")(n_groups)
} else if(global_TS_ggsci[1] == "startrek") { color_groups <- ggsci::pal_startrek("uniform")(n_groups)
} else if(global_TS_ggsci[1] == "lancet") { color_groups <- ggsci::pal_lancet("lanonc")(n_groups)
} else if(global_TS_ggsci[1] == "npg") { color_groups <- ggsci::pal_npg("nrc")(n_groups)
} else {
color_groups <- grDevices::colorRampPalette(global_TS_ggsci)(n_groups)}
x_axis <- NULL
y_axis <- NULL
one_global_scatter <- ggplot2::ggplot(one_TS_df, ggplot2::aes(x= x_axis, y = y_axis)) +
ggplot2::geom_point(alpha = global_TS_alpha, size = 3, shape = 20, color = color_groups) +
ggplot2::theme_classic() + ggplot2::ylab(one_y_TS_axis) + ggplot2::xlab(x_TS_axis) +
ggplot2::labs(color = group_TS_axis) +
ggplot2::scale_x_continuous(breaks = seq(min(one_TS_df$x_axis), max(one_TS_df$x_axis), by = 1)) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1))
all_global_xy_plots_list[[i]] <- one_global_scatter
}
}
rm(i)
#################### per site tileplot
collect_ambiguity_sum_end_list <- list()
collect_codon_sum_end_list <- list()
collect_gap_sum_end_list <- list()
collect_ambiguity_persite_plots_list <- list()
collect_gap_persite_plots_list <- list()
collect_codon_persite_plots_list <- list()
for(k in 1:length(per_site_x_names)) {
# k <- 2
all_ambiguity_names <- names(metadata_ambiguity_df)
all_ambiguity_names <- all_ambiguity_names[grep("A_",all_ambiguity_names)]
one_ambiguity_raw_df <- subset(metadata_ambiguity_df, select = c(per_site_x_names[k],all_ambiguity_names))
one_ambiguity_list <- split(one_ambiguity_raw_df, one_ambiguity_raw_df[,1])
all_codon_names <- names(metadata_codons_df)
all_codon_names <- all_codon_names[grep("codon_",all_codon_names)]
one_codon_raw_df <- subset(metadata_codons_df, select = c(per_site_x_names[k],all_codon_names))
one_codon_list <- split(one_codon_raw_df, one_codon_raw_df[,1])
all_gap_names <- names(metadata_gaps_df)
all_gap_names <- all_gap_names[grep("G_",all_gap_names)]
one_gap_raw_df <- subset(metadata_gaps_df, select = c(per_site_x_names[k],all_gap_names))
one_gap_list <- split(one_gap_raw_df, one_gap_raw_df[,1])
# calculate by tile_sum
collect_ambiguity_sum_list <- list()
collect_codon_ALL_sum_list <- list()
collect_gap_ALL_sum_list <- list()
cat("\n*** calculating sum per sites for -- \n")
for(i in 1:length(one_ambiguity_list)) {
# i <- 1
one_amb_xy <- one_ambiguity_list[[i]]
one_amb_x_ref <- unique(one_amb_xy[,1])
one_amb_x_ref_name <- names(one_amb_xy)[1]
cat(one_amb_x_ref, " -- ")
one_amb_x_ref_N <- nrow(one_amb_xy)
one_amb_x_df <- data.frame(place_holder = one_amb_x_ref, N_values = one_amb_x_ref_N, stringsAsFactors = FALSE)
names(one_amb_x_df) <- c(one_amb_x_ref_name, "N_values")
one_codon_xy <- one_codon_list[[i]]
one_codon_x_ref <- unique(one_codon_xy[,1])
one_codon_x_ref_name <- names(one_codon_xy)[1]
one_codon_x_ref_N <- nrow(one_codon_xy)
one_codon_x_df <- data.frame(place_holder = one_codon_x_ref, N_values = one_codon_x_ref_N, stringsAsFactors = FALSE)
names(one_codon_x_df) <- c(one_codon_x_ref_name, "N_values")
one_gap_xy <- one_gap_list[[i]]
one_gap_x_ref <- unique(one_gap_xy[,1])
one_gap_x_ref_name <- names(one_gap_xy)[1]
one_gap_x_ref_N <- nrow(one_gap_xy)
one_gap_x_df <- data.frame(place_holder = one_gap_x_ref, N_values = one_gap_x_ref_N, stringsAsFactors = FALSE)
names(one_gap_x_df) <- c(one_gap_x_ref_name, "N_values")
collect_amb_sum_list <- list()
collect_codon_sum_list <- list()
collect_gap_sum_list <- list()
for(j in 2:ncol(one_amb_xy)) {
# j <- 2
if(j%%100== 0) cat("*")
one_amb_y_name <- names(one_amb_xy)[j]
one_amb_y_sum <- sum(one_amb_xy[,j], na.rm = TRUE)
one_amb_y_df <- data.frame(one_amb_y_sum,stringsAsFactors = FALSE)
names(one_amb_y_df) <- one_amb_y_name
collect_amb_sum_list[[j]] <- one_amb_y_df
one_gap_y_name <- names(one_gap_xy)[j]
one_gap_y_sum <- sum(one_gap_xy[,j], na.rm = TRUE)
one_gap_y_df <- data.frame(one_gap_y_sum,stringsAsFactors = FALSE)
names(one_gap_y_df) <- one_gap_y_name
collect_gap_sum_list[[j]] <- one_gap_y_df
}
rm(j)
for(j in 2:ncol(one_codon_xy)) {
one_codon_y_name <- names(one_codon_xy)[j]
one_codon_y_sum <- sum(one_codon_xy[,j], na.rm = TRUE)
one_codon_y_df <- data.frame(one_codon_y_sum,stringsAsFactors = FALSE)
names(one_codon_y_df) <- one_codon_y_name
collect_codon_sum_list[[j]] <- one_codon_y_df
}
rm(j)
cat("\n")
collect_amb_sum_list <- Filter(Negate(is.null), collect_amb_sum_list)
collect_amb_y_df <- do.call(cbind, collect_amb_sum_list)
collect_amb_y_df <- merge(one_amb_x_df, collect_amb_y_df)
collect_ambiguity_sum_list[[i]] <- collect_amb_y_df
collect_codon_sum_list <- Filter(Negate(is.null), collect_codon_sum_list)
collect_codon_y_df <- do.call(cbind, collect_codon_sum_list)
collect_codon_y_df <- merge(one_codon_x_df, collect_codon_y_df)
collect_codon_ALL_sum_list[[i]] <- collect_codon_y_df
collect_gap_sum_list <- Filter(Negate(is.null), collect_gap_sum_list)
collect_gap_y_df <- do.call(cbind, collect_gap_sum_list)
collect_gap_y_df <- merge(one_gap_x_df, collect_gap_y_df)
collect_gap_ALL_sum_list[[i]] <- collect_gap_y_df
}
rm(i)
collect_ambiguity_sum_end_df <- do.call(rbind,collect_ambiguity_sum_list)
collect_codon_sum_end_df <- do.call(rbind,collect_codon_ALL_sum_list)
collect_gap_sum_end_df <- do.call(rbind,collect_gap_ALL_sum_list)
collect_ambiguity_sum_end_list[[k]] <- collect_ambiguity_sum_end_df
collect_codon_sum_end_list[[k]] <- collect_codon_sum_end_df
collect_gap_sum_end_list[[k]] <- collect_gap_sum_end_df
######################################### tile plots
########################### gap
# get x var
gap_x_var <- collect_gap_sum_end_df[,1]
# remove x
gap_y_df <- collect_gap_sum_end_df[,-1]
# remove counts
gap_y_df <- gap_y_df[,-1]
# get y melted
gap_persite_df <- reshape2::melt(gap_y_df)
names(gap_persite_df) <- c("ID_site","count")
# add x_var
gap_persite_df$x_var <- gap_x_var
# get legend labels
min_gap_count <- min(gap_persite_df$count)
max_gap_count <- max(gap_persite_df$count)
legend_gap_label <- "count"
legend_gap_breaks <- round(seq(min_gap_count,max_gap_count, length.out = 4), 0)
legend_gap_seq_label <- legend_gap_breaks
# transform "present" by sqrt
if(!is.null(transform_gap_count)) {
if(transform_gap_count == "sqrt") {
gap_persite_df$count <- sqrt(gap_persite_df$count)
min_gap_count <- min(gap_persite_df$count)
max_gap_count <- max(gap_persite_df$count)
legend_gap_label <- "SQRT-count"
legend_gap_breaks <- seq(min_gap_count,max_gap_count, length.out = 4)
legend_gap_seq_label <- round(legend_gap_breaks^2, 0)
}
if(transform_gap_count == "log10") {
gap_persite_df$count <- gap_persite_df$count + 1
gap_persite_df$count <- log10(gap_persite_df$count)
min_gap_count <- min(gap_persite_df$count)
max_gap_count <- max(gap_persite_df$count)
legend_gap_label <- "Log10-count"
legend_gap_breaks <- seq(min_gap_count,max_gap_count, length.out = 4)
legend_gap_seq_label <- round(10^legend_gap_breaks, 0)
}
}
# count sites
ncount_sites <- length(unique(gap_persite_df$ID_site))
ncount_sites_label <- rep(".",ncount_sites)
ncount_sites_label2 <- character()
for(mm in 1:length(ncount_sites_label)) {
ncount_sites_label2[mm] <- ifelse(mm %% brakes_every == 0, paste0(mm," --"), "")
}
ncount_sites_label2[1] <- "1 --"
gap_y_label <- names(collect_gap_sum_end_df)[1]
gap_x_label <- "Site"
## we can plot a heatmap with the expression values
ID_site <- NULL
x_var <- NULL
gap_tile_base <- ggplot2::ggplot(gap_persite_df,ggplot2::aes(x=ID_site,y=x_var))+
ggplot2::geom_tile(ggplot2::aes(fill = count), alpha = per_site_alpha)+
ggplot2::theme_classic(base_size = 11)+
ggplot2::ggtitle("Sequencing Count per Nucleotide Site") +
ggplot2::scale_x_discrete(labels= ncount_sites_label2, expand = c(0, 0)) +
ggplot2:: ylab(gap_y_label) + ggplot2::xlab(gap_x_label) +
ggplot2::theme(axis.text.y = element_text(size=ggplot2::rel(0.9)),
axis.text.x = ggplot2::element_text(size=ggplot2::rel(0.9),angle = 90, vjust = 0.5, hjust = 1.2),
axis.ticks.x = ggplot2::element_blank(),
legend.title = ggplot2::element_text(color = "black", size = 9),
legend.text = ggplot2::element_text(color = "black", size = 9),
panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
panel.background = ggplot2::element_rect(fill = per_site_NA_color, colour = per_site_NA_color, size = 0.5, linetype = "solid"))
if(!class(gap_persite_df$x_var) == "character") {
gap_tile_base <- gap_tile_base + ggplot2::scale_y_continuous(breaks = seq(min(gap_persite_df$x_var), max(gap_persite_df$x_var), by = 1), expand = c(0, 0))
} else {
gap_tile_base <- gap_tile_base + ggplot2::scale_y_discrete(expand = c(0, 0))
}
if(per_site_gap_color[1] == "viridis") {
gap_tile_base <- gap_tile_base + viridis::scale_fill_viridis(option=per_site_gap_color[2], name = legend_gap_label, breaks = legend_gap_breaks, labels = legend_gap_seq_label)
} else {
gap_tile_base <- gap_tile_base + ggplot2::scale_fill_gradientn( colours = per_site_gap_color, name = legend_gap_label, breaks = legend_gap_breaks, labels = legend_gap_seq_label)
}
collect_gap_persite_plots_list[[k]] <- gap_tile_base
########################### gap END
########################### codon
# get x var
codon_x_var <- collect_codon_sum_end_df[,1]
# remove x
codon_y_df <- collect_codon_sum_end_df[,-1]
# remove counts
codon_y_df <- codon_y_df[,-1]
# get y melted
codon_persite_df <- reshape2::melt(codon_y_df)
names(codon_persite_df) <- c("ID_site","count")
# add x_var
codon_persite_df$x_var <- codon_x_var
# get legend labels
min_codon_count <- min(codon_persite_df$count)
max_codon_count <- max(codon_persite_df$count)
legend_codon_label <- "count"
legend_codon_breaks <- round(seq(min_codon_count,max_codon_count, length.out = 4), 0)
legend_codon_seq_label <- legend_codon_breaks
# transform "present" by sqrt
if(!is.null(transform_codon_count)) {
if(transform_codon_count == "sqrt") {
codon_persite_df$count <- sqrt(codon_persite_df$count)
min_codon_count <- min(codon_persite_df$count)
max_codon_count <- max(codon_persite_df$count)
legend_codon_label <- "SQRT-count"
legend_codon_breaks <- seq(min_codon_count,max_codon_count, length.out = 4)
legend_codon_seq_label <- round(legend_codon_breaks^2, 0)
}
if(transform_codon_count == "log10") {
codon_persite_df$count <- codon_persite_df$count + 1
codon_persite_df$count <- log10(codon_persite_df$count)
min_codon_count <- min(codon_persite_df$count)
max_codon_count <- max(codon_persite_df$count)
legend_codon_label <- "Log10-count"
legend_codon_breaks <- seq(min_codon_count,max_codon_count, length.out = 4)
legend_codon_seq_label <- round(10^legend_codon_breaks, 0)
}
}
# count sites
ncount_sites <- length(unique(codon_persite_df$ID_site))
ncount_sites_label <- rep(".",ncount_sites)
ncount_sites_label2 <- character()
for(mm in 1:length(ncount_sites_label)) {
ncount_sites_label2[mm] <- ifelse(mm %% brakes_every == 0, paste0(mm," --"), "")
}
ncount_sites_label2[1] <- "1 --"
codon_y_label <- names(collect_codon_sum_end_df)[1]
codon_x_label <- "Site"
## we can plot a heatmap with the expression values
ID_site <- NULL
x_var <- NULL
codon_tile_base <- ggplot2::ggplot(codon_persite_df,ggplot2::aes(x=ID_site,y=x_var))+
ggplot2::geom_tile(ggplot2::aes(fill = count), alpha = per_site_alpha)+
ggplot2::theme_classic(base_size = 11)+
ggplot2::ggtitle("Sequencing Count per Codon Site") +
ggplot2::scale_x_discrete(labels= ncount_sites_label2, expand = c(0, 0)) +
ggplot2::ylab(codon_y_label) + ggplot2::xlab(codon_x_label) +
ggplot2::theme(axis.text.y = element_text(size=ggplot2::rel(0.9)),
axis.text.x = ggplot2::element_text(size=ggplot2::rel(0.9),angle = 90, vjust = 0.5, hjust = 1.2),
axis.ticks.x = ggplot2::element_blank(),
legend.title = ggplot2::element_text(color = "black", size = 9),
legend.text = ggplot2::element_text(color = "black", size = 9),
panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
panel.background = ggplot2::element_rect(fill = per_site_NA_color, colour = per_site_NA_color, size = 0.5, linetype = "solid"))
if(!class(codon_persite_df$x_var) == "character") {
codon_tile_base <- codon_tile_base + ggplot2::scale_y_continuous(breaks = seq(min(codon_persite_df$x_var), max(codon_persite_df$x_var), by = 1), expand = c(0, 0))
} else {
codon_tile_base <- codon_tile_base + ggplot2::scale_y_discrete(expand = c(0, 0))
}
if(per_site_codon_color[1] == "viridis") {
codon_tile_base <- codon_tile_base + viridis::scale_fill_viridis(option=per_site_codon_color[2], name = legend_codon_label, breaks = legend_codon_breaks, labels = legend_codon_seq_label)
} else {
codon_tile_base <- codon_tile_base + ggplot2::scale_fill_gradientn( colours = per_site_codon_color, name = legend_codon_label, breaks = legend_codon_breaks, labels = legend_codon_seq_label)
}
collect_codon_persite_plots_list[[k]] <- codon_tile_base
########################### codon END
########################### ambiguity
# get x var
ambiguity_x_var <- collect_ambiguity_sum_end_df[,1]
# remove x
ambiguity_y_df <- collect_ambiguity_sum_end_df[,-1]
# remove counts
ambiguity_y_df <- ambiguity_y_df[,-1]
# get y melted
ambiguity_persite_df <- melt(ambiguity_y_df)
names(ambiguity_persite_df) <- c("ID_site","count")
# add x_var
ambiguity_persite_df$x_var <- ambiguity_x_var
# get legend labels
min_ambiguity_count <- min(ambiguity_persite_df$count)
max_ambiguity_count <- max(ambiguity_persite_df$count)
legend_ambiguity_label <- "count"
legend_ambiguity_breaks <- round(seq(min_ambiguity_count,max_ambiguity_count, length.out = 4), 0)
legend_ambiguity_seq_label <- legend_ambiguity_breaks
# transform "present" by sqrt
if(!is.null(transform_ambiguity_count)) {
if(transform_ambiguity_count == "sqrt") {
ambiguity_persite_df$count <- sqrt(ambiguity_persite_df$count)
min_ambiguity_count <- min(ambiguity_persite_df$count)
max_ambiguity_count <- max(ambiguity_persite_df$count)
legend_ambiguity_label <- "SQRT-count"
legend_ambiguity_breaks <- seq(min_ambiguity_count,max_ambiguity_count, length.out = 4)
legend_ambiguity_seq_label <- round(legend_ambiguity_breaks^2, 0)
}
if(transform_ambiguity_count == "log10") {
ambiguity_persite_df$count <- ambiguity_persite_df$count + 1
ambiguity_persite_df$count <- log10(ambiguity_persite_df$count)
min_ambiguity_count <- min(ambiguity_persite_df$count)
max_ambiguity_count <- max(ambiguity_persite_df$count)
legend_ambiguity_label <- "Log10-count"
legend_ambiguity_breaks <- seq(min_ambiguity_count,max_ambiguity_count, length.out = 4)
legend_ambiguity_seq_label <- round(10^legend_ambiguity_breaks, 0)
}
}
# count sites
ncount_sites <- length(unique(ambiguity_persite_df$ID_site))
ncount_sites_label <- rep(".",ncount_sites)
ncount_sites_label2 <- character()
for(mm in 1:length(ncount_sites_label)) {
ncount_sites_label2[mm] <- ifelse(mm %% brakes_every == 0, paste0(mm," --"), "")
}
ncount_sites_label2[1] <- "1 --"
ambiguity_y_label <- names(collect_ambiguity_sum_end_df)[1]
ambiguity_x_label <- "Site"
## we can plot a heatmap with the expression values
ID_site <- NULL
x_var <- NULL
ambiguity_tile_base <- ggplot2::ggplot(ambiguity_persite_df,ggplot2::aes(x=ID_site,y=x_var))+
ggplot2::geom_tile(ggplot2::aes(fill = count), alpha = per_site_alpha)+
ggplot2::theme_classic(base_size = 11)+
ggplot2::ggtitle("Nucleotide Ambiguity per Site") +
ggplot2::scale_x_discrete(labels= ncount_sites_label2, expand = c(0, 0)) +
ggplot2::ylab(ambiguity_y_label) + ggplot2::xlab(ambiguity_x_label) +
ggplot2::theme(axis.text.y = element_text(size=ggplot2::rel(0.9)),
axis.text.x = ggplot2::element_text(size=ggplot2::rel(0.9),angle = 90, vjust = 0.5, hjust = 1.2),
axis.ticks.x = ggplot2::element_blank(),
legend.title = ggplot2::element_text(color = "black", size = 9),
legend.text = ggplot2::element_text(color = "black", size = 9),
panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
panel.background = ggplot2::element_rect(fill = per_site_NA_color, colour = per_site_NA_color, size = 0.5, linetype = "solid"))
if(!class(ambiguity_persite_df$x_var) == "character") {
ambiguity_tile_base <- ambiguity_tile_base + ggplot2::scale_y_continuous(breaks = seq(min(ambiguity_persite_df$x_var), max(ambiguity_persite_df$x_var), by = 1), expand = c(0, 0))
} else {
ambiguity_tile_base <- ambiguity_tile_base + ggplot2::scale_y_discrete(expand = c(0, 0))
}
if(per_site_ambiguity_color[1] == "viridis") {
ambiguity_tile_base <- ambiguity_tile_base + viridis::scale_fill_viridis(option=per_site_ambiguity_color[2], name = legend_ambiguity_label, breaks = legend_ambiguity_breaks, labels = legend_ambiguity_seq_label)
} else {
ambiguity_tile_base <- ambiguity_tile_base + ggplot2::scale_fill_gradientn( colours = per_site_ambiguity_color, name = legend_ambiguity_label, breaks = legend_ambiguity_breaks, labels = legend_ambiguity_seq_label)
}
collect_ambiguity_persite_plots_list[[k]] <- ambiguity_tile_base
########################### ambiguity END
} # END of k
############################## collect and save as PDFs
stripchart_name <- paste0(file_out,"_stripcharts.pdf")
global_plots_name <- paste0(file_out,"_global_plots.pdf")
gap_plots_name <- paste0(file_out,"_per_site_gap_plots.pdf")
codon_plots_name <- paste0(file_out,"_per_site_codon_plots.pdf")
ambiguity_plots_name <- paste0(file_out,"_per_site_ambiguity_plots.pdf")
grDevices::pdf(stripchart_name, width = strip_width, height = strip_height, onefile = TRUE)
for (i in seq(length(list_stripcharts))) { print(list_stripcharts)}; rm(i)
grDevices::dev.off()
grDevices::pdf(global_plots_name, width = global_width, height = global_height, onefile = TRUE)
for (i in seq(length(all_global_xy_plots_list))) { print(all_global_xy_plots_list)}; rm(i)
grDevices::dev.off()
grDevices::pdf(gap_plots_name, width = per_site_width, height = per_site_height, onefile = TRUE)
for (i in seq(length(collect_gap_persite_plots_list))) { print(collect_gap_persite_plots_list)}; rm(i)
grDevices::dev.off()
grDevices::pdf(codon_plots_name, width = per_site_width, height = per_site_height, onefile = TRUE)
for (i in seq(length(collect_codon_persite_plots_list))) { print(collect_codon_persite_plots_list)}; rm(i)
grDevices::dev.off()
grDevices::pdf(ambiguity_plots_name, width = per_site_width, height = per_site_height, onefile = TRUE)
for (i in seq(length(collect_ambiguity_persite_plots_list))) { print(collect_ambiguity_persite_plots_list)}; rm(i)
grDevices::dev.off()
## save as tables
###### TS names
for(i in 1:length(global_TS_x_names)) {
one_TS_name <- paste0(file_out, "_",global_TS_x_names[i], "_global_table.txt")
utils::write.table(stats_TS_list[i], file = one_TS_name, sep = "\t", row.names = FALSE)
}
rm(i)
###### persite names
for(i in 1:length(per_site_x_names)) {
one_ambiguity_name <- paste0(file_out, "_",per_site_x_names[i], "_ambiguity_table.txt")
one_codon_name <- paste0(file_out, "_",per_site_x_names[i], "_codon_table.txt")
one_gap_name <- paste0(file_out, "_",per_site_x_names[i], "_gap_table.txt")
utils::write.table(collect_ambiguity_sum_end_list[i], file = one_ambiguity_name, sep = "\t", row.names = FALSE)
utils::write.table(collect_codon_sum_end_list[i], file = one_codon_name, sep = "\t", row.names = FALSE)
utils::write.table(collect_gap_sum_end_list[i], file = one_gap_name, sep = "\t", row.names = FALSE)
}
rm(i)
# to return
list_of_plots <- list(plots = c(list_stripcharts,all_global_xy_plots_list, collect_gap_persite_plots_list, collect_codon_persite_plots_list, collect_ambiguity_persite_plots_list),
tables = c(stats_TS_list, collect_ambiguity_sum_end_list, collect_codon_sum_end_list, collect_gap_sum_end_list))
return(list_of_plots)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.