R/alignment_quality_stats_plotter.R

Defines functions alignment_quality_stats_plotter

Documented in alignment_quality_stats_plotter

#' @title Quality check graphics for DNA Barcodes
#'
#' @name alignment_quality_stats_plotter
#'
#' @description Performs quality plots and quality stats
#'
#' @param metadata_file_name_value a data frame of metadata from DNA barcodes (year, sequencing technology).
#' @param ambiguity_file_name_value a data frame of ambiguity of DNA barcodes (see aligned_sequence_matrix_characterizer output)
#' @param codons_file_name_value a data frame of codons of DNA barcodes (see aligned_sequence_matrix_characterizer output)
#' @param gaps_file_name_value a data frame of gaps of DNA barcodes (see aligned_sequence_matrix_characterizer output)
#' @param quality_file_name_value a data frame of quality of DNA barcodes (see aligned_sequence_matrix_characterizer output)
#' @param common_variable_name_meger_value Character. Column name used to merge the metadata, ambiguity, codons, gaps and quality matrices.

#' @param stripplot_quality_var_names_value List. Column names to be included in stripplot (see quality file for complete list of column names)
#' @param stripplot_group_var_names_value Character. plot the whole variable ("GLOBAL") or split for technology ("technology") in violin plot
#' @param stripplot_indicator_var_names_value Character. variables to be included in a subset from the quality file  ("one_seq_ID", "Date_submission_NCBI")
#' @param stripplot_mono_color_value points color in the stripplot.
#' @param stripplot_color_gradient_names_value vector. colors option to be used in a gradient stripplot example ("green","black")
#' @param stripplot_color_median_value Color option for th median value point in the stripplot
#' @param stripplot_alpha_value numeric. Alpha value for the stripplot (see ggplot2 options)
#' @param stripplot_location_n_value Numeric. position in y-axis of number of sequence (n) per technology.
#' @param stripplot_pdf_width_height_value Vector. the width and height for output graphic in pdf format.
#'
#' @param global_time_series_var_x_names_value Vector.Variable (year) to be included in a times-series plot as x-axis. Additionally, a category can be included to group the data.
#' @param global_time_series_var_y_con_names_value Vector. Variables to be included in a time-series plot as y-axis
#' @param global_time_series_var_y_dis_names_value Character. a criterion for grouping sequence under same condition example: "technology"
#' @param global_plot_time_series_x_var_name character. Variable to be included in time series analysis
#' @param global_plot_time_series_group_var_name character. variable name to group under same color in the plot.
#' @param global_plot_time_series_ggsci_or_user_color_names_value Vector. Color used for colorRampPalette.
#' @param global_plot_pdf_width_height_value Vector. the width and height for output graphic in pdf format.
#' @param global_plot_time_series_alpha_value numeric. Alpha value for the time-series (see ggplot2 options)
#'
#' @param per_site_plot_var_y_names_value option ("Date_submission_NCBI", "technology")
#' @param per_site_transform_gap_count_value transformation of count gap value. Example: "sqrt"
#' @param per_site_transform_codon_count_value transformation of count codon value. Example: "sqrt"
#' @param per_site_transform_ambiguity_count_value transformation of count ambiguity value. Example: "sqrt"
#' @param per_site_plot_color_gap_names_value Color option for count gaps plot.
#' @param per_site_plot_color_codon_names_value Color option for count codon plot.
#' @param per_site_plot_color_ambiguity_names_value Color option for count ambiguity plot.
#' @param per_site_add_label_every_n_position_value numeric. break value in the plot.
#' @param per_site_NA_color_value character. Color for NA values
#' @param per_site_plot_alpha_value numeric. Alpha value for the time-series (see ggplot2 options)
#' @param per_site_plot_pdf_width_height_value Vector. the width and height for output graphic in pdf format.
#' @param file_out_user optional prefix name for output files
#' @param out_directory_user output directory path
#'
#' @return A serie of graphics and files with quality stats
#'

#' @examples {
#' COX1_ver_1 <- alignment_quality_stats_plotter (metadata_file_name_value =
#' system.file("extdata","COI_UVI_seq_metadata.txt",package="BarcoR"),
#'ambiguity_file_name_value = system.file("extdata","COI_UVI_ambiguity_df.txt",
#'package="BarcoR"),
#'codons_file_name_value =
#'system.file("extdata","COI_UVI_codons_df.txt",package="BarcoR"),
#'gaps_file_name_value = system.file("extdata","COI_UVI_gaps_df.txt",package="BarcoR"),
#'quality_file_name_value = system.file("extdata","COI_UVI_quality.txt",package="BarcoR"),
#'common_variable_name_meger_value = "one_seq_ID",

#'stripplot_quality_var_names_value = c("nucleotide_adj_lenght_seq_consensus_NO_gap_percent_distance_per_total_length", "aminoacid_adj_lenght_seq_consensus_NO_gap_percent_distance_per_total_length", "namb_percent_per_total_length"),
#'stripplot_group_var_names_value = c("GLOBAL", "technology"),
#'stripplot_indicator_var_names_value = c("one_seq_ID", "Date_submission_NCBI"),
#'stripplot_mono_color_value = "gray25",
#'stripplot_color_gradient_names_value = NULL,
#'stripplot_color_median_value = "red",
#'stripplot_alpha_value = 0.6,
#'stripplot_location_n_value = 10,
#'stripplot_pdf_width_height_value = c(7,5),


#'global_time_series_var_x_names_value = c("Date_submission_NCBI", "technology"),
#'global_time_series_var_y_con_names_value = c("nsites_no_gap","nsites_amb","nimcomplete_codons","ncodons_no_X"),
#'global_time_series_var_y_dis_names_value = NULL,
#'global_plot_time_series_x_var_name = "Date_submission_NCBI",
#'global_plot_time_series_group_var_name = "technology",
#'global_plot_time_series_ggsci_or_user_color_names_value = c("#41B15A", "#289DAE", "#FB1F15"),
#'global_plot_time_series_alpha_value = 0.6,
#'global_plot_pdf_width_height_value = c(9,5),

#'per_site_plot_var_y_names_value = c("Date_submission_NCBI", "technology"),
#'per_site_transform_gap_count_value = "sqrt",
#'per_site_transform_codon_count_value = "sqrt",
#'per_site_transform_ambiguity_count_value = NULL,
#'per_site_plot_color_gap_names_value = c("viridis", "A"),
#'per_site_plot_color_codon_names_value = c("viridis", "C"),
#'per_site_plot_color_ambiguity_names_value = c("white","blue","red"),
#'per_site_add_label_every_n_position_value = 250,
#'per_site_NA_color_value = "gray25",
#'per_site_plot_alpha_value = 1,
#'per_site_plot_pdf_width_height_value = c(9,5),
#'file_out_user = "COX1_ver1",
#'out_directory_user = NULL)
#'}
#'
#' @export
#'
#' @importFrom seqinr "read.fasta" "translate" "consensus"
#' @importFrom stringr "str_detect"
#' @importFrom utils "read.table" "write.table" "globalVariables"
#' @importFrom dplyr "full_join" "count" "bind_rows" "mutate" "%>%"
#' @importFrom ggplot2 "ggplot" "ggtitle" "rel" "element_text" "scale_y_discrete" "scale_y_continuous" "geom_violin" "geom_jitter" "scale_colour_gradientn" "aes" "coord_flip" "stat_summary" "theme_classic" "geom_text" "geom_point" "xlab" "ylab" "labs" "scale_color_manual" "scale_x_continuous" "theme" "element_blank" "element_text" "element_rect"
#' @importFrom grDevices "pdf" "colorRampPalette" "dev.off"
#' @importFrom reshape2 "melt"
#' @importFrom ggsci "pal_startrek" "pal_lancet" "pal_npg"
#' @importFrom viridis "scale_fill_viridis"
#' @importFrom stats "sd" "median"



alignment_quality_stats_plotter <- function(metadata_file_name_value,
                                            ambiguity_file_name_value,
                                            codons_file_name_value,
                                            gaps_file_name_value,
                                            quality_file_name_value,
                                            common_variable_name_meger_value,
                                            stripplot_quality_var_names_value,
                                            stripplot_group_var_names_value = "GLOBAL",
                                            stripplot_indicator_var_names_value,
                                            stripplot_mono_color_value = "gray27",
                                            stripplot_color_gradient_names_value = NULL,
                                            stripplot_color_median_value = "red",
                                            stripplot_alpha_value = 0.6,
                                            stripplot_location_n_value = 10,
                                            stripplot_pdf_width_height_value = c(7,5),

                                            global_time_series_var_x_names_value,
                                            global_time_series_var_y_con_names_value,
                                            global_time_series_var_y_dis_names_value = NULL,
                                            global_plot_time_series_x_var_name,
                                            global_plot_time_series_group_var_name = NULL,
                                            global_plot_time_series_ggsci_or_user_color_names_value = c("#999999", "#E69F00", "#56B4E9"),
                                            global_plot_pdf_width_height_value = c(7,5),

                                            global_plot_time_series_alpha_value = 0.6,
                                            per_site_plot_var_y_names_value,
                                            per_site_transform_gap_count_value = "sqrt",
                                            per_site_transform_codon_count_value ="sqrt",
                                            per_site_transform_ambiguity_count_value = NULL,
                                            per_site_plot_color_gap_names_value = c("viridis", "A"),
                                            per_site_plot_color_codon_names_value = c("viridis", "C"),
                                            per_site_plot_color_ambiguity_names_value = c("white","blue","red"),
                                            per_site_add_label_every_n_position_value = 250,
                                            per_site_NA_color_value = "white",
                                            per_site_plot_alpha_value = 1,
                                            per_site_plot_pdf_width_height_value = c(7,5),

                                            file_out_user = NULL,
                                            out_directory_user = NULL){

  # user input

  metadata_file_name <- metadata_file_name_value
  ambiguity_file_name <- ambiguity_file_name_value
  codons_file_name <- codons_file_name_value
  gaps_file_name <- gaps_file_name_value
  quality_file_name <- quality_file_name_value

  common_variable_meger <- common_variable_name_meger_value

  stripplot_quality_names <- stripplot_quality_var_names_value
  stripplot_group_names <- stripplot_group_var_names_value
  stripplot_indicator <- stripplot_indicator_var_names_value
  stripplot_gradient <- stripplot_color_gradient_names_value
  stripplot_alpha <- stripplot_alpha_value
  stripplot_color <- stripplot_mono_color_value
  stripplot_color_median <- stripplot_color_median_value
  stripplot_location_n <- stripplot_location_n_value
  strip_width <- stripplot_pdf_width_height_value[1]
  strip_height <- stripplot_pdf_width_height_value[2]


  global_TS_x_names <- global_time_series_var_x_names_value
  global_TS_y_names <- global_time_series_var_y_con_names_value
  global_TS_y_dis_names <- global_time_series_var_y_dis_names_value
  global_TS_plot_x <- global_plot_time_series_x_var_name
  global_TS_plot_group <- global_plot_time_series_group_var_name
  global_TS_ggsci <- global_plot_time_series_ggsci_or_user_color_names_value
  global_TS_alpha <- global_plot_time_series_alpha_value
  global_width <- global_plot_pdf_width_height_value[1]
  global_height <- global_plot_pdf_width_height_value[2]

  per_site_x_names <- per_site_plot_var_y_names_value
  transform_gap_count <- per_site_transform_gap_count_value
  transform_codon_count <- per_site_transform_codon_count_value
  transform_ambiguity_count <- per_site_transform_ambiguity_count_value
  per_site_gap_color <- per_site_plot_color_gap_names_value
  per_site_ambiguity_color <- per_site_plot_color_ambiguity_names_value
  per_site_codon_color <- per_site_plot_color_codon_names_value
  brakes_every <- per_site_add_label_every_n_position_value
  per_site_NA_color <- per_site_NA_color_value
  per_site_width <- per_site_plot_pdf_width_height_value[1]
  per_site_height <- per_site_plot_pdf_width_height_value[2]
  per_site_alpha <- per_site_plot_alpha_value

  file_out <- file_out_user



  ## outdir arguments

  if(is.null(out_directory_user)) {
    out_directory <- getwd()
  } else {
    setwd(out_directory_user)
    out_directory <- getwd()
  }

  # See viridis options

  viridis_option <- data.frame(name_viridis = c("magma", "inferno", "plasma", "viridis", "cividis", "rocket", "mako", "turbo"),
                               options = c("A", "B", "C", "D", "E", "F", "G", "H"))
  print(viridis_option)



  # genetic code info

  metadata_df <- utils::read.table(metadata_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
  ambiguity_df <- utils::read.table(ambiguity_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
  codons_df <- utils::read.table(codons_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
  gaps_df <- utils::read.table(gaps_file_name, header=T,sep="\t",stringsAsFactors = FALSE)
  quality_df <- utils::read.table(quality_file_name, header=T,sep="\t",stringsAsFactors = FALSE)

  # standardize name

  names(quality_df)[names(quality_df) == common_variable_meger] <- common_variable_meger
  names(quality_df)[names(quality_df) == "seq_ID"] <- common_variable_meger

  names(ambiguity_df)[names(ambiguity_df) == common_variable_meger] <- common_variable_meger
  names(ambiguity_df)[names(ambiguity_df) == "seq_ID"] <- common_variable_meger

  names(codons_df)[names(codons_df) == common_variable_meger] <- common_variable_meger
  names(codons_df)[names(codons_df) == "seq_ID"] <- common_variable_meger

  names(gaps_df)[names(gaps_df) == common_variable_meger] <- common_variable_meger
  names(gaps_df)[names(gaps_df) == "seq_ID"] <- common_variable_meger

  # merge metadata_df and quality_df

  metadata_quality_df <- dplyr::full_join(metadata_df, quality_df, by = common_variable_meger)
  metadata_ambiguity_df <- dplyr::full_join(metadata_df, ambiguity_df, by = common_variable_meger)
  metadata_codons_df <- dplyr::full_join(metadata_df, codons_df, by = common_variable_meger)
  metadata_gaps_df <- dplyr::full_join(metadata_df, gaps_df, by = common_variable_meger)

  ####################         stripplot graphs

  counter <- 0
  list_stripcharts <- list()

  for(i in 1:length(stripplot_quality_names)) {

    # i <- 1

    one_stripchart_df <- metadata_quality_df
    one_stripchart_df$GLOBAL <- "GLOBAL"

    one_stripchart_df <- subset(one_stripchart_df, select = c(stripplot_quality_names[i],stripplot_indicator, stripplot_group_names))
    one_stripchart_x_name <- stripplot_quality_names[i]

    names(one_stripchart_df)[1] <- "one_strip_var"

    for(j in 1:length(stripplot_group_names)) {

      counter <- counter + 1

      # j <- 1

      one_stripchart_df_1 <- subset(one_stripchart_df, select = c("one_strip_var", stripplot_group_names[j]))
      names(one_stripchart_df_1) <- c("one_strip_var", "group")

      # add number of observations
      group <- NULL
      n <- NULL
      class <- one_stripchart_df_1 %>% dplyr::count(group) %>% dplyr::mutate(label = paste0("n = ", n))

      if(is.null(stripplot_gradient)) {
        one_strip_var <- NULL
        one_stripchart <- ggplot2::ggplot(one_stripchart_df_1, ggplot2::aes(x= group, y = one_strip_var)) +
          ggplot2::geom_violin(trim = FALSE, alpha = stripplot_alpha) +
          ggplot2::geom_jitter(position = ggplot2::position_jitter(0), shape = 20, cex = 3, alpha = stripplot_alpha , color = stripplot_color) +
          ggplot2::coord_flip() +
          ggplot2::stat_summary(fun = median, geom = "point", shape = 18, size = 4, color = stripplot_color_median) +
          ggplot2::theme_classic() + ggplot2::ylab(one_stripchart_x_name) +
          ggplot2::geom_text(ggplot2::aes(y = stripplot_location_n, label = paste0("n = ", n)), data = class)


      } else {
        one_strip_var <- NULL
        one_stripchart <- ggplot2::ggplot(one_stripchart_df_1, ggplot2::aes(x= group, y = one_strip_var)) +
          ggplot2::geom_violin(trim = FALSE, alpha = stripplot_alpha) +
          ggplot2::geom_jitter(position = ggplot2::position_jitter(0), shape = 20, cex = 3, alpha = stripplot_alpha , ggplot2::aes(color = one_strip_var)) +
          ggplot2::scale_colour_gradientn(colours = stripplot_gradient) +
          ggplot2::coord_flip() +
          ggplot2::stat_summary(fun = median, geom = "point", shape = 18, size = 4, color = stripplot_color_median) +
          ggplot2::theme_classic() + ggplot2::ylab(one_stripchart_x_name) + labs(color = "") +
          geom_text(ggplot2::aes(y = stripplot_location_n, label = paste0("n = ", n)), data = class)

      }


      list_stripcharts[[counter]] <- one_stripchart

    }
    rm(j)
  }
  rm(i)


  ####################         global time series graphs

  stats_TS_list <- list()

  for(k in 1:length(global_TS_x_names)) {

    # k <- 1

    ## continuous subset to groups
    all_global_xy <- subset(metadata_quality_df, select = c(global_TS_x_names[k],global_TS_y_names))
    all_global_xy_list <- split(all_global_xy, all_global_xy[,1])

    # calculate by global_TS_x_names

    collect_all_xy_list <- list()

    for(i in 1:length(all_global_xy_list)) {
      # i <- 1
      one_global_xy <- all_global_xy_list[[i]]
      one_x_ref <- unique(one_global_xy[,1])
      one_x_ref_name <- names(one_global_xy)[1]
      one_x_ref_N <- nrow(one_global_xy)
      one_x_df <- data.frame(place_holder = one_x_ref, N_values = one_x_ref_N, stringsAsFactors = FALSE)
      names(one_x_df) <- c(one_x_ref_name,  "N_values")

      collect_y_list <- list()
      for(j in 2:ncol(one_global_xy)) {
        # j <- 2
        one_y_name <- names(one_global_xy)[j]
        one_y_mean <- mean(one_global_xy[,j], na.rm = TRUE)
        one_y_sd <- stats::sd(one_global_xy[,j], na.rm = TRUE)
        one_y_max <- max(one_global_xy[,j], na.rm = TRUE)
        one_y_min <- min(one_global_xy[,j], na.rm = TRUE)
        one_y_names <- c("mean","sd","max","min")
        one_y_df <- data.frame(one_y_mean, one_y_sd, one_y_max, one_y_min,stringsAsFactors = FALSE)
        names(one_y_df) <- one_y_names
        one_y_df$property <- one_y_name
        collect_y_list[[j]] <- one_y_df
      }
      rm(j)
      collect_y_list <- Filter(Negate(is.null), collect_y_list)
      collect_y_df <- do.call(rbind, collect_y_list)
      collect_y_df <- merge(one_x_df, collect_y_df)

      collect_all_xy_list[[i]] <- collect_y_df
    }
    rm(i)
    collect_all_xy_df <- do.call(rbind,collect_all_xy_list)

    ######## discrete subset to groups

    if(!is.null(global_TS_y_dis_names)) {

      all_global_dis_xy <- subset(metadata_quality_df, select = c(global_TS_x_names[k],global_TS_y_dis_names))
      all_global_dis_xy_list <- split(all_global_dis_xy, all_global_dis_xy[,1])

      # calculate by global_TS_x_names

      collect_all_dis_xy_list <- list()

      for(i in 1:length(all_global_dis_xy_list)) {
        # i <- 1
        one_global_xy <- all_global_dis_xy_list[[i]]
        one_x_ref <- unique(one_global_xy[,1])
        one_x_ref_name <- names(one_global_xy)[1]
        one_x_ref_N <- nrow(one_global_xy)
        one_x_df <- data.frame(place_holder = one_x_ref, N_values = one_x_ref_N, stringsAsFactors = FALSE)
        names(one_x_df) <- c(one_x_ref_name,  "N_values")

        collect_dis_y_list <- list()

        for(j in 2:ncol(one_global_xy)) {
          # j <- 2
          one_y_name <- names(one_global_xy)[j]
          one_y_dis_df0 <- data.frame(unlist(table(one_global_xy[,j])))
          one_y_dis_df <- t(one_y_dis_df0[,2:ncol(one_y_dis_df0)])
          colnames(one_y_dis_df) <- one_y_dis_df0[,1]
          one_y_dis_df <- data.frame(one_y_dis_df)
          collect_dis_y_list[[j]] <- one_y_dis_df
        }
        rm(j)

        collect_dis_y_list <- Filter(Negate(is.null), collect_dis_y_list)
        collect_dis_y_df <- do.call(rbind, collect_dis_y_list)
        collect_y_df <- cbind(one_x_df,collect_dis_y_df)
        collect_y_df <- collect_y_df[,-2]

        collect_all_dis_xy_list[[i]] <- collect_y_df
      }
      rm(i)
      collect_all_dis_xy_df <- dplyr::bind_rows(collect_all_dis_xy_list)
      collect_all_dis_xy_df[is.na(collect_all_dis_xy_df)] <- 0

      collect_all_xy_df <- dplyr::full_join(collect_all_xy_df,collect_all_dis_xy_df, by = names(collect_all_xy_df)[1])
    } # end discrete

    stats_TS_list[[k]] <- collect_all_xy_df

  }
  rm(k)


  ####################         global scatterplots graphs

  all_global_xy_plot_df <- subset(metadata_quality_df, select = c(global_TS_plot_x,global_TS_plot_group,global_TS_y_names))
  all_global_xy_plots_list <- list()

  for(i in 1:length(global_TS_y_names)) {

    # i <- 1

    x_TS_axis <- global_TS_plot_x
    group_TS_axis <- global_TS_plot_group
    one_y_TS_axis <- global_TS_y_names[i]

    one_TS_df <- subset(metadata_quality_df, select = c(x_TS_axis,group_TS_axis,one_y_TS_axis))

    if(!is.null(group_TS_axis)) {

      names(one_TS_df) <- c("x_axis","group", "y_axis")

      n_groups <- length(unique(one_TS_df$group))

      if(global_TS_ggsci[1] == "uchicago") { color_groups <- ggsci::pal_ucscgb("default")(n_groups)
      } else if(global_TS_ggsci[1] == "startrek") { color_groups <- ggsci::pal_startrek("uniform")(n_groups)
      } else if(global_TS_ggsci[1] == "lancet") { color_groups <- ggsci::pal_lancet("lanonc")(n_groups)
      } else if(global_TS_ggsci[1] == "npg") { color_groups <- ggsci::pal_npg("nrc")(n_groups)
      } else {
        color_groups <- grDevices::colorRampPalette(global_TS_ggsci)(n_groups)}
      x_axis <- NULL
      y_axis <- NULL
      one_global_scatter <- ggplot2::ggplot(one_TS_df, ggplot2::aes(x= x_axis, y = y_axis)) +
        ggplot2::geom_point(ggplot2::aes(color = group), alpha = global_TS_alpha, size = 3, shape = 20) +
        ggplot2::theme_classic() + ggplot2::ylab(one_y_TS_axis) + ggplot2::xlab(x_TS_axis) +
        ggplot2::labs(color = group_TS_axis) +
        ggplot2::scale_color_manual(values=color_groups) +
        ggplot2::scale_x_continuous(breaks = seq(min(one_TS_df$x_axis), max(one_TS_df$x_axis), by = 1)) +
        ggplot2::theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

      all_global_xy_plots_list[[i]] <- one_global_scatter

    } else {

      names(one_TS_df) <- c("x_axis", "y_axis")
      n_groups <- 1

      if(global_TS_ggsci[1] == "uchicago") { color_groups <- ggsci::pal_ucscgb("default")(n_groups)
      } else if(global_TS_ggsci[1] == "startrek") { color_groups <- ggsci::pal_startrek("uniform")(n_groups)
      } else if(global_TS_ggsci[1] == "lancet") { color_groups <- ggsci::pal_lancet("lanonc")(n_groups)
      } else if(global_TS_ggsci[1] == "npg") { color_groups <- ggsci::pal_npg("nrc")(n_groups)
      } else {
        color_groups <- grDevices::colorRampPalette(global_TS_ggsci)(n_groups)}

      x_axis <- NULL
      y_axis <- NULL
      one_global_scatter <- ggplot2::ggplot(one_TS_df, ggplot2::aes(x= x_axis, y = y_axis)) +
        ggplot2::geom_point(alpha = global_TS_alpha, size = 3, shape = 20, color = color_groups) +
        ggplot2::theme_classic() + ggplot2::ylab(one_y_TS_axis) + ggplot2::xlab(x_TS_axis) +
        ggplot2::labs(color = group_TS_axis) +
        ggplot2::scale_x_continuous(breaks = seq(min(one_TS_df$x_axis), max(one_TS_df$x_axis), by = 1)) +
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1))

      all_global_xy_plots_list[[i]] <- one_global_scatter

    }
  }
  rm(i)

  ####################         per site tileplot

  collect_ambiguity_sum_end_list <- list()
  collect_codon_sum_end_list <- list()
  collect_gap_sum_end_list <- list()
  collect_ambiguity_persite_plots_list <- list()
  collect_gap_persite_plots_list <- list()
  collect_codon_persite_plots_list <- list()


  for(k in 1:length(per_site_x_names)) {
    # k <- 2
    all_ambiguity_names <- names(metadata_ambiguity_df)
    all_ambiguity_names <- all_ambiguity_names[grep("A_",all_ambiguity_names)]
    one_ambiguity_raw_df <- subset(metadata_ambiguity_df, select = c(per_site_x_names[k],all_ambiguity_names))
    one_ambiguity_list <- split(one_ambiguity_raw_df, one_ambiguity_raw_df[,1])

    all_codon_names <- names(metadata_codons_df)
    all_codon_names <- all_codon_names[grep("codon_",all_codon_names)]
    one_codon_raw_df <- subset(metadata_codons_df, select = c(per_site_x_names[k],all_codon_names))
    one_codon_list <- split(one_codon_raw_df, one_codon_raw_df[,1])

    all_gap_names <- names(metadata_gaps_df)
    all_gap_names <- all_gap_names[grep("G_",all_gap_names)]
    one_gap_raw_df <- subset(metadata_gaps_df, select = c(per_site_x_names[k],all_gap_names))
    one_gap_list <- split(one_gap_raw_df, one_gap_raw_df[,1])

    # calculate by tile_sum

    collect_ambiguity_sum_list <- list()
    collect_codon_ALL_sum_list <- list()
    collect_gap_ALL_sum_list <- list()

    cat("\n*** calculating sum per sites for -- \n")

    for(i in 1:length(one_ambiguity_list)) {
      # i <- 1
      one_amb_xy <- one_ambiguity_list[[i]]
      one_amb_x_ref <- unique(one_amb_xy[,1])
      one_amb_x_ref_name <- names(one_amb_xy)[1]
      cat(one_amb_x_ref, " -- ")
      one_amb_x_ref_N <- nrow(one_amb_xy)
      one_amb_x_df <- data.frame(place_holder = one_amb_x_ref, N_values = one_amb_x_ref_N, stringsAsFactors = FALSE)
      names(one_amb_x_df) <- c(one_amb_x_ref_name,  "N_values")

      one_codon_xy <- one_codon_list[[i]]
      one_codon_x_ref <- unique(one_codon_xy[,1])
      one_codon_x_ref_name <- names(one_codon_xy)[1]
      one_codon_x_ref_N <- nrow(one_codon_xy)
      one_codon_x_df <- data.frame(place_holder = one_codon_x_ref, N_values = one_codon_x_ref_N, stringsAsFactors = FALSE)
      names(one_codon_x_df) <- c(one_codon_x_ref_name,  "N_values")

      one_gap_xy <- one_gap_list[[i]]
      one_gap_x_ref <- unique(one_gap_xy[,1])
      one_gap_x_ref_name <- names(one_gap_xy)[1]
      one_gap_x_ref_N <- nrow(one_gap_xy)
      one_gap_x_df <- data.frame(place_holder = one_gap_x_ref, N_values = one_gap_x_ref_N, stringsAsFactors = FALSE)
      names(one_gap_x_df) <- c(one_gap_x_ref_name,  "N_values")

      collect_amb_sum_list <- list()
      collect_codon_sum_list <- list()
      collect_gap_sum_list <- list()

      for(j in 2:ncol(one_amb_xy)) {
        # j <- 2
        if(j%%100== 0) cat("*")
        one_amb_y_name <- names(one_amb_xy)[j]
        one_amb_y_sum <- sum(one_amb_xy[,j], na.rm = TRUE)
        one_amb_y_df <- data.frame(one_amb_y_sum,stringsAsFactors = FALSE)
        names(one_amb_y_df) <- one_amb_y_name
        collect_amb_sum_list[[j]] <- one_amb_y_df

        one_gap_y_name <- names(one_gap_xy)[j]
        one_gap_y_sum <- sum(one_gap_xy[,j], na.rm = TRUE)
        one_gap_y_df <- data.frame(one_gap_y_sum,stringsAsFactors = FALSE)
        names(one_gap_y_df) <- one_gap_y_name
        collect_gap_sum_list[[j]] <- one_gap_y_df
      }
      rm(j)

      for(j in 2:ncol(one_codon_xy)) {
        one_codon_y_name <- names(one_codon_xy)[j]
        one_codon_y_sum <- sum(one_codon_xy[,j], na.rm = TRUE)
        one_codon_y_df <- data.frame(one_codon_y_sum,stringsAsFactors = FALSE)
        names(one_codon_y_df) <- one_codon_y_name
        collect_codon_sum_list[[j]] <- one_codon_y_df
      }
      rm(j)

      cat("\n")
      collect_amb_sum_list <- Filter(Negate(is.null), collect_amb_sum_list)
      collect_amb_y_df <- do.call(cbind, collect_amb_sum_list)
      collect_amb_y_df <- merge(one_amb_x_df, collect_amb_y_df)
      collect_ambiguity_sum_list[[i]] <- collect_amb_y_df

      collect_codon_sum_list <- Filter(Negate(is.null), collect_codon_sum_list)
      collect_codon_y_df <- do.call(cbind, collect_codon_sum_list)
      collect_codon_y_df <- merge(one_codon_x_df, collect_codon_y_df)
      collect_codon_ALL_sum_list[[i]] <- collect_codon_y_df

      collect_gap_sum_list <- Filter(Negate(is.null), collect_gap_sum_list)
      collect_gap_y_df <- do.call(cbind, collect_gap_sum_list)
      collect_gap_y_df <- merge(one_gap_x_df, collect_gap_y_df)
      collect_gap_ALL_sum_list[[i]] <- collect_gap_y_df
    }
    rm(i)

    collect_ambiguity_sum_end_df <- do.call(rbind,collect_ambiguity_sum_list)
    collect_codon_sum_end_df <- do.call(rbind,collect_codon_ALL_sum_list)
    collect_gap_sum_end_df <- do.call(rbind,collect_gap_ALL_sum_list)


    collect_ambiguity_sum_end_list[[k]] <- collect_ambiguity_sum_end_df
    collect_codon_sum_end_list[[k]] <- collect_codon_sum_end_df
    collect_gap_sum_end_list[[k]] <- collect_gap_sum_end_df

    ######################################### tile plots
    ########################### gap

    # get x var
    gap_x_var <- collect_gap_sum_end_df[,1]

    # remove x
    gap_y_df <- collect_gap_sum_end_df[,-1]

    # remove counts
    gap_y_df <- gap_y_df[,-1]

    # get y melted
    gap_persite_df <- reshape2::melt(gap_y_df)
    names(gap_persite_df) <- c("ID_site","count")

    # add x_var
    gap_persite_df$x_var <- gap_x_var

    # get legend labels

    min_gap_count <- min(gap_persite_df$count)
    max_gap_count <- max(gap_persite_df$count)
    legend_gap_label <- "count"
    legend_gap_breaks <- round(seq(min_gap_count,max_gap_count, length.out = 4), 0)
    legend_gap_seq_label <- legend_gap_breaks

    # transform "present" by sqrt

    if(!is.null(transform_gap_count)) {

      if(transform_gap_count == "sqrt") {
        gap_persite_df$count <- sqrt(gap_persite_df$count)
        min_gap_count <- min(gap_persite_df$count)
        max_gap_count <- max(gap_persite_df$count)
        legend_gap_label <- "SQRT-count"
        legend_gap_breaks <- seq(min_gap_count,max_gap_count, length.out = 4)
        legend_gap_seq_label <- round(legend_gap_breaks^2, 0)
      }

      if(transform_gap_count == "log10") {
        gap_persite_df$count <- gap_persite_df$count + 1
        gap_persite_df$count <- log10(gap_persite_df$count)
        min_gap_count <- min(gap_persite_df$count)
        max_gap_count <- max(gap_persite_df$count)
        legend_gap_label <- "Log10-count"
        legend_gap_breaks <- seq(min_gap_count,max_gap_count, length.out = 4)
        legend_gap_seq_label <- round(10^legend_gap_breaks, 0)
      }
    }

    # count sites

    ncount_sites <- length(unique(gap_persite_df$ID_site))
    ncount_sites_label <- rep(".",ncount_sites)

    ncount_sites_label2 <- character()
    for(mm in 1:length(ncount_sites_label)) {
      ncount_sites_label2[mm] <- ifelse(mm %% brakes_every == 0, paste0(mm," --"), "")
    }

    ncount_sites_label2[1] <- "1 --"
    gap_y_label <- names(collect_gap_sum_end_df)[1]
    gap_x_label <- "Site"

    ## we can plot a heatmap with the expression values
    ID_site <- NULL
    x_var <- NULL
    gap_tile_base <- ggplot2::ggplot(gap_persite_df,ggplot2::aes(x=ID_site,y=x_var))+
      ggplot2::geom_tile(ggplot2::aes(fill = count), alpha = per_site_alpha)+
      ggplot2::theme_classic(base_size = 11)+
      ggplot2::ggtitle("Sequencing Count per Nucleotide Site") +
      ggplot2::scale_x_discrete(labels= ncount_sites_label2, expand = c(0, 0)) +
      ggplot2:: ylab(gap_y_label) + ggplot2::xlab(gap_x_label) +
      ggplot2::theme(axis.text.y = element_text(size=ggplot2::rel(0.9)),
            axis.text.x = ggplot2::element_text(size=ggplot2::rel(0.9),angle = 90, vjust = 0.5, hjust = 1.2),
            axis.ticks.x = ggplot2::element_blank(),
            legend.title = ggplot2::element_text(color = "black", size = 9),
            legend.text = ggplot2::element_text(color = "black", size = 9),
            panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
            panel.background = ggplot2::element_rect(fill = per_site_NA_color, colour = per_site_NA_color, size = 0.5, linetype = "solid"))

    if(!class(gap_persite_df$x_var) == "character") {
      gap_tile_base <- gap_tile_base + ggplot2::scale_y_continuous(breaks = seq(min(gap_persite_df$x_var), max(gap_persite_df$x_var), by = 1), expand = c(0, 0))
    } else {
      gap_tile_base <- gap_tile_base + ggplot2::scale_y_discrete(expand = c(0, 0))
    }
    if(per_site_gap_color[1] == "viridis") {
      gap_tile_base <- gap_tile_base + viridis::scale_fill_viridis(option=per_site_gap_color[2], name = legend_gap_label, breaks = legend_gap_breaks, labels = legend_gap_seq_label)
    } else {
      gap_tile_base <- gap_tile_base + ggplot2::scale_fill_gradientn( colours = per_site_gap_color, name = legend_gap_label, breaks = legend_gap_breaks, labels = legend_gap_seq_label)
    }

    collect_gap_persite_plots_list[[k]] <- gap_tile_base

    ########################### gap END

    ########################### codon

    # get x var
    codon_x_var <- collect_codon_sum_end_df[,1]

    # remove x
    codon_y_df <- collect_codon_sum_end_df[,-1]

    # remove counts
    codon_y_df <- codon_y_df[,-1]

    # get y melted
    codon_persite_df <- reshape2::melt(codon_y_df)
    names(codon_persite_df) <- c("ID_site","count")

    # add x_var
    codon_persite_df$x_var <- codon_x_var

    # get legend labels

    min_codon_count <- min(codon_persite_df$count)
    max_codon_count <- max(codon_persite_df$count)
    legend_codon_label <- "count"
    legend_codon_breaks <- round(seq(min_codon_count,max_codon_count, length.out = 4), 0)
    legend_codon_seq_label <- legend_codon_breaks

    # transform "present" by sqrt

    if(!is.null(transform_codon_count)) {

      if(transform_codon_count == "sqrt") {
        codon_persite_df$count <- sqrt(codon_persite_df$count)
        min_codon_count <- min(codon_persite_df$count)
        max_codon_count <- max(codon_persite_df$count)
        legend_codon_label <- "SQRT-count"
        legend_codon_breaks <- seq(min_codon_count,max_codon_count, length.out = 4)
        legend_codon_seq_label <- round(legend_codon_breaks^2, 0)
      }

      if(transform_codon_count == "log10") {
        codon_persite_df$count <- codon_persite_df$count + 1
        codon_persite_df$count <- log10(codon_persite_df$count)
        min_codon_count <- min(codon_persite_df$count)
        max_codon_count <- max(codon_persite_df$count)
        legend_codon_label <- "Log10-count"
        legend_codon_breaks <- seq(min_codon_count,max_codon_count, length.out = 4)
        legend_codon_seq_label <- round(10^legend_codon_breaks, 0)
      }
    }

    # count sites

    ncount_sites <- length(unique(codon_persite_df$ID_site))
    ncount_sites_label <- rep(".",ncount_sites)

    ncount_sites_label2 <- character()
    for(mm in 1:length(ncount_sites_label)) {
      ncount_sites_label2[mm] <- ifelse(mm %% brakes_every == 0, paste0(mm," --"), "")
    }

    ncount_sites_label2[1] <- "1 --"
    codon_y_label <- names(collect_codon_sum_end_df)[1]
    codon_x_label <- "Site"


    ## we can plot a heatmap with the expression values
    ID_site <- NULL
    x_var <- NULL
    codon_tile_base <- ggplot2::ggplot(codon_persite_df,ggplot2::aes(x=ID_site,y=x_var))+
      ggplot2::geom_tile(ggplot2::aes(fill = count), alpha = per_site_alpha)+
      ggplot2::theme_classic(base_size = 11)+
      ggplot2::ggtitle("Sequencing Count per Codon Site") +
      ggplot2::scale_x_discrete(labels= ncount_sites_label2, expand = c(0, 0)) +
      ggplot2::ylab(codon_y_label) + ggplot2::xlab(codon_x_label) +
      ggplot2::theme(axis.text.y = element_text(size=ggplot2::rel(0.9)),
            axis.text.x = ggplot2::element_text(size=ggplot2::rel(0.9),angle = 90, vjust = 0.5, hjust = 1.2),
            axis.ticks.x = ggplot2::element_blank(),
            legend.title = ggplot2::element_text(color = "black", size = 9),
            legend.text = ggplot2::element_text(color = "black", size = 9),
            panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
            panel.background = ggplot2::element_rect(fill = per_site_NA_color, colour = per_site_NA_color, size = 0.5, linetype = "solid"))

    if(!class(codon_persite_df$x_var) == "character") {
      codon_tile_base <- codon_tile_base + ggplot2::scale_y_continuous(breaks = seq(min(codon_persite_df$x_var), max(codon_persite_df$x_var), by = 1), expand = c(0, 0))
    } else {
      codon_tile_base <- codon_tile_base + ggplot2::scale_y_discrete(expand = c(0, 0))
    }

    if(per_site_codon_color[1] == "viridis") {
      codon_tile_base <- codon_tile_base + viridis::scale_fill_viridis(option=per_site_codon_color[2], name = legend_codon_label, breaks = legend_codon_breaks, labels = legend_codon_seq_label)
    } else {
      codon_tile_base <- codon_tile_base + ggplot2::scale_fill_gradientn( colours = per_site_codon_color, name = legend_codon_label, breaks = legend_codon_breaks, labels = legend_codon_seq_label)
    }

    collect_codon_persite_plots_list[[k]] <- codon_tile_base

    ########################### codon END


    ########################### ambiguity

    # get x var
    ambiguity_x_var <- collect_ambiguity_sum_end_df[,1]

    # remove x
    ambiguity_y_df <- collect_ambiguity_sum_end_df[,-1]

    # remove counts
    ambiguity_y_df <- ambiguity_y_df[,-1]

    # get y melted
    ambiguity_persite_df <- melt(ambiguity_y_df)
    names(ambiguity_persite_df) <- c("ID_site","count")

    # add x_var
    ambiguity_persite_df$x_var <- ambiguity_x_var

    # get legend labels

    min_ambiguity_count <- min(ambiguity_persite_df$count)
    max_ambiguity_count <- max(ambiguity_persite_df$count)
    legend_ambiguity_label <- "count"
    legend_ambiguity_breaks <- round(seq(min_ambiguity_count,max_ambiguity_count, length.out = 4), 0)
    legend_ambiguity_seq_label <- legend_ambiguity_breaks

    # transform "present" by sqrt

    if(!is.null(transform_ambiguity_count)) {

      if(transform_ambiguity_count == "sqrt") {
        ambiguity_persite_df$count <- sqrt(ambiguity_persite_df$count)
        min_ambiguity_count <- min(ambiguity_persite_df$count)
        max_ambiguity_count <- max(ambiguity_persite_df$count)
        legend_ambiguity_label <- "SQRT-count"
        legend_ambiguity_breaks <- seq(min_ambiguity_count,max_ambiguity_count, length.out = 4)
        legend_ambiguity_seq_label <- round(legend_ambiguity_breaks^2, 0)
      }

      if(transform_ambiguity_count == "log10") {
        ambiguity_persite_df$count <- ambiguity_persite_df$count + 1
        ambiguity_persite_df$count <- log10(ambiguity_persite_df$count)
        min_ambiguity_count <- min(ambiguity_persite_df$count)
        max_ambiguity_count <- max(ambiguity_persite_df$count)
        legend_ambiguity_label <- "Log10-count"
        legend_ambiguity_breaks <- seq(min_ambiguity_count,max_ambiguity_count, length.out = 4)
        legend_ambiguity_seq_label <- round(10^legend_ambiguity_breaks, 0)
      }
    }

    # count sites

    ncount_sites <- length(unique(ambiguity_persite_df$ID_site))
    ncount_sites_label <- rep(".",ncount_sites)

    ncount_sites_label2 <- character()
    for(mm in 1:length(ncount_sites_label)) {
      ncount_sites_label2[mm] <- ifelse(mm %% brakes_every == 0, paste0(mm," --"), "")
    }

    ncount_sites_label2[1] <- "1 --"
    ambiguity_y_label <- names(collect_ambiguity_sum_end_df)[1]
    ambiguity_x_label <- "Site"


    ## we can plot a heatmap with the expression values
    ID_site <- NULL
    x_var <- NULL
    ambiguity_tile_base <- ggplot2::ggplot(ambiguity_persite_df,ggplot2::aes(x=ID_site,y=x_var))+
      ggplot2::geom_tile(ggplot2::aes(fill = count), alpha = per_site_alpha)+
      ggplot2::theme_classic(base_size = 11)+
      ggplot2::ggtitle("Nucleotide Ambiguity per Site") +
      ggplot2::scale_x_discrete(labels= ncount_sites_label2, expand = c(0, 0)) +
      ggplot2::ylab(ambiguity_y_label) + ggplot2::xlab(ambiguity_x_label) +
      ggplot2::theme(axis.text.y = element_text(size=ggplot2::rel(0.9)),
            axis.text.x = ggplot2::element_text(size=ggplot2::rel(0.9),angle = 90, vjust = 0.5, hjust = 1.2),
            axis.ticks.x = ggplot2::element_blank(),
            legend.title = ggplot2::element_text(color = "black", size = 9),
            legend.text = ggplot2::element_text(color = "black", size = 9),
            panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
            panel.background = ggplot2::element_rect(fill = per_site_NA_color, colour = per_site_NA_color, size = 0.5, linetype = "solid"))


    if(!class(ambiguity_persite_df$x_var) == "character") {
      ambiguity_tile_base <- ambiguity_tile_base + ggplot2::scale_y_continuous(breaks = seq(min(ambiguity_persite_df$x_var), max(ambiguity_persite_df$x_var), by = 1), expand = c(0, 0))
    } else {
      ambiguity_tile_base <- ambiguity_tile_base + ggplot2::scale_y_discrete(expand = c(0, 0))
    }

    if(per_site_ambiguity_color[1] == "viridis") {
      ambiguity_tile_base <- ambiguity_tile_base + viridis::scale_fill_viridis(option=per_site_ambiguity_color[2], name = legend_ambiguity_label, breaks = legend_ambiguity_breaks, labels = legend_ambiguity_seq_label)
    } else {
      ambiguity_tile_base <- ambiguity_tile_base + ggplot2::scale_fill_gradientn( colours = per_site_ambiguity_color, name = legend_ambiguity_label, breaks = legend_ambiguity_breaks, labels = legend_ambiguity_seq_label)
    }

    collect_ambiguity_persite_plots_list[[k]] <- ambiguity_tile_base

    ########################### ambiguity END
  } # END of k


  ############################## collect and save as PDFs

  stripchart_name <- paste0(file_out,"_stripcharts.pdf")
  global_plots_name <- paste0(file_out,"_global_plots.pdf")
  gap_plots_name <- paste0(file_out,"_per_site_gap_plots.pdf")
  codon_plots_name <- paste0(file_out,"_per_site_codon_plots.pdf")
  ambiguity_plots_name <- paste0(file_out,"_per_site_ambiguity_plots.pdf")



  grDevices::pdf(stripchart_name, width = strip_width, height = strip_height, onefile = TRUE)
  for (i in seq(length(list_stripcharts))) { print(list_stripcharts)}; rm(i)
  grDevices::dev.off()

  grDevices::pdf(global_plots_name, width = global_width, height = global_height, onefile = TRUE)
  for (i in seq(length(all_global_xy_plots_list))) { print(all_global_xy_plots_list)}; rm(i)
  grDevices::dev.off()

  grDevices::pdf(gap_plots_name, width = per_site_width, height = per_site_height, onefile = TRUE)
  for (i in seq(length(collect_gap_persite_plots_list))) { print(collect_gap_persite_plots_list)}; rm(i)
  grDevices::dev.off()

  grDevices::pdf(codon_plots_name, width = per_site_width, height = per_site_height, onefile = TRUE)
  for (i in seq(length(collect_codon_persite_plots_list))) { print(collect_codon_persite_plots_list)}; rm(i)
  grDevices::dev.off()

  grDevices::pdf(ambiguity_plots_name, width = per_site_width, height = per_site_height, onefile = TRUE)
  for (i in seq(length(collect_ambiguity_persite_plots_list))) { print(collect_ambiguity_persite_plots_list)}; rm(i)
  grDevices::dev.off()


  ## save as tables

  ###### TS names

  for(i in 1:length(global_TS_x_names)) {

    one_TS_name <- paste0(file_out, "_",global_TS_x_names[i], "_global_table.txt")

    utils::write.table(stats_TS_list[i], file = one_TS_name, sep = "\t", row.names = FALSE)

  }
  rm(i)

  ###### persite names

  for(i in 1:length(per_site_x_names)) {

    one_ambiguity_name <- paste0(file_out, "_",per_site_x_names[i], "_ambiguity_table.txt")
    one_codon_name <- paste0(file_out, "_",per_site_x_names[i], "_codon_table.txt")
    one_gap_name <- paste0(file_out, "_",per_site_x_names[i], "_gap_table.txt")

    utils::write.table(collect_ambiguity_sum_end_list[i], file = one_ambiguity_name, sep = "\t", row.names = FALSE)
    utils::write.table(collect_codon_sum_end_list[i], file = one_codon_name, sep = "\t", row.names = FALSE)
    utils::write.table(collect_gap_sum_end_list[i], file = one_gap_name, sep = "\t", row.names = FALSE)

  }
  rm(i)

  # to return

  list_of_plots <- list(plots = c(list_stripcharts,all_global_xy_plots_list, collect_gap_persite_plots_list, collect_codon_persite_plots_list, collect_ambiguity_persite_plots_list),
                        tables = c(stats_TS_list, collect_ambiguity_sum_end_list, collect_codon_sum_end_list, collect_gap_sum_end_list))

  return(list_of_plots)

}
carvajalc/BarcoR documentation built on Dec. 19, 2021, 1:54 p.m.