R/cnv_summary.R

Defines functions cnv_summary_plot

Documented in cnv_summary_plot

#' Summary plots for CNVs
#'
#' Make summary graphs for CNVs. Types of plots produced include:
#' *  CNV counts by length and copy number
#' *  Length distribution by copy number
#' *  CNV counts by chromosome and copy number
#' *  CNV counts by individual
#'
#' @param clean_cnv a clean CNV file, as generated by the `clean_cnv` function
#' @param plot_sum_1 plot the first type of summary plot combination
#' @param plot_sum_2 plot the second type of summary plot combination
#' @param length_group a vector of lengths (in Mb) to classify CNV lengths for plotting. For example, the vector `c(0.05, 0.3,  0.6, 1)`, instructs the function to divide the CNV lengths into four groups: '<0.05Mb', '0.05 - 0.3Mb', '0.3-0.6Mb' and '>1Mb'. Up to 11 values can be provided.
#' @param folder set the name of the output folder
#' @param col_0 set colour for 0 copy of CNV
#' @param col_1 set colour for 1 copy of CNV
#' @param col_2 set colour for 2 copy of CNV (which might be ROH)
#' @param col_3 set colour for 3 copy of CNV
#' @param col_4 set colour for 4 copy of CNV
#' @param height_sum1 height of Summary Plot 1 (in cm)
#' @param width_sum1 width of Summary Plot 1 (in cm)
#' @param height_sum2 height of Summary Plot 2 (in cm)
#' @param width_sum2 width of Summary Plot 2 (in cm)
#'
#' @import ggplot2 dplyr reshape2 tidyr
#' @importFrom data.table fread fwrite
#' @importFrom scales unit_format
#'
#' @return Some summary pictures
#' @export cnv_summary_plot
#'
cnv_summary_plot <- function(clean_cnv, length_group = c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 2, 5), plot_sum_1 = TRUE, plot_sum_2 = TRUE, folder = 'cnv_summary_plot', col_0 = "hotpink",  col_1 = "turquoise", col_2 = "gray", col_3 = "tomato", col_4= "deepskyblue", height_sum1 = 26, width_sum1 = 20,height_sum2 = 20, width_sum2 = 27) {
  if (!file.exists(folder)){
    dir.create(folder)
  }

  if(typeof(clean_cnv) == "character"){
    cnv_input <- fread(file = clean_cnv, sep = "\t", header = TRUE)
  } else {
    cnv_input = clean_cnv
    cnv_input$Chr <- as.integer(cnv_input$Chr)
  }

  standard_input <- c("Sample_ID", "Chr", "Start", "End", "CNV_Value")

  if(!all(standard_input %in% colnames(cnv_input))){
    stop("The input file does not have enough columns.\nPlease includes at least five columns as belows:\nSample_ID, Chr, Start, End, CNV_Value")
  }

  CNV_Value <- c(0:4)
  if(!all(unique(cnv_input$CNV_Value) %in% CNV_Value)){
    stop("Summary plot can only working on 0 to 4 copies now.\n")
  }

  #check input data completeness
  if(!("Length" %in% colnames(cnv_input))){
    cat("The Input data file has no 'Length' column: it will be generated automatically.\n")
    cnv_input <- cnv_input %>%
                 mutate(Length = End - Start + 1)
  }

  #divide the length group of CNV
  cat("Deviding the length group...\n")
  len_int <- as.vector(length_group)
  len_int_bp <- len_int * 1000000 #convert to Mb
  cnv_input <- cnv_input %>%
               mutate(group = case_when(Length <= len_int_bp[1] ~ paste0("<", len_int[1], "Mb"),
                                        Length > len_int_bp[1] & Length <= len_int_bp[2] ~ paste0(len_int[1], "-", len_int[2], "Mb"),
                                        Length > len_int_bp[2] & Length <= len_int_bp[3] ~ paste0(len_int[2], "-", len_int[3], "Mb"),
                                        Length > len_int_bp[3] & Length <= len_int_bp[4] ~ paste0(len_int[3], "-", len_int[4], "Mb"),
                                        Length > len_int_bp[4] & Length <= len_int_bp[5] ~ paste0(len_int[4], "-", len_int[5], "Mb"),
                                        Length > len_int_bp[5] & Length <= len_int_bp[6] ~ paste0(len_int[5], "-", len_int[6], "Mb"),
                                        Length > len_int_bp[6] & Length <= len_int_bp[7] ~ paste0(len_int[6], "-", len_int[7], "Mb"),
                                        Length > len_int_bp[7] & Length <= len_int_bp[8] ~ paste0(len_int[7], "-", len_int[8], "Mb"),
                                        Length > len_int_bp[8] & Length <= len_int_bp[9] ~ paste0(len_int[8], "-", len_int[9], "Mb"),
                                        Length > len_int_bp[9] & Length <= len_int_bp[10] ~ paste0(len_int[9], "-", len_int[10], "Mb"),
                                        Length > len_int_bp[10] & Length <= len_int_bp[11] ~ paste0(len_int[10], "-", len_int[11], "Mb"),
                                        Length > len_int_bp[length(len_int_bp)] ~ paste0(len_int[length(len_int)], "Mb~")))


  #customize color
  color_copy <- c("0" = col_0,
                  "1" = col_1,
                  "2" = col_2,
                  "3" = col_3,
                  "4" = col_4)
  #plot CNV distribution
  chr_freq <- cnv_input %>%
              group_by(Chr) %>%
              count(CNV_Value, name = "Freq")

  max_chr <- max(cnv_input$Chr)

  cat("Plotting...\n")
  png(res = 350, filename = paste0(folder, "/f1_cnv_distribution.png"), height = 3500, width = 4000, bg = "transparent")

    adj_y <- max(cnv_input$Length)/max(chr_freq$Freq)

    cnv_distri <- ggplot(cnv_input) +
      geom_boxplot(aes(x = as.factor(Chr), y = Length/adj_y,  fill = as.factor(CNV_Value)), outlier.shape = NA) +
      scale_fill_manual(values = color_copy) +
      geom_line(aes(group = 1, x = as.factor(Chr), color = as.factor(CNV_Value)), stat = 'count') +
      scale_color_manual(values = color_copy) +
      geom_text(aes(x = as.factor(Chr), label = ..count..), stat = 'count') +
      scale_y_continuous(name = "Frequency of CNV (N)", sec.axis = sec_axis(~.*adj_y / 1000, name = "Length of CNV (Kb)")) +
      scale_x_discrete(breaks = seq(from = 1, to = max_chr, by = 1), drop = FALSE) + # drop = F, force appear the boxplot and line on Chrs has no CNVs
      theme_classic() +
      theme(legend.position = "top",
            strip.text.x = element_blank()) +
      labs(x = "Chromosome", fill = "CNV Value", color = "Frequency of CNV") +
      facet_wrap(~CNV_Value, ncol = 1, scales = "free")

  print(cnv_distri)
  dev.off()

  if(file.exists(paste0(folder, "/f1_cnv_distribution.png"))){
    cat(paste0("The CNV distribution plot has been saved in the '", folder, "' directory.\n"))
  } else {
    warning("Failed to produce the CNV distribution plot, please check your input data format and parameters!")
  }


  #fig.1a Length group Vs CNV Count
  cnv_count <- cnv_input %>%
               group_by(CNV_Value) %>%
               count(group) #summarize the number of CNVs on each state each length group
  #cnv_count$group <- factor(cnv_count$group, levels = c("1-50kb", "50-100kb", "100-200kb", "200-300kb",
  #                                                      "300-400kb", "400-500kb", "500-600kb",
  #                                                      "600-700kb", "700-1000kb", "1-2Mbp", "2-5Mbp"))

  png(filename = paste0(folder, "/f2_length_group.png"), res = 350, width = 3500, height = 2000)
  f1a_lengthgroup <- ggplot(cnv_count, aes(fill = as.factor(CNV_Value), x = group, y = n)) +
    scale_fill_manual(values = color_copy) +
    geom_bar(stat = "identity", position = position_dodge()) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 20, vjust = 0.8),
          axis.title.x = element_blank(),
          legend.key.size  = unit(0.5,"line"),
          legend.position = c(0.8, 0.9),
          legend.title = element_text(size = 6),
          legend.text  = element_text(size = 6),
          legend.direction = "horizontal") +
    labs(y = "Number of CNVs", fill = "CNV Value") # col change title of legend
  print(f1a_lengthgroup)
  dev.off()

  if(file.exists(paste0(folder, "/f2_length_group.png"))){
    cat(paste0("The CNV length group plot was saved in the '", folder, "' directory.\n"))
  } else {
    warning("Failed to produce the CNV length plot, please check your input data format and parameters!")
  }

  png(filename = paste0(folder, "/f3_lengthbox.png"), res = 350, width = 17, height = 14, units = "cm")
  #fig.1b CNV Vs Length
  f1b_lengthbox <- ggplot(cnv_input, aes(x = as.factor(CNV_Value), y = Length, color = as.factor(CNV_Value))) +
    geom_boxplot() +
    scale_color_manual(values = color_copy) +
    scale_y_continuous(labels = unit_format(unit = "" ,scale = 1e-3)) +
    theme_classic() +
    theme(legend.position = c(0.9, 0.9),
          legend.key.size  = unit(0.5,"line"),
          legend.title = element_text(size = 6),
          legend.text  = element_text(size = 6)) +
    labs(y = "Length (kb)", x = "Type of copy", col = "CNV Value")
  print(f1b_lengthbox)
  dev.off()
  if(file.exists(paste0(folder, "/f3_lengthbox.png"))){
    cat(paste0("The box plot of CNV lengths was saved in the '", folder, "' directory.\n"))
  } else {
    warning("Failed to produce the CNV length box plot, please check your input data format and parameters!")
  }

  #fig.1c the count of CNV number distribute on each Chr
  cnv_chr <- cnv_input %>% group_by(CNV_Value) %>% count(Chr)
  cnv_chr$Chr <- factor(cnv_chr$Chr, levels = c(1:max(cnv_chr$Chr)))
  png(filename = paste0(folder, "/f4_chr.png"), res = 350, width = 17, height = 14, units = "cm")
  f1c_chr<- ggplot(cnv_chr, aes(x = Chr, y = n, group = CNV_Value)) +
    geom_line(linetype = "dashed", aes(color = as.factor(CNV_Value))) +
    geom_point(aes(color = as.factor(CNV_Value))) +
    scale_color_manual(values = color_copy) +
    theme_classic() +
    theme(legend.position = c(0.9, 0.9),
          legend.key.size  = unit(0.5,"line"),
          legend.title = element_text(size = 6),
          legend.text  = element_text(size = 6)) +
    labs(y = "Number of CNV", x = "Chr", col = "CNV Value") # col change title of legend
  print(f1c_chr)
  dev.off()
  if(file.exists(paste0(folder, "/f4_chr.png"))){
    cat("Distribution of CNV type on chromosome plot was saved in working directory.\n")
  } else {
    cat("Task faild, please check your input data format and paramter used!\n")
  }

  #plot individual CNV number
  png(filename = paste0(folder, "/f5_individual.png"), res = 350, width = 15, height = 12, units = "cm")
  cnv_indiv <- cnv_input %>%
               group_by(Sample_ID) %>%
               count(Sample_ID, name = "n_CNV") %>%
               arrange(-n_CNV)
  if (length(unique(cnv_indiv$n_CNV)) > 41){
    f1d_indiv <- ggplot(cnv_indiv, aes(x = as.integer(n_CNV))) +
      geom_bar(stat = "count",fill = col_0, color = "black") +
      #geom_text(stat = "count", aes(label = ..count..), vjust = -0.12) +
      theme_classic() +
      theme(axis.text.x = element_text(angle = 90)) +
      #scale_x_continuous(breaks = seq(0, max(cnv_indiv$n_CNV), by = max(cnv_indiv$n_CNV)/10)) +
      labs(x = "Number of CNVs per Sample", y = "Number of Samples")
  } else {
    f1d_indiv <- ggplot(cnv_indiv, aes(x = as.factor(n_CNV))) +
      geom_bar(stat = "count",fill = col_0, color = "black") +
      #geom_text(stat = "count", aes(label = ..count..), vjust = -0.12) +
      theme_classic() +
      theme(axis.text.x = element_text(angle = 90)) +
      #scale_x_discrete(breaks = seq(0, max(cnv_indiv$n_CNV), by = max(cnv_indiv$n_CNV)/10)) +
      labs(x = "Number of CNVs per Sample", y = "Number of Samples")
  }


  print(f1d_indiv)
  dev.off()
  fwrite(cnv_indiv, file = paste0(folder, "/individual_cnv_count.txt"), sep = '\t', quote = FALSE)

  if(file.exists(paste0(folder, "/f5_individual.png"))){
    cat("The number of CNV in Individual plot was saved in working directory.\n")
  } else {
    cat("Task faild, please check your input data format and paramter used!\n")
  }


  if(!missing(plot_sum_1)) {
    a_d <- plot_grid(f1d_indiv, f1a_lengthgroup, labels = c("a", "b"))

    png(res = 350, filename = paste0(folder, "/cnv_summary_plot_1.png"), height = height_sum1, width = width_sum1, units = "cm", bg = "transparent")
    ad_e <- plot_grid(a_d, cnv_distri, ncol = 1, rel_heights = c(1,3.3), labels = c("", "c"))
    print(ad_e)
    dev.off()
    if(file.exists(paste0(folder, "/cnv_summary_plot_1.png"))){
      cat("cnv_summary_plot_1 was saved in working directory.\n")
    } else {
      cat("Task faild, please check your input data format and paramter used!\n")
    }
  }

  if(!missing(plot_sum_2)){
      # merge four figure in one
    png(filename = paste0(folder, "/cnv_summaray_plot_2.png"), res = 350, height = height_sum2, width = width_sum2, units = "cm", bg = "transparent")

    all <-  plot_grid(f1a_lengthgroup, f1b_lengthbox, f1c_chr, f1d_indiv, labels = c("a", "b","c","d"))
    print(all)
    dev.off()
      if(file.exists(paste0(folder, "/cnv_summaray_plot_2.png"))){
        cat("cnv_summaray_polt_2 was saved in working directory.\n")
        cat("Task done.\n")
      } else {
        cat("Task faild, please check your input data format and paramter used!\n")
      }
  }

  else {
    cat("Task done.")
  }
}
JH-Zhou/HandyCNV documentation built on Dec. 18, 2021, 12:25 a.m.