R/cnvr_plot.R

Defines functions cnvr_plot

Documented in cnvr_plot

#' Custom CNVR distribution map and plot all high frequency CNVR at once
#'
#' @param cnvr standard cnvr file was generated by 'call_cnvr' function
#' @param assembly UMD = "Bovine UMD 3.1", ARS = "BOvine ARS UCD 1.2", "hg38" = "Human hg38", "hg19" = "Human hg19", Sheep = "Oar_v4.0", Pig = "Sscrofa11.1", Chicken = "Chicken_galGal6", Horse = "EquCab3.0", Dog = "UMICH_Zoey_3.1"
#' @param overlap_cnvr the common CNVRs list between two CNVR results, which use to mark underline to indicate the common CNVRs, should only contain Chr, Start and End three columns, the standard file was generated by 'compare_cnvr' function
#' @param label_prop if 'TRUE', will display the proportion of CNVRs in each chromosome
#' @param left_prop_label adjust the coordinates of the label to move to the left, default value is 3
#' @param refgene reference gene list, use for plot gene
#' @param clean_cnv standard input file was generated by 'cnv_clean' function
#' @param sample_size integer, the total number of unique samples in the cnv result. combine with common_cnv_threshold to plot all CNVs which passed the threshold
#' @param common_cnv_threshold two decimal places, combine with sample_size to plot all CNVs passed the common threshold
#' @param legend_x the x coordinate of legend, relative to the maximum length of Chromosome, unit is 'Mb'
#' @param legend_y the y coordinate of legend
#' @param gain_col set color for type of gain CNVR
#' @param loss_col set color for type of loss CNVR
#' @param mixed_col set color for type of mixed CNVR
#' @param overlap_col set color for Overlapped CNVRs
#' @param chr_col set color of Chromosomes
#' @param folder set name of folder to save results
#' @param gene_font_size set the font size of gene in CNV annotation plot
#' @param width_1 number to set the width of final plot size, unit is 'cm'
#' @param height_1 number to set the height of final plot size, unit is 'cm'
#' @param col_gene set the color of Gene, only work in Gene annotated at CNV plot
#'
#' @import ggplot2 dplyr reshape2 tidyr
#'
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#' @importFrom graphics barplot legend par rect text
#'
#' @return A figure of CNVR distribution map and plot parameters.
#' If given clean_cnv file, will plot all CNVRs which are passed common threshold.
#' @export cnvr_plot
#'
cnvr_plot <- function(cnvr, assembly = "ARS", overlap_cnvr = NULL, label_prop = TRUE, left_prop_label = 3, legend_x = 127, legend_y = 30, clean_cnv = NULL, sample_size = NULL, common_cnv_threshold = 0.05, refgene = "ARS", gain_col = "red", loss_col = "green", mixed_col ="blue", overlap_col = "purple", chr_col = "black", folder ="cnvr_plot", gene_font_size = 2.2, width_1 = 22, height_1 = 14.5, col_gene = "gray") {
  if(!file.exists(paste0(folder))){
    dir.create(paste0(folder))
    cat(paste0("A new folder '", folder, "' was created in working directory.\n"))
  }
  if (is.null(clean_cnv)) {
    #Prepare parameters for CNVR distribution plot
    #prepare the Y axis for each chromosome
    if(typeof(cnvr) == "character"){
      cnvr_plot_part <- fread(file = cnvr, header = TRUE, sep = "\t")
    } else {
      cnvr_plot_part <- cnvr
      cnvr_plot_part$Chr <- as.integer(cnvr_plot_part$Chr)
    }

    chr <- data.frame("Chr" = seq(1,max(as.integer(cnvr_plot_part$Chr))))  #generate a chr order
    chr <- chr %>%
           mutate(chr_top = (as.integer(max(cnvr_plot_part$Chr)) + 1 - Chr)*3,
                  chr_bottom = chr_top - 1.5)   #our plot depend on x and y coordinate axis, the top one is chr 1 and bottom is chr max, interval of y axis is 3
                                                #bar plot depth is 1.5, this will using for plot rectangle for cnvr

    #chr$chr_top <- (max(cnvr_plot_part$Chr) + 1 -chr$Chr)*3 #our plot depend on x and y coordinate axis, the top one is chr 1 and bottom is chr29, interval of y axis is 3
    #chr$chr_bottom <- chr$chr_top - 1.5 #bar plot depth is 1.5, this will using for plot rectangle for cnvr
    #class(chr$Chr) #check the type
    chr$Chr = as.character(chr$Chr)#convert integer to character

    if (assembly == "UMD") {
      #UMD3.1 Chromosome Length
      chr_length <- c(51.505224, 46.312546, 45.407902, 51.681464, 42.90417, 62.71493, 52.530062,
                      61.435874, 71.599096, 72.042655, 64.057457,
                      66.004023,75.158596, 81.724687, 85.296676, 84.64839, 84.24035, 91.163125,
                      107.310763, 104.305016, 105.70825, 113.384836, 112.638659, 119.458736,
                      121.1914245, 120.829699, 121.430405, 137.060424, 158.337067)
      chr_name <- paste0("chr", seq(from = 29, to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = 29, to = 1, by = -1), "Length" = chr_length)

    } else if(assembly == "ARS"){
      # The length of X Chromosome is 139.009144 in ARS reference genome
      # ARS assembly
      chr_length <- c(51.098607, 45.94015, 45.612108, 51.992305, 42.350435,
                       62.317253, 52.498615, 60.773035, 69.862954,
                           71.974595, 63.449741, 65.820629, 73.167244, 81.013979,
                           85.00778, 82.403003, 83.472345, 87.216183, 106.982474,
                           103.308737, 105.454467, 113.31977, 110.682743, 117.80634,
                           120.089316, 120.000601, 121.005158, 136.231102, 158.53411)
      chr_name <- paste0("chr", seq(from = 29, to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = 29, to = 1, by = -1), "Length" = chr_length)
    } else if (assembly == "hg38"){  #hg38 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39#/st Y = 57.264655, X=156.040895,
      chr_length <- c(51.857516, 46.709983, 64.444167, 58.617616, 80.373285,
                       83.836422, 92.211104, 102.439437, 108.136338, 114.364328, 133.275309, 135.186938,
                       133.797422, 138.688728, 145.138636, 159.345973, 170.805979, 181.630948, 190.424264,
                       198.450956, 242.508799, 249.698942)
      chr_name <- paste0("chr", seq(from = 22, to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = 22, to = 1, by = -1), "Length" = chr_length)
    }else if (assembly == "hg19"){  #hg19 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/#/st Y = 59.373566, X=155.270560,
      chr_length <- c(51.304566, 48.157577, 63.025520, 59.380841, 78.081510, 81.529607, 90.354753,
                       102.531392, 107.349540, 115.169878, 133.851895, 135.046619, 135.534747, 141.696573,
                       146.440111, 159.321559, 171.115067, 180.915260, 191.535534, 198.022430, 243.199373,
                       249.904550)
      chr_name <- paste0("chr", seq(from = 22, to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = 22, to = 1, by = -1), "Length" = chr_length)
    }else if (assembly == "Sheep"){  #Oar_v4.0 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000298735.2/#/st X=135.185801,
      chr_length <- c(44.047080, 45.223504, 41.976827, 62.282865, 50.780147, 49.987992, 51.049468,
                       60.445663, 68.494538, 72.251135, 71.693149, 80.783214, 62.568341, 82.951069,
                       79.028859, 62.170480, 86.377204, 94.583238, 90.615088, 100.009711, 116.888256,
                       107.836144, 119.216639, 223.996068, 248.966461, 275.406953)
      chr_name <- paste0("chr", seq(from = 26, to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = 26, to = 1, by = -1), "Length" = chr_length)
    } else if (assembly == "Pig"){  #Sscrofa11.1 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000003025.6/#/st X=135.185801,
      chr_length <- c(55.982971, 63.494081, 79.944280, 140.412725, 141.755446, 208.334590, 61.602749,
                       79.169978, 69.359453, 139.512083, 138.966237, 121.844099, 170.843587, 104.526007,
                       130.910915, 132.848913, 151.935994, 274.330532)
      chr_name <- paste0("chr", seq(from = 18, to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = 18, to = 1, by = -1), "Length" = chr_length)
    } else if (assembly == "Chicken"){  #Chicken_galGal6 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000002315.5/#/st W=7.488053, Z=82.545508
      chr_length <- c(7.821848, 7.25831, 6.185160, 1.818525, 5.118401, 8.080432, 6.055710,
                      3.982494, 6.491222, 6.202910, 5.460380, 6.844979, 13.897287, 10.324343,
                      11.373140, 10.762512, 2.994104, 13.064218, 16.220073, 19.168871, 20.387278,
                      20.200042, 21.119840, 24.181049, 30.219446, 36.742308, 36.428957, 59.809098,
                      91.315245, 110.854228, 149.682049, 197.677740)
      chr_name <- paste0("chr", c(33,32,31,30,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1))

      chr_len <- data.frame("Chr" = c(33,32,31,30,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1),
                            "Length" = chr_length)
    } else if (assembly == "Horse"){  #EquCab3.0 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_002863925.1/#/st X=128.206784,
      chr_length <- c(26.001039, 31.395959, 34.77612, 47.348498, 40.25469, 43.147642, 40.282968, 48.288683,
                      55.556184, 50.928189, 58.984458, 65.343332, 62.681739, 82.641348, 80.72243, 88.962352,
                      92.851403, 94.600235, 43.784481, 36.992759, 61.676917, 85.155674, 85.793548, 97.563019,
                      100.787686, 87.230776, 96.759418, 109.462549, 121.351753, 121.350024, 188.260577)
      chr_name <- paste0("chr", seq(from = 31, to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = 31, to = 1, by = -1), "Length" = chr_length)
    } else if (assembly == "Dog"){  #UMICH_Zoey_3.1 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_005444595.1/#/st X=122.739558,
      chr_length <- c(23.826045, 30.840041, 31.110614, 26.483662, 42.177406, 31.430699, 39.041421, 39.389883,
                      40.358978, 42.070705, 41.338898, 46.166038, 38.422312, 51.695226, 47.687389, 52.790355,
                      61.534057, 50.995378, 57.931936, 53.975570, 55.934595, 64.204256, 59.281881, 64.289916,
                      61.098034, 63.682632, 72.762459, 74.036318, 70.059352, 60.772386, 74.192687, 80.745686,
                      77.758088, 89.245444, 88.429730, 92.032900, 82.903020, 122.894117)
      chr_name <- paste0("chr", seq(from = 38, to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = 38, to = 1, by = -1), "Length" = chr_length)
    } else {
      #construct the border of each chromosome from input data
      complete_chr <- data.frame(Chr = seq(1, max(as.integer(cnvr_plot_part$Chr)), by = 1))
      chr_length <- cnvr_plot_part %>%
                    group_by(Chr) %>%
                    summarise(length = max(End) / 1000000) %>%
                    right_join(y = complete_chr, by = "Chr") %>%
                    replace(is.na(.), 0) %>%
                    arrange(-as.integer(Chr)) %>%
                    select(length) %>%
                    as.matrix() %>%
                    t() %>%
                    as.vector()
      chr_name <- paste0("chr", seq(from = max(as.integer(cnvr_plot_part$Chr)), to = 1, by = -1))

      chr_len <- data.frame("Chr" = seq(from = max(as.integer(cnvr_plot_part$Chr)), to = 1, by = -1), "Length" = chr_length)
    }

    #4.6.1 prepare CNVPartition plot input data-----
    cnvr_plot_part$start_left <- cnvr_plot_part$Start/1000000 #convert bp to Mbp
    cnvr_plot_part$end_right <- cnvr_plot_part$End/1000000 #convert bp to Mbp
    cnvr_plot_part$Chr <- as.character(cnvr_plot_part$Chr)
    cnvr_plot_part <- merge(cnvr_plot_part, chr, all.x = TRUE)

    #customize the number of CNVRs in map by sample size
    if(!is.null(sample_size)){
      cnvr_plot_part <- filter(cnvr_plot_part, n_Sample >= sample_size * common_cnv_threshold)
    }

    fwrite(cnvr_plot_part, file = paste0(folder, "/cnvr_plot.txt"), sep ="\t", quote = FALSE)

    #this file need reread if you start from here
    #cnvr_plot_part <- fread("CNVRplot/cnvr_plot_part.txt", sep ="\t", quote = FALSE)
    gain_part <- cnvr_plot_part[cnvr_plot_part$Type == "Gain", ]
    loss_part <- cnvr_plot_part[cnvr_plot_part$Type == "Loss", ]
    mixed_part <- cnvr_plot_part[cnvr_plot_part$Type == "Mixed", ]


    #4.6.2 start make cnvr plot for CNVPartition
    png(filename = paste0(folder, "/cnvr_plot.png"),
        res = 350,
        width = width_1, height = height_1, units = "cm", bg = "transparent")

    #setting the graphic parameter
    par(lab=c(max(cnvr_plot_part$Chr),max(cnvr_plot_part$Chr),3),las=1,yaxt="s",xaxt="s",mar=c(4,5,2,2))

    #draw a bar plot and setup each bar name
    bar <- barplot(chr_length, horiz=TRUE,width=1.5,space=1,xlim=c(0,max(chr_length)+6),
                   names.arg= chr_name,
                   col=c("white"), border = chr_col, xlab="Physical Position (Mbp)",cex.name=0.8)

    #read the gain type file, each file need prepare four columns as following order: start_left, end_right, chr_top and chr_bottom
    rect(xleft = gain_part$start_left, ybottom = gain_part$chr_bottom, xright = gain_part$end_right, ytop = gain_part$chr_top, col = gain_col, border = gain_col)

    #read the loss type file and draw a rectangle shape for each CNVR and color to green
    #loss <- cnvr_plot_part[cnvr_plot_part$Type == 'LOSS']
    rect(xleft = loss_part$start_left, ybottom = loss_part$chr_bottom, xright = loss_part$end_right, ytop = loss_part$chr_top, col = loss_col, border = loss_col)

    #read the mixed file and plot rectangle as red
    #mixed <- cnvr_plot_part[cnvr_plot_part$Type == 'mixed']
    rect(xleft = mixed_part$start_left, ybottom = mixed_part$chr_bottom, xright = mixed_part$end_right, ytop = mixed_part$chr_top, col = mixed_col, border = mixed_col)

    #condition. add summary label for each chromosome
    if(label_prop == "TRUE"){

      unique_chr <- data.frame("Chr" = unique(cnvr_plot_part$Chr))
      chr_len_max <- merge(x = unique_chr, y = chr_len, all.x = TRUE)

      chr_len_max$Chr <- as.character(chr_len_max$Chr)

      chr_label <- cnvr_plot_part %>%
                   group_by(Chr) %>%
                   summarise(total_len = sum(Length) / 1000000) %>%
                   left_join(chr, by = "Chr") %>%
                   left_join(chr_len_max, by = "Chr") %>%
                   mutate(Len_prop = paste0(round(total_len * 100 / Length, digits = 1), "%"))

      text(x = chr_label$Length + left_prop_label, y = chr_label$chr_bottom + 0.5, labels = chr_label$Len_prop, cex=0.5)
    }


    #prepare Y axis for CNVPartition overlap region, mark a under line on the overlap region
    if(!is.null(overlap_cnvr)){
      if(typeof(overlap_cnvr) == "character"){
        overlap <- fread(file = overlap_cnvr, header = TRUE, sep = "\t")
      } else {
        overlap <- overlap_cnvr
      }
      overlap$Chr <- as.character(overlap$Chr)
      overlap <- merge(x = overlap, y = chr, all.x = TRUE)

      #condition, check if overlap CNVRs in original CNVR list? Only plot which are overlapped to original lists
      cnvr_plot_part <- setDT(cnvr_plot_part)
      setkey(cnvr_plot_part, Chr, Start, End)
      overlap <- foverlaps(x = overlap, y = cnvr_plot_part, by.x = c("Chr", "Start", "End"), type = "any", nomatch = NULL)

      overlap$start_left <- overlap$Start / 1000000
      overlap$end_right <- overlap$End / 1000000
      overlap$chr_bottom <- overlap$chr_bottom - 0.5
      overlap$chr_top <- overlap$chr_top - 1.7
      overlap$Type <- "Overlap"
      rect(xleft = overlap$start_left, ybottom = overlap$chr_bottom, xright = overlap$end_right, ytop = overlap$chr_top, col = overlap_col, border = overlap_col)

      #add legend
      legend(legend_x, legend_y,c("Gain","Loss","Mixed", "Overlap"), pch = c(15, 15, 15, 15),
             col =c(gain_col, loss_col, mixed_col, overlap_col), bty = "n", cex=0.8)
    } else {
      #add legend
      legend(legend_x, legend_y,c("Gain","Loss","Mixed"), pch = c(15, 15, 15),
             col =c(gain_col, loss_col, mixed_col), bty = "n", cex=0.8)
    }

    #title(main = "CNVR Distribution on Population Level")

    dev.off()

    if (file.exists(paste0(folder, "/cnvr_plot.png"))) {
      cat("Task done, CNVR Distribution Plot saved in working directory.\n")
    } else {
      cat("Task faild, please check your CNVR input file carefully, the input cnvr file should be the CNVR results generated from call_cnvr function.\n")
    }

  }



  else {
    if(typeof(cnvr) == "character"){
      cnvr <- fread(file = cnvr, header = TRUE, sep = "\t")
    } else {
      cnvr <- cnvr
    }

    # high_freq <- cnvr[which(cnvr[, cnvr$Frequent >= sample_size * 0.05]), ] #call common CNVRs
    high_freq <- cnvr %>% filter(n_Sample >= sample_size * common_cnv_threshold)
    cat(paste0("There ", nrow(high_freq), " high frequency CNVR passed the customized threshold.\n"))
    for (i in 1:nrow(high_freq)) {
      print(paste0("Ploting CNVR ", i, "..." ))
      cnv_visual(clean_cnv = clean_cnv, chr_id = high_freq$Chr[i], start_position = high_freq$Start[i]/1000000, end_position = high_freq$End[i]/1000000, refgene = refgene, folder = folder, gene_font_size = gene_font_size, height_1 = height_1, width_1 = width_1, col_gene = col_gene)
    }
  }
}
JH-Zhou/HandyCNV documentation built on Dec. 18, 2021, 12:25 a.m.