#' Custom CNVR distribution map and plot all high frequency CNVR at once
#'
#' @param cnvr standard cnvr file was generated by 'call_cnvr' function
#' @param assembly UMD = "Bovine UMD 3.1", ARS = "BOvine ARS UCD 1.2", "hg38" = "Human hg38", "hg19" = "Human hg19", Sheep = "Oar_v4.0", Pig = "Sscrofa11.1", Chicken = "Chicken_galGal6", Horse = "EquCab3.0", Dog = "UMICH_Zoey_3.1"
#' @param overlap_cnvr the common CNVRs list between two CNVR results, which use to mark underline to indicate the common CNVRs, should only contain Chr, Start and End three columns, the standard file was generated by 'compare_cnvr' function
#' @param label_prop if 'TRUE', will display the proportion of CNVRs in each chromosome
#' @param left_prop_label adjust the coordinates of the label to move to the left, default value is 3
#' @param refgene reference gene list, use for plot gene
#' @param clean_cnv standard input file was generated by 'cnv_clean' function
#' @param sample_size integer, the total number of unique samples in the cnv result. combine with common_cnv_threshold to plot all CNVs which passed the threshold
#' @param common_cnv_threshold two decimal places, combine with sample_size to plot all CNVs passed the common threshold
#' @param legend_x the x coordinate of legend, relative to the maximum length of Chromosome, unit is 'Mb'
#' @param legend_y the y coordinate of legend
#' @param gain_col set color for type of gain CNVR
#' @param loss_col set color for type of loss CNVR
#' @param mixed_col set color for type of mixed CNVR
#' @param overlap_col set color for Overlapped CNVRs
#' @param chr_col set color of Chromosomes
#' @param folder set name of folder to save results
#' @param gene_font_size set the font size of gene in CNV annotation plot
#' @param width_1 number to set the width of final plot size, unit is 'cm'
#' @param height_1 number to set the height of final plot size, unit is 'cm'
#' @param col_gene set the color of Gene, only work in Gene annotated at CNV plot
#'
#' @import ggplot2 dplyr reshape2 tidyr
#'
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#' @importFrom graphics barplot legend par rect text
#'
#' @return A figure of CNVR distribution map and plot parameters.
#' If given clean_cnv file, will plot all CNVRs which are passed common threshold.
#' @export cnvr_plot
#'
cnvr_plot <- function(cnvr, assembly = "ARS", overlap_cnvr = NULL, label_prop = TRUE, left_prop_label = 3, legend_x = 127, legend_y = 30, clean_cnv = NULL, sample_size = NULL, common_cnv_threshold = 0.05, refgene = "ARS", gain_col = "red", loss_col = "green", mixed_col ="blue", overlap_col = "purple", chr_col = "black", folder ="cnvr_plot", gene_font_size = 2.2, width_1 = 22, height_1 = 14.5, col_gene = "gray") {
if(!file.exists(paste0(folder))){
dir.create(paste0(folder))
cat(paste0("A new folder '", folder, "' was created in working directory.\n"))
}
if (is.null(clean_cnv)) {
#Prepare parameters for CNVR distribution plot
#prepare the Y axis for each chromosome
if(typeof(cnvr) == "character"){
cnvr_plot_part <- fread(file = cnvr, header = TRUE, sep = "\t")
} else {
cnvr_plot_part <- cnvr
cnvr_plot_part$Chr <- as.integer(cnvr_plot_part$Chr)
}
chr <- data.frame("Chr" = seq(1,max(as.integer(cnvr_plot_part$Chr)))) #generate a chr order
chr <- chr %>%
mutate(chr_top = (as.integer(max(cnvr_plot_part$Chr)) + 1 - Chr)*3,
chr_bottom = chr_top - 1.5) #our plot depend on x and y coordinate axis, the top one is chr 1 and bottom is chr max, interval of y axis is 3
#bar plot depth is 1.5, this will using for plot rectangle for cnvr
#chr$chr_top <- (max(cnvr_plot_part$Chr) + 1 -chr$Chr)*3 #our plot depend on x and y coordinate axis, the top one is chr 1 and bottom is chr29, interval of y axis is 3
#chr$chr_bottom <- chr$chr_top - 1.5 #bar plot depth is 1.5, this will using for plot rectangle for cnvr
#class(chr$Chr) #check the type
chr$Chr = as.character(chr$Chr)#convert integer to character
if (assembly == "UMD") {
#UMD3.1 Chromosome Length
chr_length <- c(51.505224, 46.312546, 45.407902, 51.681464, 42.90417, 62.71493, 52.530062,
61.435874, 71.599096, 72.042655, 64.057457,
66.004023,75.158596, 81.724687, 85.296676, 84.64839, 84.24035, 91.163125,
107.310763, 104.305016, 105.70825, 113.384836, 112.638659, 119.458736,
121.1914245, 120.829699, 121.430405, 137.060424, 158.337067)
chr_name <- paste0("chr", seq(from = 29, to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = 29, to = 1, by = -1), "Length" = chr_length)
} else if(assembly == "ARS"){
# The length of X Chromosome is 139.009144 in ARS reference genome
# ARS assembly
chr_length <- c(51.098607, 45.94015, 45.612108, 51.992305, 42.350435,
62.317253, 52.498615, 60.773035, 69.862954,
71.974595, 63.449741, 65.820629, 73.167244, 81.013979,
85.00778, 82.403003, 83.472345, 87.216183, 106.982474,
103.308737, 105.454467, 113.31977, 110.682743, 117.80634,
120.089316, 120.000601, 121.005158, 136.231102, 158.53411)
chr_name <- paste0("chr", seq(from = 29, to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = 29, to = 1, by = -1), "Length" = chr_length)
} else if (assembly == "hg38"){ #hg38 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39#/st Y = 57.264655, X=156.040895,
chr_length <- c(51.857516, 46.709983, 64.444167, 58.617616, 80.373285,
83.836422, 92.211104, 102.439437, 108.136338, 114.364328, 133.275309, 135.186938,
133.797422, 138.688728, 145.138636, 159.345973, 170.805979, 181.630948, 190.424264,
198.450956, 242.508799, 249.698942)
chr_name <- paste0("chr", seq(from = 22, to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = 22, to = 1, by = -1), "Length" = chr_length)
}else if (assembly == "hg19"){ #hg19 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/#/st Y = 59.373566, X=155.270560,
chr_length <- c(51.304566, 48.157577, 63.025520, 59.380841, 78.081510, 81.529607, 90.354753,
102.531392, 107.349540, 115.169878, 133.851895, 135.046619, 135.534747, 141.696573,
146.440111, 159.321559, 171.115067, 180.915260, 191.535534, 198.022430, 243.199373,
249.904550)
chr_name <- paste0("chr", seq(from = 22, to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = 22, to = 1, by = -1), "Length" = chr_length)
}else if (assembly == "Sheep"){ #Oar_v4.0 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000298735.2/#/st X=135.185801,
chr_length <- c(44.047080, 45.223504, 41.976827, 62.282865, 50.780147, 49.987992, 51.049468,
60.445663, 68.494538, 72.251135, 71.693149, 80.783214, 62.568341, 82.951069,
79.028859, 62.170480, 86.377204, 94.583238, 90.615088, 100.009711, 116.888256,
107.836144, 119.216639, 223.996068, 248.966461, 275.406953)
chr_name <- paste0("chr", seq(from = 26, to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = 26, to = 1, by = -1), "Length" = chr_length)
} else if (assembly == "Pig"){ #Sscrofa11.1 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000003025.6/#/st X=135.185801,
chr_length <- c(55.982971, 63.494081, 79.944280, 140.412725, 141.755446, 208.334590, 61.602749,
79.169978, 69.359453, 139.512083, 138.966237, 121.844099, 170.843587, 104.526007,
130.910915, 132.848913, 151.935994, 274.330532)
chr_name <- paste0("chr", seq(from = 18, to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = 18, to = 1, by = -1), "Length" = chr_length)
} else if (assembly == "Chicken"){ #Chicken_galGal6 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_000002315.5/#/st W=7.488053, Z=82.545508
chr_length <- c(7.821848, 7.25831, 6.185160, 1.818525, 5.118401, 8.080432, 6.055710,
3.982494, 6.491222, 6.202910, 5.460380, 6.844979, 13.897287, 10.324343,
11.373140, 10.762512, 2.994104, 13.064218, 16.220073, 19.168871, 20.387278,
20.200042, 21.119840, 24.181049, 30.219446, 36.742308, 36.428957, 59.809098,
91.315245, 110.854228, 149.682049, 197.677740)
chr_name <- paste0("chr", c(33,32,31,30,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1))
chr_len <- data.frame("Chr" = c(33,32,31,30,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1),
"Length" = chr_length)
} else if (assembly == "Horse"){ #EquCab3.0 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_002863925.1/#/st X=128.206784,
chr_length <- c(26.001039, 31.395959, 34.77612, 47.348498, 40.25469, 43.147642, 40.282968, 48.288683,
55.556184, 50.928189, 58.984458, 65.343332, 62.681739, 82.641348, 80.72243, 88.962352,
92.851403, 94.600235, 43.784481, 36.992759, 61.676917, 85.155674, 85.793548, 97.563019,
100.787686, 87.230776, 96.759418, 109.462549, 121.351753, 121.350024, 188.260577)
chr_name <- paste0("chr", seq(from = 31, to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = 31, to = 1, by = -1), "Length" = chr_length)
} else if (assembly == "Dog"){ #UMICH_Zoey_3.1 cite from https://www.ncbi.nlm.nih.gov/assembly/GCF_005444595.1/#/st X=122.739558,
chr_length <- c(23.826045, 30.840041, 31.110614, 26.483662, 42.177406, 31.430699, 39.041421, 39.389883,
40.358978, 42.070705, 41.338898, 46.166038, 38.422312, 51.695226, 47.687389, 52.790355,
61.534057, 50.995378, 57.931936, 53.975570, 55.934595, 64.204256, 59.281881, 64.289916,
61.098034, 63.682632, 72.762459, 74.036318, 70.059352, 60.772386, 74.192687, 80.745686,
77.758088, 89.245444, 88.429730, 92.032900, 82.903020, 122.894117)
chr_name <- paste0("chr", seq(from = 38, to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = 38, to = 1, by = -1), "Length" = chr_length)
} else {
#construct the border of each chromosome from input data
complete_chr <- data.frame(Chr = seq(1, max(as.integer(cnvr_plot_part$Chr)), by = 1))
chr_length <- cnvr_plot_part %>%
group_by(Chr) %>%
summarise(length = max(End) / 1000000) %>%
right_join(y = complete_chr, by = "Chr") %>%
replace(is.na(.), 0) %>%
arrange(-as.integer(Chr)) %>%
select(length) %>%
as.matrix() %>%
t() %>%
as.vector()
chr_name <- paste0("chr", seq(from = max(as.integer(cnvr_plot_part$Chr)), to = 1, by = -1))
chr_len <- data.frame("Chr" = seq(from = max(as.integer(cnvr_plot_part$Chr)), to = 1, by = -1), "Length" = chr_length)
}
#4.6.1 prepare CNVPartition plot input data-----
cnvr_plot_part$start_left <- cnvr_plot_part$Start/1000000 #convert bp to Mbp
cnvr_plot_part$end_right <- cnvr_plot_part$End/1000000 #convert bp to Mbp
cnvr_plot_part$Chr <- as.character(cnvr_plot_part$Chr)
cnvr_plot_part <- merge(cnvr_plot_part, chr, all.x = TRUE)
#customize the number of CNVRs in map by sample size
if(!is.null(sample_size)){
cnvr_plot_part <- filter(cnvr_plot_part, n_Sample >= sample_size * common_cnv_threshold)
}
fwrite(cnvr_plot_part, file = paste0(folder, "/cnvr_plot.txt"), sep ="\t", quote = FALSE)
#this file need reread if you start from here
#cnvr_plot_part <- fread("CNVRplot/cnvr_plot_part.txt", sep ="\t", quote = FALSE)
gain_part <- cnvr_plot_part[cnvr_plot_part$Type == "Gain", ]
loss_part <- cnvr_plot_part[cnvr_plot_part$Type == "Loss", ]
mixed_part <- cnvr_plot_part[cnvr_plot_part$Type == "Mixed", ]
#4.6.2 start make cnvr plot for CNVPartition
png(filename = paste0(folder, "/cnvr_plot.png"),
res = 350,
width = width_1, height = height_1, units = "cm", bg = "transparent")
#setting the graphic parameter
par(lab=c(max(cnvr_plot_part$Chr),max(cnvr_plot_part$Chr),3),las=1,yaxt="s",xaxt="s",mar=c(4,5,2,2))
#draw a bar plot and setup each bar name
bar <- barplot(chr_length, horiz=TRUE,width=1.5,space=1,xlim=c(0,max(chr_length)+6),
names.arg= chr_name,
col=c("white"), border = chr_col, xlab="Physical Position (Mbp)",cex.name=0.8)
#read the gain type file, each file need prepare four columns as following order: start_left, end_right, chr_top and chr_bottom
rect(xleft = gain_part$start_left, ybottom = gain_part$chr_bottom, xright = gain_part$end_right, ytop = gain_part$chr_top, col = gain_col, border = gain_col)
#read the loss type file and draw a rectangle shape for each CNVR and color to green
#loss <- cnvr_plot_part[cnvr_plot_part$Type == 'LOSS']
rect(xleft = loss_part$start_left, ybottom = loss_part$chr_bottom, xright = loss_part$end_right, ytop = loss_part$chr_top, col = loss_col, border = loss_col)
#read the mixed file and plot rectangle as red
#mixed <- cnvr_plot_part[cnvr_plot_part$Type == 'mixed']
rect(xleft = mixed_part$start_left, ybottom = mixed_part$chr_bottom, xright = mixed_part$end_right, ytop = mixed_part$chr_top, col = mixed_col, border = mixed_col)
#condition. add summary label for each chromosome
if(label_prop == "TRUE"){
unique_chr <- data.frame("Chr" = unique(cnvr_plot_part$Chr))
chr_len_max <- merge(x = unique_chr, y = chr_len, all.x = TRUE)
chr_len_max$Chr <- as.character(chr_len_max$Chr)
chr_label <- cnvr_plot_part %>%
group_by(Chr) %>%
summarise(total_len = sum(Length) / 1000000) %>%
left_join(chr, by = "Chr") %>%
left_join(chr_len_max, by = "Chr") %>%
mutate(Len_prop = paste0(round(total_len * 100 / Length, digits = 1), "%"))
text(x = chr_label$Length + left_prop_label, y = chr_label$chr_bottom + 0.5, labels = chr_label$Len_prop, cex=0.5)
}
#prepare Y axis for CNVPartition overlap region, mark a under line on the overlap region
if(!is.null(overlap_cnvr)){
if(typeof(overlap_cnvr) == "character"){
overlap <- fread(file = overlap_cnvr, header = TRUE, sep = "\t")
} else {
overlap <- overlap_cnvr
}
overlap$Chr <- as.character(overlap$Chr)
overlap <- merge(x = overlap, y = chr, all.x = TRUE)
#condition, check if overlap CNVRs in original CNVR list? Only plot which are overlapped to original lists
cnvr_plot_part <- setDT(cnvr_plot_part)
setkey(cnvr_plot_part, Chr, Start, End)
overlap <- foverlaps(x = overlap, y = cnvr_plot_part, by.x = c("Chr", "Start", "End"), type = "any", nomatch = NULL)
overlap$start_left <- overlap$Start / 1000000
overlap$end_right <- overlap$End / 1000000
overlap$chr_bottom <- overlap$chr_bottom - 0.5
overlap$chr_top <- overlap$chr_top - 1.7
overlap$Type <- "Overlap"
rect(xleft = overlap$start_left, ybottom = overlap$chr_bottom, xright = overlap$end_right, ytop = overlap$chr_top, col = overlap_col, border = overlap_col)
#add legend
legend(legend_x, legend_y,c("Gain","Loss","Mixed", "Overlap"), pch = c(15, 15, 15, 15),
col =c(gain_col, loss_col, mixed_col, overlap_col), bty = "n", cex=0.8)
} else {
#add legend
legend(legend_x, legend_y,c("Gain","Loss","Mixed"), pch = c(15, 15, 15),
col =c(gain_col, loss_col, mixed_col), bty = "n", cex=0.8)
}
#title(main = "CNVR Distribution on Population Level")
dev.off()
if (file.exists(paste0(folder, "/cnvr_plot.png"))) {
cat("Task done, CNVR Distribution Plot saved in working directory.\n")
} else {
cat("Task faild, please check your CNVR input file carefully, the input cnvr file should be the CNVR results generated from call_cnvr function.\n")
}
}
else {
if(typeof(cnvr) == "character"){
cnvr <- fread(file = cnvr, header = TRUE, sep = "\t")
} else {
cnvr <- cnvr
}
# high_freq <- cnvr[which(cnvr[, cnvr$Frequent >= sample_size * 0.05]), ] #call common CNVRs
high_freq <- cnvr %>% filter(n_Sample >= sample_size * common_cnv_threshold)
cat(paste0("There ", nrow(high_freq), " high frequency CNVR passed the customized threshold.\n"))
for (i in 1:nrow(high_freq)) {
print(paste0("Ploting CNVR ", i, "..." ))
cnv_visual(clean_cnv = clean_cnv, chr_id = high_freq$Chr[i], start_position = high_freq$Start[i]/1000000, end_position = high_freq$End[i]/1000000, refgene = refgene, folder = folder, gene_font_size = gene_font_size, height_1 = height_1, width_1 = width_1, col_gene = col_gene)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.