#' Plotting CNVR with all relevant information
#'
#' @param cnvr cnvr list, standard file generated by the call_cnvr function
#' @param cnv_annotation standard file generated by the call_gene function
#' @param intensity the signal intensity file contains LRR and BAF
#' @param map the map corresponding to the reference genome used to create the CNVR file, standard file generated by the convert_map function
#' @param prefix_bed the prefix of the BED, BIM and FAM files in Plink format
#' @param sample_size the total number of unique samples in the CNVR list
#' @param common_cnv_threshold two decimal places, combine with sample_size to set the common threshold
#' @param width_1 default value is 14, unit is 'cm', set the width of final the plot
#' @param height_1 default value is 30, unit is 'cm', set the height of final the plot
#' @param ld_heat If TRUE, will plot a heatmap of LD, otherwise will plot the classic inverted triangle plot. Plotting is faster with TRUE, and plotting the triange with FALSE may fail for large regions.
#' @param folder set the folder name to save results in
#' @param col_0 set colour for 0 copy of CNV
#' @param col_1 set colour for 1 copy of CNV
#' @param col_2 set colour for 2 copy of CNV (which might be ROH)
#' @param col_3 set colour for 3 copy of CNV
#' @param col_4 set colour for 4 copy of CNV
#' @param col_gene set colour of genes
#' @param gene_font_size set the font size of gene labels in CNV annotated plot
#'
#' @import dplyr ggrepel ggplot2 tidyr gaston cowplot
#' @importFrom scales unit_format
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#' @importFrom stringr str_detect
#' @importFrom ggplotify base2grob
#' @return cnvr plot with all CNVs, annotated genes, Log R Ratio, B Allele Frequency, Genotyping rate and LD...
#' @export plot_cnvr_panorama
#'
plot_cnvr_panorama <- function(cnvr, cnv_annotation, intensity = NULL, map = NULL, prefix_bed = NULL, ld_heat = TRUE, sample_size, common_cnv_threshold = 0.05, width_1 = 14, height_1 = 30, folder = "cnvr_panorama", col_0 = "hotpink", col_1 = "turquoise", col_2 = "NA", col_3 = "tomato", col_4= "deepskyblue", col_gene = "black", gene_font_size = 2.2) {
if(!file.exists(folder)){
dir.create(folder)
print(paste0("A new folder '", folder, "' was created in working directory."))
}
if(!(missing(cnvr))){
#1.Read the CNVR result----
cnvr <- fread(file = cnvr)
cnvr <- unite(cnvr, "title", names(cnvr[, c("Chr", "Start", "End", "Type")]), remove = FALSE) #generate a new columns name
high_freq <- cnvr %>%
filter(n_Sample >= sample_size * common_cnv_threshold) %>%
arrange(Length)
if(nrow(high_freq) == 0){
print("No CNVRs passed the high frequency threshold, please reset your common_cnv_threshold!")
} else {
print(paste0(nrow(high_freq), " CNVRs passed the common frequency threshold."))
}
}
if(!missing(cnv_annotation)){
#2.read cnv
cnv <- fread(file = cnv_annotation)
names(cnv)[names(cnv) == "CNV_Start"] <- "Start"
names(cnv)[names(cnv) == "CNV_End"] <- "End"
#merge cnv and cnvr
setkey(cnv, Chr, Start, End)
cnv_cnvr <- foverlaps(cnvr, cnv)
}
#3. Read SNP intensity file----
if(!(missing(intensity)) & !(missing(map))){
default_title <- c("SNP Name", "Sample ID", "B Allele Freq", "Log R Ratio")
intensity <- fread(file = intensity, skip = 9, header = TRUE)
intensity <- intensity %>%
select(c(default_title))
#4.read map plink format
map <-fread(file = map, header = FALSE)
names(map) <- c("Chr", "SNP Name", "MorganPos", "Position")
map$Chr <- suppressWarnings(as.character(map$Chr))
names(map)[names(map) == "Name"] <- 'SNP Name'
inten_chr <- merge(intensity, map, by = "SNP Name", all.x =TRUE, sort =F) #add location information to intensity file
}
if(!(missing(prefix_bed))){
#5. Read bed,bim and fam data from plink----
x <- read.bed.matrix(basename = prefix_bed, rds = NULL) #path and prefix
}
#zoom cnv, in order to match cnv value for each snp in Intensity
# chr_length_ars <- data.frame("Chr" = c(29:1), "Length" = c( 51.098607, 45.94015, 45.612108, 51.992305,
# 42.350435, 62.317253, 52.498615, 60.773035, 69.862954,
# 71.974595, 63.449741, 65.820629, 73.167244, 81.013979,
# 85.00778, 82.403003, 83.472345, 87.216183, 106.982474,
# 103.308737, 105.454467, 113.31977, 110.682743, 117.80634,
# 120.089316, 120.000601, 121.005158, 136.231102, 158.53411))
# chr_length_ars <- chr_length_ars[order(chr_length_ars$Chr),]
chr_length_ars <- cnv %>%
group_by(Chr) %>%
summarise(Length = max(End) / 1000000) %>%
arrange(as.numeric(Chr)) %>%
select(Chr, Length)
print("Starting to make plot...")
for (i in 1:nrow(high_freq)){
chr_id = high_freq$Chr[i]
start_position = high_freq$Start[i]
end_position = high_freq$End[i]
cnv_chr <- cnv[cnv$Chr == chr_id, ]
cnv_chr_zoom <- filter(cnv_chr, Start >= start_position - 1 & End <= end_position + 1)
names(cnv_chr_zoom)[names(cnv_chr_zoom) == "Sample_ID"] <- "Sample ID"
id_coord <- data.frame("Sample_ID" = sort(unique(cnv_chr_zoom$`Sample ID`))) #extract unique ID prepare coordinate
try(id_coord$Order <- seq(1, nrow(id_coord),1), silent = FALSE)
#id_coord$x <- chr_length_ars[chr_id, 2]
max_len = chr_length_ars %>%
filter(Chr == chr_id) %>%
select(Length)
id_coord$x <- max_len$Length
id_coord$y <- (id_coord$Order-1)*5 + 1
names(id_coord)[names(id_coord) == "Sample_ID"] <- "Sample ID"
cnv_chr_zoom <- merge(cnv_chr_zoom, id_coord, by = "Sample ID", all.x = TRUE, sort = FALSE) #prepare original data
cnv_chr_zoom$zoom_x <- end_position
cnv_chr_zoom$gene_order <- max(cnv_chr_zoom$Order) + 3
#gene_coord <- group_by(cnv_chr_zoom, name2) %>% slice(1) %>% drop_na(g_Start)# generate gene data
#gene_coord$CNV_Start <- gene_coord$g_Start
#gene_coord$Order <- gene_coord$gene_order
g_order <- max(cnv_chr_zoom$Order) + 3
g_order_1 <- max(cnv_chr_zoom$Order) + 6
g_order_2 <- max(cnv_chr_zoom$Order) + 9
g_order_3 <- max(cnv_chr_zoom$Order) + 12
g_order_4 <- max(cnv_chr_zoom$Order) + 15
gene_coord <- cnv_chr_zoom %>%
group_by(name2) %>%
drop_na(g_Start) %>%
arrange(g_Start) %>%
distinct(name2, .keep_all = T) %>%
filter(str_detect(name2, "[:alpha:]")) %>% #drop the gene without a certain name
#filter(!(name2 > 0)) %>%
ungroup() %>%
mutate(Order = case_when(length(Chr) <= 5 ~ rep_len(c(g_order, g_order_1),length.out = length(Chr)),
length(Chr) > 5 & length(name2) <= 15 ~ rep_len(c(g_order, g_order_1, g_order_2),length.out = length(Chr)),
length(Chr) > 15 & length(name2) <= 25 ~ rep_len(c(g_order, g_order_1, g_order_2, g_order_3),length.out = length(Chr)),
length(name2) > 25 ~ rep_len(c(g_order, g_order_1, g_order_2, g_order_3, g_order_4),length.out = length(name2))))
try(gene_coord$CNV_Value <- "5", silent = FALSE)
#zoom_name <- paste0("Chr", chr_id, "_",start_position,"-",end_position, "Mb", ".png")
id_number <- nrow(id_coord)
zoom_title <- paste0("Chr", chr_id, ":", round(start_position/1000000, 2),"-", round(end_position/1000000, 2), "Mb", " with ", id_number," Samples")
#png(res = 300, filename = zoom_name, width = 3500, height = 2000)
color_copy <- c("0" = col_0,
"1" = col_1,
"2" = col_2,
"3" = col_3,
"4" = col_4)
zoom_plot <- ggplot() +
geom_rect(data = cnv_chr_zoom, aes(xmin = Start/1000000, xmax = End/1000000, ymin = (Order-1)*5, ymax = (Order-1)*5 + 3, fill = as.factor(CNV_Value))) +
scale_fill_manual(values = color_copy) +
geom_rect(data = gene_coord, aes(xmin = g_Start/1000000, xmax = g_End/1000000, ymin = (Order-1)*5, ymax = (Order-1)*5 + 3), fill = col_gene) +
geom_text_repel(data = gene_coord, aes(x = g_Start/1000000, y = (Order-1)*5 + 4, label = name2), size = gene_font_size) +
geom_hline(yintercept = (max(cnv_chr_zoom$Order) + 2)*5 - 2, linetype = "dashed") +
scale_x_continuous(limits = c(start_position/1000000, end_position/1000000, by = 0.25)) +
#scale_x_continuous(breaks = seq(start_position, end_position, by = 250000), labels = unit_format(unit = "", scale = 1e-6)) +
#geom_text(aes(zoom_x, y, label = Sample_ID), size = 2.5) +
#scale_color_manual(values = c("#F8766D", "#A3A500", "#00B0F6", "#E76BF3", "black")) +
theme_bw() +
theme(legend.key.size = unit(0.5,"line"),
legend.title = element_text(size = 6),
legend.text = element_text(size = 6),
legend.margin=margin(0, 0, 0, 0),
#legend.position = c(0.95, -0.087),
) +
{if(missing(intensity)) theme(legend.position = "right")} +
{if(!missing(intensity)) theme(legend.position = "top")} +
#scale_y_continuous(labels = NULL) +
#scale_x_continuous(breaks = seq(round(start_position,2), round(end_position,2), by = 0.2)) +
labs(x = "Physical Position (Mb)", y ="Individual", title = zoom_title, fill = "Copy")
if(!(missing(intensity)) & !(missing(map))){
#5.6 Plot BAF, LRR, LF MAF----
#extract intensity for individual which with cnv in cnvr, so start and end position from cnvr
sub_inten <- inten_chr[which(inten_chr$Chr == chr_id),]
sub_inten <- sub_inten[order(sub_inten$Position), ]
sub_inten <- filter(sub_inten, Position >= start_position & Position <= end_position)
sub_inten <- sub_inten[which(sub_inten$`Sample ID` %in% cnv_chr_zoom$`Sample ID`), ]
cnv_chr_zoom.byId = split(cnv_chr_zoom, cnv_chr_zoom$`Sample ID`)
typeF = function(i) {
if ( nrow(sub_inten) == 0 ) {
return(2)
}
id = sub_inten[i,'Sample ID'][[1]] ## get the ID in data_1 for row i
pos = sub_inten[i,'Position'] ## get the Position in data_1 for row i
tab = cnv_chr_zoom.byId[[id]] ## get the subset of data_2 that matches this ID
## For each row in the data_2 subset, is the Position
## inside the interval [Start,End]
## idx is a Boolean (TRUE or FALSE) vector
idx = pos >= tab$Start & pos <= tab$End
## Return the matching Type from the data_2 subset
## or return 2 if nothing matches
## any(idx): does any element of idx == TRUE?, i.e.,
## does the Position match any interval?
## Yes -> tab$Type[idx][1]: return the Type for the first match
## No -> return 2
ifelse( any(idx), tab$CNV_Value[idx][1], 2 )
}
CNV_Value = sapply( 1:nrow(sub_inten), typeF )
sub_inten = cbind(sub_inten, CNV_Value)
baf <- ggplot(sub_inten, aes(x = Position, y =`B Allele Freq`, color = as.factor(CNV_Value))) +
scale_color_manual(values = color_copy) +
#scale_color_manual(values = c("#F8766D", "#A3A500","gray", "#00B0F6", "#E76BF3")) +
theme_bw() +
theme(legend.position = "top",
legend.margin=margin(0, 0, 0, 0),
axis.title.x = element_blank()) +
geom_point(shape = 1) +
scale_x_continuous(breaks = seq(start_position, end_position, by = 250000), labels = unit_format(unit = "", scale = 1e-6)) +
ylim(0,1) +
#labs(x = 'Position (Mb)', y = 'B Allele Freq', color = "Copy_Num")
labs(y = 'B Allele Freq', color = "Copy_Num")
lrr <- ggplot(sub_inten, aes(x = Position, y = `Log R Ratio`, color = as.factor(CNV_Value))) +
#scale_color_manual(values = c("#F8766D", "#A3A500", "gray", "#00B0F6", "#E76BF3")) +
scale_color_manual(values = color_copy) +
theme_bw() +
theme(legend.position = "top",
legend.margin=margin(0, 0, 0, 0),
axis.title.x = element_blank()) +
geom_point(shape = 1) +
scale_x_continuous(breaks = seq(start_position, end_position, by = 250000), labels = unit_format(unit = "", scale = 1e-6)) +
ylim(-2, 2) +
#labs(x = 'Position (Mb)', y = 'Log R Ratio', color = "Copy_Num")
labs(y = 'Log R Ratio', color = "Copy_Num")
}
if(!(missing(prefix_bed))){
ld_x <- LD(x, c(which(x@snps$id == sub_inten$`SNP Name`[1]), which(x@snps$id == sub_inten$`SNP Name`[nrow(sub_inten)])))
#ld_x <- LD(x, c(which(x@snps$id == sub_inten$`SNP Name`[1]), which(x@snps$id == sub_inten$`SNP Name`[10])))
ld_x[is.na(ld_x)] <- 0
snp_info <- x@snps[c(which(x@snps$id == sub_inten$`SNP Name`[1]):which(x@snps$id == sub_inten$`SNP Name`[nrow(sub_inten)])),]
#snp_info <- x@snps[c(which(x@snps$id == sub_inten$`SNP Name`[1]):which(x@snps$id == sub_inten$`SNP Name`[10])),]
#select to plot Classical inverted triangle LD or heatmap of LD
if(ld_heat == FALSE){
try(ld <- ggplotify::as.ggplot(function() LD.plot(LD = ld_x, snp.positions = snp_info$pos, draw.chr = FALSE, graphical.par = list(mar = c(0,0,0,0)), below.space = 0)), silent = TRUE)
###ld <- ggplotify::base2grob(function() LD.plot(LD = ld_x, snp.positions = snp_info$pos, draw.chr = FALSE))
} else{
#plot ld and genotype in an interested region
ld_table <- reshape2::melt(ld_x) #convert
heat_map <-ggplot(ld_table, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradientn(colours = c("lightblue", "yellow", "red"), na.value = "black") +
scale_x_discrete(labels = NULL) +
scale_y_discrete(labels = NULL) +
theme(legend.position = "top") + #top right
labs(x = "SNP Ordered by Position", y = "SNP Ordered by Position", fill = "r^2")
#dev.off() #save plot in working directory
}
maf <- ggplot(data = snp_info) +
geom_point(aes(x = pos, y = maf, color = "maf")) +
geom_point(aes(x = pos, y = hz, color = "heterozygosity")) +
geom_line(aes(x = pos, y = callrate, color = "callrate")) +
scale_x_continuous(breaks = seq(start_position, end_position, by = 250000), labels = unit_format(unit = "", scale = 1e-6)) +
ylim(0.0, 1.0) + theme_bw() +
theme(legend.position = "top",
legend.margin=margin(0, 0, 0, 0),
axis.title.x = element_blank()) +
labs(y = "Percentage") +
scale_color_manual(values = c("maf" = "red", "heterozygosity" = "green", "callrate" = "purple"))
}
#preparing save final plot
b <- high_freq$title[i]
dir <- paste( b, ".png", sep = "")
#png(dir, res = 300, width = 2000, height = 5000, bg = "transparent")
#plot_grid(zoom,zoom2, maf, baf, lrr, align = "v", axis = "t", ncol = 1, rel_heights = c(1,1,1,1,1))
png(filename = paste0(folder, "/",dir), width = width_1, height = height_1, res = 300, units = "cm")
if(missing(intensity) & missing(map) & missing(prefix_bed)){
final_plot <- zoom_plot
} else if(missing(prefix_bed) & missing(ld_heat)){
final_plot <- plot_grid(zoom_plot, baf, lrr, align = "v", ncol = 1,
rel_heights = c(1.5, 1, 1))
} else{
if(ld_heat == TRUE){
final_plot <- plot_grid(zoom_plot, baf, lrr, maf, heat_map, align = "v", ncol = 1,
rel_heights = c(1.5, 1, 1, 1, 1.5))
} else{
try(final_plot <- plot_grid(zoom_plot, baf, lrr, maf, ld, align = "v", ncol = 1,
rel_heights = c(1.5, 1, 1, 1, 1.6)))
}
}
print(final_plot)
dev.off()
#ggsave(plot = final_plot, path = paste0(folder, "/"), filename = dir, width = width_1, height = height_1, dpi = 300, units = "cm")
print(paste0("Plot ", i, " (", dir, ") was stored in the ", paste0(folder, "/"), " directory."))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.