R/plot_cnvr_panorama.R

Defines functions plot_cnvr_panorama

Documented in plot_cnvr_panorama

#' Plotting CNVR with all relevant information
#'
#' @param cnvr cnvr list, standard file generated by the call_cnvr function
#' @param cnv_annotation standard file generated by the call_gene function
#' @param intensity the signal intensity file contains LRR and BAF
#' @param map the map corresponding to the reference genome used to create the CNVR file, standard file generated by the convert_map function
#' @param prefix_bed the prefix of the BED, BIM and FAM files in Plink format
#' @param sample_size the total number of unique samples in the CNVR list
#' @param common_cnv_threshold two decimal places, combine with sample_size to set the common threshold
#' @param width_1 default value is 14, unit is 'cm', set the width of final the plot
#' @param height_1 default value is 30, unit is 'cm', set the height of final the plot
#' @param ld_heat If TRUE, will plot a heatmap of LD, otherwise will plot the classic inverted triangle plot. Plotting is faster with TRUE, and plotting the triange with FALSE may fail for large regions.
#' @param folder set the folder name to save results in
#' @param col_0 set colour for 0 copy of CNV
#' @param col_1 set colour for 1 copy of CNV
#' @param col_2 set colour for 2 copy of CNV (which might be ROH)
#' @param col_3 set colour for 3 copy of CNV
#' @param col_4 set colour for 4 copy of CNV
#' @param col_gene set colour of genes
#' @param gene_font_size set the font size of gene labels in CNV annotated plot
#'
#' @import dplyr ggrepel ggplot2 tidyr gaston cowplot
#' @importFrom scales unit_format
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#' @importFrom stringr str_detect
#' @importFrom ggplotify base2grob
#' @return cnvr plot with all CNVs, annotated genes, Log R Ratio, B Allele Frequency, Genotyping rate and LD...
#' @export plot_cnvr_panorama
#'
plot_cnvr_panorama <- function(cnvr, cnv_annotation, intensity = NULL, map = NULL, prefix_bed = NULL, ld_heat = TRUE, sample_size, common_cnv_threshold = 0.05, width_1 = 14, height_1 = 30, folder = "cnvr_panorama", col_0 = "hotpink",  col_1 = "turquoise", col_2 = "NA", col_3 = "tomato", col_4= "deepskyblue", col_gene = "black", gene_font_size = 2.2) {
  if(!file.exists(folder)){
    dir.create(folder)
    print(paste0("A new folder '", folder, "' was created in working directory."))
  }

  if(!(missing(cnvr))){
    #1.Read the CNVR result----
    cnvr <- fread(file = cnvr)
    cnvr <- unite(cnvr, "title", names(cnvr[, c("Chr", "Start", "End", "Type")]), remove = FALSE) #generate a new columns name
    high_freq <- cnvr %>%
      filter(n_Sample >= sample_size * common_cnv_threshold) %>%
      arrange(Length)
    if(nrow(high_freq) == 0){
      print("No CNVRs passed the high frequency threshold, please reset your common_cnv_threshold!")
    } else {
      print(paste0(nrow(high_freq), " CNVRs passed the common frequency threshold."))
    }
  }

  if(!missing(cnv_annotation)){
    #2.read cnv
    cnv <- fread(file = cnv_annotation)
    names(cnv)[names(cnv) == "CNV_Start"] <- "Start"
    names(cnv)[names(cnv) == "CNV_End"] <- "End"
    #merge cnv and cnvr
    setkey(cnv, Chr, Start, End)
    cnv_cnvr <- foverlaps(cnvr, cnv)
  }

  #3. Read SNP intensity file----
  if(!(missing(intensity)) & !(missing(map))){
    default_title <- c("SNP Name", "Sample ID",	"B Allele Freq", "Log R Ratio")
    intensity <- fread(file = intensity, skip = 9, header = TRUE)
    intensity <- intensity %>%
                 select(c(default_title))

  #4.read map plink format
    map <-fread(file = map, header = FALSE)
    names(map) <- c("Chr", "SNP Name", "MorganPos", "Position")
    map$Chr <- suppressWarnings(as.character(map$Chr))
    names(map)[names(map) == "Name"] <- 'SNP Name'
    inten_chr <- merge(intensity, map, by = "SNP Name", all.x =TRUE, sort =F) #add location information to intensity file
    }

  if(!(missing(prefix_bed))){
    #5. Read bed,bim and fam data from plink----
    x <- read.bed.matrix(basename = prefix_bed, rds = NULL) #path and prefix
  }

  #zoom cnv, in order to match cnv value for each snp in Intensity
  # chr_length_ars <- data.frame("Chr" = c(29:1), "Length" = c( 51.098607, 45.94015, 45.612108, 51.992305,
  #                                                               42.350435, 62.317253, 52.498615, 60.773035, 69.862954,
  #                                                               71.974595, 63.449741, 65.820629, 73.167244, 81.013979,
  #                                                               85.00778, 82.403003, 83.472345, 87.216183, 106.982474,
  #                                                               103.308737, 105.454467, 113.31977, 110.682743, 117.80634,
  #                                                               120.089316, 120.000601, 121.005158, 136.231102, 158.53411))
  # chr_length_ars <- chr_length_ars[order(chr_length_ars$Chr),]

  chr_length_ars <- cnv %>%
    group_by(Chr) %>%
    summarise(Length = max(End) / 1000000) %>%
    arrange(as.numeric(Chr)) %>%
    select(Chr, Length)

  print("Starting to make plot...")
  for (i in 1:nrow(high_freq)){
    chr_id = high_freq$Chr[i]
    start_position = high_freq$Start[i]
    end_position = high_freq$End[i]

    cnv_chr <- cnv[cnv$Chr == chr_id, ]
    cnv_chr_zoom <- filter(cnv_chr, Start >= start_position - 1 & End <= end_position + 1)
    names(cnv_chr_zoom)[names(cnv_chr_zoom) == "Sample_ID"] <- "Sample ID"
    id_coord <- data.frame("Sample_ID" = sort(unique(cnv_chr_zoom$`Sample ID`))) #extract unique ID prepare coordinate
    try(id_coord$Order <- seq(1, nrow(id_coord),1), silent = FALSE)
    #id_coord$x <- chr_length_ars[chr_id, 2]
    max_len = chr_length_ars %>%
      filter(Chr == chr_id) %>%
      select(Length)
    id_coord$x <- max_len$Length
    id_coord$y <- (id_coord$Order-1)*5 + 1
    names(id_coord)[names(id_coord) == "Sample_ID"] <- "Sample ID"
    cnv_chr_zoom <- merge(cnv_chr_zoom, id_coord, by = "Sample ID", all.x = TRUE, sort = FALSE) #prepare original data
    cnv_chr_zoom$zoom_x <- end_position
    cnv_chr_zoom$gene_order <- max(cnv_chr_zoom$Order) + 3
    #gene_coord <- group_by(cnv_chr_zoom, name2) %>% slice(1) %>% drop_na(g_Start)# generate gene data
    #gene_coord$CNV_Start <- gene_coord$g_Start
    #gene_coord$Order <- gene_coord$gene_order
    g_order <- max(cnv_chr_zoom$Order) + 3
    g_order_1 <- max(cnv_chr_zoom$Order) + 6
    g_order_2 <- max(cnv_chr_zoom$Order) + 9
    g_order_3 <- max(cnv_chr_zoom$Order) + 12
    g_order_4 <- max(cnv_chr_zoom$Order) + 15

    gene_coord <- cnv_chr_zoom %>%
                  group_by(name2) %>%
                  drop_na(g_Start) %>%
                  arrange(g_Start) %>%
                  distinct(name2, .keep_all = T) %>%
                  filter(str_detect(name2, "[:alpha:]")) %>% #drop the gene without a certain name
                  #filter(!(name2 > 0)) %>%
                  ungroup() %>%
                    mutate(Order = case_when(length(Chr) <= 5 ~ rep_len(c(g_order, g_order_1),length.out = length(Chr)),
                               length(Chr) > 5 & length(name2) <= 15 ~ rep_len(c(g_order, g_order_1, g_order_2),length.out = length(Chr)),
                               length(Chr) > 15 & length(name2) <= 25 ~ rep_len(c(g_order, g_order_1, g_order_2, g_order_3),length.out = length(Chr)),
                               length(name2) > 25 ~ rep_len(c(g_order, g_order_1, g_order_2, g_order_3, g_order_4),length.out = length(name2))))
    try(gene_coord$CNV_Value <- "5", silent = FALSE)

    #zoom_name <- paste0("Chr", chr_id, "_",start_position,"-",end_position, "Mb", ".png")
    id_number <- nrow(id_coord)
    zoom_title <- paste0("Chr", chr_id, ":", round(start_position/1000000, 2),"-", round(end_position/1000000, 2), "Mb", " with ", id_number," Samples")
    #png(res = 300, filename = zoom_name, width = 3500, height = 2000)
    color_copy <- c("0" = col_0,
                    "1" = col_1,
                    "2" = col_2,
                    "3" = col_3,
                    "4" = col_4)
    zoom_plot <- ggplot() +
      geom_rect(data = cnv_chr_zoom, aes(xmin = Start/1000000, xmax = End/1000000, ymin = (Order-1)*5, ymax = (Order-1)*5 + 3, fill = as.factor(CNV_Value))) +
      scale_fill_manual(values = color_copy) +
      geom_rect(data = gene_coord, aes(xmin = g_Start/1000000, xmax = g_End/1000000, ymin = (Order-1)*5, ymax = (Order-1)*5 + 3), fill = col_gene) +
      geom_text_repel(data = gene_coord, aes(x = g_Start/1000000, y = (Order-1)*5 + 4, label = name2), size = gene_font_size) +
      geom_hline(yintercept = (max(cnv_chr_zoom$Order) + 2)*5 - 2, linetype = "dashed") +
      scale_x_continuous(limits = c(start_position/1000000, end_position/1000000, by = 0.25)) +
      #scale_x_continuous(breaks = seq(start_position, end_position, by = 250000), labels = unit_format(unit = "", scale = 1e-6)) +
      #geom_text(aes(zoom_x, y, label = Sample_ID), size = 2.5) +
      #scale_color_manual(values = c("#F8766D", "#A3A500", "#00B0F6", "#E76BF3", "black")) +
      theme_bw() +
      theme(legend.key.size = unit(0.5,"line"),
            legend.title = element_text(size = 6),
            legend.text  = element_text(size = 6),
            legend.margin=margin(0, 0, 0, 0),
            #legend.position = c(0.95, -0.087),
            ) +
      {if(missing(intensity)) theme(legend.position = "right")} +
      {if(!missing(intensity)) theme(legend.position = "top")} +
      #scale_y_continuous(labels = NULL) +
      #scale_x_continuous(breaks = seq(round(start_position,2), round(end_position,2), by = 0.2)) +
      labs(x = "Physical Position (Mb)", y ="Individual", title = zoom_title, fill = "Copy")

    if(!(missing(intensity)) & !(missing(map))){
      #5.6 Plot BAF, LRR, LF MAF----
      #extract intensity for individual which with cnv in cnvr, so start and end position from cnvr
      sub_inten <- inten_chr[which(inten_chr$Chr == chr_id),]
      sub_inten <- sub_inten[order(sub_inten$Position), ]
      sub_inten <- filter(sub_inten, Position >= start_position & Position <= end_position)
      sub_inten <- sub_inten[which(sub_inten$`Sample ID` %in% cnv_chr_zoom$`Sample ID`), ]

      cnv_chr_zoom.byId = split(cnv_chr_zoom, cnv_chr_zoom$`Sample ID`)
      typeF = function(i) {
        if ( nrow(sub_inten) == 0 ) {
          return(2)
        }
        id = sub_inten[i,'Sample ID'][[1]]         ## get the ID in data_1 for row i
        pos = sub_inten[i,'Position']  ## get the Position in data_1 for row i
        tab = cnv_chr_zoom.byId[[id]]     ## get the subset of data_2 that matches this ID
        ## For each row in the data_2 subset, is the Position
        ## inside the interval [Start,End]
        ## idx is a Boolean (TRUE or FALSE) vector
        idx = pos >= tab$Start & pos <= tab$End
        ## Return the matching Type from the data_2 subset
        ## or return 2 if nothing matches
        ## any(idx): does any element of idx == TRUE?, i.e.,
        ## does the Position match any interval?
        ## Yes -> tab$Type[idx][1]: return the Type for the first match
        ## No  -> return 2
        ifelse( any(idx), tab$CNV_Value[idx][1], 2 )
      }

      CNV_Value = sapply( 1:nrow(sub_inten), typeF )
      sub_inten = cbind(sub_inten, CNV_Value)

      baf <- ggplot(sub_inten, aes(x = Position, y =`B Allele Freq`, color = as.factor(CNV_Value))) +
        scale_color_manual(values = color_copy) +
        #scale_color_manual(values = c("#F8766D", "#A3A500","gray", "#00B0F6", "#E76BF3")) +
        theme_bw() +
        theme(legend.position = "top",
              legend.margin=margin(0, 0, 0, 0),
              axis.title.x = element_blank()) +
        geom_point(shape = 1) +
        scale_x_continuous(breaks = seq(start_position, end_position, by = 250000), labels = unit_format(unit = "", scale = 1e-6)) +
        ylim(0,1) +
        #labs(x = 'Position (Mb)', y = 'B Allele Freq', color = "Copy_Num")
        labs(y = 'B Allele Freq', color = "Copy_Num")

      lrr <- ggplot(sub_inten, aes(x = Position, y = `Log R Ratio`, color = as.factor(CNV_Value))) +
        #scale_color_manual(values = c("#F8766D", "#A3A500", "gray", "#00B0F6", "#E76BF3")) +
        scale_color_manual(values = color_copy) +
        theme_bw() +
        theme(legend.position = "top",
              legend.margin=margin(0, 0, 0, 0),
              axis.title.x = element_blank()) +
        geom_point(shape = 1) +
        scale_x_continuous(breaks = seq(start_position, end_position, by = 250000), labels = unit_format(unit = "", scale = 1e-6)) +
        ylim(-2, 2) +
        #labs(x = 'Position (Mb)', y = 'Log R Ratio', color = "Copy_Num")
        labs(y = 'Log R Ratio', color = "Copy_Num")
    }

    if(!(missing(prefix_bed))){
      ld_x <- LD(x, c(which(x@snps$id == sub_inten$`SNP Name`[1]), which(x@snps$id == sub_inten$`SNP Name`[nrow(sub_inten)])))
      #ld_x <- LD(x, c(which(x@snps$id == sub_inten$`SNP Name`[1]), which(x@snps$id == sub_inten$`SNP Name`[10])))
      ld_x[is.na(ld_x)] <- 0
      snp_info <- x@snps[c(which(x@snps$id == sub_inten$`SNP Name`[1]):which(x@snps$id == sub_inten$`SNP Name`[nrow(sub_inten)])),]
      #snp_info <- x@snps[c(which(x@snps$id == sub_inten$`SNP Name`[1]):which(x@snps$id == sub_inten$`SNP Name`[10])),]

      #select to plot Classical inverted triangle LD or heatmap of LD
      if(ld_heat == FALSE){
        try(ld <- ggplotify::as.ggplot(function() LD.plot(LD = ld_x, snp.positions = snp_info$pos, draw.chr = FALSE, graphical.par = list(mar = c(0,0,0,0)), below.space = 0)), silent = TRUE)
        ###ld <- ggplotify::base2grob(function() LD.plot(LD = ld_x, snp.positions = snp_info$pos, draw.chr = FALSE))
      } else{
        #plot ld and genotype in an interested region
        ld_table <- reshape2::melt(ld_x) #convert
        heat_map <-ggplot(ld_table, aes(Var1, Var2, fill = value)) +
          geom_tile() +
          scale_fill_gradientn(colours = c("lightblue", "yellow", "red"), na.value = "black") +
          scale_x_discrete(labels = NULL) +
          scale_y_discrete(labels = NULL) +
          theme(legend.position = "top") + #top right
          labs(x = "SNP Ordered by Position", y = "SNP Ordered by Position", fill = "r^2")
        #dev.off() #save plot in working directory
      }

      maf <- ggplot(data = snp_info) +
        geom_point(aes(x = pos, y = maf, color = "maf")) +
        geom_point(aes(x = pos, y = hz, color = "heterozygosity")) +
        geom_line(aes(x = pos, y = callrate, color = "callrate")) +
        scale_x_continuous(breaks = seq(start_position, end_position, by = 250000), labels = unit_format(unit = "", scale = 1e-6)) +
        ylim(0.0, 1.0) + theme_bw() +
        theme(legend.position = "top",
              legend.margin=margin(0, 0, 0, 0),
              axis.title.x = element_blank()) +
        labs(y = "Percentage") +
        scale_color_manual(values = c("maf" = "red", "heterozygosity" = "green", "callrate" = "purple"))
    }

    #preparing save final plot
    b <- high_freq$title[i]
    dir <- paste( b, ".png", sep = "")
    #png(dir, res = 300, width = 2000, height = 5000, bg = "transparent")
    #plot_grid(zoom,zoom2, maf, baf, lrr, align = "v", axis = "t", ncol = 1, rel_heights = c(1,1,1,1,1))
    png(filename = paste0(folder, "/",dir), width = width_1, height = height_1, res = 300, units = "cm")

    if(missing(intensity) & missing(map) & missing(prefix_bed)){
      final_plot <- zoom_plot
    } else if(missing(prefix_bed) & missing(ld_heat)){
      final_plot <- plot_grid(zoom_plot, baf, lrr, align = "v", ncol = 1,
                              rel_heights = c(1.5, 1, 1))
    } else{
      if(ld_heat == TRUE){
        final_plot <- plot_grid(zoom_plot, baf, lrr, maf, heat_map, align = "v", ncol = 1,
                                rel_heights = c(1.5, 1, 1, 1, 1.5))
      } else{
        try(final_plot <- plot_grid(zoom_plot, baf, lrr, maf, ld, align = "v", ncol = 1,
                                    rel_heights = c(1.5, 1, 1, 1, 1.6)))
      }
    }

    print(final_plot)
    dev.off()
    #ggsave(plot = final_plot, path = paste0(folder, "/"), filename = dir, width = width_1, height = height_1, dpi = 300, units = "cm")
    print(paste0("Plot ", i, " (", dir, ") was stored in the ", paste0(folder, "/"), " directory."))
  }
}
JH-Zhou/HandyCNV documentation built on Dec. 18, 2021, 12:25 a.m.