R/cnv.R

Defines functions mean_cnv_plot quantile_plot cnv_heatmap prep_cnv_dat

Documented in cnv_heatmap mean_cnv_plot prep_cnv_dat quantile_plot

#' Prepare data for CNV heat map
#'
#' This function caps CNV scores, adds annotation columns for plotting,
#' performs hierarchical clustering of bins based on similar CNV score, and plots nUMI per bin 
#'
#' @param dat_bin data.table of CNV scores per bin
#' @param lower numeric float to represent the lower cap for CNV scores
#' @param upper numeric float to represent the upper cap for CNV scores 
#' @param hc_function character for which hierarchical clustering function to use
#' @param plot_directory output plot directory path
#' @return A list object for downstream cnv plotting and analysis
#'         all = data.table of CNV scores of all bins x (metadata + genes)
#'         malig = data.table of CNV scores of just malignant bins x (metadata + genes)
#'         all_wide = data.frame in wide format of CNV scores of all bins x (metadata + genes)
#'         malig_wide = data.frame in wide format of CNV scores of just malignant bins x (metadata + genes)
#'         hcl = hclust object that describes the hierarchical clustering for malignant bins 
#'         hcl_all = hclust object that describes the hierarchical clustering for all bins 

#' @export
prep_cnv_dat <- function(dat_bin, 
                         lower=0.6, 
                         upper=1.4, 
                         hc_function = 'ward.D2', 
                         plot_directory) {     
    
    # store old par value to restore upon exit
    old_par <- graphics::par(no.readonly = TRUE) 
    on.exit(graphics::par(old_par)) 
    
    # separate data into all (malignant + non-malignant), malignant, and non-malignant cells
    malig_cells=unique(dat_bin[cluster_type == 'Malignant',]$variable)
    ref_cells=unique(dat_bin[cluster_type == 'Non-malignant',]$variable)
    all_cells=unique(dat_bin$variable)
    
    dat_malig <- dat_bin[variable %in% malig_cells]
    dat_ref <- dat_bin[variable %in% ref_cells]
    
    # cap data
    dat_bin[value>upper]$value <- upper
    dat_malig[value>upper]$value <- upper
    dat_bin[value<lower]$value <- lower
    dat_malig[value<lower]$value <- lower
    
    # factorize cells, chrs, and add order column
    dat_malig[,variable:=factor(variable,levels=malig_cells),]
    dat_malig$chr <- factor(dat_malig$chr, 
                            levels = unique(dat_malig$chr))
    dat_malig[,total_order:=c(1:nrow(dat_malig)),]
    dat_bin[,variable:=factor(variable,levels=all_cells),]
    dat_bin$chr <- factor(dat_bin$chr, 
                          levels = unique(dat_bin$chr))
    dat_bin[,total_order:=c(1:nrow(dat_bin)),]
    
    # transform to wide format for a bins vs genes data frame
    malig_wide=as.data.frame(data.table::dcast(dat_malig,variable~chr+plot_order,
                                               value.var = "value"))
    row.names(malig_wide)=malig_wide$variable
    malig_wide$variable=NULL
    malig_wide <- malig_wide[,stringr::str_sort(colnames(malig_wide), numeric = TRUE)]

    all_wide=as.data.frame(data.table::dcast(dat_bin,variable~chr+plot_order,
                                             value.var = "value"))
    row.names(all_wide)=all_wide$variable
    all_wide$variable=NULL
    all_wide <- all_wide[,stringr::str_sort(colnames(all_wide),
                                   numeric = TRUE)]

    # hierarchical clustering of bins by CNV score
    hcl=stats::hclust(stats::dist(malig_wide), method=hc_function)
    hcl_all=stats::hclust(stats::dist(all_wide), method=hc_function)
    
    # plot dendrogram of hierarchical clustering
    grDevices::pdf(file = paste0(plot_directory,"/hcl_cnv.pdf"), width = 10, height = 6)
    graphics::par(mfrow=c(1,2)) 
    plot(hcl,labels=FALSE)
    plot(hcl_all,labels=FALSE)
    grDevices::dev.off()
    
    dat_malig[,variable:=factor(variable,levels=hcl$labels[hcl$order])]
    dat_bin[,variable:=factor(variable,levels=hcl_all$labels[hcl_all$order])]

    # plot distribution of values across number of beads/bin and umi/bin
    grDevices::pdf(file = paste0(plot_directory,"/beads_umi_per_bin.pdf"), width = 10, height = 6)
    graphics::par(mfrow=c(1,2)) 
    graphics::boxplot(dat_malig$value ~ dat_malig$N_bin)
    graphics::boxplot(dat_malig$value ~ dat_malig$umi_bin)
    grDevices::dev.off()
    
    cnv_data <- list("all" = dat_bin, 
                     "malig" = dat_malig, 
                     "all_wide" = all_wide, 
                     "malig_wide" = malig_wide, 
                     "hcl"=hcl,
                     "hcl_all" = hcl_all)
    return(cnv_data)
}
utils::globalVariables(c("cluster_type", "variable", "value", "total_order"))

#' Plot CNV scores on a heat map
#'
#' This function prepares data for plotting and makes a heat map of CNV scores per bead across all genes
#'
#' @param cnv_data list object of cnv data from SlideCNA::prep_cnv_dat()
#' @param md data.table of metadata of each bead
#' @param chrom_colors vector of colors labeled by which chromosome they correspond to
#' @param hc_function character for which hierarchical clustering function to use
#' @param plot_directory output plot directory path
#' @return None

#' @export
cnv_heatmap <- function(cnv_data, 
                        md, 
                        chrom_colors, 
                        hc_function = 'ward.D2', 
                        plot_directory) {
    
    # get chromosome names
    annot_col <- data.frame(chr=matrix(unlist(strsplit(colnames(cnv_data$all_wide), "_")), 
                                       ncol=2, 
                                       byrow=TRUE)[,1], 
                            row.names=colnames(cnv_data$all_wide))
    
    # Select columns to plot
    md_sub <- unique(md[,c('bin_all', 'cluster_type')])
    annot_row <- data.frame(md_sub$cluster_type, row.names=md_sub$bin_all)
    colnames(annot_row)[1] = 'Tissue Type'
    
    # Order by plot order (genomic position)
    cnv_data$all_wide <- cnv_data$all_wide[order(as.numeric(row.names(cnv_data$all_wide))),]
    cnv_data$all_wide <- rbind(cnv_data$all_wide[1:nrow(cnv_data$malig_wide),]
                          [match(rownames(cnv_data$malig_wide),
                                 rownames(cnv_data$all_wide[1:nrow(cnv_data$malig_wide),])),],
                               cnv_data$all_wide[(nrow(cnv_data$malig_wide)+1):nrow(cnv_data$all_wide),])
    
    # Choose colors to represnt tissue types
    ann_colors = list("Tissue Type" = c("Malignant"="#F8776D", "Non-malignant"="#00BFC4"), chr = chrom_colors)
    
    # Plot heat map
    grDevices::png(file = paste0(plot_directory,"/cnv_heatmap.png"), width = 1500, height = 1000) # png version
    print(pheatmap::pheatmap(cnv_data$all_wide, 
                             color = grDevices::colorRampPalette(c("navy", "white","firebrick3"))(50), 
                             cluster_rows = TRUE, 
                             cluster_cols = FALSE, 
                             clustering_distance_rows = "euclidean",
                             clustering_method = hc_function, 
                             annotation_row = annot_row, 
                             annotation_col = annot_col,
                             annotation_colors=ann_colors, 
                             show_rownames=FALSE, 
                             show_colnames=FALSE, 
                             filename=paste0(plot_directory,"/cnv_heatmap.png")))
    grDevices::dev.off()
 
    # Plot heat map (pdf)
    pheatmap::pheatmap(cnv_data$all_wide, 
                             color = grDevices::colorRampPalette(c("navy", "white","firebrick3"))(50), 
                             cluster_rows = TRUE, 
                             cluster_cols = FALSE, 
                             clustering_distance_rows = "euclidean",
                             clustering_method = hc_function, 
                             annotation_row = annot_row, 
                             annotation_col = annot_col,
                             annotation_colors=ann_colors, 
                             show_rownames=FALSE, 
                             show_colnames=FALSE, 
                             filename=paste0(plot_directory,"/cnv_heatmap.pdf"),
                              width=18,
                              height=15)
}

#' Plot CNV score quantiles per bin and per chromosome
#'
#' This function colors and plots each bin by its CNV score quantiles (min, 1st quartile, median, 3rd quartile, max)
#' on spatial coordinates for each chromosome
#'
#' @param cnv_data list object of cnv data from SlideCNA::prep_cnv_dat()
#' @param cluster_label character string of which column name to keep 
#' @param text_size integer of text size for ggplot
#' @param title_size integer of title size for ggplot
#' @param legend_height_bar integer of bar height of legend for ggplot
#' @param plot_directory output plot directory path
#’ @Import ggplot2
#' @return None

#' @export
quantile_plot <- function(cnv_data, 
                          cluster_label="seurat_clusters", 
                          text_size, 
                          title_size, 
                          legend_height_bar, 
                          plot_directory) {
    plot_data <- cnv_data$all
    plot_data$chr <- factor(plot_data$chr, levels = unique(plot_data$chr))
    
    # get quantile values
    min_plot <- plot_data[,new_value:=min(value),by=c("variable","chr")]
    min_plot$level <- '1'
    first_plot <- plot_data[,new_value:=stats::quantile(value, .25),by=c("variable","chr")]
    first_plot$level <- '2'
    med_plot <- plot_data[,new_value:=stats::median(value),by=c("variable","chr")]
    med_plot$level <- '3'
    third_plot <- plot_data[,new_value:=stats::quantile(value, .75),by=c("variable","chr")]
    third_plot$level <- '4'
    max_plot <- plot_data[,new_value:=max(value),by=c("variable","chr")]
    max_plot$level <- '5'

    # combined data.frame of quamtiles
    quant_combined <- rbind(min_plot, first_plot, med_plot, third_plot, max_plot)
    quant_combined=dplyr::distinct(dplyr::select(quant_combined,
                                                 c(variable, chr, pos_x, pos_y, cluster_label, new_value, level)))
    # Plot CNV scores across space
    legend_title='CNV Score'
    gg <- ggplot2::ggplot(quant_combined) +
    geom_point(aes(pos_x, pos_y, color = new_value)) + 
    facet_wrap(chr~level,scales="free_x") +
    scale_color_gradient2(midpoint=1, low="blue", mid="white", high="red") +
    theme_bw() +
    theme(panel.background = element_rect(fill = "black"),
          axis.title=element_text(size=title_size, face='bold'), 
          strip.text.x = element_text(size=text_size),
          legend.title=element_text(size=title_size, face='bold'),
          legend.text=element_text(size=text_size),
          legend.key.height = unit(legend_height_bar,"cm"),
          plot.background = element_blank(),
          panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    xlab("Position X") + 
    ylab("Position Y") + 
    labs(color = legend_title)
    
    grDevices::png(file = paste0(plot_directory,"/cnv_score_quantiles.png"), width = 1000, height = 1200) # png version
    print(gg)
    grDevices::dev.off()
}                              
utils::globalVariables(c("new_value", "value", "variable", "chr", "pos_x", "pos_y", "level"))

#' Plot mean CNV scores per bin and per chromosome
#'
#' This function colors and plots each bin by its mean CNV score
#' on spatial coordinates for each chromosome
#'
#' @param cnv_data list object of cnv data from SlideCNA::prep_cnv_dat()
#' @param text_size integer of text size for ggplot
#' @param title_size integer of title size for ggplot
#' @param legend_height_bar integer of bar height of legend for ggplot
#' @param plot_directory output plot directory path
#’ @Import ggplot2
#' @return None

#' @export
mean_cnv_plot <- function(cnv_data, 
                          text_size, 
                          title_size, 
                          legend_height_bar, 
                          plot_directory) {
    
#     plot_data <- data.table()
#     for (col in colnames(cnv_data$all)) {
#         plot_data <- cbind(plot_data, cnv_data$all[[col]])
#     }
#     colnames(plot_data) <- colnames(cnv_data$all)
    
    plot_data <- data.table::copy(cnv_data$all)
    # add new column that takes the mean cnv score across all genes
    mean_plot <- plot_data[,cnv_score:=mean(value),by=c("variable","chr")]
    
    # plot CNV scores
    legend_title='CNV Score'
    gg <- ggplot2::ggplot(mean_plot) +
    geom_point(aes(pos_x, pos_y, color = cnv_score)) + 
    facet_wrap(~chr,scales="free_x") +
    scale_color_gradient2(midpoint=1, low="blue", mid="white", high="red") +
    theme_bw() +
    theme(panel.background = element_rect(fill = "black"), 
          axis.text=element_text(size=text_size), 
          axis.title=element_text(size=title_size, face='bold'), 
          strip.text.x = element_text(size=text_size),
          legend.title=element_text(size=title_size, face='bold'), 
          legend.text=element_text(size=text_size),
          legend.key.height = unit(legend_height_bar,"cm"),
          plot.background = element_blank(),
          panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
    xlab("Position X") + ylab("Position Y") + labs(color = legend_title) 
    
    grDevices::png(file = paste0(plot_directory,"/mean_cnv_score.png"), width = 1200, height = 1200) # png version
    print(gg)
    grDevices::dev.off()
}                              
utils::globalVariables(c("cnv_score", "value", "pos_x", "pos_y"))
dkzhang777/SlideCNA documentation built on Jan. 25, 2025, 5:53 p.m.