R/bin.R

Defines functions long_to_bin SpatialPlot dat_to_long bin bin_metadata

Documented in bin bin_metadata dat_to_long long_to_bin SpatialPlot

#' Spatio-molecular binning of relative expression intensities
#'
#' This function combines metadata with binned relative expression intensities 
#'
#' @param md data.table of metadata of each bead
#' @param dat data.table of smoothed relative expression intensities 
#' @param avg_bead_per_bin integer of average number of beads there should be per bin 
#' @param pos TRUE if doing spatial and expressional binning, FALSE if just expressional binning
#' @param pos_k positional weight
#' @param ex_k expressional weight
#' @param hc_function hierarchical clustering function
#' @param plot_directory output plot directory path
#' @return A data.table of bead metadata combined with binned expression intensities for all genes for all beads

#' @export
bin_metadata <- function(md, 
                         dat, 
                         avg_bead_per_bin=12, 
                         pos=TRUE, 
                         pos_k=55, 
                         ex_k=1, 
                         hc_function='ward.D2',
                         plot_directory) {
    
    # bin malignant beads
    md_mal <- md[cluster_type=='Malignant']
    dat_mal <- cbind(dat[,c("GENE", "chr", "start", "end", "rel_gene_pos", "length")], 
                     dat[,which(dat[,c(colnames(dat)%in%md_mal$bc)]), with=FALSE])
    
    n_bin_mal <- round(nrow(md_mal)/avg_bead_per_bin)
    futile.logger::flog.info("Number of malignant bins: %s", n_bin_mal)
    md_mal <- SlideCNA::bin(dat_mal, md_mal, n_bin_mal, pos, pos_k, ex_k, hc_function, plot_directory)
    
    # bin reference beads
    md_ref <- md[cluster_type=='Non-malignant']
    dat_ref <- cbind(dat[,c("GENE", "chr", "start", "end", "rel_gene_pos", "length")], 
                     dat[,which(dat[,c(colnames(dat)%in%md_ref$bc)]), with=FALSE])
    
    n_bin_ref <- round(nrow(md_ref)/avg_bead_per_bin)
    futile.logger::flog.info("Number of non-malignant bins: %s", n_bin_ref)
    md_ref <- SlideCNA::bin(dat_ref, md_ref, n_bin_ref, pos, pos_k, ex_k, hc_function, plot_directory)
    md_ref$bin_all <- md_ref$bin_all + max(md_mal$bin_all) 
    
    md <- rbind(md_mal, md_ref)
    md[,pct_mal:=sum(icluster_type)/.N,by=bin_all]
    return(md)
    
}   
utils::globalVariables(c("cluster_type", "pct_mal", "icluster_type", ".N", "bin_all"))

#' Subfunction of bin_metadata() for expression/positional binning 
#'
#' This function computes a pseudospatial distance between beads that combines spatial distance and distance from the expression space, then using the silhouette score and hierarchical clustering, segregates beads into bins
#'
#' @param dat data.table of smoothed relative expression intensities 
#' @param md data.table of metadata of each bead
#' @param k number of malignant bins to set
#' @param pos number of malignant clusters
#' @param pos TRUE if doing spatial and expressional binning, FALSE if just expressional binning
#' @param pos_k positional weight
#' @param ex_k expressional weight
#' @param hc_function hierarchical clustering function
#' @param plot_directory output plot directory path
#' @return A data.table of bead metadata combined with bin designations
                              
#' @export
bin <- function(dat, 
                md, 
                k, 
                pos=TRUE, 
                pos_k=55, 
                ex_k=1, 
                hc_function = 'ward.D2', 
                plot_directory) {
    
    dat_var <- t(dat[,!c("chr", "start", "end", "rel_gene_pos", "length")]) 
    colnames(dat_var) <- dat_var[1,]
    dat_var <- as.data.frame(dat_var[-1,])
    dat_var$bc <- rownames(dat_var)
    dat_var <- as.data.frame(merge(dplyr::select(md, bc), dat_var, by="bc"))
    row.names(dat_var) <- dat_var$bc
    dat_var <- dat_var[,-1]  
    dat_var <- scale(sapply(dat_var, as.numeric))
    expr_distance <- stats::dist(dat_var) # get distances from expression matrix
    
    md[,icluster_type:=ifelse(cluster_type=='Non-malignant',0,1)]
    icluster_type <- as.data.frame(dplyr::select(md, bc, icluster_type))
    row.names(icluster_type) <- icluster_type$bc
    icluster_type <- icluster_type[,-1]
    
    # If using spatial information
    if (pos==TRUE) {
        pos <- as.data.frame(dplyr::select(md, bc, pos_x, pos_y))
        row.names(pos) <- pos$bc
        pos <- pos[,-1]
        pos_distance <- stats::dist(pos) # get distances from position matrix
        
        # linearly combine expression and position matrices
        distance <- pos_k * pos_distance + ex_k * expr_distance
    }
    else {
        distance <- expr_distance
    }
    
    # hierarchical clustering on combined distances
    hcl <- stats::hclust(distance, method=hc_function)
    grDevices::pdf(file = paste0(plot_directory, "/bin_hclust_dend.pdf"), width = 10, height = 6)
    plot(hcl)
    grDevices::dev.off()
    
    # get bins 
    hcl_sub <- stats::cutree(hcl, k = k)
    
    # update bead metadata with bin designations
    new_md <- dplyr::mutate(md, bin_all = hcl_sub)
    new_md[,N_bin:=.N,by=bin_all] #.N is number of instances that bin combo exists in dataset
    new_md[,umi_bin:=sum(nCount_RNA),by=bin_all]
    
    return(new_md)
}
utils::globalVariables(c("bc", "cluster_type", "pos_x", "pos_y", "N_bin", ".N",
                         "bin_all", "umi_bin", "nCount_RNA"))

#' Convert data to long format and add in metadata
#'
#' This function will create rows for each bead and gene combination, adding in new metadata with bin designations
#'
#' @param dat data.table of smoothed relative expression intensities 
#' @param md data.table of metadata per bead 
#' @return A data.table of bead expression intensities per gene with metadata in long format 

#' @export
dat_to_long <- function(dat, 
                        md) {
    # change to long format
    dat_long=data.table::as.data.table(reshape2::melt(dat,id.vars = c("GENE","chr","start","end","rel_gene_pos", "length")))
    # add metadata
    dat_long=merge(dat_long,md,by.x="variable",by.y="bc")
    return(dat_long)
}  

#' Spatial plots of meta data
#'
#' This function will plot information about beads and bins on x and y coordinates
#'
#' @param dat_long data.table of bead expression intensities per gene with metadata in long format
#' @param vars character vector of features to plot/columns of metadata
#' @param text_size Ggplot2 text size
#' @param title_size Ggplot2 title size
#' @param legend_size_pt Ggplot2 legend_size_pt
#' @param legend_height_bar Ggplot2 legend_height_bar
#' @param plot_directory output plot directory path
#' @return None

#' @export
SpatialPlot <- function(dat_long, 
                        vars=NULL, 
                        text_size, 
                        title_size, 
                        legend_size_pt, 
                        legend_height_bar, 
                        plot_directory) {
    dat_distinct <- dplyr::distinct(dplyr::select(dat_long, 
                                                  c("variable", "pos_x", "pos_y", vars)))
    
    for (var in vars) {
        
        if (var == "seurat_clusters") {
            legend_title = "Seurat Cluster"
        }
        else if (var == "nmf_ct") {
            legend_title = "NMF Cell Types"
        }
        else if (var == "Cross_Section") {
            legend_title = "Tumor Subsample"
        }
        else if (var == "bin_all") {
            legend_title = "Bin"
        }
        else if (var == "N_bin") {
            legend_title = "Number of Beads per Bin"
        }
        else if (var == "umi_bin") {
            legend_title = "Number of UMIs per Bin"
        }
        else if (var == "cluster_type") {
             legend_title = "Tissue Type"
        }
        
        if (var %in% c('seurat_clusters', 'nmf_ct', 'cluster_type', 'Cross_Section')) {
            gg <- ggplot2::ggplot(dat_distinct) +
            geom_point(aes(pos_x, pos_y, color = eval(as.name(var))), size = 0.5) + 
            theme_bw() +
            theme(axis.text=element_text(size=text_size), 
                  axis.title=element_text(size=title_size, face='bold'),
                  legend.title=element_text(size=title_size, face='bold'), 
                  legend.text=element_text(size=text_size),
                  plot.background = element_blank(),
                  panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
            xlab("Position X") + 
            ylab("Position Y") + 
            labs(color = legend_title) +
            coord_fixed() +
            guides(color = guide_legend(override.aes = list(size = legend_size_pt)))
            
            grDevices::pdf(file = paste0(plot_directory,"/", var, "_spatial.pdf"), width = 6, height = 8)
            print(gg)
            grDevices::dev.off()
        }
        
        else if (var == 'bin_all') {
            gg <- ggplot2::ggplot(dat_distinct) +
            geom_point(aes(pos_x, pos_y, color = eval(as.name(var))), size=0.5) + 
            theme_bw() + 
            theme(axis.text=element_text(size=text_size), 
                  axis.title=element_text(size=title_size, face='bold'),
                  legend.title=element_text(size=title_size, face='bold'),
                  legend.text=element_text(size=text_size),
                  legend.key.height = unit(legend_height_bar,"cm"),
                  plot.background = element_blank(),
                  panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
            xlab("Position X") + 
            ylab("Position Y") + 
            labs(color = legend_title) + 
            coord_fixed() +
            scale_color_gradientn(colours = sample(grDevices::rainbow(500), 500))
            
            grDevices::pdf(file = paste0(plot_directory,"/", var, "_spatial.pdf"), width = 6, height = 8)
            print(gg)
            grDevices::dev.off()
        }
        
        else {
            gg <- ggplot2::ggplot(dat_distinct) + 
            geom_point(aes(pos_x, pos_y, color = eval(as.name(var))), size = 0.6) + 
            theme_bw() + 
            theme(axis.text=element_text(size=text_size), 
                  axis.title=element_text(size=title_size, face='bold'),
                  legend.title=element_text(size=title_size, face='bold'), 
                  legend.text=element_text(size=text_size),
                  legend.key.height = unit(legend_height_bar,"cm"),
                  plot.background = element_blank(),
                  panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
            xlab("Position X") + 
            ylab("Position Y") + 
            labs(color = legend_title) +
            coord_fixed()
            
            grDevices::pdf(file = paste0(plot_directory,"/", var, "_spatial.pdf"), width = 6, height = 8)
            print(gg)
            grDevices::dev.off()
        }        
             
    }
    
}
utils::globalVariables(c("pos_x", "pos_y"))

#' Convert to wide bin x genes + metadata format
#'
#' This function will combine beads into bins, taking the average expression intensities, average positions,  
#' most common cluster seurat cluster, and most common cluster/tissue type of constituent beads
#'
#' @param dat_long data.table of bead expression intensities per gene with metadata in long format
#' @param plot_directory output plot directory path
#' @param spatial True if using spatial information
#' @return data.table of expression intensities at aggregated bin level
#' @export
#' @import ggplot2

#' @export
long_to_bin <- function(dat_long, 
                        plot_directory, 
                        spatial=TRUE) {
    
    # Using spatial information
    if (isTRUE(spatial)) {
        # combine constituent beads to bins
        dat_bin=dat_long[,.(value=mean(value),
                            pos_x=mean(pos_x),
                            pos_y=mean(pos_y),
                            seurat_clusters=mode(seurat_clusters), 
                            cluster_type=mode(cluster_type)),
                            by=c("bin_all","GENE","chr","start","end","rel_gene_pos", 
                                 "length", "N_bin", "umi_bin")]
        data.table::setnames(dat_bin,"bin_all","variable")
        dat_bin[,plot_order:=c(1:length(value)),by=c("variable","chr")] # plot order is order of genes on chrom.

        # Plot max seurat cluster of a given bin
        gg <- ggplot2::ggplot(unique(dat_bin[,c('pos_x','pos_y','seurat_clusters')])) +
          geom_jitter(aes(pos_x, pos_y, color = seurat_clusters), width = 0.1)

        grDevices::pdf(file = paste0(plot_directory,"/seurat_clusters_bin_spatial.pdf"), width = 10, height = 6)
        print(gg)
        grDevices::dev.off()
    }
    
    # Not using spatial information
    else {
        # combine constituent beads to bins
        dat_bin=dat_long[,.(value=mean(value),
                            cluster_type=mode(cluster_type)), 
                            by=c("bin_all", "GENE", "chr",
                                 "start","end","rel_gene_pos",
                                 "length", "N_bin", "umi_bin")]
        
        data.table::setnames(dat_bin,"bin_all","variable")
        dat_bin[,plot_order:=c(1:length(value)),by=c("variable","chr")] # plot order is order of genes on chrom.
    }
    
    return(dat_bin)
}                               
utils::globalVariables(c("value", "pos_x", "pos_y", "seurat_clusters", "cluster_type", "plot_order", "."))

#' Subfunction of long_to_bin() that finds mode of vector/column
#'
#' This function finds the mode of a vector
#'
#' @param x vector (column in data.table) to calculate the mode from
#' @return mode of the vector
                              
#' @export
mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

#' Scale for nUMI (UMI Count) to generate CNV scores
#'
#' This function re-scales expression intensities to be in a smaller range, normalizes for nUMI per bin,
#' and subtracts reference bead signal
#'
#' @param dat_bin data.table of relative expression intensities per bin
#' @param thresh_hard TRUE if using strict thresholds for expression thresholds and FALSE if adjusting 
#' thresholds based on 1 + or - the mean of absolute min and max values
#' @return data.table of CNV scores per bin

#' @export
scale_nUMI <- function(dat_bin, 
                       thresh_hard=FALSE) {
    
    # Setting thresholds for scale range
    if (!isTRUE(thresh_hard)) {
        bot <- stats::quantile(dat_bin$value, na.rm=TRUE)[[1]] # min
        top <- stats::quantile(dat_bin$value, na.rm=TRUE)[[5]] # max

        thresh=mean(abs(c(bot, top)))
        bot_thresh <- 1-thresh
        top_thresh <- 1+thresh
    }
    else {
        bot_thresh=0.5
        top_thresh=1.5
    }

    for (nbin in unique(dat_bin$umi_bin)) {

            start = bot_thresh
            end = top_thresh
        
            # Capping extreme values and normalizing for nUMIs for each bin
            dat_bin[dat_bin$umi_bin==nbin,]$value <- SlideCNA::scalefit(dat_bin, nbin, start, end)
        
            # Adjust for reference beads
            normal_mean <- mean(dat_bin[dat_bin$cluster_type=='Non-malignant',]$value) # mean of reference beads
            dat_bin$value <- dat_bin$value - normal_mean + 1 
        }
    
    return(dat_bin)
}                  

#' Subfunction for scale_nUMI that normalizes a given bin for UMI count and centers the  mean CNV score at 1
#'
#' This function re-scales expression intensities to be in a smaller range, normalizes for nUMI per bin,
#' and centers the CNV scores to have a mean of 1
#'
#' @param obj data.table of relative expression intensities per bin
#' @param nbin nUMIs in that specific bin
#' @param start lower bound of CNV scores
#' @param end upper bound of CNV scores
#' @return vector of adjusted CNV scores for that bins with nbin number of nUMIs within the range (inclusive) of start to end

#' @export
scalefit <- function(obj, 
                     nbin, 
                     start, 
                     end) {
    scaled <- scales::rescale(obj[obj$umi_bin==nbin,]$value, to=c(start,end)) # scale for nUMIs 
    scaled <- scale(scaled, scale=FALSE) + 1 # re-center at 1
    return(scaled)
}
dkzhang777/SlideCNA documentation built on Jan. 25, 2025, 5:53 p.m.