R/Select_normal.R

Defines functions Select_normal

Documented in Select_normal

#' Identify candidate normal cells and normal regions for cell coverage normalization
#'
#' @param Obj_filtered An Alleloscope object with major haplotype proportion (theta_hat) for each cell of each region in the "rds_list".
#' @param raw_counts A large binned coverage matrix (bin by cell) with values being read counts for all chromosomal regions of tumor sample.
#' @param cell_nclust Integer. Number of clusters used in identifying normal cells in the sample.
#' @param pre_sel Logical (TRUE/FALSE). Whether or not to use theta_hat from regions without segmentation (each chromosome) to identify normal cells for segmentation.
#' @param plot_theta Logical (TRUE/FALSE). Whether or not to plot the hierarchical clustering result using the theta_hat values across regions.
#' @param cell_type A matrix with two columns: COL1- cell barcodes; COL2- cell types ("tumor" and others)
#' @param cutree_rows Integer. Number of clusters the rows are divided into for visualization (inherited from the pheatmap function).
#' @param mincell An integer to filter out regions with minimum number of cells.
#'
#' @return A Alleloscope object with a "select_normal" list added.
#' A "select_normal" list includes
#' "barcode_normal": Barcodes of the identified normal cells in the tumor sample.
#' "region_normal": A vector of ordered potential normal regions for selection. (1st is the most possible.)
#' "region_normal_rank": A table with the potential "normal regions" for the k clusters from hierarchical clustering.
#' "k_normal": An integer indicates the kth clsuter that is idenfied as "normal cells"
#'
#' @export
Select_normal=function(Obj_filtered=NULL, raw_counts=NULL, cell_nclust=5 , plot_theta=FALSE,pre_sel=FALSE, cell_type=NULL, cutree_rows=3 , mincell=NULL ){
  
  
  # check parameters
  if(is.null(Obj_filtered)){
    stop("Please provide a valid Alleloscope object for Obj_filtered.")
  }else if(length(unlist(strsplit(rownames(raw_counts)[2],'-')))!=3){
    stop("The rownames for the raw_counts matrix should be formatted as: chr1-1-20000.")
  }else if(!(nrow(raw_counts)>0 & ncol(raw_counts)>0)){
    stop("raw_counts matrix is not valid.")
  }else if(cell_nclust<0 | cell_nclust > length(Obj_filtered$barcodes)){
    stop("cell_nclust should be in the range [0, ncell].")
  }else if(cutree_rows<0 | cutree_rows > length(Obj_filtered$barcodes)){
    stop("cutree_rows should be in the range [0, ncell].")
  }
  
  # set values
  EMresult=Obj_filtered$rds_list
  filtered_seg_table=Obj_filtered$seg_table_filtered
  #cell_info=Obj_filtered$cell_info
  
  plot_path=paste0(Obj_filtered$dir_path,'/plots/')
  
  if(!dir.exists(plot_path)){
    dir.create(plot_path)
  }
  
  if(!dir.exists(paste0(Obj_filtered$dir_path,"/rds"))){
    dir.create(paste0(Obj_filtered$dir_path,"/rds"))
  }
  
  samplename=Obj_filtered$samplename
  cell_barcodes=Obj_filtered$barcodes
  assay=Obj_filtered$assay
  ncell=length(Obj_filtered$barcodes)
  distance_seg=as.numeric(filtered_seg_table$end)-as.numeric(filtered_seg_table$start) ## for cytoarm
  size=Obj_filtered$size
  cell_total=Matrix::colSums(Obj_filtered$total_all)
  if(is.null(mincell)){
    mincell=ncol(Obj_filtered$total_all)*0.9
  }
  if(mincell<0 | mincell> length(Obj_filtered$barcodes)){
    stop("mincell should be in the range [0, ncell].")
  }
  
  
  if(pre_sel==FALSE){
    ## raw/ref count matrix info
    raw_chr=sapply(strsplit(rownames(raw_counts),'-'),'[',1)
    raw_start=as.numeric(sapply(strsplit(rownames(raw_counts),'-'),'[',2))
    raw_end=as.numeric(sapply(strsplit(rownames(raw_counts),'-'),'[',3))
  }
  theta_N=list()
  
  for(chrr in as.character(filtered_seg_table$chrr)){ # for cytoarm
    
    chrrn=unlist(strsplit(chrr,':'))[1]
    result=EMresult[[paste0('chr', chrr)]]
    theta_hat=result$theta_hat
    names(theta_hat)=result$barcodes
    barcodes=result$barcodes#
    
    if(pre_sel==FALSE){
      
      if(length(result$barcodes)<mincell){
        cat(paste0("Exclude ",chrr," region:<",mincell," cells\n"))
        next
      }
      
      # subset the coverge info
      raw_counts_chr=raw_counts[which(raw_chr %in% paste0('chr', as.character(chrrn))),]
      raw_chr_sub=raw_chr[which(raw_chr %in% paste0('chr', as.character(chrrn)))]
      raw_start_sub=raw_start[which(raw_chr %in% paste0('chr', as.character(chrrn)))]
      raw_end_sub=raw_end[which(raw_chr %in% paste0('chr', as.character(chrrn)))]
      
      query=GenomicRanges::GRanges(paste0('chr',chrrn),IRanges::IRanges(as.numeric(filtered_seg_table[which(filtered_seg_table$chrr == chrr),2]), as.numeric(filtered_seg_table[which(filtered_seg_table$chrr == chrr),3])))
      subject=GenomicRanges::GRanges(raw_chr_sub, IRanges::IRanges(as.numeric(raw_start_sub),as.numeric(raw_end_sub))) ## cytoband 1-based start and 1-based end
      ov=findOverlaps(query, subject)
      ov=as.matrix(ov)
      
      
      bin_start=min(ov[,2])
      bin_end=max(ov[,2])
      raw_counts_chr=raw_counts_chr[bin_start:bin_end,] ## for p and q #for cytoarm
      
      raw_counts_chr=raw_counts_chr[, 1:length(Obj_filtered$barcodes)][,match(barcodes, cell_barcodes)]
      Nr=colSums(raw_counts_chr)
      
      # N0=cell_info$num_mapped_dedup_reads
      N0=cell_total
      
      barcodes_non_noisy=cell_barcodes#cell_info$barcode[which(cell_info$is_noisy==0)]
      barcodes_non_noisy=intersect(barcodes_non_noisy, barcodes)
      
      names(Nr)=barcodes
      Nr=Nr[match(barcodes_non_noisy, barcodes)]
      N0=N0[match(barcodes_non_noisy, barcodes)]
      Ni=Nr/N0*sum(size)/distance_seg[which(filtered_seg_table$chrr==chrr)]
      names(Ni)=barcodes_non_noisy
      Ni[Ni>quantile(Ni, 0.99)]=quantile(Ni, 0.99)
      
      
      theta_hat=theta_hat[match(barcodes_non_noisy, names(theta_hat))]
      
      
      df=data.frame(Nir_Ni0=Ni, theta_hat=theta_hat)
      
      theta_N[[paste0("rho_",as.character(chrr))]]=Ni
      theta_N[[paste0("theta_",as.character(chrr))]]=theta_hat
    }else{
      df=data.frame(theta_hat=theta_hat)
      theta_N[[paste0("theta_",as.character(chrr))]]=theta_hat
    }
    
    
    cat(paste0(chrr," "))
  }
  
  cat("\n")
  #####3 remove the region that have too few cells!
  
  cell_list<-lapply(theta_N, function(x) {
    names(x)
  })
  
  cell_intersect <- Reduce(intersect, cell_list)
  
  
  theta_hat_cbn <- sapply(theta_N,function(x){
    x[match(cell_intersect, names(x))]
  })
  
  rownames(theta_hat_cbn) <- cell_intersect
  
  if(pre_sel==FALSE){
    saveRDS(theta_hat_cbn, paste0(Obj_filtered$dir_path,"/rds/theta_N_seg.rds"))
    
    theta_hat_cbn2=theta_hat_cbn[,which(stringr::str_sub(colnames(theta_hat_cbn), end=1)=='t'), drop=F]
    rho_hat_cbn2=theta_hat_cbn[,which(stringr::str_sub(colnames(theta_hat_cbn), end=1)=='r'), drop=F]
    
    tmp=pheatmap::pheatmap(theta_hat_cbn2, cluster_cols = F, cluster_rows = T, show_rownames = F, clustering_method = "ward.D2", silent = TRUE)#, gaps_col=gaps_col), annotation_row = cell_label, annotation_col =region_label)
    
    
    if(plot_theta==TRUE){
      pdf(paste0(plot_path,"/hierarchcial_clustering_theta.pdf"), width = 12,height = 6)
      if(ncol(theta_hat_cbn2)<2){
        gaps_col=NULL
      }else{
        gaps_col=2*(1:(dim(theta_hat_cbn2)[2]/2))
      }
      region_name=paste0('chr',sapply(strsplit(colnames(theta_hat_cbn2),"_"),'[',2))
      if(is.null(cell_type)){
        tmp=pheatmap::pheatmap(theta_hat_cbn2, cluster_cols = F, cluster_rows = T, show_rownames = F, clustering_method = "ward.D2", gaps_col=gaps_col,  labels_col=region_name)#, annotation_row = cell_label, annotation_col =region_label)
      }else{
        barcodes_tumor=cell_type[which(cell_type[,2]=='tumor'),1]
        barcodes_normal=cell_type[which(cell_type[,2]!='tumor'),1]
        cell_label=rep('unknown', dim(theta_hat_cbn2)[1])
        cell_label[which(rownames(theta_hat_cbn2) %in% barcodes_tumor)]='tumor'
        cell_label[which(rownames(theta_hat_cbn2) %in% barcodes_normal)]='normal'
        cell_label=data.frame('cell type'=cell_label, row.names = rownames(theta_hat_cbn2))
        #cell_label=data.frame('cell type'=cell_type[,2], row.names = cell_type[,1])
        my_colour = list(
          cell.type = c(tumor = "#d53e4f", normal="#1f78b4",unknown="#696969" )
        )
        
        tmp=pheatmap::pheatmap(theta_hat_cbn2, cluster_cols = F, cluster_rows = T, show_rownames = F, clustering_method = "ward.D2", gaps_col=1:(dim(theta_hat_cbn2)[2]), annotation_row = cell_label, annotation_colors = my_colour, cutree_rows = cutree_rows,  labels_col=region_name)
        #tmp=pheatmap::pheatmap(theta_hat_cbn2, cluster_cols = F, cluster_rows = T, show_rownames = F, clustering_method = "ward.D2", gaps_col=1:(dim(theta_hat_cbn2)[2]), annotation_row = cell_label, cutree_rows = cutree_rows,  labels_col=region_name)
      }
      dev.off()
      
      cat(paste0("Plot for theta_hat clustering across all regions is saved in the path:", plot_path,"\n"))
    }
    
    
    ## select normal cells
    k=cell_nclust
    clust=cutree(tmp$tree_row, k=k)
    theta_ss_k=c()
    for(ii in 1:k){
      theta_sub=theta_hat_cbn2[which(clust==ii),, drop=F]
      theta_mean=apply(theta_sub, 2, mean)
      theta_ss=sum((theta_mean-0.5)^2)
      theta_ss_k=c(theta_ss_k, theta_ss)
    }
    
    (k_normal=which(theta_ss_k==min(theta_ss_k)))
    
    barcode_normal=names(clust)[which(clust==k_normal)]
    
    
    
    ## select normal regions
    #k=5)
    region_normal_rank5=matrix(nrow = 10, ncol=k)
    for(ii in 1:k){
      theta_sub=theta_hat_cbn2[which(clust==ii),, drop=F]
      rho_sub=rho_hat_cbn2[which(clust==ii),, drop=F]
      theta_ss_region=apply(theta_sub, 2, function(x) sum((x-0.55)^2))
      names(theta_ss_region)=colnames(theta_hat_cbn2)
      #rho_cv_region=apply(rho_hat_cbn2, 2, function(x) sd(x)/mean(x))
      rho_med_region=apply(rho_sub,2, mean)
      normal_rank=sort(rank(theta_ss_region)+rank(rho_med_region))
      normal_regions=sapply(strsplit(names(normal_rank),"_"),'[',2)
      region_normal_rank5[,ii]=normal_regions[1:10]
    }
    
    
    colnames(region_normal_rank5)=paste0('k',1:k)
    rownames(region_normal_rank5)=paste0('rank',1:10)
    
    #tmp=c()
    #for(rr in 1:3){
    #  tmp=c(tmp, names(table(region_normal_rank5[rr,-k_normal])))
    #}
    nrname=unique(as.character(region_normal_rank5))[!is.na(unique(as.character(region_normal_rank5)))]
    rorder=rep(0, length(nrname))
    
    for(rr in 1:ncol(region_normal_rank5)){
      if(rr!= k_normal){
        korder=match(nrname, region_normal_rank5[,rr])
        korder[is.na(korder)]=11
        rorder=rorder+korder
      }
    }
    names(rorder)=nrname
    
    (region_normal=names(sort(rorder))[1:10])
    
    select_normal=list("barcode_normal"=barcode_normal, "region_normal"=region_normal, "region_normal_rank"=region_normal_rank5, "k_normal"=k_normal )
    
  }else{
    saveRDS(theta_hat_cbn, paste0(Obj_filtered$dir_path,"/rds/theta_seg_pre_sel.rds"))
    
    theta_hat_cbn2=theta_hat_cbn[,which(stringr::str_sub(colnames(theta_hat_cbn), end=1)=='t'), drop=F]
    
    tmp=pheatmap::pheatmap(theta_hat_cbn2, cluster_cols = F, cluster_rows = T, show_rownames = F, clustering_method = "ward.D2", silent = TRUE)#, gaps_col=gaps_col), annotation_row = cell_label, annotation_col =region_label)
    
    
    if(plot_theta==TRUE){
      pdf(paste0(plot_path,"/hierarchcial_clustering_theta_pre_sel.pdf"), width = 12,height = 6)
      if(ncol(theta_hat_cbn2)<2){
        gaps_col=NULL
      }else{
        gaps_col=2*(1:(dim(theta_hat_cbn2)[2]/2))
      }
      region_name=paste0('chr',sapply(strsplit(colnames(theta_hat_cbn2),"_"),'[',2))
      if(is.null(cell_type)){
        tmp=pheatmap::pheatmap(theta_hat_cbn2, cluster_cols = F, cluster_rows = T, show_rownames = F, clustering_method = "ward.D2", gaps_col=gaps_col,  labels_col=region_name)#, annotation_row = cell_label, annotation_col =region_label)
      }else{
        barcodes_tumor=cell_type[which(cell_type[,2]=='tumor'),1]
        barcodes_normal=cell_type[which(cell_type[,2]!='tumor'),1]
        cell_label=rep('unknown', dim(theta_hat_cbn2)[1])
        cell_label[which(rownames(theta_hat_cbn2) %in% barcodes_tumor)]='tumor'
        cell_label[which(rownames(theta_hat_cbn2) %in% barcodes_normal)]='normal'
        cell_label=data.frame('cell type'=cell_label, row.names = rownames(theta_hat_cbn2))
        my_colour = list(
          cell.type = c(tumor = "#d53e4f", normal="#1f78b4",unknown="#696969" )
        )
        
        tmp=pheatmap::pheatmap(theta_hat_cbn2, cluster_cols = F, cluster_rows = T, show_rownames = F, clustering_method = "ward.D2", gaps_col=1:(dim(theta_hat_cbn2)[2]), annotation_row = cell_label, annotation_colors = my_colour, cutree_rows = cutree_rows,  labels_col=region_name)
      }
      dev.off()
      
      cat(paste0("Plot for theta_hat clustering across all regions is saved in the path:", plot_path,"\n"))
    }
    
    
    ## select normal cells
    k=cell_nclust
    clust=cutree(tmp$tree_row, k=k)
    theta_ss_k=c()
    for(ii in 1:k){
      theta_sub=theta_hat_cbn2[which(clust==ii),, drop=F]
      theta_mean=apply(theta_sub, 2, mean)
      theta_ss=sum((theta_mean-0.5)^2)
      theta_ss_k=c(theta_ss_k, theta_ss)
    }
    
    (k_normal=which(theta_ss_k==min(theta_ss_k)))
    
    barcode_normal=names(clust)[which(clust==k_normal)]
    
    select_normal=list("barcode_normal"=barcode_normal ) 
  }
  
  
  Obj_filtered$select_normal=select_normal
  Obj_filtered$theta_hats=theta_hat_cbn2
  
  message("Candidate normal cell and normal region info is in \"Obj_filtered$select_normal\".")
  return(Obj_filtered)
  
}
seasoncloud/Alleloscope documentation built on March 17, 2023, 1:14 a.m.