R/Cov_value.R

Defines functions Cov_value

Documented in Cov_value

#' Normalize coverage using identified/ specified normal cells and one normal region and generate a table with (rho_hat) of each cell for all regions.
#'
#' rho_hat: Relative coverage change for each cell in a region
#'
#' @param Obj_filtered An Alleloscope object with segments, specified normal cells and a normal region
#' @param type Specify whether the sample is a "tumor" or "cellline". If "type" is a "cellline", param "ref_counts" needs to be specified for normal sample.
#' @param raw_counts (required) A large binned coverage matrix (m1 bin by n1 cell) with values being read counts for all chromosomal regions of tumor sample.
#' @param ref_counts (required only when type = "cellline") A binned coverage matrix (m2 bin by n2 cell) with values being read counts for all chromosomal regions of normal sample. n2 can be 1 for bulk sample.
#' @param cov_adj An integer for coverage adjustment for tumor cells. 
#' @param qt_filter Logical (TRUE/ FALSE). Whether or not to exclude cells with rho_hat>0.99 or <0.01 for each region. 
#' @param refr Logical (TRUE/ FALSE). Whether or not to use diplid region for normalization (otherwise, cell size is used).
#' 
#' @return (rho_hat) of each cell for all region in the "cov_values".
#' Every column in the cov_values is (rho_hat) of each region. Each row is a cell.
#'
#' @import rtracklayer
#' @import pheatmap
#' @export
Cov_value=function(Obj_filtered=NULL, type="tumor", raw_counts=NULL, ref_counts=NULL, cov_adj=1, qt_filter=FALSE, refr=TRUE, plot_path=NULL){
  
  ## check parameters
  if(is.null(Obj_filtered)){
    stop("Please provide a valid Alleloscope object for Obj_filttered.")
  }else if(!(type=="tumor" | type=="cellline")){
    stop("type should be either tumor or cellline.")
  }else if(length(unlist(strsplit(rownames(raw_counts)[2],'-')))!=3){
    stop("The rownames for the raw_counts matrix should be formatted as: chr1-1-20000.")
  }else if(!(nrow(raw_counts)>0 & ncol(raw_counts)>0)){
    stop("raw_counts matrix is not valid.")
  }
  
  
  if(!dir.exists(paste0(Obj_filtered$dir_path,"/rds"))){
    dir.create(paste0(Obj_filtered$dir_path,"/rds"))
  }
  
  # set values
  samplename=Obj_filtered$samplename
  dir_path=Obj_filtered$dir_path
  assay=Obj_filtered$assay
  barcode_normal=Obj_filtered$select_normal$barcode_normal
  if(refr==TRUE){
  ref=Obj_filtered$ref
  refn=sapply((strsplit(ref,":")),'[',1)
  }else{
    ref="cellsize"
    refn="0"
  }
  seg_table_filtered=Obj_filtered$seg_table_filtered
  #rds_list=Obj_filtered$rds_list
  cell_barcodes=Obj_filtered$barcodes
  
  ncell=length(Obj_filtered$barcodes)
  theta_N_nr_nc=list()
  
  
  ##ref
  # if(!is.null(ref_gtv)){
  #   dna_gt=ref_gtv
  #   dna_gt=dna_gt[,which(sapply(strsplit(colnames(dna_gt),"_"),'[',1)=='rho')]
  # }
  # 
  # if(is.null(mincell)){
  #   mincell=round(ncol(Obj_filtered$total_all)*0.9)
  # }
  
  # if(mincell<0 | mincell> length(Obj_filtered$barcodes)){
  #   stop("mincell should be in the range [0, ncell].")
  # }
  
  
  ## raw/ref count matrix info
  raw_chr=sapply(strsplit(rownames(raw_counts),'-'),'[',1)
  raw_start=as.numeric(sapply(strsplit(rownames(raw_counts),'-'),'[',2))
  raw_end=as.numeric(sapply(strsplit(rownames(raw_counts),'-'),'[',3))
  
  #if(is.null(ref_gtv) & type=='cellline'){
  if( type=='cellline'){
    if(refr==TRUE){
      ref_chr=sapply(strsplit(rownames(ref_counts),'-'),'[',1)
      ref_start=as.numeric(sapply(strsplit(rownames(ref_counts),'-'),'[',2))
      ref_end=as.numeric(sapply(strsplit(rownames(ref_counts),'-'),'[',3))}
  }
  
  N0_all=colSums(raw_counts) ## for p and q #for cytoarm
  
  message("Start estimating cell specific (rho_hat) for each region.")
  
  
  
  ####
  if(!is.null(plot_path)){
  #regions=gsub('chr','',names(Obj_filtered$rds_list))
  regions=gsub('chr','',seg_table_filtered$chrr)
  for(chrr in regions){
    chrrn=unlist(strsplit(chrr,':'))[1]
    
    #result=rds_list[[paste0('chr',chrr)]]
    #theta_hat=result$theta_hat
    #names(theta_hat)=result$barcodes
    
    raw_counts_chr=raw_counts[which(raw_chr %in% paste0('chr', as.character(chrrn))),]
    raw_chr_sub=raw_chr[which(raw_chr %in% paste0('chr', as.character(chrrn)))]
    raw_start_sub=raw_start[which(raw_chr %in% paste0('chr', as.character(chrrn)))]
    raw_end_sub=raw_end[which(raw_chr %in% paste0('chr', as.character(chrrn)))]
    
    
    
    
    ## subsetting region
    query=GenomicRanges::GRanges(paste0('chr',chrrn),IRanges::IRanges(as.numeric(seg_table_filtered[which(seg_table_filtered$chrr == chrr),2]), as.numeric(seg_table_filtered[which(seg_table_filtered$chrr == chrr),3])))
    subject=GenomicRanges::GRanges(raw_chr_sub, IRanges::IRanges(as.numeric(raw_start_sub),as.numeric(raw_end_sub))) ## cytoband 1-based start and 1-based end
    ov=findOverlaps(query, subject)
    ov=as.matrix(ov)
    
    bin_start=min(ov[,2])
    bin_end=max(ov[,2])
    #raw_counts_chr=raw_counts_chr[bin_start:bin_end,match(result$barcodes, cell_barcodes)] ## for p and q #for cytoarm
    raw_counts_chr=raw_counts_chr[bin_start:bin_end,] ## for p and q #for cytoarm
    Nr=colSums(raw_counts_chr)
    #names(Nr)=result$barcodes
    names(Nr)=cell_barcodes
    
    
    # subsetting normal
    if(refr==TRUE){
      raw_counts_ref=raw_counts[which(raw_chr %in% paste0('chr', as.character(refn))),]
      raw_ref_chr_sub=raw_chr[which(raw_chr %in% paste0('chr', as.character(refn)))]
      raw_ref_start_sub=raw_start[which(raw_chr %in% paste0('chr', as.character(refn)))]
      raw_ref_end_sub=raw_end[which(raw_chr %in% paste0('chr', as.character(refn)))]
      
      query=GenomicRanges::GRanges(paste0('chr',refn),IRanges::IRanges(as.numeric(seg_table_filtered[which(seg_table_filtered$chrr == ref),2]), as.numeric(seg_table_filtered[which(seg_table_filtered$chrr == ref),3])))
      subject=GenomicRanges::GRanges(raw_ref_chr_sub, IRanges::IRanges(as.numeric(raw_ref_start_sub),as.numeric(raw_ref_end_sub))) ## cytoband 1-based start and 1-based end
      ov=findOverlaps(query, subject)
      ov=as.matrix(ov)
      
      bin_start=min(ov[,2])
      bin_end=max(ov[,2])
      #raw_counts_ref=raw_counts_ref[bin_start:bin_end,match(result$barcodes, cell_barcodes)] ## for p and q #for cytoarm
      raw_counts_ref=raw_counts_ref[bin_start:bin_end,] 
      N0=colSums(raw_counts_ref)
    }else{
      #raw_counts_ref=raw_counts[,match(result$barcodes, cell_barcodes)] ## for p and q #for cytoarm
      #N0=N0_all[match(result$barcodes, cell_barcodes)]
      N0=N0_all
      ref="cell_size"
      Obj_filtered$ref=ref
    }
    
    
    ## non_noisy
    barcodes_non_noisy=cell_barcodes#cell_info$barcode[which(cell_info$is_noisy==0)]
    #barcodes_non_noisy=intersect(barcodes_non_noisy, result$barcodes)
    
    
    #Nr=Nr[match(barcodes_non_noisy, result$barcodes)]
    #N0=N0[match(barcodes_non_noisy, result$barcodes)]
    Ni=Nr/N0
    names(Ni)=barcodes_non_noisy
    
    #if(is.null(ref_gtv)){
      # if(length(result$barcodes)<mincell){
      #   cat(paste0("Exclude ",chrr," region:<",mincell," cells\n"))
      #   next
      # }
      if(type=='tumor'){
        ref_ncell=length(barcode_normal)
        Nrref=Nr[which(names(Nr) %in% barcode_normal)]
        N0ref=N0[which(names(Nr) %in% barcode_normal)]
        Ni_ref=median(Nrref/N0ref)
        
      }else if(type=='cellline'){
        #normal sample from other normal dataset
        ref_counts_chr=ref_counts[which(ref_chr %in% paste0('chr', as.character(chrrn))),]
        ref_counts_ref=ref_counts[which(ref_chr %in% paste0('chr', as.character(refn))),]
        
        ref_chr_sub=ref_chr[which(ref_chr %in% paste0('chr', as.character(chrrn)))]
        ref_start_sub=ref_start[which(ref_chr %in% paste0('chr', as.character(chrrn)))]
        ref_end_sub=ref_end[which(ref_chr %in% paste0('chr', as.character(chrrn)))]
        
        ref_ref_chr_sub=ref_chr[which(ref_chr %in% paste0('chr', as.character(refn)))]
        ref_ref_start_sub=ref_start[which(ref_chr %in% paste0('chr', as.character(refn)))]
        ref_ref_end_sub=ref_end[which(ref_chr %in% paste0('chr', as.character(refn)))]
        
        # ref chr
        query=GenomicRanges::GRanges(paste0('chr',chrrn),IRanges::IRanges(as.numeric(seg_table_filtered[which(seg_table_filtered$chrr == chrr),2]), as.numeric(seg_table_filtered[which(seg_table_filtered$chrr == chrr),3])))
        subject=GenomicRanges::GRanges(ref_chr_sub, IRanges::IRanges(as.numeric(ref_start_sub),as.numeric(ref_end_sub))) ## cytoband 1-based start and 1-based end
        ov=findOverlaps(query, subject)
        ov=as.matrix(ov)
        
        bin_start=min(ov[,2])
        bin_end=max(ov[,2])
        ref_counts_chr=ref_counts_chr[bin_start:bin_end,] ## for p and q #for cytoarm
        
        ## ref ref
        query=GenomicRanges::GRanges(paste0('chr',refn),IRanges::IRanges(as.numeric(seg_table_filtered[which(seg_table_filtered$chrr == ref),2]), as.numeric(seg_table_filtered[which(seg_table_filtered$chrr == ref),3])))
        subject=GenomicRanges::GRanges(ref_ref_chr_sub, IRanges::IRanges(as.numeric(ref_ref_start_sub),as.numeric(ref_ref_end_sub))) ## cytoband 1-based start and 1-based end
        ov=findOverlaps(query, subject)
        ov=as.matrix(ov)
        
        bin_start=min(ov[,2])
        bin_end=max(ov[,2])
        ref_counts_ref=ref_counts_ref[bin_start:bin_end,] ## for p and q #for cytoarm
        
        Nrref=colSums(ref_counts_chr)
        N0ref=colSums(ref_counts_ref)
        #
        Ni_ref=Nrref/N0ref
        Ni_ref[is.na(Ni_ref)]=0
        Ni_ref=median(Ni_ref)
      }
      
      if(type=='cellline'){
        Ni=Ni*cov_adj
      }else{
        Ni[which(! names(Nr) %in% barcode_normal)]=Ni[which(! names(Nr) %in% barcode_normal)]*cov_adj
      }
      
      Ni=Ni/Ni_ref
      Ni[is.na(Ni)]=0
      Ni[which(Ni==Inf)]=0
      if(qt_filter==TRUE){
        Niq=Ni[which(Ni<=quantile(Ni, 0.99) & Ni>=quantile(Ni, 0.01))] ##
      }else{
        Niq=Ni
        Niq=pmin(Niq, quantile(Ni, 0.99))
        Niq=pmax(Niq, quantile(Ni, 0.01))
      }
      barcodes_nn_q=names(Niq)
      
    # }else{
    #   gt=as.numeric(dna_gt[,which(colnames(dna_gt)==paste0('rho_',chrr))])
    #   if(length(Ni)>(ncell/2)){
    #     md=median(gt)
    #     Ni=md/median(Ni, na.rm = TRUE)*Ni
    #   }else{
    #     md=quantile(gt,0.2)
    #     Ni=md/quantile(Ni,0.2, na.rm=TRUE)*Ni}
    #   Niq=Ni
    #   barcodes_nn_q=names(Niq)
    #   
    #   
    # }
    
    #theta_hat=theta_hat[match(barcodes_nn_q, names(theta_hat))]
    
    #w1=result$w1[match(names(Ni), names(result$w1))]
    #w2=result$w2[match(names(Ni), names(result$w2))]
    
    theta_N_nr_nc[[paste0("rho_",as.character(chrr))]]=Niq
    #theta_N_nr_nc[[paste0("theta_",as.character(chrr))]]=theta_hat
    #theta_N_nr_nc[[paste0("h1_",as.character(chrr))]]=w1
    #theta_N_nr_nc[[paste0("h2_",as.character(chrr))]]=w2
    
    
    cat(paste0(chrr," "))
  }
  cat('\n')
  
  cell_list<-lapply(theta_N_nr_nc, function(x) {
    names(x)
  })
  
  # if(!is.null(ref_gtv)){
  #   cell_intersect <- Reduce(union, cell_list)
  # }else{
   # if(cell_filter==TRUE){
      cell_intersect <- Reduce(intersect, cell_list)
  #  }else{
  #    cell_intersect <- Reduce(union, cell_list) 
  #  }
  #}
  
  
  theta_hat_cbn <- sapply(theta_N_nr_nc,function(x){
    x[match(cell_intersect, names(x))]
  })
  
  rownames(theta_hat_cbn) <- cell_intersect
  
  saveRDS(theta_hat_cbn, paste0(Obj_filtered$dir_path, "/rds/cov_values.rds"))
  
  Obj_filtered$cov_values=theta_hat_cbn
  message("\"cov_values\" is added to the Obj_filtered object.")
  cat(paste0("Matrix for cell specific (rho) for each region is stored as cov_values.rds in the path:",dir_path,"\n"))
  
  
  #### plotting
  message("Plotting the coverage values for each segment...")
  
  cluster_cbn=theta_hat_cbn
  cluster_cbn=cluster_cbn*2
  cluster_cbn=apply(cluster_cbn, c(1,2), function(x) if(x<=2.5 & x>=1.5){x=2}else{x=x})
  #cluster_cbn=apply(cluster_cbn, c(1,2), function(x) if(x<=2.3 & x>=1.7){x=2}else{x=x})
  #cluster_cbn=apply(cluster_cbn, c(1,2), function(x){pmin(x,5)})
  
  segmentation=Obj_filtered$seg_table_filtered
  nrep=round(as.numeric(segmentation[,5])/5000000)
  cnrep=c(0,cumsum(nrep))
  cluster_cbn2_all=matrix(ncol=sum(nrep), nrow=nrow(cluster_cbn))
  
  for(ii in 1:ncol(cluster_cbn)){
    if(nrep[ii]>0){
      rr=(replicate(nrep[ii],cluster_cbn[,ii]))
      cluster_cbn2_all[,(cnrep[ii]+1):(cnrep[ii+1])]=rr
    }
  }
  
  rownames(cluster_cbn2_all)=rownames(cluster_cbn)
  
  
  chrgap=c()
  for(ii in 1:22){
    chrgap=c(chrgap,sum(nrep[which(segmentation$chr==ii)]))
  }
  
  
  nclust=3
  celltype=as.data.frame(celltypes)
  rownames(celltype)=celltype[,1]
  
  plot_matrix<- cluster_cbn2_all
  
  breaklength = 50
  setcolor = colorRampPalette(c("blue", "white", "red"))(breaklength)
  setbreaks = c(seq(min(plot_matrix), 1.7, length.out=ceiling(breaklength/2) + 1), 
                c(2.3,seq((max(plot_matrix)-2.3)/breaklength+2.3, max(plot_matrix), 
                          length.out=floor(breaklength/2)))[1:(breaklength/2)])
  
  col_lab=rep(" ", ncol(cluster_cbn2_all))
  col_lab[c(0, cumsum(chrgap)[1:(length(chrgap)-1)])+chrgap/2]=paste0("chr",as.character(1:22))
  
  png(plot_path,width=800, height=450)
  tmp=pheatmap::pheatmap(plot_matrix,
                         cluster_cols = F, cluster_rows = TRUE,
                         show_rownames = F,
                         show_colnames = T,
                         color=setcolor,
                         breaks = setbreaks,
                         labels_col=col_lab,
                         clustering_distance_rows = "correlation",
                         clustering_method = "ward.D2",
                         gaps_col=cumsum(chrgap),
                         cutree_rows = nclust,
                         annotation_row=celltype[, ncol(celltype),drop=F])
  
  dev.off()
  
  message(paste0("Plot is successfully saved in the path:",plot_path))
  }
  
  return(Obj_filtered)
}
seasoncloud/Alleloscope documentation built on March 17, 2023, 1:14 a.m.