R/coverage.R

Defines functions count_coverage rawfile_ToBigWig smoothBin concatenate_scBed_into_clusters generate_coverage_tracks

Documented in concatenate_scBed_into_clusters count_coverage generate_coverage_tracks rawfile_ToBigWig smoothBin

#' Generate cell cluster pseudo-bulk coverage tracks
#' 
#' @description  Generate cell cluster pseudo-bulk coverage tracks. First, scBED
#' files are concatenated into cell clusters contained in the 'by' 
#' column of your SingleCellExperiment object. To do so, for each sample in the 
#' given list, the barcodes of each cluster are grepped and BED files are 
#' merged into pseudo-bulk of clusters (C1,C2...). Two cells from different can
#' have the same barcode ID as cell affectation is done sample by sample.
#'  Then coverage of pseudo-bulk BED files is calculated by averaging &
#'   smoothing reads on small genomic window (150bp per default). The pseudo 
#'   bulk BED and BigWigs coverage tracks are writtend to the output directory.
#'   This functionality is not available on Windows as it uses the 'cat' and 
#'   'gzip' utilities from Unix OS.
#' 
#' @param scExp_cf A SingleCellExperiment with cluster selected.
#' (see \code{\link{choose_cluster_scExp}}). It is recommended having a minimum
#' of ~100 cells per cluster in order to obtain smooth tracks.
#' @param input Either a named list of character vector of path towards 
#' single-cell BED files or a sparse raw matrix of small bins (<<500bp). If 
#' a named list specifying scBED the names MUST correspond to the 'sample_id' 
#' column in your SingleCellExperiment object. The single-cell BED files names MUST 
#' match the  barcode names in your SingleCellExperiment (column 'barcode'). The
#' scBED files can be gzipped or not. 
#' @param odir The output directory to write the cumulative BED and BigWig
#'  files.
#' @param format  File format, either "raw_mat", "BED" or "BAM"
#' @param ref_genome The genome of reference, used to constrain to canonical 
#' chromosomes. Either 'hg38' or 'mm10'. 'hg38' per default. 
#' @param bin_width The width of the bin to create the coverage track. The 
#' smaller the greater the resolution & runtime. Default to 150.
#' @param n_smoothBin Number of bins left & right to average ('smooth') the 
#' signal on. Default to 5.
#' @param read_size The estimated size of reads. Default to 101.
#' @param quantile_for_peak_calling The quantile to define the threshold above 
#' which signal is considered as a peak.
#' @param by A character specifying a categoricla column of scExp_cf metadata
#'  by which to group cells and generate coverage tracks and peaks.
#' @param progress A Progress object for Shiny. Default to NULL.
#'
#' @return Generate coverage tracks (.bigwig) for each group in the 
#' SingleCellExperiment "by" column.
#' @export
#'
#' @examples \dontrun{
#' data(scExp)
#' input_files_coverage = list(
#'   "scChIP_Jurkat_K4me3" = paste0("/path/to/",scExp$barcode[1:51],".bed"),
#'   "scChIP_Ramos_K4me3" = paste0("/path/to/",scExp$barcode[52:106],".bed")
#' )
#' generate_coverage_tracks(scExp, input_files_coverage, "/path/to/output",
#' ref_genome = "hg38")
#' }
generate_coverage_tracks <- function(scExp_cf, input, odir, 
                                     format = "scBED",
                                     ref_genome = c("hg38","mm10", "ce11")[1],
                                     bin_width = 150,  n_smoothBin = 5,
                                     read_size = 101,
                                     quantile_for_peak_calling = 0.85,
                                     by = "cell_cluster",
                                     progress = NULL){
  stopifnot(is(scExp_cf,"SingleCellExperiment"), is.character(format),
            dir.exists(odir), ref_genome %in% c("hg38","mm10", "ce11"), 
            is.numeric(bin_width), is.numeric(n_smoothBin),
            is.numeric(read_size))
  
  if(format == "scBED") stopifnot(length(intersect(scExp_cf$sample_id, names(input))) > 0)
  
  affectation = SingleCellExperiment::colData(scExp_cf)
  affectation[,by] = as.factor(affectation[,by])
  
  nclust = length(unique(affectation[,by]))
  if (!is.null(progress)) progress$set(message=paste0('Generating coverage tracks for k= ', nclust,' group ...'), value = 0.1)
  
  
  if (!is.null(progress)) progress$set(detail = "Generating pseudo-bulk...", value = 0.2)
  if(format == "scBED") {
    concatenate_scBed_into_clusters(affectation, input, odir)
  } 
  
  if (!is.null(progress)) progress$set(detail = "Creating coverage files for each cluster...", value = 0.40)
  
  if(format == "raw_mat"){
    original_bins = rownames(input)
    original_bins =  strsplit(original_bins, "_", fixed = TRUE)
    original_bins_chr = as.character(lapply(original_bins, function(x) x[1]))
    original_bins_start = as.numeric(lapply(original_bins, function(x) x[2]))
    original_bins_end = as.numeric(lapply(original_bins, function(x) x[3]))
    gc()
    original_bins = GenomicRanges::GRanges(
      seqnames = original_bins_chr, ranges = IRanges::IRanges(original_bins_start, original_bins_end))
    input = input[,match(colnames(scExp_cf), colnames(input))]
  }
  suffix = ".bed"
  n = 0
  for(class in unique(affectation[,by])) {
    n = n + 1
    if (!is.null(progress)) progress$set(detail = paste0("Coverage ",class,"..."),
                                         value = 0.4 + n * (0.6/length(unique(affectation[, by]))) )
    out_bw = file.path(odir, paste0(class,".bw"))
    norm_factor = sum(affectation$total_counts[affectation[, by] == class])
    if(format == "scBED") {
      input = file.path(odir, paste0(class,suffix))
      rawfile_ToBigWig(input, out_bw, "BED", bin_width = bin_width,
                       norm_factor = norm_factor,
                       n_smoothBin = n_smoothBin,  ref = ref_genome,
                       read_size = read_size,
                       quantile_for_peak_calling = quantile_for_peak_calling)
    } else{
      input. = input[,which(affectation[, by] %in% class)]
      rawfile_ToBigWig(input = input., BigWig_filename = out_bw,
                       format = "raw_mat", bin_width = bin_width,
                       norm_factor = norm_factor,
                       n_smoothBin = n_smoothBin,  ref = ref_genome,
                       read_size = read_size,
                       original_bins = original_bins,
                       quantile_for_peak_calling = quantile_for_peak_calling)
    }
  }
  if (!is.null(progress)) progress$set(detail = "Done !", value = 0.95)
}

#' Concatenate single-cell BED into clusters
#'
#' @param affectation Annotation data.frame containing cluster information
#' @param files_list Named list of scBED file paths to concatenate. List Names 
#' must match affectation$sample_id and basenames must match
#'  affectation$barcode.
#' @param odir Output directory to write concatenate pseudo-bulk BEDs.
#'
#' @return Merge single-cell BED files into cluster BED files. Ungzip file if 
#' BED is gzipped.
#'
concatenate_scBed_into_clusters <- function(affectation, files_list, odir){
  unlink(file.path(odir, "*.bed"))
  gzipped = grepl(".gz", files_list[[1]][1])
  suffix = ""
  if(gzipped) suffix = ".gz"
  for(sample in names(files_list)){
    message("ChromSCape:::concatenate_scBed_into_clusters - concatenating ",
            "files for ", sample, " files...")
    file_pool = files_list[[sample]]
    for (class in levels(factor(affectation[,by])))
    {
      message("ChromSCape:::concatenate_scBed_into_clusters - concatenating ", class, "...")
      class_barcodes <- affectation$barcode[which(affectation[,by] == as.character(class) &
                                                    affectation$sample_id == sample)]
      
      files_class = c()
      if(length(class_barcodes)>500){
        
        while(length(class_barcodes)>500){
          class_barcodes_chunk = class_barcodes[1:500]
          files_class = c(files_class,
                          file_pool[grep(paste(class_barcodes_chunk, collapse="|"),
                                         file_pool,useBytes = TRUE, perl = TRUE)])
          class_barcodes = class_barcodes[-c(1:500)]
        }
        files_class = c(files_class,
                        file_pool[grep(paste(class_barcodes, collapse="|"),
                                       file_pool,useBytes = TRUE, perl = TRUE)])
        
      } else {
        files_class = file_pool[grep(paste(class_barcodes, collapse="|"),
                                     file_pool,useBytes = TRUE, perl = TRUE)]
      }
      if(length(class_barcodes)>0){
        for(file in files_class){
          command = paste0("cat '", file, "' >> '",
                           file.path(odir, paste0(class, ".bed",suffix,"'")))
          system(command)
        }
      }
    }
  }
  for(class in levels(factor(affectation[,by]))){
    if(gzipped) {
      command = paste0("gzip -cd '", file.path(
        odir,paste0(class,".bed", suffix)),
        "' > '", file.path(odir,paste0(class, ".bed'")))
      system(command)
    }
  }
}


#' Smooth a vector of values with nb_bins left and righ values
#'
#' @param bin_score A numeric vector of values to be smoothed
#' @param nb_bins Number of values to take left and right
#'  
#'  @importFrom BiocParallel bpvec
#' @return A smooth vector of the same size 
#' 
smoothBin <- function(bin_score, nb_bins = 10){
  bin_original = bin_score
  fun <-function(v){
    v_or = v
    start = nb_bins + 1
    end = length(v) - nb_bins -1
    for(i in start:end){
      v[i] = mean(v_or[(i-nb_bins):(i+nb_bins)])
    }
    return(v)
  }
  bin_score = unlist(BiocParallel::bpvec(bin_score, fun))
  return(bin_score)
}

#' rawfile_ToBigWig : reads in BAM file and write out BigWig coverage file, 
#' normalized and smoothed
#'
#' @param input Either a named list of character vector of path towards 
#' single-cell BED files or a sparse raw matrix of small bins (<<500bp). If 
#' a named list specifying scBEDn the names MUST correspond to the 'sample_id' 
#' column in your SingleCellExperiment object. The single-cell BED files names MUST 
#' match the  barcode names in your SingleCellExperiment (column 'barcode'). The
#' scBED files can be gzipped or not.
#' @param BigWig_filename Path to write the output BigWig file
#' @param format File format, either "BAM" or "BED"
#' @param bin_width Bin size for coverage
#' @param norm_factor Then number of cells or total number of reads in the given 
#' sample, for normalization.
#' @param n_smoothBin Number of bins for smoothing values
#' @param ref Reference genome.
#' @param read_size Length of the reads.
#' @param original_bins Original bins GenomicRanges in case the format is raw
#' matrix.
#' @param quantile_for_peak_calling The quantile to define the threshold above 
#' which signal is considered as a peak.
#'
#' @return Writes in the output directory a bigwig file displaying the 
#' cumulative coverage of cells and a basic set of peaks called by taking all 
#' peaks above a given threshold
#' 
#' @importFrom rtracklayer export.bw export.bed
#' @importFrom GenomicRanges tileGenome width seqnames
#' @return Writes a BigWig file as output
#' 
rawfile_ToBigWig <- function(input, BigWig_filename, format = "BAM",
                             bin_width = 150, norm_factor,
                             n_smoothBin = 5, ref = "hg38",
                             read_size = 101, original_bins = NULL,
                             quantile_for_peak_calling = 0.85){
  bins = NULL
  eval(parse(text = paste0("data(",ref, ".chromosomes)")))
  canonical_chr <-  eval(parse(text = paste0(ref, ".chromosomes")))
  canonical_chr$start = 1
  canonical_chr <- GenomicRanges::GRanges(seqnames = canonical_chr$chr,
                                          ranges = IRanges::IRanges(
                                            start = canonical_chr$start,
                                            end = canonical_chr$end
                                          ))
  GenomicRanges::seqinfo(canonical_chr)@seqlengths  = end(canonical_chr)
  if(format != "raw_mat") {
    message("ChromSCape:::rawfile_ToBigWig - generating bigwig for  ",
            basename(BigWig_filename), " file...")
    
    
    bins <- unlist(GenomicRanges::tileGenome(
      setNames(
        GenomicRanges::width(canonical_chr),
        GenomicRanges::seqnames(canonical_chr)
      ),
      tilewidth = bin_width
    ))
    gc()
  } else if(format == "raw_mat")  {
    stopifnot(!is.null(original_bins), is(original_bins, "GRanges"))
    message("ChromSCape:::rawfile_ToBigWig - generating bigwig from the ",
            "raw matrix...")
  } else{
    message("ChromSCape:::rawfile_ToBigWig - format must be either 'raw_mat'",
            ", 'BAM' or 'BED'. Returning.")
    return()
  }
  
  bins <- count_coverage(input, format, bins, canonical_chr, norm_factor, 
                         n_smoothBin, ref, read_size, original_bins)
  ## export as bigWig
  GenomicRanges::seqinfo(bins)@seqlengths = 
      GenomicRanges::seqinfo(canonical_chr)@seqlengths[
          match(GenomicRanges::seqinfo(bins)@seqnames,
                GenomicRanges::seqinfo(canonical_chr)@seqnames)]
  
  print(BigWig_filename)
  rtracklayer::export.bw(bins, BigWig_filename)
  
  peaks = bins[bins$score > quantile(bins$score, quantile_for_peak_calling)]
  peaks = GenomicRanges::reduce(peaks, min.gapwidth = 1000, 
                                ignore.strand = TRUE)
  rtracklayer::export.bed(peaks, gsub(".bw",".bed.gz", BigWig_filename))
}

#' Create a smoothed and normalized coverage track from a BAM file and
#' given a bin GenomicRanges object (same as deepTools bamCoverage)
#'
#' Normalization is CPM, smoothing is done by averaging on n_smoothBin regions
#' left and right of any given region.
#' 
#' @param input Either a named list of character vector of path towards 
#' single-cell BED files or a sparse raw matrix of small bins (<<500bp). If 
#' a named list specifying scBEDn the names MUST correspond to the 'sample_id' 
#' column in your SingleCellExperiment object. The single-cell BED files names MUST 
#' match the  barcode names in your SingleCellExperiment (column 'barcode'). The
#' scBED files can be gzipped or not.
#' @param format File format, either "BAM" or "BED"
#' @param bins A GenomicRanges object of binned genome
#' @param canonical_chr GenomicRanges of the chromosomes to read the BAM file.
#' @param norm_factor Then number of cells or total number of reads in the given 
#' sample, for normalization.
#' @param n_smoothBin Number of bins left and right to smooth the signal.
#' @param ref Genomic reference
#' @param read_size Length of the reads
#' @param original_bins Original bins GenomicRanges in case the format is raw
#' 
#' matrix.
#' @importFrom Rsamtools ScanBamParam scanBam scanBamWhat
#' @importFrom GenomicRanges tileGenome width seqnames findOverlaps
#' GRanges reduce split
#' @importFrom IRanges IRanges
#' 
#' 
#'  
#' @return A binned GenomicRanges that can be readily exported into bigwig file.
#'
count_coverage <-function(input, format = "BAM", bins, canonical_chr, norm_factor,
                          n_smoothBin = 5, ref = "hg38", read_size = 101,
                          original_bins = NULL)
{
  if(format == "BAM"){
    param = Rsamtools::ScanBamParam(which = canonical_chr,
                                    what = Rsamtools::scanBamWhat()[c(3,5)])
    gr <- Rsamtools::scanBam(file = input, param = param)
    chr = unlist(lapply(1:length(canonical_chr), function(i){gr[[i]]$rname}))
    start = unlist(lapply(1:length(canonical_chr), function(i){gr[[i]]$pos}))
    end = start + 101
    gr = GenomicRanges::GRanges(seqnames = chr,
                                ranges = IRanges::IRanges("start" = start,
                                                          "end" = end))
  } else if(format == "BED") {
    gr <- rtracklayer::import(input)
  } else if(format == "raw_mat"){
    bins <- original_bins
  } else {
    message("ChromSCape::count_coverage - format must be either 'BAM', 'BED',",
            " or 'raw_mat'. Returning.")
    return()
  }
  
  if(format != "raw_mat"){
    hits <- GenomicRanges::findOverlaps(
      bins, gr, minoverlap = 1, ignore.strand = TRUE)
    hits = as.matrix(hits)
    hits_agg = table(hits[,1])
    bins$score = 0
    bins$score[as.numeric(names(hits_agg))] = hits_agg
    rm(hits_agg)
    gc()
  } else{
    bins$score = rowSums(input)
  }
  
  bins$score = 10^6* bins$score / length(bins)
  bins$score = smoothBin(bins$score, n_smoothBin)
  bins = bins[-which(bins$score==0)]
  bins$score = 10^6 * bins$score / norm_factor
  
  gc()
  return(bins)
}
vallotlab/ChromSCape documentation built on Oct. 15, 2023, 1:47 p.m.