R/process_chip_data.R

Defines functions add_chip_signals_to_sites add_chip_peak_labels_to_sites normalize_chip count_normalize_chip

Documented in add_chip_peak_labels_to_sites add_chip_signals_to_sites count_normalize_chip normalize_chip

#' @title Counts and normalizes (and transforms) ChIP-seq read coverage
#' @description Counts ChIP-seq read coverage for each BAM file
#' using the \code{coverage} function from \code{bedtools}.
#' Then normalizes the read counts
#' by scaling to the reference ChIP-seq library size,
#' and transforms the ChIP-seq read counts.
#' @param sites A data frame containing the candidate sites.
#' @param chip_bam_files ChIP-seq BAM files (may include multiple replicates).
#' @param chip_idxstats_files ChIP-seq \code{idxstats} files
#' (may include multiple replicates).
#' (Default: uses the corresponding \code{.idxstats.txt} files in
#' the same directory of the BAM files).
#' @param chrom_size_file Chromosome size file.
#' @param ref_size Scales to ChIP-seq reference library size
#' (Default: 2e7)
#' @param transform Transformation for ChIP-seq read counts.
#' Options are \sQuote{none} (no transformation), \sQuote{asinh}, \sQuote{log2}, and \sQuote{'sqrt'}.
#' @param bedtools_path Path to \code{bedtools} executable.
#' (default: 'bedtools')
#' @return A data frame of candidate sites and normalized (and transformed)
#' ChIP-seq read counts around each candidate site.
#' @importFrom data.table fread fwrite
#' @export
#' @examples
#' \dontrun{
#' sites_chip <- count_normalize_chip(sites,
#'                                    chip_bam_files=c('ChIPseq.rep1.bam', 'ChIPseq.rep2.bam'),
#'                                    chrom_size_file='hg38.chrom.sizes',
#'                                    ref_size=2e7)
#' }
count_normalize_chip = function(sites,
                                chip_bam_files,
                                chip_idxstats_files,
                                chrom_size_file,
                                ref_size=2e7,
                                transform=c('none', 'asinh', 'log2', 'sqrt'),
                                bedtools_path='bedtools'){

  transform <- match.arg(transform)

  if ( Sys.which(bedtools_path) == '' ) {
    stop( 'bedtools could not be executed. Please install bedtools and set bedtools_path.' )
  }

  if ( length(chip_bam_files) == 0 || chip_bam_files == '' || is.na(chip_bam_files)) {
    stop('No ChIP-seq files! \n')
  }

  if(missing(chip_idxstats_files)){
    chip_idxstats_files <- paste0(chip_bam_files, '.idxstats.txt')
  }

  # Counts total ChIP-seq reads (pooling replicates together)
  cat('Counting ChIP-seq read coverage ... \n')
  chip_bam_files <- paste(chip_bam_files, collapse = ' ')
  chip_count_file <- tempfile(pattern = 'totalcounts')
  sites_file <- tempfile('tmp_sites')
  fwrite(sites, sites_file, sep = '\t', col.names = FALSE, scipen = 999)

  cmd <- paste(bedtools_path, 'coverage -a', sites_file, '-b', chip_bam_files,
               '-counts -sorted -g', chrom_size_file, '>', chip_count_file)
  if(.Platform$OS.type == "windows") shell(cmd) else system(cmd)

  sites_chip <- as.data.frame(data.table::fread(chip_count_file))
  colnames(sites_chip) <- c(colnames(sites), 'chip')

  # Normalizes (scales) ChIP-seq read counts to the reference library size
  cat('Normalize (scale) ChIP-seq library to', ref_size / 1e6, 'million reads... \n')
  total_readsMapped <- sum(sapply(chip_idxstats_files, get_total_reads, select_chr = TRUE))
  scaling_factor <- ref_size / total_readsMapped
  sites_chip$chip <- sites_chip$chip * scaling_factor

  # Transforms ChIP-seq read counts
  if (transform == 'asinh') {
    sites_chip$chip <- asinh(sites_chip$chip)
  } else if (transform == 'log2') {
    sites_chip$chip <- log2(sites_chip$chip + 1)
  } else if (transform == 'sqrt') {
    sites_chip$chip <- sqrt(sites_chip$chip)
  }

  unlink(chip_count_file)

  return(sites_chip)

}


#' @title Normalizes (and transforms) ChIP-seq read coverage
#' @description Normalizes the ChIP-seq read counts by scaling
#' to the reference ChIP-seq library size,
#' and transforms the ChIP-seq read counts.
#' @param chip_counts A vector of ChIP-seq counts.
#' @param idxstats_file The \code{idxstats} file generated by \code{samtools}.
#' @param ref_size ChIP-Seq reference library size (Default: 20 million).
#' @param transform Type of transformation for the ChIP read counts.
#' Options are \sQuote{none} (no transformation), \sQuote{asinh}, \sQuote{log2}, and \sQuote{'sqrt'}.
#' @return A vector of transformed ChIP-seq counts.
#' @export
#'
normalize_chip <- function(chip_counts,
                           idxstats_file,
                           ref_size=2e7,
                           transform=c('none', 'asinh', 'log2', 'sqrt')) {

  transform <- match.arg(transform)

  # Count total mapped ChIP-seq reads (chr1:22)
  total_readsMapped <- get_total_reads(idxstats_file, select_chr = TRUE)

  # Normalize (scale) ChIP-seq read counts
  cat('Normalize ChIP-seq reads to', ref_size / 1e6, 'million \n')
  scaling_factor <- ref_size / total_readsMapped
  chip_counts <- chip_counts * scaling_factor

  # Transform the ChIP-seq counts
  if (transform == 'asinh') {
    chip_counts <- asinh(chip_counts)
  } else if (transform == 'log2') {
    chip_counts <- log2(chip_counts + 1)
  } else if (transform == 'sqrt') {
    chip_counts <- sqrt(chip_counts)
  }

  return(chip_counts)

}

#' @title Adds ChIP-seq peak labels to the candidate sites
#' @description Adds ChIP-seq peak labels to the candidate sites.
#' It uses ChIP-seq peaks from \code{chip_peak_file} if available.
#' If \code{chip_peak_file} is not available, it will download the
#' ChIP-seq peaks from ENCODE database (using
#' the sample ID provided in \code{chip_peak_sampleID},
#' and save to the directory \code{chip_peak_dir}).
#' @param sites A data frame containing the candidate sites
#' @param chip_peak_file ChIP-seq peak file (.bed.gz format)
#' @param chip_peak_sampleID ENCODE sample ID of ChIP-seq peaks (.bed.gz format)
#' @param chip_peak_dir Directory to save the downloaded ChIP-seq peaks
#' @return A data frame of candidate sites and binary ChIP-seq peak labels
#' (bound: 1, unbound: 0).
#' @importFrom GenomicRanges makeGRangesFromDataFrame countOverlaps
#' @importFrom data.table fread
#' @export
#' @examples
#' \dontrun{
#' sites_chip_labels <- add_chip_peak_labels_to_sites(sites, chip_peak_file='ChIPseq.peaks.bed.gz')
#' }
add_chip_peak_labels_to_sites <- function(sites,
                                          chip_peak_file=NULL,
                                          chip_peak_sampleID=NULL,
                                          chip_peak_dir='./'){

  if(is.null(chip_peak_file) || !file.exists(chip_peak_file)){
    if( is.null(chip_peak_sampleID) || is.na(chip_peak_sampleID) || chip_peak_sampleID == '' ){
      cat('No ChIP peak data. \n')
      return(sites)
    }else{
      chip_peak_file <- file.path(chip_peak_dir, paste0(chip_peak_sampleID,'.bed.gz'))
      if(!file.exists(chip_peak_file)){
        cat('Download ChIP peak file from ENCODE ... \n')
        chip_peak_url <- sprintf('https://www.encodeproject.org/files/%s/@@download/%s.bed.gz',
                                 chip_peak_sampleID, chip_peak_sampleID)
        system(paste('wget', chip_peak_url, '-P', chip_peak_dir))
      }
    }
  }

  # Get ChIP peak labels for candidate sites
  cat('Get ChIP peak labels for candidate sites... \n')
  chip_peaks <- as.data.frame(data.table::fread(chip_peak_file))
  colnames(chip_peaks) <- c('chr', 'start', 'end', 'name', 'score', 'strand',
                            'signalValue', 'pValue', 'qValue', 'peak')
  chip_peaks.gr <- makeGRangesFromDataFrame(chip_peaks, keep.extra.columns = TRUE)
  sites.gr <- makeGRangesFromDataFrame(sites, keep.extra.columns = TRUE)

  sites_chip <- cbind(sites, chip_label = 0)
  in.peaks <- which(countOverlaps(sites.gr, chip_peaks.gr) > 0)
  sites_chip$chip_label[in.peaks] <- 1

  return(sites_chip)

}

#' @title Adds ChIP-seq signals to the candidate sites
#' @description Adds ChIP-seq signals to the candidate sites.
#' It uses ChIP-seq signals (in BigWig format) from \code{chip_signal_file}.
#' If \code{chip_peak_file} is not available, it will download the
#' ChIP-seq signals from ENCODE database (using
#' the sample ID provided in \code{chip_signal_sampleID},
#' and save to the directory \code{chip_signal_dir}).
#' @param sites A data frame containing the candidate sites
#' @param chip_signal_file ChIP-seq signal file (BigWig format).
#' @param chip_signal_sampleID ENCODE sample ID of ChIP-seq signals (BigWig format).
#' @param chip_signal_dir Directory to save the downloaded ChIP-seq signals.
#' @param bigWigAverageOverBed_path Path to \code{bigWigAverageOverBed} executable.
#' @return A data frame of candidate sites and ChIP-seq signals.
#' @importFrom data.table fread fwrite
#'
#' @export
#' @examples
#' \dontrun{
#' sites_chip_signals - add_chip_signals_to_sites(sites, chip_signal_file='ChIPseq.signals.bigWig')
#' }
add_chip_signals_to_sites <- function(sites,
                                      chip_signal_file=NULL,
                                      chip_signal_sampleID=NULL,
                                      chip_signal_dir='./',
                                      bigWigAverageOverBed_path='bigWigAverageOverBed'){

  if ( Sys.which(bigWigAverageOverBed_path) == '' ) {
    stop( 'bigWigAverageOverBed could not be executed. Please install bigWigAverageOverBed and set bigWigAverageOverBed_path.' )
  }

  if(is.null(chip_signal_file) || !file.exists(chip_signal_file)){
    if( is.null(chip_signal_sampleID) || is.na(chip_signal_sampleID) || chip_signal_sampleID == '' ){
      cat('No ChIP signal data. \n')
      return(sites)
    }else{
      chip_signal_file <- file.path(chip_signal_dir, paste0(chip_signal_sampleID, '.bigWig'))

      if(!file.exists(chip_signal_file)){
        cat('Download ChIP signal file from ENCODE ... \n')
        chip_signal_url <- sprintf('https://www.encodeproject.org/files/%s/@@download/%s.bigWig',
                                   chip_signal_sampleID, chip_signal_sampleID)
        system(paste('wget', chip_signal_url, '-P', chip_signal_dir))
      }
    }
  }

  # Adds ChIP-seq signal information
  sites_file <- tempfile('tmp_sites')
  fwrite(sites[,1:4], sites_file, sep = '\t', col.names = FALSE, scipen = 999)

  # Takes the average signal values in each site
  sites_signals_file <- tempfile('tmp_signals')
  cmd <- paste(bigWigAverageOverBed_path, chip_signal_file, sites_file, sites_signals_file)
  if(.Platform$OS.type == "windows") shell(cmd) else system(cmd)

  sites_signals <- as.data.frame(fread(sites_signals_file))
  colnames(sites_signals) <- c('name', 'size', 'covered', 'sum', 'mean0', 'mean')

  m <- match(sites$name, sites_signals$name)
  sites_chip <- cbind(sites, chip_signal=sites_signals$mean[m])

  unlink(c(sites_file, sites_signals_file))

  return(sites_chip)

}
HarteminkLab/TOP documentation built on July 27, 2023, 6:14 p.m.