#' @title Counts and normalizes (and transforms) ChIP-seq read coverage
#' @description Counts ChIP-seq read coverage for each BAM file
#' using the \code{coverage} function from \code{bedtools}.
#' Then normalizes the read counts
#' by scaling to the reference ChIP-seq library size,
#' and transforms the ChIP-seq read counts.
#' @param sites A data frame containing the candidate sites.
#' @param chip_bam_files ChIP-seq BAM files (may include multiple replicates).
#' @param chip_idxstats_files ChIP-seq \code{idxstats} files
#' (may include multiple replicates).
#' (Default: uses the corresponding \code{.idxstats.txt} files in
#' the same directory of the BAM files).
#' @param chrom_size_file Chromosome size file.
#' @param ref_size Scales to ChIP-seq reference library size
#' (Default: 2e7)
#' @param transform Transformation for ChIP-seq read counts.
#' Options are \sQuote{none} (no transformation), \sQuote{asinh}, \sQuote{log2}, and \sQuote{'sqrt'}.
#' @param bedtools_path Path to \code{bedtools} executable.
#' (default: 'bedtools')
#' @return A data frame of candidate sites and normalized (and transformed)
#' ChIP-seq read counts around each candidate site.
#' @importFrom data.table fread fwrite
#' @export
#' @examples
#' \dontrun{
#' sites_chip <- count_normalize_chip(sites,
#' chip_bam_files=c('ChIPseq.rep1.bam', 'ChIPseq.rep2.bam'),
#' chrom_size_file='hg38.chrom.sizes',
#' ref_size=2e7)
#' }
count_normalize_chip = function(sites,
chip_bam_files,
chip_idxstats_files,
chrom_size_file,
ref_size=2e7,
transform=c('none', 'asinh', 'log2', 'sqrt'),
bedtools_path='bedtools'){
transform <- match.arg(transform)
if ( Sys.which(bedtools_path) == '' ) {
stop( 'bedtools could not be executed. Please install bedtools and set bedtools_path.' )
}
if ( length(chip_bam_files) == 0 || chip_bam_files == '' || is.na(chip_bam_files)) {
stop('No ChIP-seq files! \n')
}
if(missing(chip_idxstats_files)){
chip_idxstats_files <- paste0(chip_bam_files, '.idxstats.txt')
}
# Counts total ChIP-seq reads (pooling replicates together)
cat('Counting ChIP-seq read coverage ... \n')
chip_bam_files <- paste(chip_bam_files, collapse = ' ')
chip_count_file <- tempfile(pattern = 'totalcounts')
sites_file <- tempfile('tmp_sites')
fwrite(sites, sites_file, sep = '\t', col.names = FALSE, scipen = 999)
cmd <- paste(bedtools_path, 'coverage -a', sites_file, '-b', chip_bam_files,
'-counts -sorted -g', chrom_size_file, '>', chip_count_file)
if(.Platform$OS.type == "windows") shell(cmd) else system(cmd)
sites_chip <- as.data.frame(data.table::fread(chip_count_file))
colnames(sites_chip) <- c(colnames(sites), 'chip')
# Normalizes (scales) ChIP-seq read counts to the reference library size
cat('Normalize (scale) ChIP-seq library to', ref_size / 1e6, 'million reads... \n')
total_readsMapped <- sum(sapply(chip_idxstats_files, get_total_reads, select_chr = TRUE))
scaling_factor <- ref_size / total_readsMapped
sites_chip$chip <- sites_chip$chip * scaling_factor
# Transforms ChIP-seq read counts
if (transform == 'asinh') {
sites_chip$chip <- asinh(sites_chip$chip)
} else if (transform == 'log2') {
sites_chip$chip <- log2(sites_chip$chip + 1)
} else if (transform == 'sqrt') {
sites_chip$chip <- sqrt(sites_chip$chip)
}
unlink(chip_count_file)
return(sites_chip)
}
#' @title Normalizes (and transforms) ChIP-seq read coverage
#' @description Normalizes the ChIP-seq read counts by scaling
#' to the reference ChIP-seq library size,
#' and transforms the ChIP-seq read counts.
#' @param chip_counts A vector of ChIP-seq counts.
#' @param idxstats_file The \code{idxstats} file generated by \code{samtools}.
#' @param ref_size ChIP-Seq reference library size (Default: 20 million).
#' @param transform Type of transformation for the ChIP read counts.
#' Options are \sQuote{none} (no transformation), \sQuote{asinh}, \sQuote{log2}, and \sQuote{'sqrt'}.
#' @return A vector of transformed ChIP-seq counts.
#' @export
#'
normalize_chip <- function(chip_counts,
idxstats_file,
ref_size=2e7,
transform=c('none', 'asinh', 'log2', 'sqrt')) {
transform <- match.arg(transform)
# Count total mapped ChIP-seq reads (chr1:22)
total_readsMapped <- get_total_reads(idxstats_file, select_chr = TRUE)
# Normalize (scale) ChIP-seq read counts
cat('Normalize ChIP-seq reads to', ref_size / 1e6, 'million \n')
scaling_factor <- ref_size / total_readsMapped
chip_counts <- chip_counts * scaling_factor
# Transform the ChIP-seq counts
if (transform == 'asinh') {
chip_counts <- asinh(chip_counts)
} else if (transform == 'log2') {
chip_counts <- log2(chip_counts + 1)
} else if (transform == 'sqrt') {
chip_counts <- sqrt(chip_counts)
}
return(chip_counts)
}
#' @title Adds ChIP-seq peak labels to the candidate sites
#' @description Adds ChIP-seq peak labels to the candidate sites.
#' It uses ChIP-seq peaks from \code{chip_peak_file} if available.
#' If \code{chip_peak_file} is not available, it will download the
#' ChIP-seq peaks from ENCODE database (using
#' the sample ID provided in \code{chip_peak_sampleID},
#' and save to the directory \code{chip_peak_dir}).
#' @param sites A data frame containing the candidate sites
#' @param chip_peak_file ChIP-seq peak file (.bed.gz format)
#' @param chip_peak_sampleID ENCODE sample ID of ChIP-seq peaks (.bed.gz format)
#' @param chip_peak_dir Directory to save the downloaded ChIP-seq peaks
#' @return A data frame of candidate sites and binary ChIP-seq peak labels
#' (bound: 1, unbound: 0).
#' @importFrom GenomicRanges makeGRangesFromDataFrame countOverlaps
#' @importFrom data.table fread
#' @export
#' @examples
#' \dontrun{
#' sites_chip_labels <- add_chip_peak_labels_to_sites(sites, chip_peak_file='ChIPseq.peaks.bed.gz')
#' }
add_chip_peak_labels_to_sites <- function(sites,
chip_peak_file=NULL,
chip_peak_sampleID=NULL,
chip_peak_dir='./'){
if(is.null(chip_peak_file) || !file.exists(chip_peak_file)){
if( is.null(chip_peak_sampleID) || is.na(chip_peak_sampleID) || chip_peak_sampleID == '' ){
cat('No ChIP peak data. \n')
return(sites)
}else{
chip_peak_file <- file.path(chip_peak_dir, paste0(chip_peak_sampleID,'.bed.gz'))
if(!file.exists(chip_peak_file)){
cat('Download ChIP peak file from ENCODE ... \n')
chip_peak_url <- sprintf('https://www.encodeproject.org/files/%s/@@download/%s.bed.gz',
chip_peak_sampleID, chip_peak_sampleID)
system(paste('wget', chip_peak_url, '-P', chip_peak_dir))
}
}
}
# Get ChIP peak labels for candidate sites
cat('Get ChIP peak labels for candidate sites... \n')
chip_peaks <- as.data.frame(data.table::fread(chip_peak_file))
colnames(chip_peaks) <- c('chr', 'start', 'end', 'name', 'score', 'strand',
'signalValue', 'pValue', 'qValue', 'peak')
chip_peaks.gr <- makeGRangesFromDataFrame(chip_peaks, keep.extra.columns = TRUE)
sites.gr <- makeGRangesFromDataFrame(sites, keep.extra.columns = TRUE)
sites_chip <- cbind(sites, chip_label = 0)
in.peaks <- which(countOverlaps(sites.gr, chip_peaks.gr) > 0)
sites_chip$chip_label[in.peaks] <- 1
return(sites_chip)
}
#' @title Adds ChIP-seq signals to the candidate sites
#' @description Adds ChIP-seq signals to the candidate sites.
#' It uses ChIP-seq signals (in BigWig format) from \code{chip_signal_file}.
#' If \code{chip_peak_file} is not available, it will download the
#' ChIP-seq signals from ENCODE database (using
#' the sample ID provided in \code{chip_signal_sampleID},
#' and save to the directory \code{chip_signal_dir}).
#' @param sites A data frame containing the candidate sites
#' @param chip_signal_file ChIP-seq signal file (BigWig format).
#' @param chip_signal_sampleID ENCODE sample ID of ChIP-seq signals (BigWig format).
#' @param chip_signal_dir Directory to save the downloaded ChIP-seq signals.
#' @param bigWigAverageOverBed_path Path to \code{bigWigAverageOverBed} executable.
#' @return A data frame of candidate sites and ChIP-seq signals.
#' @importFrom data.table fread fwrite
#'
#' @export
#' @examples
#' \dontrun{
#' sites_chip_signals - add_chip_signals_to_sites(sites, chip_signal_file='ChIPseq.signals.bigWig')
#' }
add_chip_signals_to_sites <- function(sites,
chip_signal_file=NULL,
chip_signal_sampleID=NULL,
chip_signal_dir='./',
bigWigAverageOverBed_path='bigWigAverageOverBed'){
if ( Sys.which(bigWigAverageOverBed_path) == '' ) {
stop( 'bigWigAverageOverBed could not be executed. Please install bigWigAverageOverBed and set bigWigAverageOverBed_path.' )
}
if(is.null(chip_signal_file) || !file.exists(chip_signal_file)){
if( is.null(chip_signal_sampleID) || is.na(chip_signal_sampleID) || chip_signal_sampleID == '' ){
cat('No ChIP signal data. \n')
return(sites)
}else{
chip_signal_file <- file.path(chip_signal_dir, paste0(chip_signal_sampleID, '.bigWig'))
if(!file.exists(chip_signal_file)){
cat('Download ChIP signal file from ENCODE ... \n')
chip_signal_url <- sprintf('https://www.encodeproject.org/files/%s/@@download/%s.bigWig',
chip_signal_sampleID, chip_signal_sampleID)
system(paste('wget', chip_signal_url, '-P', chip_signal_dir))
}
}
}
# Adds ChIP-seq signal information
sites_file <- tempfile('tmp_sites')
fwrite(sites[,1:4], sites_file, sep = '\t', col.names = FALSE, scipen = 999)
# Takes the average signal values in each site
sites_signals_file <- tempfile('tmp_signals')
cmd <- paste(bigWigAverageOverBed_path, chip_signal_file, sites_file, sites_signals_file)
if(.Platform$OS.type == "windows") shell(cmd) else system(cmd)
sites_signals <- as.data.frame(fread(sites_signals_file))
colnames(sites_signals) <- c('name', 'size', 'covered', 'sum', 'mean0', 'mean')
m <- match(sites$name, sites_signals$name)
sites_chip <- cbind(sites, chip_signal=sites_signals$mean[m])
unlink(c(sites_file, sites_signals_file))
return(sites_chip)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.