######################################################
#' generate genome-wide bins for counting purpose
#'
#' Given a chrom.size file, this function allows you to generate a your own sliding windows (bins).
#' @param chromFile chrom.size. Given "hg19","mm10","mm9" or "hg38", will load chrom.size file from package folder.
#' @param binSize size of bins (bp)
#' @param overlap overlaps between two consecutive bins (bp)
#' @param withChr chromosome names in bin File have chr if set withChr to TRUE; FALSE - no chr
#' @param minChrSize exclude small contigs (bp) . when it is NULL (by default), minChrSize equals binSize x 2 (bp).
#' @return A data.frame of generated bins
#' @export
#' @examples
#' ## 1. generate a mm10 binFile without chr and use a binSize of 1000 bp
#' ## and overlap of 500 bp between two consecutive bins
#'
#' ## "mm10" will be parsed as system.file("extdata", "mm10.chrom.sizes",
#' ## package = "ChIPseqSpikeInFree")
#' # binDF <- GenerateBins(chromFile="mm10",binSize=1000, overlap=500,
#' # withChr=FALSE,prefix="mm10")
#'
#' ## 2. generate a hg19 binFile with chr and use a binSize of 2000 bp
#'
#' ## "hg19" will be parsed as system.file("extdata", "hg19.chrom.sizes",
#' ## package = "ChIPseqSpikeInFree")
#' # binDF<- GenerateBins(chromFile="hg19",binSize=2000, overlap=0,
#' # withChr=TRUE,minChrSize=2000)
GenerateBins <- function(chromFile, binSize = 1000, overlap = 0, withChr = TRUE, minChrSize=NULL) {
# generate genome-wide bins for counting purpose
options(stringsAsFactors = FALSE)
if (!file.exists(chromFile)) {
chromFile <- system.file("extdata", paste0(chromFile, ".chrom.sizes"), package = "ChIPseqSpikeInFree")
if (!file.exists(chromFile)) {
stop(paste0("\nchromFile was not found:", chromFile, "\n"))
}
}
if (binSize < 200 && binSize > 10000) {
stop(paste0("\n**Recommended binSize range 200 ~ 10000 (bp); Your binSize is", binSize, " **\n"))
}
cat(paste0("\n\tFollowing file will be used to generate sliding windows: ", basename(chromFile)))
chromSize <- read.table(chromFile, sep = "\t", header = F, fill = TRUE, quote = "")
options(scipen = 999) # disable scientific notation when writing out
counter <- NULL
excluded <- grepl("random", chromSize[, 1]) | grepl("Un", chromSize[, 1]) | grepl("alt", chromSize[, 1]) | grepl("hap", chromSize[, 1])
chromSize <- chromSize[!excluded, ]
chrNotation <- gsub("chrM", "MT", chromSize[, 1])
chrNotation <- gsub("chr", "", chrNotation)
if (withChr) {
chrNotation <- gsub("^", "chr", chrNotation)
chrNotation <- gsub("chrMT", "chrM", chrNotation)
}
chromSize[, 1] <- chrNotation
cat(paste0("\n\twarn: exclude small contigs [n=", sum(chromSize[,2] < minChrSize),"]"))
if (is.null(minChrSize)){
minChrSize <- binSize * 2
}
chromSize <- chromSize[chromSize[,2] >= minChrSize,]
beds <- NULL
timeStamp<- strtrim(gsub("[-: ]","",Sys.time()),14) #8,12
outFile <- paste0(timeStamp,".txt")
cat("\n\tstart:",as.character(Sys.time()),"\n\t")
for (r in 1:nrow(chromSize)) {
chrName <- chromSize[r, 1]
chrLength <- chromSize[r, 2]
realBinSize <- binSize - overlap
starts <- seq(0, chrLength, realBinSize) + 1
if (chrLength < realBinSize){
ends <- chrLength
}else{
ends <- c(seq(binSize, chrLength, realBinSize), chrLength)
}
starts <- starts[1:length(ends)]
bed <- data.frame(chr = chrName, start = starts, end = ends)
counter <- c(counter, nrow(bed))
if (r == 1) { # overwrite lines
beds <- bed
} else {
beds <- rbind(beds, bed)
}
}
return(beds)
}
######################################################
#' count raw reads for all bins
#'
#' This function counts raw reads for each bin.
#' @param bamFiles a vector of bam filenames.
#' @param chromFile a chrom.size file. Given "hg19","mm10","mm9" or "hg38", will load chrom.size file from package folder. Otherwise, give a your own chrom.size
#' @param prefix prefix of output file name
#' @param singleEnd To count paired-end reads, set argument singleEnd=FALSE
#' @param binSize size of bins (bp). Recommend a value bwteen 200 and 10000
#' @return a data.frame of raw counts for each bin
#' @import Rsamtools
#' @import GenomicRanges
#' @import GenomicAlignments
#' @import IRanges
#' @export
#' @examples
#' ## 1.count reads using mm9 bams
#' # bams <- c("your/path/ChIPseq1.bam","your/path/ChIPseq2.bam")
#' # rawCountDF <- CountRawReads(bamFiles=bams,chromFile="mm9",
#' # prefix="your/path/test",singleEnd=TRUE)
CountRawReads <- function(bamFiles, chromFile = "hg19", prefix = "test", singleEnd = TRUE, binSize = 1000) {
# count raw reads for every 1kb bin across genome
# check file
if (sum(!file.exists(bamFiles)) > 0) {
cat("\n** some bam files are unavailable:**")
cat("\t", bamFiles[!file.exists(bamFiles)], "\n")
stop("Invalid bam file list!")
}
if (binSize < 200 && binSize > 10000) {
stop(paste0("\n**Recommended binSize range 200 ~ 10000 (bp); Your binsize is", binSize, " **\n"))
}
# Function to check chromosome notation in bam file
# return a logical vector
ChrCheck <- function(bamFiles) {
bamHeader <- scanBamHeader(bamFiles)
res <- unlist(
lapply(
1:length(bamHeader),
function(x) {
any( grepl ("^chr", names(bamHeader[[x]]$targets)))
}
)
)
res
}
# Function to count reads
# return a data frame
CountingReads <- function(bamFiles, withChr) {
binDF <- GenerateBins(chromFile = chromFile, binSize = binSize, overlap = 0, withChr = withChr)
bins <- binDF
myCoords <- GRanges(
seqnames = bins[, 1],
ranges = IRanges(start = bins[, 2], end = bins[, 3], names = paste0(bins[, 1], ":", bins[, 2], "-", bins[, 3])),
strand = "+"
)
bamlist <- BamFileList(bamFiles, yieldSize = 5e4)
names(bamlist) <- basename(bamFiles)
assay <- NULL ## To please R CMD check
counts <- summarizeOverlaps(
features = myCoords,
reads = bamlist,
ignore.strand = TRUE,
singleEnd = singleEnd
)
dat <- as.data.frame(assay(counts))
return(dat)
}
chrFlags <- ChrCheck(bamFiles)
bams.Chr <- bamFiles[chrFlags]
bams.NoChr <- bamFiles[!chrFlags]
cat("\n\tThis step could take some time. Please be patient...")
dat.Chr <- NULL
dat.NoChr <- NULL
if (length(bams.Chr) > 0) {
dat.Chr <- CountingReads(bams.Chr, TRUE)
}
if (length(bams.NoChr) > 0) {
dat.NoChr <- CountingReads(bams.NoChr, FALSE)
}
if (!is.null(dat.Chr) && !is.null(dat.NoChr)) {
datOut <- data.frame(bin = rownames(dat.Chr), dat.Chr, dat.NoChr, check.names = F)
} else if (!is.null(dat.Chr)) {
datOut <- data.frame(bin = rownames(dat.Chr), dat.Chr, check.names = F)
} else {
datOut <- data.frame(bin = rownames(dat.NoChr), dat.NoChr, check.names = F)
}
libSizeFlag <- colSums(datOut[,-1])>10^6
keptSamples <- colnames(datOut[,-1])[libSizeFlag] # remove it if total library size is lesss than 10^6
if(length(keptSamples) < ncol(datOut[,-1])){
cat("\n*WARNING* Some samples were removed due to library size(< 10^6) or no read count [ n=",sum(!libSizeFlag),"]:")
cat("\n\t", colnames(datOut)[!libSizeFlag], "\n")
}
datOut <- datOut[, c("bin",keptSamples)]
rm(list = c("dat.Chr", "dat.NoChr"))
options(scipen = 99) # disable scientific notation when writing out
outFile <- paste0(prefix, "_rawCounts.txt")
write.table(datOut, outFile, sep = "\t", quote = F, row.names = F, col.names = T)
invisible(return(datOut[, -1]))
}
######################################################
#' read in sample metadata file
#'
#' This function allows you to load metadat to a R data.frame and return the object.
#' In addtion, it validates meta_info format and adds a COLOR column if it's undefined.
#' @param metaFile a metadata file name; the file must have three columns: ID (bam filename without full path), ANTIBODY and GROUP. the COLOR column is optional and will be used for plotting purpose.
#' @return A data.frame of metaFile
#' @export
#' @examples
#' ## 1. load an example of metadata file
#'
#' metaFile <- system.file("extdata", "sample_meta.txt", package = "ChIPseqSpikeInFree")
#' meta <- ReadMeta(metaFile)
#' head(meta, n = 1)
#' meta
#' # ID ANTIBODY GROUP COLOR
#' # H3K27me3-NSH.K27M.A.bam H3K27me3-NSH.K27M.A.bam H3K27me3 K27M green
#' # H3K27me3-NSH.K27M.B.bam H3K27me3-NSH.K27M.B.bam H3K27me3 K27M green
#' # H3K27me3-NSH.K27M.C.bam H3K27me3-NSH.K27M.C.bam H3K27me3 K27M green
#' # H3K27me3-NSH.WT.D.bam H3K27me3-NSH.WT.D.bam H3K27me3 WT grey
#' # H3K27me3-NSH.WT.E.bam H3K27me3-NSH.WT.E.bam H3K27me3 WT grey
#' # H3K27me3-NSH.WT.F.bam H3K27me3-NSH.WT.F.bam H3K27me3 WT grey
ReadMeta <- function(metaFile = "sample_meta.txt") {
# read in sample metadata file
if (!file.exists(metaFile)) {
stop(paste0("\nFile was not found:", metaFile, "\n"))
}
meta <- read.table(metaFile, sep = "\t", header = TRUE, fill = TRUE, stringsAsFactors = FALSE, quote = "", check.names = F)
colnames(meta) <- toupper(colnames(meta))
# check duplicate ID
if (sum(duplicated(meta$ID)) > 1) {
cat("\n**Warning: found duplicate ID(s) in your metaFile.**\n\n")
print(unique(meta$ID[duplicated(meta$ID)]))
cat("\nPlease correct your metaFile and then re-run the pipeline.\n\n")
quit("no")
}
rownames(meta) <- meta$ID
if (!"ID" %in% colnames(meta) | !"ANTIBODY" %in% colnames(meta) | !"GROUP" %in% colnames(meta)) {
cat("\n**ERROR: Invalid meta file. **\n\tID, ANTIBODY, GROUP column(s) are required. COLOR column is optional.\n")
# stop("Exit due to unexpected error.")
return(NA)
}
if (!"COLOR" %in% colnames(meta)) {
myColors <- c(
"turquoise", "blue", "brown", "yellow", "green", "red", "black", "pink", "magenta", "purple", "greenyellow", "tan", "salmon", "cyan", "midnightblue", "lightcyan",
"grey60", "lightgreen", "royalblue", "darkred", "darkgreen", "darkturquoise", "darkgrey", "orange", "darkorange", "skyblue", "gold1", "lightyellow",
"saddlebrown", "steelblue", "paleturquoise", "violet", "darkolivegreen", "darkmagenta", "sienna3", "yellowgreen", "skyblue3", "plum1", "orangered4", "mediumpurple3",
"lightsteelblue1", "lightcyan1", "ivory", "floralwhite", "darkorange2", "brown4", "bisque4", "darkslateblue", "plum2", "thistle2", "thistle1", "salmon4",
"palevioletred3", "navajowhite2", "maroon", "lightpink4", "lavenderblush3", "honeydew1", "darkseagreen4", "coral1", "antiquewhite4", "coral2", "mediumorchid", "skyblue2",
"yellow4", "skyblue1", "plum", "orangered3", "mediumpurple2", "lightsteelblue", "lightcoral", "indianred4", "firebrick4", "darkolivegreen4", "brown2", "blue2",
"darkviolet", "plum3", "thistle3", "thistle", "salmon2", "palevioletred2", "navajowhite1", "magenta4", "lightpink3", "lavenderblush2", "honeydew", "darkseagreen3",
"coral", "antiquewhite2", "coral3", "mediumpurple4", "skyblue4", "yellow3", "sienna4", "pink4", "orangered1", "mediumpurple1", "lightslateblue", "lightblue4",
"indianred3", "firebrick3", "darkolivegreen2", "blueviolet", "blue4", "deeppink", "plum4", "thistle4", "tan4", "salmon1", "palevioletred1", "navajowhite",
"magenta3", "lightpink2", "lavenderblush1", "green4", "darkseagreen2", "chocolate4", "antiquewhite1", "coral4", "mistyrose", "slateblue", "yellow2", "sienna2",
"pink3", "orangered", "mediumpurple", "lightskyblue4", "lightblue3", "indianred2", "firebrick2", "darkolivegreen1", "blue3", "brown1", "deeppink1", "powderblue",
"tomato", "tan3", "royalblue3", "palevioletred", "moccasin", "magenta2", "lightpink1", "lavenderblush", "green3", "darkseagreen1", "chocolate3", "aliceblue",
"cornflowerblue", "navajowhite3", "slateblue1", "whitesmoke", "sienna1", "pink2", "orange4", "mediumorchid4", "lightskyblue3", "lightblue2", "indianred1", "firebrick",
"darkgoldenrod4", "blue1", "brown3", "deeppink2", "purple2", "tomato2", "tan2", "royalblue2", "paleturquoise4", "mistyrose4", "magenta1", "lightpink",
"lavender", "green2", "darkseagreen", "chocolate2", "antiquewhite", "cornsilk", "navajowhite4", "slateblue2", "wheat3", "sienna", "pink1", "orange3",
"mediumorchid3", "lightskyblue2", "lightblue1", "indianred", "dodgerblue4", "darkgoldenrod3", "blanchedalmond", "burlywood", "deepskyblue", "red1", "tomato4", "tan1",
"rosybrown4", "paleturquoise3", "mistyrose3", "linen", "lightgoldenrodyellow", "khaki4", "green1", "darksalmon", "chocolate1", "antiquewhite3", "cornsilk2", "oldlace",
"slateblue3", "wheat1", "seashell4", "peru", "orange2", "mediumorchid2", "lightskyblue1", "lightblue", "hotpink4", "dodgerblue3", "darkgoldenrod1", "bisque3",
"burlywood1", "deepskyblue4", "red4", "turquoise2", "steelblue4", "rosybrown3", "paleturquoise1", "mistyrose2", "limegreen", "lightgoldenrod4", "khaki3", "goldenrod4",
"darkorchid4", "chocolate", "aquamarine", "cyan1", "orange1", "slateblue4", "violetred4", "seashell3", "peachpuff4", "olivedrab4", "mediumorchid1", "lightskyblue",
"lemonchiffon4", "hotpink3", "dodgerblue1", "darkgoldenrod", "bisque2", "burlywood2", "dodgerblue2", "rosybrown2", "turquoise4", "steelblue3", "rosybrown1", "palegreen4",
"mistyrose1", "lightyellow4", "lightgoldenrod3", "khaki2", "goldenrod3", "darkorchid3", "chartreuse4", "aquamarine1", "cyan4", "orangered2", "snow", "violetred2",
"seashell2", "peachpuff3", "olivedrab3", "mediumblue", "lightseagreen", "lemonchiffon3", "hotpink2", "dodgerblue", "darkblue", "bisque1", "burlywood3", "firebrick1",
"royalblue1", "violetred1", "steelblue1", "rosybrown", "palegreen3", "mintcream", "lightyellow3", "lightgoldenrod2", "khaki1", "goldenrod2", "darkorchid2", "chartreuse3",
"aquamarine2", "darkcyan", "orchid", "snow2", "violetred", "seashell1", "peachpuff2", "olivedrab2", "mediumaquamarine", "lightsalmon4", "lemonchiffon2", "hotpink1",
"deepskyblue3", "cyan3", "bisque", "burlywood4", "forestgreen", "royalblue4", "violetred3", "springgreen3", "red3", "palegreen1", "mediumvioletred", "lightyellow2",
"lightgoldenrod1", "khaki", "goldenrod1", "darkorchid1", "chartreuse2", "aquamarine3", "darkgoldenrod2", "orchid1", "snow4", "turquoise3", "seashell", "peachpuff1",
"olivedrab1", "maroon4", "lightsalmon3", "lemonchiffon1", "hotpink", "deepskyblue2", "cyan2", "beige", "cadetblue", "gainsboro", "salmon3", "wheat",
"springgreen2", "red2", "palegreen", "mediumturquoise", "lightyellow1", "lightgoldenrod", "ivory4", "goldenrod", "darkorchid", "chartreuse1", "aquamarine4", "darkkhaki",
"orchid3", "springgreen1", "turquoise1", "seagreen4", "peachpuff", "olivedrab", "maroon3", "lightsalmon2", "lemonchiffon", "honeydew4", "deepskyblue1", "cornsilk4",
"azure4", "cadetblue1", "ghostwhite", "sandybrown", "wheat2", "springgreen", "purple4", "palegoldenrod", "mediumspringgreen", "lightsteelblue4", "lightcyan4", "ivory3",
"gold3", "darkorange4", "chartreuse", "azure", "darkolivegreen3", "palegreen2", "springgreen4", "tomato3", "seagreen3", "papayawhip", "navyblue", "maroon2",
"lightsalmon1", "lawngreen", "honeydew3", "deeppink4", "cornsilk3", "azure3", "cadetblue2", "gold", "seagreen", "wheat4", "snow3", "purple3",
"orchid4", "mediumslateblue", "lightsteelblue3", "lightcyan3", "ivory2", "gold2", "darkorange3", "cadetblue4", "azure1", "darkorange1", "paleturquoise2", "steelblue2",
"tomato1", "seagreen2", "palevioletred4", "navy", "maroon1", "lightsalmon", "lavenderblush4", "honeydew2", "deeppink3", "cornsilk1", "azure2", "cadetblue3",
"gold4", "seagreen1", "yellow1", "snow1", "purple1", "orchid2", "mediumseagreen", "lightsteelblue2", "lightcyan2", "ivory1", "white"
)
groupNum <- as.numeric(factor(paste(meta$ANTIBODY, meta$GROUP, sep = "")))
meta$COLOR <- myColors[groupNum]
}
invisible(meta)
}
######################################################
#' parse readCounts matrix
#'
#' This function allows you to parse rawCount table (generated by CountRawReads() function) to a parsedMatrix of (cutoff, and percent of reads accumulatively passed the cutoff in each sample).
#' @param data a data.frame returned by readRawCounts() or a file name of rawCount table
#' @param metaFile a data.frame of metadata by ReadMeta(); or a filename of metadata file.
#' @param by step used to define cutoffs; ParseReadCounts will cumulatively calculate the percent of reads that pass the every cutoff.
#' @param prefix prefix of output filename to save the parsedMatrix of (cutoff, and percent of reads accumulatively passed the cutoff in each sample).
#' @param binSize size of bins (bp). Recommend a value bwteen 200 and 10000
#' @param ncores number of cores for parallel computing.
#' @return A data.frame of parsed data.
#' @export
#' @examples
#'
#' ## prerequisite step 1. count raw reads
#' ## (if your bam files were aligned to mm9 genome with chr in reference chromosomes).
#' # bams <- c("your/path/ChIPseq1.bam","your/path/ChIPseq2.bam")
#' # rawCountDF <- CountRawReads(bamFiles=bams,chromFile="mm9",prefix="your/path/test")
#' ## output file will be "your/path/test_rawCount.txt"
#' # head(rawCountDF,n=2)
#'
#' # bin ChIPseq1.bam ChIPseq2.bam
#' # chr1:1-1000 0 0
#' # chr1:1001-2000 0 0
#'
#' ## prerequisite step 2: generate your sample_meta.txt.
#' ## A tab-delimited txt file has three required columns
#' # ID ANTIBODY GROUP
#' # ChIPseq1.bam H3K27me3 WT
#' # ChIPseq2.bam H3K27me3 K27M
#'
#' ## 1.parse readCount table using this function.
#' # metaFile <- "your/path/sample_meta.txt"
#' # dat <- ParseReadCounts(data="your/path/test_rawCount.txt",
#' # metaFile=metaFile, prefix="your/path/test")
#' ## output file will be "your/path/test_parsedMatrix.txt"
ParseReadCounts <- function(data, metaFile = "sample_meta.txt", by = 0.05, prefix = "test", binSize = 1000, ncores = 2) {
######################################################
parParseRaw <- function(x, SEQ) {
x <- x[x > 0 ]
totalCPM <- sum(x)
res <- NULL
pct <- NULL
for (K in SEQ) {
pct <- sum(x[x <= K]) / totalCPM
res <- c(res, pct)
if (pct > 0.999) { # exit loop when 99.9% reads have been used
res <- c(res, rep(1, length(SEQ) - length(res)))
break
}
}
res
}
######################################################
if (class(metaFile) == "character") { # given a filename, need to load it
metaFile <- ReadMeta(metaFile)
}
if (class(data) != "data.frame") {
if (file.exists(data)) {
data <- read.table(data, sep = "\t", header = TRUE, fill = TRUE, stringsAsFactors = FALSE, quote = "", row.names = 1, check.names = F)
} else {
stop("Not found rawCount table and your input is not a data.frame.\n")
}
}
libSizeFlag <- colSums(data)>10^6
keptSamples <- colnames(data)[libSizeFlag] # remove it if total library size is lesss than 10^6
if(length(keptSamples) < ncol(data)){
cat("\n*WARNING* Some samples were removed due to small library size or no read count [ n=",sum(!libSizeFlag),"]:")
cat("\n\t", colnames(data)[!libSizeFlag], "\n")
}
data <- data[, keptSamples]
kept <- intersect(metaFile$ID, colnames(data))
if (length(kept) == 0) {
cat("\n** Oooops: you need to change metadata file **\n")
stop("Please check whether you IDs in metaFile match with colnames(bam filenames) in parsedMatrix.\n\n")
}
metaFile <- metaFile[kept, ] # only use samples listed in metaFile
# Calculate the number of cores
avai_cores <- detectCores() - 1
ncores <- ifelse(ncores > avai_cores, avai_cores, ncores)
if (ncores > 1) {
cat(paste0("\n\tEnabling parallel computing ( ", ncores, " cores)...\n"))
}
tryCatch({
# Initiate cluster
cl <- makeCluster(ncores)
clusterExport(cl, varlist = c("data"), envir = environment())
# calculate CPM
CPM <- parLapply(
cl, 1:ncol(data),
function(x) data[, x] / sum(data[, x]) * 1000000 * (1000 / binSize)
)
CPM <- Reduce("cbind", CPM)
colnames(CPM) <- colnames(data)
# remove extreme values potential from centromere regions
MAX_CPMW <- 150
CPM[rowSums(CPM > MAX_CPMW) > 0, ] <- 0
clusterExport(cl, varlist = "CPM", envir = environment())
# calculate data range and times of loop
MAX <- 0
MAX <- parLapply(
cl, 1:ncol(CPM),
function(x) max(CPM[, x])
)
MAX <- Reduce("cbind", MAX)
MAX <- as.numeric(ceiling(max(MAX)))
if (!is.finite(MAX)) {
MAX <- MAX_CPMW
}
SEQ <- seq(0, MAX, by = by)
if (by > MAX || length(SEQ) < 100) {
by <- MAX / 100
SEQ <- seq(0, MAX, by = by)
}
clusterExport(cl, varlist = c("SEQ", "CPM", "parParseRaw"), envir = environment())
options(scipen = 999)
res <- NULL
dat <- NULL
res <- parLapply(
cl, 1:ncol(CPM),
function(x) parParseRaw(CPM[, x], SEQ)
)
stopCluster(cl)
res <- Reduce("cbind", res)
cat("\nCPM was parsed.\n")
colnames(res) <- colnames(CPM)
dat <- data.frame(cutoff = SEQ, res, check.names = F)
if (is.null(res) || is.null(dat) ){
stop("\n Failed to parse raw read counts [ParseReadCounts]!\n\n")
}
kept <- rowSums(res) != ncol(res) # delete rows with all 1
dat <- dat[kept, ]
output <- paste0(prefix, "_parsedMatrix.txt")
write.table(dat, output, sep = "\t", quote = F, row.names = F, col.names = T)
cat("\n\t", output, "[saved]")
},
error = function(e) print(e)
)
return(dat)
}
######################################################
#' calculate scaling factors and save results
#'
#' This function allows you to plot curves, caculate SF(scaling factor) per antibody based on parsedMatrix.
#' In addtion return a data.frame of updated metadata.
#' If you run ChIPseqSpikeInFree() seperately for two batches, the scaling factors will be not comparable between two batches.
#' The correct way is to combine bamFiles parameter and create a new metadata file to include all bam files. Then re-run ChIPseqSpikeInFree().
#' @param data a data.frame generated by function ParseReadCounts() or a file name of parsed matrix
#' @param metaFile a data.frame of metadata by ReadMeta(); or a filename of metadata file.
#' @param minFirstTurn "auto" or a numeric value [between 0.20 and 0.80] to define minimum fraction of noisy reads in non-enriched regions.
#' @param maxLastTurn a numeric value [between 0.95 and 0.99] to define maximum fraction of reads to be included for identifying last turning point [0.99 by default]. Slightly smaller maxLastTurn may imporve result when the enrichment is not ideal.
#' @param cutoff_QC a numeric value [between 0.90 and 1.20] to identiy sample of QC failure or poor enrichment [1.2 by default]. For some challenging ChIP-seq like H3K9me3, you may try loose cutoff like 1.
#' @param metaFile a data.frame of metadata by ReadMeta(); or a filename of metadata file.
#' @return A data.frame of the updated metadata with scaling factors (sacaling factor table)
#' @export
#' @examples
#' ## 1. start from a parsedMatrix file
#'
#' # parsedMatrixFile <- "your/path/test_parsedMatrix.txt"
#' # metaFile <- "your/path/sample_meta.txt"
#' # parsedDF <- read.table(parsedMatrixFile, sep="\t",header=TRUE,fill=TRUE,
#' # quote="",row.names=NULL ,check.names=F)
#' # SF <- CalculateSF (data=parsedDF,metaFile=metaFile)
#' ## For some ChIP with unideal enrichment like H3K9me3, you may try loose cutoff (1)
#' ## but use 97% of total reads to improve performance.
#' # SF <- CalculateSF(data, metaFile = metaFile, maxLastTurn=0.97, cutoff_QC=1)
#'
#' ## 2. start from a rawCount file
#'
#' # metaFile <- "your/path/sample_meta.txt"
#' # parsedDF <- ParseReadCounts(data="your/path/test_rawCounts.txt", metaFile=metaFile,
#' # prefix="your/path/test_parsedMatrix.txt")
#' # SF <- CalculateSF (data=parsedDF,metaFile=metaFile)
CalculateSF <- function(data, metaFile = "sample_meta.txt",minFirstTurn = "auto", maxLastTurn=0.99, cutoff_QC=1.2) {
# calculate scaling factors
options(stringsAsFactors = F)
if (class(metaFile) == "character") { # given a filename, need to load it
meta <- ReadMeta(metaFile)
} else {
meta <- metaFile
}
if (class(data) == "character") { # given a filename, need to load it
if (file.exists(data)) {
data <- read.table(data, sep = "\t", header = TRUE, fill = TRUE, stringsAsFactors = FALSE, quote = "", row.names = NULL, check.names = F)
} else {
stop("Not found parsed matrix and your input is not a data.frame.\n")
}
}
kept <- intersect(meta$ID, colnames(data))
if (length(kept) == 0) {
stop("meta data IDs don't match with column names in parsedMatrix.txt.")
}
meta <- meta[kept, ] # only use samples listed in meta
data <- data[, c(colnames(data)[1], kept)] # reorder and subset
MAX_CPM <- max(data[, 1])
#---------FUNCTION for quality control-------------------------------------
QC <- function(data, minFirstTurn = "auto", maxLastTurn=0.99, cutoff_QC = 1.2) {
# minFirstTurn: "auto"[default] or value between 0.1-0.8; 0.5 works well empirically
# minLastTurn: "auto"[default] or value between 0.95 -0.99; 0.99 works well empirically
# cutoff_QC: 1.2 detect 90% of input samples as QC failure [default]
findLastTurnByPeak <- function(x, maxLastTurn1=0.99) {
# define the last turning point as the highest peak in the density distribution.
# x must be a vector with names
# por : proportion of reads
x <- x[x > 0 & x < maxLastTurn1]
d <- density(x)
dValues <- diff(d$y)
turns <- which(dValues[-1] * dValues[-length(dValues)] < 0) + 1
if (length(turns) == 0) {
ind <- which.max(d$y)
} else {
ind <- max(turns)
}
list(
por = round(d$x[ind], 4),
cpmw = as.numeric(names(x[x >= d$x[ind]][1]))
)
}
findLastTurnByCutoff <- function(x, minDiff = 0.001, maxLastTurn2=0.99) {
# find the last turning point where dValue is close to minDiff
# x must be a vector with names
# por : proportion of reads
x <- x[x > 0 & x < maxLastTurn2] #
dValues <- diff(x)
dValues <- dValues[dValues > 0]
minDiff <- ifelse(min(dValues) > minDiff, min(dValues), minDiff)
dValuesAfterPeak <- dValues[which.max(dValues):length(dValues)]
ind <- which(dValuesAfterPeak > 0 & dValuesAfterPeak <= minDiff)[1]
list(
por = round(x[names(dValues)[ind]], 4),
cpmw = as.numeric(names(dValues)[ind])
)
}
findLastTurn <- function(x, minDiff = 0.001, maxLastTurn4=0.99) {
# determine the optimal last turning point from two methods
x <- x[x > 0 & x < maxLastTurn]
res1 <- findLastTurnByPeak(x,maxLastTurn1=maxLastTurn4)
res2 <- findLastTurnByCutoff(x,minDiff, maxLastTurn2=maxLastTurn4)
delta <- max(x) - res1$por
# to deal with sample haveing bad enrichment and long tail of distribution
if (delta < 0.01) {
return(res2)
}
return(res1)
}
findFirstTurn <- function(x, cutoff = minFirstTurn, maxLastTurn3=0.99) {
# define the first turning point before the highest peak
# x must be a vector with names
# cutoff: "auto" or value between 0.1-0.8
# por : proportion of reads
if (tolower(as.character(cutoff)) == "auto") {
x <- x[x > 0 & x < maxLastTurn3]
d1 <- density(x)
Xpeak <- d1$x[which.max(d1$y)]
d2 <- density(d1$x[d1$x < Xpeak])
por <- min(d2$x[which.max(d2$y):length(d2$x)])
cpmw <- as.numeric(names(x[x >= por][1]))
} else if (as.numeric(cutoff) >= 0.1 | as.numeric(cutoff) <= 0.8) {
por <- x[which(as.numeric(names(x)) >= cutoff)[1]]
cpmw <- cutoff
list(cpmw = cutoff, por = por)
} else {
stop("\n**Invalid cutoff for detection of the first turning point**\n")
}
list(por = round(por, 4), cpmw = cpmw)
}
QC.list <- apply(data[, 2:ncol(data)], 2, FUN = function(x, cutoffQC = cutoff_QC) {
names(x) <- data[, 1]
# find last turning point
turnLast <- findLastTurn(x, minDiff = 0.001, maxLastTurn4=maxLastTurn)
turnFirst <- findFirstTurn(x, cutoff = minFirstTurn, maxLastTurn3=maxLastTurn)
if (turnLast$cpmw >= cutoffQC) {
QCstr <- "pass"
} else {
QCstr <- "fail: complete loss, input or poor enrichment"
}
slope <- (turnLast$por - turnFirst$por) / (turnLast$cpmw - turnFirst$cpmw)
list(
QC = QCstr, TURNS = paste0(turnFirst$cpmw, ",", turnFirst$por, ",", turnLast$cpmw, ",", turnLast$por),
xMin = turnFirst$cpmw, yMin = turnFirst$por, xMax = turnLast$cpmw, yMax = turnLast$por, SLOPE = slope
)
})
QC.df <- do.call(rbind.data.frame, QC.list)
QC.df
}
#---------calculate SF-------------------------------------
QC.df <- QC(data, minFirstTurn = minFirstTurn, maxLastTurn=maxLastTurn, cutoff_QC = cutoff_QC)
meta <- meta[, !colnames(meta) %in% c("TURNS", "QC", "SF")]
meta <- cbind(meta, QC.df[rownames(meta), ])
meta$SF <- NA
for (ab in unique(meta$ANTIBODY)) {
inds <- grep(paste0("^", ab, "$"), meta$ANTIBODY) # grep may cause problem sometime
SF <- round(max(meta$SLOPE[inds]) / meta$SLOPE[inds], 2)
meta$SF[inds] <- SF
}
meta$SF[meta$SF == 0 ] <- 1 # happened when Xa could equal Xb, only if user manually set maxLastTurn and cutoff_QC inapporiately
meta$SF[meta$QC != "pass"] <- NA
return(invisible(meta))
}
#-----------------------------------------------------
#' This function generates CPMW distribtion curve and barplot using sacaling factor table.
#'
#' @param data a data.frame generated by function ParseReadCounts() or a file name of parsed matrix
#' @param metaFile a data.frame of metadata by ReadMeta(); or a filename of metadata file.
#' @param prefix prefix of output filename to save the plots and scaling factor values.
#' @param xlimMaxCPMW NULL or a numeric value [ between 10 and 100 ] to define maximum CPMW in the distribtion plot (xlim).
#' @return None
#' @export
#' @examples
#' ## 1. start from a parsedMatrix file
#'
#' # parsedMatrixFile <- "your/path/test_parsedMatrix.txt"
#' # metaFile <- "your/path/sample_meta.txt"
#' # parsedDF <- read.table(parsedMatrixFile, sep="\t",header=TRUE,fill=TRUE,
#' # quote="",row.names=NULL ,check.names=F)
#'
#' ## use defalut setting
#' # SF <- CalculateSF(data = parsedDF, metaFile = metaFile)
#' # PlotDistr (data=parsedDF,SF=SF, prefix="your/path/test", xlimMaxCPMW=NULL)
#'
#' ## use custom setting for H3K9me3
#' # SF <- CalculateSF(data = parsedDF, metaFile = metaFile,
#' # maxLastTurn=0.97, cutoff_QC=1)
#' # PlotDistr (data=parsedDF,SF=SF, prefix="your/path/test_manual_cutoff", xlimMaxCPMW=NULL)
#'
#' ## zoom out distribution curve
#' # PlotDistr (data=parsedDF,SF=SF, prefix="your/path/test_manual_cutoff_zoom", xlimMaxCPMW=50)
#'
######################################################
PlotDistr<-function(data, SF="test_SF.txt", prefix="test", xlimMaxCPMW=NULL)
{
#plot distribution of CPMW
if (class(SF) == "character") { # given a filename, need to load it
if (file.exists(SF)) {
meta <- ReadMeta(SF)
}else {
stop("Not found SF file and your SF is not a data.frame.\n")
}
}else{ # assume it is a data.frame
meta <- SF
}
if (class(data) == "character") { # given a filename, need to load it
if (file.exists(data)) {
data <- read.table(data, sep = "\t", header = TRUE, fill = TRUE, stringsAsFactors = FALSE, quote = "", row.names = NULL, check.names = F)
} else {
stop("Not found parsed matrix and your input is not a data.frame.\n")
}
}
x <- data[, 1]
imgOutput <- paste0(prefix, "_distribution.pdf")
pdf(imgOutput, width = 14, height = 8)
slopesByAb <- NULL
for (ab in c("All Antibodies", unique(meta$ANTIBODY))) {
if (ab == "All Antibodies") {
metaByAb <- meta
} else {
metaByAb <- meta[meta$ANTIBODY == ab, ]
}
if (length(unique(meta$ANTIBODY)) == 1) {
# avoid redundant and identical plot
if (ab == unique(meta$ANTIBODY)) {
next
} else {
ab <- unique(meta$ANTIBODY)
}
}
subsetByAb <- as.data.frame(data[, metaByAb$ID], check.names = F)
colnames(subsetByAb) <- metaByAb$ID
# delete rows where all values are out of dataRange 9
if (ncol(subsetByAb) > 1) {
kept <- rowSums(subsetByAb > 0.99) != ncol(subsetByAb)
x <- data[kept, 1]
subsetByAb <- subsetByAb[kept, ]
} else {
x <- data[, 1]
}
# Set plot layout
par(mfrow = c(1, 3), oma = c(0, 0, 5, 0))
layout.matrix <- matrix(c(1, 2, 3), nrow = 1, ncol = 3)
layout(
mat = layout.matrix,
widths = c(4, 3, 1)
) # Widths of the three columns
par(mar = c(10, 6, 6, 2))
#-----------------------------------------------------
#-------------plot1: curves--------------------------
if (is.null(xlimMaxCPMW)){
Xbs <- strsplit(meta$TURNS,",")
MAX_CPMW <- max(as.numeric(unlist(lapply(Xbs,'[',3) ))) + 10
}else{
MAX_CPMW <- min(max(x), xlimMaxCPMW, 80)
}
if (ncol(subsetByAb) == 1) {
totalPages <- 1
} else {
totalPages <- 1:ncol(subsetByAb)
}
for (r in totalPages) {
y <- subsetByAb[, r]
id <- colnames(subsetByAb)[r]
used <- data.frame(x = x, y = y)
used <- na.omit(used)
if (r == 1) {
plot(used,
main = "Cumulative Distribution", col = metaByAb$COLOR[r], lwd = 2, xlab = "Cutoff (CPMW)",
cex.lab = 1.5, cex.axis = 1.5, type = "l", ylab = "Proportion of reads", xlim = c(0, MAX_CPMW),
ylim = c(0.0, 1.1), cex.main = 1.5
)
} else {
lines(used, col = metaByAb$COLOR[r], lty = 1, lwd = 2, pch = 20, cex = 0.1)
}
if (!is.na(metaByAb$SF[r])) { # skip slope line for QC-failed sample
lines(x = c(metaByAb$xMin[r], metaByAb$xMax[r]), y = c(metaByAb$yMin[r], metaByAb$yMax[r]), col = metaByAb$COLOR[r], lty = 3)
}
}
labelLength <- max(nchar(colnames(subsetByAb)))
fontSize <- ifelse(labelLength < 40, 1.5, 0.8)
legFontSize <- ifelse(ncol(subsetByAb) < 10, fontSize, 0.4 + 5 / ncol(subsetByAb))
legend("bottomright",
legend = paste(gsub(".bam", "", metaByAb$ID), paste(", SF=", metaByAb$SF, sep = ""), sep = ""),
col = metaByAb$COLOR, text.col = metaByAb$COLOR, pch = 15, bty = "n", ncol = 1, cex = legFontSize
)
#-----------plot2: barplot----------------------------
cpmw_grps <- do.call(rbind.data.frame, strsplit(metaByAb$TURNS, ","))
cpmw_Xa <- as.numeric(as.character(cpmw_grps[, 1]))
cpmw_Xb <- as.numeric(as.character(cpmw_grps[, 3]))
datTmp1 <- data.frame(factor = metaByAb$GROUP, value = cpmw_Xa)
Xa <- max(with(datTmp1, tapply(value, factor, median)))
datTmp2 <- data.frame(factor = metaByAb$GROUP, value = cpmw_Xb)
Xb <- min(with(datTmp2, tapply(value, factor, median)))
cutoffLow <- round(Xa, 2)
cutoffHigh <- round(Xb, 2)
if (ab == "All Antibodies") { # for all antibody
cutoffLow <- 0.5
cutoffHigh <- 1.2
}
if (cutoffHigh >= 3) { # high enrichment
myCols <- c(gray(8 / 9), "pink", "red")
} else if (cutoffHigh >= 1.2) { # median enrichment
myCols <- c(gray(c(8, 7) / 9), "pink")
} else { # poor enrichment
myCols <- gray(c(8, 7, 6) / 9)
}
Low <- which.min(abs(x - cutoffLow))
High <- which.min(abs(x - cutoffHigh))
barplotDF <- rbind(subsetByAb[Low, ], subsetByAb[High, ] - subsetByAb[Low, ])
barplotDF <- rbind(barplotDF, 1 - subsetByAb[High, ])
barplotDF <- as.data.frame(barplotDF, check.names = F)
colnames(barplotDF) <- metaByAb$ID
rownames(barplotDF) <- c(
paste0("Low (0-", cutoffLow, ")"),
paste0("Low [", cutoffLow, "~", cutoffHigh, "]"),
paste0("High (>", cutoffHigh, ")")
)
xLabels <- colnames(barplotDF)
xLabels <- gsub(".bam", "", xLabels)
xLabels <- gsub(".BAM", "", xLabels)
barNum <- ncol(subsetByAb)
if (max(nchar(xLabels)) > 40 || barNum > 10) {
fontSize <- 0.8
} else {
fontSize <- 1.5
}
if (max(nchar(xLabels)) > 50) {
xLabels <- strtrim(xLabels, 50)
bottomMargin <- 32
} else if (max(nchar(xLabels)) < 20) {
bottomMargin <- 16
} else {
xLabels <- strtrim(xLabels, 40)
bottomMargin <- 25
}
legLbls <- rownames(barplotDF)
par(mar = c(bottomMargin, 10, 6, 0)) # c(bottom, left, top, right)
xlimMax <- 1
barWidth <- ifelse(barNum < 10, xlimMax / barNum * 0.7, xlimMax / barNum * 0.9)
barSpace <- ifelse(barNum < 10, xlimMax / barNum * 0.45, xlimMax / barNum * 0.25)
bp <- barplot(as.matrix(barplotDF),
col = myCols, beside = F, main = "Group by enrichment level", ylab = "Proportion of reads",
axes = FALSE, axisnames = FALSE, cex.main = 1.5, cex.lab = 1.5, cex.axis = 1.5, cex.names = 1.5,
xlim = c(0, xlimMax), width = barWidth, space = barSpace
)
axis(2, cex = 2)
axis(1, at = bp, labels = FALSE, tck = -0.02)
text(
x = bp, y = par("usr")[3] - (par("usr")[4] - par("usr")[3]) / 30, labels = xLabels,
col = metaByAb$COLOR, srt = 60, adj = 1, xpd = TRUE, cex = fontSize
)
#---------plot3: legend---------------------------------
# c(bottom, left, top, right)
par(mar = c(6, 2, 6, 2), xpd = T)
plot.new()
legend("top", fill = rev(myCols), legend = rev(legLbls), ncol = 1, cex = 1.2, bty = "n", title = "Group[CPMW]")
mtext(ab, outer = TRUE, cex = 1.5, line = 2)
mtext(imgOutput, outer = TRUE, cex = 1, line = 0)
}
garbage <- dev.off()
cat("\n\t", imgOutput, "[saved]")
#-----------------------------------------------------
output <- paste0(prefix, "_SF.txt")
outDF <- meta[, !colnames(meta) %in% c("SLOPE", "xMin", "yMin", "xMax", "yMax")]
write.table(outDF, output, sep = "\t", quote = F, row.names = F, col.names = T)
cat("\n\t", output, "[saved]")
cat("\n\n Reporting summary")
if ("GROUP" %in% colnames(meta)) {
combs <- paste(meta$ANTIBODY, ".", meta$GROUP, sep = "")
groups <- unique(combs)
for (group in groups) {
inds <- grep(group, combs)
cat("\n\t", group, ",\tave.SF =", round(mean(meta$SF[inds]), 3))
# cat(",\tave.SLOPE =", round(mean(meta$SLOPE[inds]),4))
}
cat("\n")
}
#--------plot each antibody individually------------------------------
}
######################################################
#' This function generates boxplot using sacaling factor table. It's been included in the last step of ChIPseqSpikeInFree().
#'
#' @param input a file/data.frame generated/returned by CalculateSF() or ChIPseqSpikeInFree().
#' It looks like metadata file but has extra columns SF and COLOR.
#' @param prefix prefix of output filename.
#' @return A filename of generated boxplot
#' @export
#' @examples
#' ## 1. re-generate boxplot of ChIPseqSpikeInFree scaling factors
#' ## After you run ChIPseqSipkeFree(), a sacaling factor table
#' ## (for example, test_SF.txt) will be generated.
#'
#' # BoxplotSF(input="test_SF.txt",prefix="test")
BoxplotSF <- function(input, prefix = "test") {
# This function generates boxplot using sacaling factor table
if (class(input) == "character") {
input <- read.table(input, sep = "\t", header = TRUE, fill = TRUE, quote = "", stringsAsFactors = FALSE, check.names = F)
rownames(input) <- input$ID
}
if (!"SF" %in% colnames(input) || !"ANTIBODY" %in% colnames(input) || !"GROUP" %in% colnames(input) || !"QC" %in% colnames(input)) {
stop("Input looks invalid for BoxplotSF().\n")
}
input <- na.omit(input) # exclude samples with SF==NA
if (nrow(input) == 0 | nrow(input) == 0) {
cat("\n **no data for boxplot**\n")
return(NULL)
}
# =======================================================
plotByAb <- function(metaByGAb, myTitle, PAGE) {
# no return value, just used side effect to generate plots
groupSize <- length(unique(metaByAb$GROUP2))
fontSize <- ifelse(groupSize > 10, 0.6, 1.0)
if (PAGE == 1) {
nCol <- ifelse(groupSizeAll > 10, 2, 1)
marB <- 10
marL <- 4
} else {
nCol <- 1
marB <- 15
marL <- 12
}
groupLabels <- sort(unique(metaByAb$GROUP2))
nameOrder <- ordered(metaByAb$GROUP2, levels = groupLabels)
if (!"COLOR" %in% colnames(metaByAb)) {
tim10equal <- c("skyblue", "#EF0000", "grey", "#00DFFF", "#50FFAF", "#BFFF40", "#FFCF00", "#FF6000", "#0000FF", "#800000")
myCols <- tim10equal[as.factor(metaByAb$GROUP2)]
} else {
myCols <- metaByAb[!duplicated(metaByAb$GROUP2), "COLOR"]
}
#---------panel left---------------
par(mar = c(marB, marL, marB / 2, 1))
bp <- boxplot(SF ~ nameOrder,
data = metaByAb,
main = myTitle, ylim = c(0, max(metaByAb$SF)),
las = 2, ylab = "Scaling Factors", xlab = "", xaxt = "n", cex.axis = fontSize, col = myCols,
cex.lab = 1, cex.main = 1, outline = T, pars = list(outcol = "white")
)
axis(1, at = seq_along(groupLabels), labels = FALSE, tck = -0.02)
text(
x = seq_along(groupLabels), y = par("usr")[3] - (par("usr")[4] - par("usr")[3]) / 30, srt = 45, adj = 1,
labels = groupLabels, col = myCols, xpd = TRUE, cex = fontSize
)
stripchart(SF ~ nameOrder,
data = metaByAb,
vertical = T,
method = "jitter", add = TRUE, jitter = 0.2, cex = 0.8, pch = 1, col = "#595959"
)
#---------panel right---------------
par(mar = c(marB, 0, marB / 2, 3), xpd = T)
# c(bottom, left, top, right)
plot.new()
legend("top", legend = groupLabels, col = myCols, pch = 15, bty = "n", ncol = nCol, cex = fontSize)
# mtext(myTitle, outer = TRUE, cex = 1, line = 0)
}
# =======================================================
input$SF <- as.numeric(input$SF)
input$GROUP2 <- paste(input$GROUP, input$ANTIBODY, sep = ".")
input <- input[order(input$ANTIBODY), ]
groupSizeAll <- length(unique(input$GROUP2))
antibodyList <- unique(input$ANTIBODY)
imgWidth <- ifelse(groupSizeAll > 15, 8 + groupSizeAll / 10,
ifelse(length(antibodyList) > 1, 12, 8)
)
output <- paste0(prefix, "_boxplot.pdf")
pdf(output, width = imgWidth, height = 7)
# page1: All antibodies
layout.matrix <- matrix(c(1, 2), nrow = 1, ncol = 2)
layout(layout.matrix, widths = c(4, 3))
myTitle <- "ScalingFactor ~ All antibodies"
par(oma = c(2, 2, 2, 2))
metaByAb <- input
plotByAb(metaByAb, myTitle, 1)
if (length(antibodyList) > 1) {
# avoid redundant and identical plot
# page2~END: 2 antibodies per page
num <- 2
layout.matrix <- matrix(1:(num * 2), nrow = 1, ncol = num * 2)
layout(layout.matrix, widths = rep(c(4, 3), num))
for (ab in antibodyList) {
metaByAb <- input[input$ANTIBODY == ab, ]
myTitle <- paste0("ScalingFactor ~ ", ab)
plotByAb(metaByAb, myTitle, 2)
}
}
garbage <- dev.off()
cat("\n\t", output, "[saved]")
invisible(output)
}
######################################################
#' wrapper function - perform ChIP-seq spike-free normalization in one step.
#'
#' This function wraps all steps.
#' If you run ChIPseqSpikeInFree() seperately for two batches, the scaling factors will be not comparable between two batches.
#' The correct way is to combine bamFiles parameter and create a new metadata file to include all bam files. Then re-run ChIPseqSpikeInFree().
#' @param bamFiles a vector of bam filenames.
#' @param chromFile chrom.size file. Given "hg19","mm10","mm9" or "hg38", will load chrom.size file from package folder.
#' @param metaFile a filename of metadata file. the file must have three columns: ID (bam filename without full path), ANTIBODY and GROUP
#' @param binSize size of bins (bp). Recommend a value bwteen 200 and 10000
#' @param minFirstTurn a numeric value [between 0.3 and 0.8] to define minimum fraction of reads to be ignored as background[ 0.5 by default]. Or character "auto" to allow automatic selction of value.
#' @param maxLastTurn a numeric value [between 0.95 and 0.99] to define maximum fraction of reads to be included for identifying last turning point [0.99 by default]. Slightly smaller maxLastTurn value may imporve result when the enrichment is not ideal.
#' @param singleEnd a logical value to define where your reads are single-end [TRUE by default] or FALSE [paired-end].
#' @param cutoff_QC a numeric value [between 0.95 and 1.20] to identiy sample of QC failure or poor enrichment [1.2 by default]. Slightly smaller cutoff_QC value (like 1) may imporve result when the enrichment is not ideal.
#' @param disableOverwritten a logical value to define whether to overwrite previous counting matrix & parsed matrix [FALSE by default].
#' @param ncores number of cores for parallel computing.
#' @param prefix prefix of output filename.
#' @return A data.frame of the updated metaFile with scaling factor
#' @export
#' @examples
#' ## 1 first You need to generate a sample_meta.txt (tab-delimited txt file).
#' # metaFile <- "your/path/sample_meta.txt"
#' # meta <- ReadMeta(metaFile)
#' # head(meta)
#' # ID ANTIBODY GROUP
#' # ChIPseq1.bam H3K27me3 WT
#' # ChIPseq2.bam H3K27me3 K27M
#'
#' ## 2. bam files
#' # bams <- c("ChIPseq1.bam","ChIPseq2.bam")
#' # prefix <- "test"
#'
#' ## 3. run ChIPseqSpikeInFree pipeline
#' # ChIPseqSpikeInFree(bamFiles=bams, chromFile="mm9",metaFile=metaFile,prefix="test")
#'
#' ## 4. run ChIPseqSpikeInFree pipeline with custom arguments for H3K9me3 with unideal enrichment
#' # ChIPseqSpikeInFree(bamFiles=bams, chromFile="mm9",
#' # metaFile=metaFile,prefix="test_manual_cutoffs",
#' # cutoff_QC = 1, maxLastTurn=0.97)
ChIPseqSpikeInFree <- function(bamFiles,
chromFile = "hg19",
metaFile = "sample_meta.txt",
prefix = "test",
binSize = 1000,
cutoff_QC = 1.2,
minFirstTurn = 0.5,
maxLastTurn = 0.99,
singleEnd = TRUE,
disableOverwritten = TRUE,
ncores = 2
) {
# perform ChIP-seq spike-free normalization in one step
if (binSize < 100 && binSize > 10000) {
cat(paste0("\n**recommended binSize range 200~10000 (bp); your binSize is", binSize, " **\n"))
}
cat("\nstep1. loading metadata file...")
meta <- ReadMeta(metaFile)
cat("\n\t[--done--]\n")
cat("\nstep2. counting reads...")
output1 <- paste0(prefix, "_rawCounts.txt")
# in case of downstream errors, just load existing outFile to avoid re-counting
if (file.exists(output1) && disableOverwritten) {
cat("\n\t", output1, "[just loading the existing file; delete this file first to do re-counting ]")
rawCountDF <- read.table(output1, sep = "\t", header = TRUE, fill = TRUE, stringsAsFactors = FALSE, quote = "", row.names = 1, check.names = F)
} else {
rawCountDF <- CountRawReads(bamFiles = bamFiles, chromFile = chromFile, prefix = prefix, binSize = binSize, singleEnd = singleEnd)
}
cat("\n\t[--done--]\n")
cat("\nstep3. parsing raw counts...")
output2 <- paste0(prefix, "_parsedMatrix.txt")
# in case of downstream errors, just load existing outFile to avoid re-processing raw counts
if (file.exists(output2) && disableOverwritten) {
cat("\n\t", output2, "[just loading the existing file; delete this file first to re-parse rawCount table]")
parsedDF <- read.table(output2, sep = "\t", header = TRUE, fill = TRUE, stringsAsFactors = FALSE, quote = "", row.names = NULL, check.names = F)
} else {
steps <- 0.05 * binSize / 1000
parsedDF <- ParseReadCounts(data = rawCountDF, metaFile = meta, prefix = prefix, by = steps, binSize = binSize, ncores = ncores)
}
cat("\n\t[--done--]\n")
cat("\nstep4. calculating scaling factors...")
SF <- CalculateSF(data = parsedDF, metaFile = meta, minFirstTurn = minFirstTurn, maxLastTurn=maxLastTurn, cutoff_QC=cutoff_QC)
cat("\nstep5. ploting distribution curves and bars ...")
PlotDistr(parsedDF, SF, prefix, xlimMaxCPMW=NULL )
cat("\t[--done--]\n")
cat("\nstep6. ploting scaling factors...")
BoxplotSF(SF, prefix)
cat("\n\t[--done--]\n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.