dptviz: dptviz

# ###################################################################
# Author: Jeffrey Bhasin <jeffb@case.edu>
# ###################################################################

# ###################################################################
# Main Report Functions

# -------------------------------------------------------------------
#' Summary plots of data preprocessed in bins by iDPT
#'
#' iDPT performs a preprocessing stage to convert a set of alignments in a BAM file to average coverage values in windows of a user-specified size genome-wide. This function saves a PDF file containing plots that summarize the distribution of these average coverage values and their relationship to CpG density on a specified reference genome.
#' @param dptpath Path to a folder containing an iDPT run (the folder that contains the "DPT_ws" folder)
#' @param chrs A vector of chromosome names to use (by default, the autosomes and sex chromosomes from hg19)
#' @param bsgenome A BSgenome object for the reference genome used in the alignments contained in the input BAM files given to iDPT (i.e. for UCSC's hg19, you would load "library(BSgenome.Hsapiens.UCSC.hg19)" and give BSgenome.Hsapiens.UCSC.hg19 to this argument). This is needed to compute CpG density in the windows used by iDPT.
#' @param file Filename to save the PDF report to (if NULL, will use the name of the folder as a prefix and save "prefix.reportPrepro.pdf" in the current working directory")
#' @param ncore Number of concurrent threads to use
#' @export
reportPrepro <- function(dptpath, chrs=c(sapply(seq(1,22),function(x) paste("chr",x,sep="")),"chrX","chrY"), bsgenome, file=NULL, ncore=1)
{
	# Set number of cores for %dopar%
	registerDoMC(ncore)

	# Set filename using dptpath if none given as argument
	if(is.null(file))
	{
		prefix <- str_split(dptpath, "/")[[1]]
		prefix <- prefix[length(prefix)]
		file <- paste(prefix, ".reportPrepro.pdf", sep="")
	}

	# Load data as matrix
	message("Loading data matrix")
	mat <- dptviz:::loadPrepro(dptpath, chrs)

	# Load bin locations as GRanges
	message("Extracting bin coordinates")
	bins.gr <- dptviz:::getBins(dptpath, chrs)

	# Calculate CpG density in reference genome for each bin
	message("Calculating CpG density for each bin")
	cpgctvec <- dptviz:::getCpG(bins.gr, bsgenome)

	# Make the average coverage into integer counts
	mat.r <- round(mat,0)

	# Start Plotting

	# Open PDF graphics device
	pdf(file=file, width=10.5, height=8)
	message(paste("Plotting as PDF: ",file, sep=""))

	# PLOT: Proportional Bar Graph of Zero to Nonzero
	df <- data.frame(sample=rownames(mat), zero=rowSums(mat==0), "less.than.10"=rowSums((mat<=10)&(mat!=0)), "from.11.to.100"=rowSums((mat>10)&(mat<=100)), "greater.than.100"=rowSums(mat>100))
	df <- melt(df)
	print(ggplot(df, aes(x=sample, y=value, fill=variable)) + geom_bar(stat="identity") + dptviz:::ggnice() + coord_flip() + labs(title="Proportion of Windows with Zero Reads", x="Sample", y="Number of Windows"))

	# PLOT: Kernel Density Plot of Read Counts by Windows
	dat <- melt(mat[,colSums(mat)>0])
	dat <- dat[dat$value>0,]
	print(ggplot(dat, aes(x=value, y=..count..)) + geom_density(aes(group=X1, colour=X1, fill=X1), alpha=0.3) + dptviz:::ggnice() + labs(title="Unrounded Read Distribution, Windows with > 0 Reads", x="log10(Read Count)", y="Number of Windows with Count"))

	# PLOT: Zoomed Kernel Density
	datsub <- dat[dat$value<25,]
	print(ggplot(datsub, aes(x=value, y=..count..)) + geom_density(aes(group=X1, colour=X1, fill=X1), alpha=0.3) + dptviz:::ggnice() + labs(title="Unrounded Read Distribution, Windows with > 0 Reads", x="Read Count", y="Number of Windows with Count"))

	# PLOT: Histogram of exact counts at read levels near the initfilter cutoff zone
	df <- foreach(i=1:nrow(mat), .combine="rbind") %dopar%
	{
		matsub <- mat.r[i,(mat.r[i,]>0)&(mat.r[i,]<=20)]
		h <- hist(matsub, breaks=c(1:21), include.lowest=TRUE, right=FALSE, plot=FALSE)
		bins <- h$breaks[1:(length(h$breaks)-1)]
		bins <- factor(bins, levels=bins)
		ret <- data.frame(sample=rownames(mat)[i], reads=bins, windows=h$counts)
		ret
	}
	print(ggplot(df, aes(x=reads, y=windows, fill=sample)) + geom_bar(stat="identity") + dptviz:::ggnice() + labs(title="Histogram of Rounded Read Counts", x="Read Count", y="Number of Windows at Count") + facet_grid(sample ~ .))

	# PLOT: Histogram of CpG count in windows with and without signal
	df1 <- foreach(i=1:nrow(mat), .combine="rbind") %dopar%
	{
		wins <- mat.r[i,]>0
		cg <- cpgctvec[wins]
		h <- hist(cg, breaks=0:(max(cpgctvec)+1), include.lowest=TRUE, right=FALSE, plot=FALSE)
		bins <- h$breaks[1:(length(h$breaks)-1)]
		bins <- factor(bins, levels=bins)
		ret <- data.frame(sample=rownames(mat)[i], type="greater_than_zero", ncpg=bins, nwindows=h$counts, per=(h$counts/sum(wins))*100)
		ret
	}
	df2 <- foreach(i=1:nrow(mat), .combine="rbind") %dopar%
	{
		wins <- mat.r[i,]==0
		cg <- cpgctvec[wins]
		h <- hist(cg, breaks=0:(max(cpgctvec)+1), include.lowest=TRUE, right=FALSE, plot=FALSE)
		bins <- h$breaks[1:(length(h$breaks)-1)]
		bins <- factor(bins, levels=bins)
		ret <- data.frame(sample=rownames(mat)[i], type="zero", ncpg=bins, nwindows=h$counts, per=(h$counts/sum(wins))*100)
		ret
	}
	df3 <- foreach(i=1:nrow(mat), .combine="rbind") %dopar%
	{
		wins <- mat.r[i,]>3
		cg <- cpgctvec[wins]
		h <- hist(cg, breaks=0:(max(cpgctvec)+1), include.lowest=TRUE, right=FALSE, plot=FALSE)
		bins <- h$breaks[1:(length(h$breaks)-1)]
		bins <- factor(bins, levels=bins)
		ret <- data.frame(sample=rownames(mat)[i], type="greater_than_3", ncpg=bins, nwindows=h$counts, per=(h$counts/sum(wins))*100)
		ret
	}
	df <- rbind(df1,df3,df2)
	print(ggplot(df, aes(x=ncpg, y=per, fill=sample)) + geom_bar(stat="identity") + dptviz:::ggnice() + labs(title="Percent of Zero and Nonzero Signal Windows at CpG Counts", x="Reference Genome CpG Count", y="Percent of Windows at CpG Count") + facet_grid(sample ~ type))

	# PLOT: CpG to nReads Relationship by Sample and Pooled Overall
	df <- foreach(i=1:nrow(mat), .combine="rbind") %dopar%
	{
		df <- data.frame(sample=rownames(mat)[i], ncpg=cpgctvec[cpgctvec>0], reads=mat.r[i,cpgctvec>0])
		df$logreads <- log10(df$reads)
		df <- df[df$reads>3,]
		df <- df[df$reads<100,]
		df$ncpg.fact <- factor(df$ncpg)
		df
	}
	print(ggplot(df, aes(x=ncpg.fact, y=reads, fill=sample)) + geom_boxplot() + ggnice() + facet_grid(sample ~ .) + labs(title="Read Count Distributions at Increasing CpG Counts", x="CpG Count", y="Read Count"))
	cor <- cor(df$ncpg,df$reads,use="pairwise.complete.obs")
	reg <- lm(df$reads ~ df$ncpg)
	print(smoothScatter(df$ncpg, df$reads,nrpoints=1000,main=paste("nCpG vs Reads, Pooled for All Samples", " (cor=",round(cor,digits=4),")",sep=""),xlab="CpG", ylab="Reads"))
	print(abline(reg))

	# Close Graphics Device
	dev.off()

	NULL
}
# -------------------------------------------------------------------

# -------------------------------------------------------------------
# ' Summary plots of sites detected by iDPT
#'
#' iDPT performs a preprocessing stage to convert a set of alignments in a BAM file to average coverage values in windows of a user-specified size genome-wide. This function saves a PDF file containing plots that summarize the distribution of these average coverage values and their relationship to CpG density on a specified reference genome.
#' @param dptpath Path to a folder containing an iDPT run (the folder that contains the "DPT_ws" folder)
#' @param sitespath Path to the "sites" csv file output from running iDPT (if NULL, will search for a "*.sites.csv" file in dptpath and choose the first one it finds)
#' @param chrs A vector of chromosome names to use (by default, the autosomes and sex chromosomes from hg19)
#' @param file Filename to save the PDF report to (if NULL, will use the name of the folder as a prefix and save "prefix.reportPrepro.pdf" in the current working directory")
#' @param ncore Number of concurrent threads to use
#' @export
reportSites <- function(dptpath, sitespath=NULL, chrs=c(sapply(seq(1,22),function(x) paste("chr",x,sep="")),"chrX","chrY"), file=NULL, ncore=1)
{
	# Set number of cores for %dopar%
	registerDoMC(ncore)

	# Set filename using dptpath if none given as argument
	if(is.null(file))
	{
		prefix <- str_split(dptpath, "/")[[1]]
		prefix <- prefix[length(prefix)]
		file <- paste(prefix, ".reportSites.pdf", sep="")
	}

	

	NULL
}
# -------------------------------------------------------------------

# -------------------------------------------------------------------
# ' Annotation (via the goldmine package) of the genomic contexts of sites detected by iDPT

# -------------------------------------------------------------------

# ###################################################################

# ###################################################################
# Utility Functions

# Nice clean ggplot2 theme
ggnice <- function()
{
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), axis.text = element_text(color="black"))
}

# Loads bcrvg.RData and converts object to a matrix (rows=samples, cols=windows genome-wide in order of chrs)
loadPrepro <- function(dptpath, chrs)
{
	# Load iDPT workspace
	bcvrg.path <- paste(dptpath, "/DPT_ws/Preprocess/bcvrg.RData", sep="")
	load(bcvrg.path)

	# Build matrix of coverage values
	samples <- names(bcvrg$data)

	mat <- foreach(sample=samples, .combine=rbind, .verbose=FALSE) %dopar%
	{
		# get genome-wide vector
		foreach(chr=chrs, .combine=c) %do%	
		{
			print(paste("Doing",sample, "chr", chr, sep=" "))
			vec <- as.vector(bcvrg$data[[sample]][[chr]])
			vec
		}
	}
	rownames(mat) <- samples
	rownames(mat) <- str_replace(rownames(mat),".bam","")
	mat
}

# Returns GenomicRanges object of each bin iDPT used
getBins <- function(dptpath, chrs)
{
	bcvrg.path <- paste(dptpath, "/DPT_ws/Preprocess/bcvrg.RData", sep="")
	load(bcvrg.path)
	bcvrg.gr2 <- GRanges(seqnames=seqnames(bcvrg.rd), ranges=IRanges(start=start(bcvrg.rd), end=end(bcvrg.rd)))
	gbins <- bcvrg.gr2[seqnames(bcvrg.gr2) %in% chrs]
	seqlevels(gbins) <- chrs
	gbins <- gbins[order(seqnames(gbins))]
}

# Returns vector of CpG frequency in each region of a GenomicRanges object
getCpG <- function(gr, bsgenome)
{	
	# Parallel by seqnames (chrs)
	out <- foreach(chr=as.character(unique(seqnames(gr))), .combine="c") %dopar%
	{
		print(paste("Doing",chr,sep=" "))
		seq <- getSeq(bsgenome, names=gr[seqnames(gr)==chr])
		difreq <- dinucleotideFrequency(seq, as.prob=FALSE)
		cpgfreq <- difreq[,colnames(difreq)=="CG"]
		cpgfreq
	}
	out
}

# Read an iDPT sites CSV as a data.frame
readSitesCSV <- function(dptpath)
{
	# Set pattern and contrast as character class
	csv <- read.csv(file=dptpath, header=TRUE, stringsAsFactors=FALSE, nrows=5)
	classes <- sapply(csv, class)
	classes[c(5,6)] <- "character"
	csv <- read.csv(file=dptpath, header=TRUE, colClasses=classes)
	csv
}

# Take a vector of binary indicators, and flip their state (i.e. 0->1 and 1->0 so 101 becomes 010)
invertBinary <- function(bin)
{
	bin <- str_replace_all(bin, "0", "A")
	bin <- str_replace_all(bin, "1", "B")
	bin <- str_replace_all(bin, "A", "1")
	bin <- str_replace_all(bin, "B", "0")
	bin
}

# flip the fold change ratio in cases where the contrast is opposite to the pattern
fixContrasts <- function(sites)
{
	# The all "1" state pattern for this number of samples
	allpatt <- paste(rep(1,str_length(sites[1,]$pattern)),collapse="")

	# Grab all sites where contrast does not match pattern and pattern is not 111
	s.index <- (sites$pattern!=sites$contrast)&(sites$pattern!=allpatt)
	s <- sites[s.index,]

	# Invert the binary of the contrast
	s$contrast <- invertBinary(s$contrast)

	# Flip the sign of the ratio
	s$ratio <- s$ratio*-1

	# Merge back with the original results and return
	sites[s.index,] <- s

	# Take all negative ratio pattern 111 and invert the contrast/flip the sign
	s.index <- ((sites$pattern==allpatt)&(sites$ratio<0))
	s <- sites[s.index,]
	s$contrast <- invertBinary(s$contrast)
	sites[s.index,] <- s

	# Join "111" sites with their contrasts as patterns
	sites[sites$pattern==allpatt,]$pattern <- paste(sites[sites$pattern==allpatt,]$pattern, sites[sites$pattern==allpatt,]$contrast, sep="_")
	sites
}
# ###################################################################