##########################################################################
# Genotyping Uncertainty with Sequencing data - Base package (GUSbase)
# Copyright 2017-2020 Timothy P. Bilton
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#########################################################################
#' Produce a comet plot, rocket plot or RDD plot
#'
#' Diagnostic graphs for assessing the validity of the binomial model for read count data
#' generated using high-throughput sequencing technology
#'
#' @usage
#' \preformatted{
#' ## Method for RA object
#' RAobj$cometPlot(ploid=2, filename=NULL, cex=1, maxdepth=500, maxSNPs=1e5, res=300, ind=FALSE, ncores=1, ...)
#'
#' RAobj$rocketPlot(ploid=2, filename=NULL, cex=1, maxdepth=500, maxSNPs=1e5, res=300, scaled=TRUE, ...)
#'
#' RAobj$RDDPlot(ploid=2, filename=NULL, maxdepth=500, maxSNPs=1e5, ...)
#' }
#'
#' ## Stand-alone functions
#' cometPlot(ref, alt, ploid=2, gfreq=NULL, file=NULL, ind=FALSE, cex=1, maxdepth=500, maxSNPs=1e5, res=300,
#' color=NULL, ncores=1, indID=NULL, ...)
#'
#' rocketPlot(ref, alt, ploid=2, gfreq=NULL, file=NULL, cex=1, maxdepth=500, maxSNPs=1e5, res=300, scaled=TRUE, ...)
#'
#' RDDPlot(ref, alt, ploid=2, gfreq=NULL, file=NULL, maxdepth=500, maxSNPs=1e5, ...)
#'
#' @param ref Integer matrix: Read counts for the reference allele
#' @param alt Integer matrix: Read counts for the alternate allele
#' @param ploid Non-negative integer: The ploidy level of the individuals
#' @param gfreq Numeric matrix: Genotype probabilities for each SNP and individual
#' @param filename Character: The name of the file to save the plot to
#' @param cex Numeric value: A magnification value for the size of the text and labels on the axes
#' @param maxdepth Postive integer: The maximum depth for the x-axis and y-axis
#' @param maxSNPs Postive integer: The maximum number of SNPs to use in creating the graph
#' @param res Positive numeric value: The resolution of the graph when saving to a file
#' @param scaled Logical: If TRUE, the counts in the rocket plot are scaled
#' @param color Vector: Color palette used for the heatmap.
#' @param ind Logical: If TRUE, cometPlots are produced for each individual separately
#' @param indID Character vector: Sample IDs corresponding to the rows of the reference and alternative allele matrices
#' @param ncores Positive integer: Number of cores to use in the parallelization when
#' constructing comet plots for each individual.
#' @param xleg Positive value: Scaling factor for the horizontal width of the legend.
#' @param yleg Positive value: Scaling factor for the vertical height of the legend.
#'
#' @aliases cometPlot rocketPlot $rocketPlot $cometPlot RDDplot $RDDplot
#' @author Timothy P. Bilton
#' @examples
#'
#' ## Read in simulated data in GUSbase package
#' vcffile <- simDS()$vcf
#' rafile <- VCFtoRA(vcffile)
#' radata <- readRA(rafile)
#'
#' ## Produce a comet plot
#' radata$cometPlot()
#'
#' ## Produce a rocket plot
#' radata$rocketPlot()
#'
#' ## RDD plot
#' radata$RDDPlot()
#'
#' @name DiagPlots
#' @export cometPlot
cometPlot <- function(ref, alt, ploid=2, gfreq=NULL, file=NULL, ind=FALSE, cex=1, maxdepth=500, maxSNPs=1e5, res=300,
color=NULL, ncores=1, indID=NULL, yleg=5, xleg=2, ...){
## Do some checks
if(!is.matrix(ref) || GUSbase::checkVector(as.vector(ref)))
stop("Matrix of reference allele counts is invalid")
if(!is.matrix(alt) || GUSbase::checkVector(as.vector(alt)))
stop("Matrix of alternate allele counts is invalid")
if(GUSbase::checkVector(ploid, minv=2))
stop("Argument `ploid` is invalid")
if (!is.null(gfreq)) {
if (!is.matrix(gfreq) || !is.numeric(gfreq) || any(is.na(gfreq)) ||
any(gfreq < 0) || any(gfreq > 1) || nrow(gfreq) !=
(ploid + 1) || ncol(gfreq) != ncol(ref) ||
any(sapply(colSums(gfreq), function(x) isTRUE(!all.equal(x, 1, tolerance = 1e-04)))))
stop("Matrix of genotype frequencies is invalid")
}
if(GUSbase::checkVector(maxdepth, minv = 2))
stop("Argument `maxdepth` is invalid")
if(GUSbase::checkVector(maxSNPs, minv = 2))
stop("Argument `maxSNPs` is invalid")
if(GUSbase::checkVector(res, type="pos_numeric", minv = 2))
stop("Argument `res` is invalid")
if(GUSbase::checkVector(cex, type="pos_numeric"))
stop("Argument `cex` is invalid")
if(!is.null(color) & (!is.vector(color) | !all(network::is.color(color))))
stop("Argument `color` is invalid")
if(!is.logical(ind) || length(ind) != 1 || is.na(ind))
stop("Argument `ind` is invalid")
if(GUSbase::checkVector(ncores, type="pos_integer",minv=1) || length(ncores) != 1)
stop("Argument `ncores` is invalid")
if(GUSbase::checkVector(yleg, type="pos_numeric",minv=0, equal=F) || length(yleg) != 1)
stop("Argument `yleg` is invalid")
if(GUSbase::checkVector(xleg, type="pos_numeric",minv=0, equal=F) || length(xleg) != 1)
stop("Argument `xleg` is invalid")
if(!ind){
### check for filename
if(!is.null(file)){
if(!is.vector(file) || !is.character(file) || length(file) != 1)
stop("Filename input is invalid")
filename <- paste0(file,"_comet.png")
if(!file.create(filename,showWarnings = F))
stop("Unable to create output file.")
}
### check if there are too many SNPs
if(ncol(ref) > maxSNPs){
SNPsam <- sample(1:ncol(ref), size=maxSNPs)
ref <- ref[,SNPsam]
alt <- alt[,SNPsam]
}
## Remove SNPs with really large read depths
depth <- ref+alt
toRemove = which(depth>2*maxdepth)
ref[toRemove] <- 0
alt[toRemove] <- 0
depth[toRemove] <- 0
## compute the observed depths
heteCall <- which(ref > 0 | alt > 0)
maxCount <- 2*min(max(ref,alt,100),maxdepth)
ref_sub <- factor(ref[heteCall],levels=0:maxCount,labels=0:maxCount)
alt_sub <- factor(alt[heteCall],levels=0:maxCount,labels=0:maxCount)
obsTab <- table(ref_sub,alt_sub)
## table for cometplot
obsTab[upper.tri(obsTab)] <- obsTab[upper.tri(obsTab)] + t(obsTab)[upper.tri(t(obsTab))]
## set up the count matrix
countMat <- matrix(nrow=maxCount+2,ncol=maxCount+2)
countMat[upper.tri(countMat)] <- log(obsTab[-which(lower.tri(obsTab))]+1)
rm(list=c("ref_sub","alt_sub"))
gc()
## Compute the expected counts
if(is.null(gfreq)){ # if genotype frequencies are not specified
p <- matrix(colMeans(ref/(ref+alt),na.rm=T), nrow=nrow(ref), ncol=ncol(ref), byrow=T)
uni_depth <- 2:maxCount
expCounts <- sapply(uni_depth, function(x){
indx <- which(depth == x)
if(length(indx) == 0) return(numeric(floor(x/2)+1))
else{
pvec <- sort(unique(round(p[indx],2)))
countp <- tabulate(round(p[indx]*100))
if(any(pvec == 0)) countp <- c(sum(round(p[indx]*100) == 0), countp)
countp <- countp[which(countp > 0)]
out <- as.matrix(sapply(1:length(pvec), function(z){
temp <- countp[z]*sapply(0:x, function(y) sum(dbinom(y, x, prob=0:ploid/ploid)*dbinom(0:ploid, size=ploid, prob=(1-pvec[z]))))
if((x %% 2) == 0)
return(temp[0:(x/2)+1] + c(temp[x:(x/2+1)+1],0))
else
return(temp[0:floor(x/2)+1] + temp[x:ceiling(x/2)+1])
}))
return(rowSums(out))
}
})
} else{
pdist <- unique(gfreq, MARGIN=2)
indx_p <- sapply(1:ncol(pdist), function(x) which(apply(gfreq,2, function(y) identical(y,pdist[,x]))), simplify = F)
pvec <- numeric(ncol(ref))
for(i in 1:length(indx_p))
pvec[indx_p[[i]]] <- i
expCounts <- sapply(2:maxCount, function(x){
indx <- which(depth == x, arr.ind = T)[,2]
if(length(indx) == 0) return(numeric(x+1))
else{
countp <- tabulate(pvec[indx])
nzp <- which(countp > 0)
countp <- countp[nzp]
out <- as.matrix(sapply(1:length(countp), function(z){
temp <- countp[z]*sapply(0:x, function(y) sum(dbinom(y, x, prob=0:ploid/ploid)*pdist[,z]))
if((x %% 2) == 0)
return(temp[0:(x/2)+1] + c(temp[x:(x/2+1)+1],0))
else
return(temp[0:floor(x/2)+1] + temp[x:ceiling(x/2)+1])
}))
return(rowSums(out))
}
})
}
expCounts <- c(0,obsTab[1,2],expCounts)
maxCol = length(tail(expCounts,1)[[1]])
expCounts_order <- c(list(cbind(unlist(lapply(expCounts, function(x) x[1])), 0:(maxCount)+1, 1)),
sapply(2:maxCol, function(x)
cbind(unlist(lapply(expCounts[(x*2-1):(maxCount+1)], function(y) y[x])), (x-1):(maxCount+1-x)+1,x)))
temp <- do.call("rbind",expCounts_order)
indx_entry <- temp[,c(2:3)]
indx_entry[,1] <- indx_entry[,1]+1
countMat[indx_entry] <- log(temp[,1]+1)
countMat[which(countMat < 0)] <- 0
countMat[1,2] <- countMat[2,1] <- NA
### Produce the plot
maxplot <- min(max(ref,alt,100),maxdepth)
if(!is.null(file))
png(filename, width=max(maxplot*3,640)*4+sqrt(cex)*maxplot,height=max(maxplot*3,640)*4+sqrt(cex)*maxplot, res=res)
par(mar = c(5.1,5.1,5.1,5.1)*sqrt(cex), ...)
#newCol <- colorRampPalette(c("white","red","orange","yellow","green","cyan","blue"))(200)
if(is.null(color)) newCol <- viridis::viridis(200, direction = -1, option = "A")
else newCol <- color
grid_add <- function(){
xaxp <- par("xaxp")
ticks <- seq(par()$xaxp[1],par()$xaxp[2],par()$xaxp[2]/par()$xaxp[3])
for(i in ticks){
lines(x=rep(i-0.5,2),y=c(-2,i-0.5), lty=3, col="grey", cex=cex)
lines(x=c(i-1.5,maxplot),y=rep(i,2)-1.5, lty=3, col="grey", cex=cex)
lines(y=rep(i-0.5,2),x=c(-2,i-0.5), lty=3, col="grey", cex=cex)
lines(y=c(i-1.5,maxplot),x=rep(i,2)-1.5, lty=3, col="grey", cex=cex)
}
return(invisible(NULL))
}
image(0:nrow(countMat)-2,0:ncol(countMat)-2,countMat, xlab="Major Read Depth (Expected)",ylab="Major Read Depth (Observed)",col=newCol,zlim=c(0,max(countMat,na.rm=T)),
cex.lab=cex,cex.axis=cex, xlim=c(-2,maxplot), ylim=c(-2,maxplot), mgp=c(3*sqrt(cex),sqrt(cex),0),
xaxt = "n", yaxt="n")
box()
grid_add()
ticks <- seq(par()$xaxp[1],par()$xaxp[2],par()$xaxp[2]/par()$xaxp[3])
axis(side = 1, at = ticks-0.5,labels = ticks, cex.axis=cex)
axis(side = 2, at = ticks-0.5,labels = ticks, cex.axis=cex)
axis(side = 3, at = ticks-1.5,labels = ticks, cex.axis=cex)
axis(side = 4, at = ticks-1.5,labels = ticks, cex.axis=cex)
abline(0,1, cex=cex)
if(ploid > 2){
rat1 = 1:(ceiling(ploid/2)-1)
rat2 = (ploid-1):(floor(ploid/2)+1)
junk <- sapply(1:length(rat1), function(x) abline(-1.5,rat1[x]/rat2[x], lty=3, cex=cex))
junk <- sapply(1:length(rat1), function(x) abline(rat2[x]/rat1[x]*1.5,rat2[x]/rat1[x], lty=3, cex=cex))
}
mtext("Minor Read Depth (Observed)",side=3,cex=cex, font=1, line=2.5*sqrt(cex))
mtext("Minor Read Depth (Expected)", side=4,cex=cex, font=1,line=3.5*sqrt(cex))
legend_image <- as.raster(matrix(rev(newCol), ncol = 1))
adj = min(max(ref,alt),maxplot)*0.05
rasterImage(legend_image,xleft = min(max(ref,alt),maxplot)-adj*xleg, ybottom = adj, ytop = yleg*adj, xright = min(max(ref,alt),maxplot)-adj)
lines(x = c(rep(min(max(ref,alt),maxplot)-adj*xleg,2),rep(min(max(ref,alt),maxplot)-adj,2),min(max(ref,alt),maxplot)-adj*xleg), y=c(adj,rep(yleg*adj,2),adj,adj),cex=cex)
text(x = min(max(ref,alt),maxplot)-adj*xleg-1, y = adj + seq(0,(yleg-1)*adj,length.out=6), labels = format(exp(seq(0,max(countMat,na.rm=T),length.out=6))-1,digits=1, scientific=F),cex=cex, pos=2)
junk <- sapply(adj + seq(0,(yleg-1)*adj,length.out=6), function(y) lines(min(max(ref,alt),maxplot) - adj*xleg + c(0,-1), y=rep(y,2), cex=cex))
if(!is.null(file))
dev.off()
return(invisible())
} else{ ## plots for each individual
## check the sample ID vector
if(is.null(indID))
indID = 1:nrow(ref)
else if(!is.vector(indID) || !is.character(indID) || length(indID) != nrow(ref) || any(is.na(indID)))
stop("Argument `indID` is invalid")
## Check the filename
if(is.null(file))
stop("A filename must be specified to produce plots for each individual")
else {
if(!is.vector(file) || !is.character(file) || length(file) != 1)
stop("Filename is invalid")
filename <- paste0(file,"_test.png")
if(!file.create(filename,showWarnings = F))
stop("Unable to create output file.")
else {
file.remove(filename)
filename <- paste0(file,"_comet")
}
}
## If genotype frequencies not given, estimate allele frequecies
if(is.null(gfreq))
p <- colMeans(ref/(ref+alt),na.rm=T)
doParallel::registerDoParallel(cores=ncores)
junk = foreach::foreach(ind = 1:nrow(ref)) %dopar% {
ref_ind = ref[ind,]
alt_ind = alt[ind,]
### check if there are too many SNPs
#if(length(ref_ind) > maxSNPs){
# SNPsam <- sample(1:length(ref), size=maxSNPs)
# ref_ind <- ref_ind[,SNPsam]
# alt_ind <- alt_ind[,SNPsam]
#}
## Remove entries with really large read depths and missing values
depth <- ref_ind+alt_ind
tokeep = which(!(depth>2*maxdepth | depth == 0))
# Check to see if there are any reads left
if(length(tokeep) == 0){
warnings(paste0("Individual ",ind,": No non-missing enteries with read depths below ",maxdepth))
return(invisible(NULL))
}
ref_ind = ref_ind[tokeep]
alt_ind = alt_ind[tokeep]
depth = depth[tokeep]
if(is.null(gfreq))
p_ind = p[tokeep]
else
gfreq_ind = gfreq[,tokeep]
## compute the observed depths
heteCall <- which(ref_ind > 0 | alt_ind > 0)
maxCount <- 2*min(max(ref,alt,100),maxdepth)
ref_sub <- factor(ref_ind[heteCall],levels=0:maxCount,labels=0:maxCount)
alt_sub <- factor(alt_ind[heteCall],levels=0:maxCount,labels=0:maxCount)
obsTab <- table(ref_sub,alt_sub)
## table for cometplot
obsTab[upper.tri(obsTab)] <- obsTab[upper.tri(obsTab)] + t(obsTab)[upper.tri(t(obsTab))]
## set up the count matrix
countMat <- matrix(nrow=maxCount+2,ncol=maxCount+2)
countMat[upper.tri(countMat)] <- log(obsTab[-which(lower.tri(obsTab))]+1)
rm(list=c("ref_sub","alt_sub"))
gc()
## Compute the expected counts
if(is.null(gfreq)){ # if genotype frequencies are not specified
expCounts <- sapply(2:maxCount, function(x){
indx <- which(depth == x)
if(length(indx) == 0) return(numeric(floor(x/2)+1))
else{
pvec <- sort(unique(round(p_ind[indx],2)))
countp <- tabulate(round(p_ind[indx]*100))
if(any(pvec == 0)) countp <- c(sum(round(p_ind[indx]*100) == 0), countp)
countp <- countp[which(countp > 0)]
out <- as.matrix(sapply(1:length(pvec), function(z){
temp <- countp[z]*sapply(0:x, function(y) sum(dbinom(y, x, prob=0:ploid/ploid)*dbinom(0:ploid, size=ploid, prob=(1-pvec[z]))))
if((x %% 2) == 0)
return(temp[0:(x/2)+1] + c(temp[x:(x/2+1)+1],0))
else
return(temp[0:floor(x/2)+1] + temp[x:ceiling(x/2)+1])
}))
return(rowSums(out))
}
})
} else{
pdist <- unique(gfreq_ind, MARGIN=2)
indx_p <- sapply(1:ncol(pdist), function(x) which(apply(gfreq_ind,2, function(y) identical(y,pdist[,x]))), simplify = F)
pvec <- numeric(length(ref_ind))
for(i in 1:length(indx_p))
pvec[indx_p[[i]]] <- i
expCounts <- sapply(2:maxCount, function(x){
indx <- which(depth == x)
if(length(indx) == 0) return(numeric(x+1))
else{
countp <- tabulate(pvec[indx])
nzp <- which(countp > 0)
countp <- countp[nzp]
out <- as.matrix(sapply(1:length(countp), function(z){
temp <- countp[z]*sapply(0:x, function(y) sum(dbinom(y, x, prob=0:ploid/ploid)*pdist[,z]))
if((x %% 2) == 0)
return(temp[0:(x/2)+1] + c(temp[x:(x/2+1)+1],0))
else
return(temp[0:floor(x/2)+1] + temp[x:ceiling(x/2)+1])
}))
return(rowSums(out))
}
})
}
expCounts <- c(0,obsTab[1,2],expCounts)
maxCol = length(tail(expCounts,1)[[1]])
expCounts_order <- c(list(cbind(unlist(lapply(expCounts, function(x) x[1])), 0:maxCount+1, 1)),
sapply(2:maxCol, function(x)
cbind(unlist(lapply(expCounts[(x*2-1):(maxCount+1)], function(y) y[x])), (x-1):(maxCount+1-x)+1,x)))
temp <- do.call("rbind",expCounts_order)
indx_entry <- temp[,c(2:3)]
indx_entry[,1] <- indx_entry[,1]+1
countMat[indx_entry] <- log(temp[,1]+1)
countMat[which(countMat < 0)] <- 0
countMat[1,2] <- countMat[2,1] <- NA
### Produce the plot
maxplot <- min(max(ref,alt,100),maxdepth)
tempfilename = paste0(filename,"_ind-",indID[ind],".png")
png(tempfilename, width=max(maxplot*3,640)*4+sqrt(cex)*maxplot,height=max(maxplot*3,640)*4+sqrt(cex)*maxplot, res=res)
par(mar = c(5.1,5.1,5.1,5.1)*sqrt(cex), ...)
#newCol <- colorRampPalette(c("white","red","orange","yellow","green","cyan","blue"))(200)
if(is.null(color)) newCol <- viridis::viridis(200, direction = -1, option = "A")
else newCol <- color
grid_add <- function(){
xaxp <- par("xaxp")
ticks <- seq(par()$xaxp[1],par()$xaxp[2],par()$xaxp[2]/par()$xaxp[3])
for(i in ticks){
lines(x=rep(i-0.5,2),y=c(-2,i-0.5), lty=3, col="grey", cex=cex)
lines(x=c(i-1.5,maxplot),y=rep(i,2)-1.5, lty=3, col="grey", cex=cex)
lines(y=rep(i-0.5,2),x=c(-2,i-0.5), lty=3, col="grey", cex=cex)
lines(y=c(i-1.5,maxplot),x=rep(i,2)-1.5, lty=3, col="grey", cex=cex)
}
return(invisible(NULL))
}
image(0:nrow(countMat)-2,0:ncol(countMat)-2,countMat, xlab="Major Read Depth",ylab="Major Read Depth",col=newCol,zlim=c(0,max(countMat,na.rm=T)),
cex.lab=cex,cex.axis=cex, xlim=c(-2,maxplot), ylim=c(-2,maxplot), mgp=c(3*sqrt(cex),sqrt(cex),0),
xaxt = "n", yaxt="n")
box()
grid_add()
ticks <- seq(par()$xaxp[1],par()$xaxp[2],par()$xaxp[2]/par()$xaxp[3])
axis(side = 1, at = ticks-0.5,labels = ticks, cex.axis=cex)
axis(side = 2, at = ticks-0.5,labels = ticks, cex.axis=cex)
axis(side = 3, at = ticks-1.5,labels = ticks, cex.axis=cex)
axis(side = 4, at = ticks-1.5,labels = ticks, cex.axis=cex)
abline(0,1, cex=cex)
if(ploid > 2){
rat1 = 1:(ceiling(ploid/2)-1)
rat2 = (ploid-1):(floor(ploid/2)+1)
junk <- sapply(1:length(rat1), function(x) abline(-1.5,rat1[x]/rat2[x], lty=3, cex=cex))
junk <- sapply(1:length(rat1), function(x) abline(rat2[x]/rat1[x]*1.5,rat2[x]/rat1[x], lty=3, cex=cex))
}
mtext("(Observed)\nMinor Read Depth",side=3,cex=cex, font=1, line=2.5*sqrt(cex))
mtext("Minor Read Depth\n(Expected)", side=4,cex=cex, font=1,line=3.5*sqrt(cex))
legend_image <- as.raster(matrix(rev(newCol), ncol = 1))
adj = min(max(ref,alt),maxplot)*0.05
rasterImage(legend_image,xleft = min(max(ref,alt),maxplot)-adj*2, ybottom = adj, ytop = 5*adj, xright = min(max(ref,alt),maxplot)-adj)
lines(x = c(rep(min(max(ref,alt),maxplot)-adj*2,2),rep(min(max(ref,alt),maxplot)-adj,2),min(max(ref,alt),maxplot)-adj*2), y=c(adj,rep(5*adj,2),adj,adj),cex=cex)
text(x = min(max(ref,alt),maxplot)-adj*2-1, y = adj + seq(0,4*adj,length.out=6), labels = format(exp(seq(0,max(countMat,na.rm=T),length.out=6))-1,digits=1, scientific=F),cex=cex, pos=2)
junk <- sapply(adj + seq(0,4*adj,length.out=6), function(y) lines(min(max(ref,alt),maxplot) - adj*2 + c(0,-1), y=rep(y,2), cex=cex))
dev.off()
return(invisible(NULL))
}
}
return(invisible(NULL))
}
#' @export rocketPlot
rocketPlot <- function(ref, alt, ploid=2, gfreq=NULL, file=NULL, cex=1, maxdepth=500, maxSNPs=1e5, res=300,
scaled=TRUE, ...){
## Do some checks
if(!is.matrix(ref) || GUSbase::checkVector(as.vector(ref)))
stop("Matrix of reference allele counts is invalid")
if(!is.matrix(alt) || GUSbase::checkVector(as.vector(alt)))
stop("Matrix of alternate allele counts is invalid")
if(GUSbase::checkVector(ploid, minv=2))
stop("Matrix of alternate allele counts is invalid")
if (!is.null(gfreq)) {
if (!is.matrix(gfreq) || !is.numeric(gfreq) || any(is.na(gfreq)) ||
any(gfreq < 0) || any(gfreq > 1) || nrow(gfreq) !=
(ploid + 1) || ncol(gfreq) != ncol(gfreq) ||
any(sapply(colSums(gfreq), function(x) isTRUE(!all.equal(x, 1, tolerance = 1e-04)))))
stop("Matrix of genotype frequencies is invalid")
}
if(!is.null(file)){
if(!is.vector(file) || !is.character(file) || length(file) != 1)
stop("Filename input is invalid")
filename <- paste0(file,"_rocket.png")
if(!file.create(filename,showWarnings = F))
stop("Unable to create output file.")
}
if(GUSbase::checkVector(maxdepth, minv = 2))
stop("Argument `maxdepth` is invalid")
if(GUSbase::checkVector(maxSNPs, minv = 2))
stop("Argument `maxSNPs` is invalid")
if(GUSbase::checkVector(res, type="pos_numeric", minv = 2))
stop("Argument `res` is invalid")
if(GUSbase::checkVector(cex, type="pos_numeric"))
stop("Argument `cex` is invalid")
if(!is.vector(scaled) || !is.logical(scaled) || length(scaled) != 1)
stop("Argument `scaled` is invalid")
depth <- ref+alt
toRemove = which(depth>2*maxdepth)
ref[toRemove] <- 0
alt[toRemove] <- 0
depth[toRemove] <- 0
## compute the observed depths
heteCall <- which(ref > 0 | alt > 0)
maxCount <- max(100,2*maxdepth)
ref_sub <- factor(ref[heteCall],levels=0:maxCount,labels=0:maxCount)
alt_sub <- factor(alt[heteCall],levels=0:maxCount,labels=0:maxCount)
obsTab <- table(ref_sub,alt_sub)
rm(list=c("ref_sub","alt_sub"))
gc()
## Compute the expected counts
if(is.null(gfreq)){ # if genotype frequencies are not specified
p <- matrix(colMeans(ref/(ref+alt),na.rm=T), nrow=nrow(ref), ncol=ncol(ref), byrow=T)
## data for producing density plots
expCounts <- sapply(1:maxCount, function(x){
indx <- which(depth == x)
if(length(indx) == 0) return(numeric(x+1))
else{
pvec <- sort(unique(round(p[indx],2)))
countp <- tabulate(round(p[indx]*100))
if(any(pvec == 0)) countp <- c(sum(round(p[indx]*100) == 0), countp)
countp <- countp[which(countp > 0)]
out <- sapply(1:length(pvec), function(z){
return(countp[z]*sapply(0:x, function(y) sum(dbinom(y, x, prob=0:ploid/ploid)*dbinom(0:ploid, size=ploid, prob=pvec[z]))))
})
return(rev(rowSums(out)))
}
})
} else{
pdist <- unique(gfreq, MARGIN=2)
indx_p <- sapply(1:ncol(pdist), function(x) which(apply(gfreq,2, function(y) identical(y,pdist[,x]))), simplify = F)
pvec <- numeric(ncol(ref))
for(i in 1:length(indx_p))
pvec[indx_p[[i]]] <- i
expCounts <- sapply(1:maxCount, function(x){
indx <- which(depth == x, arr.ind = T)[,2]
if(length(indx) == 0) return(numeric(x+1))
else{
countp <- tabulate(pvec[indx])
nzp <- which(countp > 0)
countp <- countp[nzp]
out <- sapply(nzp, function(z){
return(countp[z]*sapply(0:x, function(y) sum(dbinom(y, x, prob=0:ploid/ploid)*pdist[,z])))
})
return(rev(rowSums(out)))
}
})
}
expCounts <- c(0,expCounts)
expMat <- matrix(nrow=maxCount+1, ncol=maxCount+1)
expCounts_order <- sapply(1:(maxCount+1), function(x)
cbind(unlist(lapply(expCounts[x:(maxCount+1)], function(y) y[x])), 0:(maxCount+1-x)+1,x))
temp <- do.call("rbind",expCounts_order)
indx_entry <- temp[,c(2:3)]
expMat[indx_entry] <- temp[,1]
## Produce the plot
if(scaled == TRUE){
newCol <- colorRampPalette(c("red","orange","yellow","white","cyan","#0080FF","blue"))(500)
maxdepth <- min(max(ref,alt),maxdepth)
newMat <- (obsTab-expMat)
newMat2 <- newMat[0:maxdepth+1,0:maxdepth+1]
newMat2 <- newMat2/sqrt(expMat[0:maxdepth+1,0:maxdepth+1])
newMat2[which(newMat2< -20)] <- -20
newMat2[which(newMat2> 20)] <- 20
ran <- max(abs(newMat2),na.rm=T)
if(!is.null(file))
png(filename, width=max(maxdepth*3,640)*4+sqrt(cex)*maxdepth,height=max(maxdepth*3,640)*4+sqrt(cex)*maxdepth, res=res)
par(mar = c(5.1,5.1,1.1,1.1)*sqrt(cex), ...)
image(0:maxdepth,0:maxdepth,t(newMat2), zlim=c(-ran,ran), col=newCol, cex.lab=cex,cex.axis=cex,
xaxt = "n", yaxt="n", ylab="Reference Allele Count", xlab="Alternate Allele Count",mgp=c(3*sqrt(cex),sqrt(cex),0))
ticks <- seq(par()$xaxp[1],par()$xaxp[2],par()$xaxp[2]/par()$xaxp[3])
graphics::axis(side = 1, at = ticks+0,labels = ticks, cex.axis=cex)
graphics::axis(side = 2, at = ticks+0,labels = ticks, cex.axis=cex)
graphics::grid()
graphics::box()
if(ploid > 2){
rat1 = 1:(ploid-1)
rat2 = rev(rat1)
junk <- base::sapply(1:length(rat1), function(x) graphics::abline(0,rat1[x]/rat2[x], lty=3, cex=cex))
} else if(ploid == 2) graphics::abline(0,1, lty=3, cex=cex)
legend_image <- as.raster(matrix(rev(newCol), ncol = 1))
adj = min(max(ref,alt),maxdepth)*0.05
rasterImage(legend_image,xleft = min(max(ref,alt),maxdepth)-adj*2, ybottom = adj, ytop = 6*adj, xright = min(max(ref,alt),maxdepth)-adj )
lines(x = c(rep(min(max(ref,alt),maxdepth)-adj*2,2),rep(min(max(ref,alt),maxdepth)-adj,2),min(max(ref,alt),maxdepth)-adj*2), y=c(adj,rep(6*adj,2),adj,adj),cex=cex)
num <- unique(c(-seq(20,0, length.out=4),seq(0,20,length.out=4)))
text(x = min(max(ref,alt),maxdepth)-adj*2-1, y = adj + seq(0,5*adj,length.out=7), labels = format(num,digits=2, scientific=F),cex=cex, pos=2)
junk <- sapply(adj + seq(0,5*adj,length.out=7), function(y) lines(min(max(ref,alt),maxdepth) - adj*2 + c(0,-1), y=rep(y,2), cex=cex))
if(!is.null(file))
dev.off()
} else{
newMat <- (obsTab-expMat)
maxDiff <- max(abs(newMat), na.rm=T)
indx <- which(newMat > 0)
newMat[indx] <- log(newMat[indx]+1)
indx <- which(newMat < 0)
newMat[indx] <- -log(abs(newMat[indx])+1)
newCol <- colorRampPalette(c("red","orange","yellow","white","cyan","#0080FF","blue"))(200)
maxdepth <- min(max(ref,alt),maxdepth)
ran <- max(abs(newMat),na.rm=T)
if(!is.null(file))
png(filename, width=max(maxdepth*3,640)*4+sqrt(cex)*maxdepth,height=max(maxdepth*3,640)*4+sqrt(cex)*maxdepth, res=res)
graphics::par(mar = c(5.1,5.1,1.1,1.1)*sqrt(cex), ...)
image(0:maxdepth,0:maxdepth,t(newMat[0:maxdepth+1,0:maxdepth+1]), zlim=c(-ran,ran), col=newCol, cex.lab=cex,cex.axis=cex,
xaxt = "n", yaxt="n", ylab="Reference Allele Count", xlab="Alternate Allele Count",mgp=c(3*sqrt(cex),sqrt(cex),0))
ticks <- base::seq(par()$xaxp[1],graphics::par()$xaxp[2],graphics::par()$xaxp[2]/graphics::par()$xaxp[3])
graphics::axis(side = 1, at = ticks+0,labels = ticks, cex.axis=cex)
graphics::axis(side = 2, at = ticks+0,labels = ticks, cex.axis=cex)
graphics::grid()
graphics::box()
if(ploid > 2){
rat1 = 1:(ploid-1)
rat2 = rev(rat1)
junk <- base::sapply(1:length(rat1), function(x) graphics::abline(0,rat1[x]/rat2[x], lty=3, cex=cex))
} else if(ploid == 2) graphics::abline(0,1, lty=3, cex=cex)
legend_image <- as.raster(matrix(rev(newCol), ncol = 1))
adj = min(max(ref,alt),maxdepth)*0.05
rasterImage(legend_image,xleft = min(max(ref,alt),maxdepth)-adj*2, ybottom = adj, ytop = 6*adj, xright = min(max(ref,alt),maxdepth)-adj )
lines(x = c(rep(min(max(ref,alt),maxdepth)-adj*2,2),rep(min(max(ref,alt),maxdepth)-adj,2),min(max(ref,alt),maxdepth)-adj*2), y=c(adj,rep(6*adj,2),adj,adj),cex=cex)
num <- unique(c(-(exp(seq(max(abs(newMat),na.rm=T),0, length.out=4))-1),exp(seq(0,max(abs(newMat), na.rm=T),length.out=4))-1))
text(x = min(max(ref,alt),maxdepth)-adj*2-1, y = adj + seq(0,5*adj,length.out=7), labels = format(num,digits=1, scientific=F),cex=cex, pos=2)
junk <- sapply(adj + seq(0,5*adj,length.out=7), function(y) lines(min(max(ref,alt),maxdepth) - adj*2 + c(0,-1), y=rep(y,2), cex=cex))
if(!is.null(file))
dev.off()
}
return(invisible())
}
#' @export RDDPlot
RDDPlot <- function(ref, alt, ploid=2, gfreq=NULL, file=NULL, maxdepth=500, maxSNPs=1e5, ...){
## Do some checks
if(!is.matrix(ref) || GUSbase::checkVector(as.vector(ref)))
stop("Matrix of reference allele counts is invalid")
if(!is.matrix(alt) || GUSbase::checkVector(as.vector(alt)))
stop("Matrix of alternate allele counts is invalid")
if(GUSbase::checkVector(ploid, minv=2))
stop("Matrix of alternate allele counts is invalid")
if (!is.null(gfreq)) {
if (!is.matrix(gfreq) || !is.numeric(gfreq) || any(is.na(gfreq)) ||
any(gfreq < 0) || any(gfreq > 1) || nrow(gfreq) !=
(ploid + 1) || ncol(gfreq) != ncol(gfreq) ||
any(sapply(colSums(gfreq), function(x) isTRUE(!all.equal(x, 1, tolerance = 1e-04)))))
stop("Matrix of genotype frequencies is invalid")
}
if(!is.null(file)){
if(!is.vector(file) || !is.character(file) || length(file) != 1)
stop("Filename input is invalid")
filename <- paste0(file,"_RDD.png")
if(!file.create(filename,showWarnings = F))
stop("Unable to create output file.")
}
if(GUSbase::checkVector(maxdepth, minv = 2))
stop("Argument `maxdepth` is invalid")
if(GUSbase::checkVector(maxSNPs, minv = 2))
stop("Argument `maxSNPs` is invalid")
### check if there are too many SNPs
if(ncol(ref) > maxSNPs){
SNPsam <- sample(1:ncol(ref), size=maxSNPs)
ref <- ref[,SNPsam]
alt <- alt[,SNPsam]
}
## Remove SNPs with really large read depths
depth <- ref+alt
toRemove = which(depth>maxdepth)
ref[toRemove] <- 0
alt[toRemove] <- 0
depth[toRemove] <- 0
## compute the observed depths
heteCall <- which(ref > 0 | alt > 0)
maxCount <- max(50,max(depth))
ref_sub <- factor(ref[heteCall],levels=0:maxCount,labels=0:maxCount)
alt_sub <- factor(alt[heteCall],levels=0:maxCount,labels=0:maxCount)
obsTab <- table(ref_sub,alt_sub)
## varibles for histrogram
counts_obs <- as.vector(obsTab)[-1]
counts_obs <- counts_obs/sum(counts_obs)
temp <- seq(maxCount+1)-1
tempref <- matrix(temp, nrow=maxCount+1,ncol=maxCount+1, byrow=F)
tempd <- matrix(temp, nrow=maxCount+1,ncol=maxCount+1, byrow=T) + tempref
ratio_obs <- tempref/tempd
ratio_obs <- as.vector(ratio_obs)[-1]
rm(list=c("ref_sub","alt_sub"))
gc()
## Compute the expected counts
if(is.null(gfreq)){ # if genotype frequencies are not specified
p <- matrix(colMeans(ref/(ref+alt),na.rm=T), nrow=nrow(ref), ncol=ncol(ref), byrow=T)
## data for producing density plots
expCounts <- sapply(1:maxCount, function(x){
indx <- which(depth == x)
if(length(indx) == 0) return(numeric(x+1))
else{
pvec <- sort(unique(round(p[indx],2)))
countp <- tabulate(round(p[indx]*100))
countp <- countp[which(countp > 0)]
out <- sapply(1:length(pvec), function(z){
return(countp[z]*sapply(0:x, function(y) sum(dbinom(y, x, prob=0:ploid/ploid)*dbinom(0:ploid, size=ploid, prob=pvec[z]))))
})
return(rev(rowSums(out)))
}
})
} else{
pdist <- unique(gfreq, MARGIN=2)
indx_p <- sapply(1:ncol(pdist), function(x) which(apply(gfreq,2, function(y) identical(y,pdist[,x]))), simplify = F)
pvec <- numeric(ncol(ref))
for(i in 1:length(indx_p))
pvec[indx_p[[i]]] <- i
expCounts <- sapply(1:maxCount, function(x){
indx <- which(depth == x, arr.ind = T)[,2]
if(length(indx) == 0) return(numeric(x+1))
else{
countp <- tabulate(pvec[indx])
nzp <- which(countp > 0)
countp <- countp[nzp]
out <- sapply(nzp, function(z){
return(countp[z]*sapply(0:x, function(y) sum(dbinom(y, x, prob=0:ploid/ploid)*pdist[,z])))
})
return(rev(rowSums(out)))
}
})
}
expCounts <- c(0,expCounts)
tempMat <- matrix(nrow=maxCount+1, ncol=maxCount+1)
expCounts_order <- sapply(1:(maxCount+1), function(x)
cbind(unlist(lapply(expCounts[x:(maxCount+1)], function(y) y[x])), 0:(maxCount+1-x)+1,x))
temp <- do.call("rbind",expCounts_order)
indx_entry <- temp[,c(2:3)]
tempMat[indx_entry] <- temp[,1]
counts_exp <- as.vector(tempMat)[-1]
counts_exp[which(is.na(counts_exp))] <- 0
counts_exp <- counts_exp/sum(counts_exp)
temp_df <- data.frame(counts=c(counts_exp,counts_obs),ratio=rep(ratio_obs,2)*ploid,
method=rep(c("Expected","Observed"),rep(length(ratio_obs),2)))
## Produce RRD plot
pp <- ggplot2::ggplot(temp_df, ggplot2::aes(x=ratio, fill=method)) +
ggplot2::geom_density(ggplot2::aes(weight=counts), alpha=0.2) + ggplot2::theme_bw() +
ggplot2::xlab("Dosage of reference allele") + ggplot2::theme(legend.title = ggplot2::element_blank(), legend.position="top")
if(is.null(file)) print(pp)
else ggplot2::ggsave(filename=filename, plot=pp,...)
return(invisible(NULL))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.