Nothing
## ggplotting-methods.R -- new plotting functions that will coexist
## with existing qrqc plotting functions (however these others will
## warn that they are deprecated).
setMethod("getQual", signature(x="FASTQSummary"),
# Given a FASTQSummary object, extract a dataframe of quality
# distribution statistics using binned2boxplot.
function(x) {
tmp <- apply(x@qual.freqs[, -1], 1, binned2boxplot)
tmp <- t(tmp)
tmp <- cbind(1:nrow(tmp), tmp)
vals <- as.numeric(names(x@qual.freqs[, -1]))
tmp <- cbind(tmp, apply(x@qual.freqs[, -1], 1, function(x) weighted.mean(vals, x)))
tmp <- as.data.frame(tmp)
colnames(tmp) <- c('position', 'ymin', 'alt.lower', 'lower',
'middle', 'upper', 'alt.upper', 'ymax', 'mean')
return(tmp)
})
setMethod("getKmer", signature(x="SequenceSummary"),
# Given a SequenceSummary object, extract the k-mer dataframe
function(x) {
x@kmer
})
setMethod("getGC", signature(x="SequenceSummary"),
# Given a SequenceSummary object, extract a dataframe of GC content.
function(x) {
gcd <- local({
base.props <- getBaseProps(x)
d <- subset(base.props, base.props$base %in% c('G', 'C'))
gc <- aggregate(d$proportion, list(d$position), sum)
colnames(gc) <- c('position', 'gc')
return(gc)
})
gcd
})
setMethod("getBase", signature(x="SequenceSummary"),
# Given a SequenceSummary object, extract a dataframe of base content.
function(x, drop=TRUE) {
base.freqs <- local({
tmp <- reshape::melt(x@base.freqs, id='position')
colnames(tmp) <- c('position', 'base', 'frequency')
return(tmp)
})
if (drop) {
# is zero for all bases?
bt <- aggregate(base.freqs$frequency, list(base=base.freqs$base), sum)
base.keep <- bt$base[bt$x > 0]
return(subset(base.freqs, base %in% base.keep))
}
base.freqs
})
setMethod("getBaseProp", signature(x="SequenceSummary"),
# Given an object that inherits from SummarySequence, return the bases
# by proportion.
function(x, drop=TRUE) {
base.props <- local({
tmp <- prop.table(as.matrix(x@base.freqs[, -1]), margin=1)
tmp <- reshape::melt(tmp, id='position')
colnames(tmp) <- c('position', 'base', 'proportion')
return(tmp)
})
if (drop) {
# is zero for all bases?
bt <- aggregate(base.props$proportion, list(base=base.props$base), sum)
base.keep <- bt$base[bt$x > 0]
return(subset(base.props, base %in% base.keep))
}
return(base.props)
})
setMethod("getSeqlen", signature(x="SequenceSummary"),
# Get sequence length dataframe from object that inherits from
# SequenceSummary.
function(x) {
l <- x@seq.lengths
x <- 1:length(l)
data.frame(length=x, count=l)
})
setMethod("getMCQual", signature(x="FASTQSummary"),
# Generate random draws from a FASTQSummary object so that a lowess
# can be fit to quickly view the general trend of a quality summary.
function(x, n=100) {
set.seed(0)
vals <- as.numeric(colnames(x@qual.freqs)[-1])
binsample <- function(x) {
sample(vals, n, prob=x/sum(x), replace=TRUE)
}
# Use binsample() to sample from the binned quality frequency data, then
# do some data munging to get it in a usable format.
s <- apply(x@qual.freqs[, -1], 1, binsample)
tmp <- t(s)
nr <- nrow(tmp)
dim(tmp) <- c(ncol(tmp)*nrow(tmp), 1)
data.frame(position=1:nr, quality=tmp)
})
list2df <-
# Use mapply
function(x, fun) {
if (!length(names(x)))
stop("list 'x' must have named elements.")
d <- mapply(function(x, n) {
d.tmp <- fun(x)
d.tmp$sample <- n
d.tmp
}, x, names(x), SIMPLIFY=FALSE)
as.data.frame(do.call(rbind, d))
}
setMethod("qualPlot", signature(x="FASTQSummary"),
# Plot a single FASTQSummary object.
function(x, smooth=TRUE, extreme.color="grey", quartile.color="orange",
mean.color="blue", median.color=NULL) {
qd <- getQual(x)
p <- ggplot(qd)
p <- p + geom_qlinerange(extreme.color=extreme.color, quartile.color=quartile.color,
mean.color=mean.color, median.color=median.color)
p <- p + scale_y_continuous("quality")
if (smooth) {
mcd <- getMCQual(x)
p <- p + geom_smooth(aes(x=position, y=quality), data=mcd, se=FALSE, color="purple")
}
p
})
setMethod("qualPlot", signature(x="list"),
# Plot a list of FASTQSummary objects as facets.
function(x, smooth=TRUE, extreme.color="grey", quartile.color="orange",
mean.color="blue", median.color=NULL) {
if (!length(names(x)))
stop("A list pased into qualPlot must have named elements.")
if (!all(sapply(x, class) == "FASTQSummary"))
stop("All items in list must have class FASTQSummary.")
qd <- list2df(x, getQual)
p <- ggplot(qd)
p <- p + geom_qlinerange(extreme.color=extreme.color, quartile.color=quartile.color,
mean.color=mean.color, median.color=median.color)
p <- p + scale_y_continuous("quality")
if (smooth) {
mcd <- list2df(x, getMCQual)
p <- p + geom_smooth(aes(x=position, y=quality), data=mcd, se=FALSE)
}
p <- p + facet_wrap( ~ sample)
p
})
setMethod("gcPlot", signature(x="SequenceSummary"),
# Plot GC by read position.
function(x, color="red") {
gcd <- getGC(x)
p <- ggplot(gcd) + geom_line(aes(x=position, y=gc), color=color)
p
})
setMethod("gcPlot", signature(x="list"),
# Plot GC by read position.
function(x, color="red") {
gcd <- list2df(x, getGC)
p <- ggplot(gcd) + geom_line(aes(x=position, y=gc), color=color) + facet_wrap( ~ sample)
p
})
setMethod("basePlot", signature(x="SequenceSummary"),
# Plot Bases by read position.
function(x, geom=c("line", "bar", "dodge"), type=c("frequency", "proportion"),
bases=DNA_BASES_N, colorvalues=getBioColor("DNA_BASES_N")) {
colorvalues <- colorvalues[bases]
# get the type
type <- match.arg(type)
fun <- list(frequency=getBase, proportion=getBaseProp)[[type]]
# get the geom
geom <- match.arg(geom)
geom.list <- list(line=geom_line(aes_string(x="position", y=type, color="base")),
bar=geom_bar(aes_string(x="position", y=type, fill="base"), stat="identity"),
dodge=geom_bar(aes_string(x="position", y=type, fill="base"), stat="identity", position="dodge"))
g <- geom.list[[geom]]
bd <- fun(x, drop=FALSE) # drop is FALSE if a user asks just to see N, and there are none
p <- ggplot(subset(bd, base %in% bases)) + g
p <- p + scale_colour_manual(values=colorvalues)
p <- p + scale_fill_manual(values=colorvalues)
p
})
setMethod("basePlot", signature(x="list"),
# Plot Bases by read position for a list.
function(x, geom=c("line", "bar", "dodge"), type=c("frequency", "proportion"),
bases=DNA_BASES_N, colorvalues=getBioColor("DNA_BASES_N")) {
colorvalues <- colorvalues[bases]
# get the type
type <- match.arg(type)
fun <- list(frequency=getBase, proportion=getBaseProp)[[type]]
bd <- list2df(x, fun)
# get the geom
geom <- match.arg(geom)
geom.list <- list(line=geom_line(aes_string(x="position", y=type, color="base")),
bar=geom_bar(aes_string(x="position", y=type, fill="base"), stat="identity"),
dodge=geom_bar(aes_string(x="position", y=type, fill="base"), stat="identity", position="dodge"))
g <- geom.list[[geom]]
p <- ggplot(subset(bd, base %in% bases)) + g + facet_wrap( ~ sample)
p <- p + scale_colour_manual(values=colorvalues)
p <- p + scale_fill_manual(values=colorvalues)
p
})
setMethod("seqlenPlot", signature(x="SequenceSummary"),
# Plot sequence lengths.
function(x) {
ld <- getSeqlen(x)
p <- ggplot(ld, aes(x=length, y=count)) + geom_bar(stat="identity")
p
})
setMethod("seqlenPlot", signature(x="list"),
# Plot sequence lengths.
function(x) {
ld <- list2df(x, getSeqlen)
p <- ggplot(ld, aes(x=length, y=count)) + geom_bar(stat="identity")
p <- p + facet_wrap( ~ sample)
p
})
geom_qlinerange <-
# Add a series of geoms to plot quality statistics.
function(extreme.color="grey", quartile.color="orange", mean.color="blue", median.color=NULL) {
l <- list(extreme.color=geom_linerange(aes_string(x="position", ymin="alt.lower", ymax="alt.upper"), color=extreme.color),
quartile.color=geom_linerange(aes_string(x="position", ymin="lower", ymax="upper", y="mean"), color=quartile.color, size=1.2),
mean.color=geom_point(aes_string(x="position", y="mean"), color=mean.color),
median.color=geom_point(aes_string(x="position", y="middle"), color=median.color))
keep <- sapply(names(l), function(n) !is.null(get(n)))
l[keep]
}
scale_color_dna <-
# Make a color scheme using biovizBase's good choices.
function() {
scale_color_manual(values=getBioColor("DNA_BASES_N"))
}
scale_color_iupac <-
# Make a color scheme using biovizBase's good choices.
function() {
scale_color_manual(values=getBioColor("IUPAC_CODE_MAP"))
}
calcKL <- function(x) {
d <- getKmer(x)
kmerDist <- function(x) {
# Given x, find the distribution of k-mers averaged across
# position.
tmp <- aggregate(x$count, list(kmer=x$kmer), sum)
data.frame(kmer=tmp$kmer, prop=tmp$x/sum(tmp$x))
}
kmer.dist <- kmerDist(d)
kmer.by.pos <- split(d, list(position=d$position))
## We want to calculate the K-L divergence, but our sample spaces
## are not the same, as our k-mer for all positions is populated
## with more k-mers than most positions (as there aren't all k-mers
## in every position). Thus we calculate the probabilities of k-mers
## for all positions for the sub-sample space of the positional
## k-mers.
y <- lapply(kmer.by.pos, kmerDist)
kl <- mapply(function(x, p) {
x$q <- kmer.dist$prop[match(x$kmer, kmer.dist$kmer)]
klout <- data.frame(kmer=x$kmer, position=p,
kl=x$prop*log2(x$prop/x$q), p=x$prop, q=x$q)
# by definition of K-L divergence, we only keep if for i, p(i) > 0
# AND q(i) > 0.
subset(klout, p > 0 & q > 0)
}, y, as.numeric(names(y)), SIMPLIFY=FALSE)
kld <- do.call(rbind, kl)
kld
}
setMethod("kmerKLPlot", signature(x="SequenceSummary"),
# K-L divergence of empirical distribution across all reads to
# positional distribution.
function(x, n.kmers=20) {
if (!nrow(getKmer(x)))
stop("Data frame of k-mer counts by position is empty. Rerun readSeqFile with kmer=TRUE.")
kld <- calcKL(x)
kld.subset <- subset(kld, kmer %in% kld$kmer[order(kld$kl, decreasing=TRUE)[seq_len(n.kmers)]])
p <- ggplot(kld.subset)
p <- p + geom_bar(aes(x=position, y=kl, fill=kmer), stat="identity")
p + scale_y_continuous("K-L divergence of selected components")
})
setMethod("kmerKLPlot", signature(x="list"),
# K-L divergence of empirical distribution across all reads to
# positional distribution, facted by object.
function(x, n.kmers=20) {
if (!all(sapply(x, function(x) nrow(getKmer(x)))))
stop("Data frame of k-mer counts by position is empty. Rerun readSeqFile with kmer=TRUE.")
kld.subset <- list2df(x, function(x) {
kld <- calcKL(x)
subset(kld, kmer %in% kld$kmer[order(kld$kl, decreasing=TRUE)[seq_len(n.kmers)]])
})
p <- ggplot(kld.subset)
p <- p + geom_bar(aes(x=position, y=kl, fill=kmer), stat="identity")
p + facet_wrap(~ sample) + scale_y_continuous("K-L divergence of selected components")
})
kmerEntropy <-
# Get a dataframe of k-mer entropy by position.
function(x) {
d <- getKmer(x)
y <- split(d, list(position=d$position))
kmer.entropies <- sapply(y, function(x) {
p <- x$count/sum(x$count)
-sum(p*log2(p))
})
data.frame(position=as.numeric(names(y)), entropy=kmer.entropies)
}
setMethod("kmerEntropyPlot", signature(x="SequenceSummary"),
# Plot the k-mer entropy by position.
function(x) {
ggplot(data=kmerEntropy(x)) + geom_line(aes(x=position, y=entropy), color="blue")
})
setMethod("kmerEntropyPlot", signature(x="list"),
# Plot the k-mer entropy by position.
function(x) {
ke <- list2df(x, kmerEntropy)
p <- ggplot(data=ke) + geom_line(aes(x=position, y=entropy), color="blue")
p <- p + facet_wrap(~ sample)
p
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.