#' Plot aligned reads over annotated region.
#' @param aln read alignments in GAlignments objet (GenomicTRanges)
#' @param annots genome annotations in GRanges object
#' @param plotfile path for output plot
#' @importFrom ggbio autoplot tracks
#' @importFrom ggplot2 aes
#' @export
PlotAlignAnnot <- function(aln, annots, plotfile) {
reads.track <- autoplot(aln)
annots.track <- autoplot(annots,
aes(color=group, fill=group))
png(filename=plotfile,
width=1024, height=512)
print(tracks(alignments=reads.track,
annotation=annots.track,
heights=c(5,1)))
dev.off()
}
#' Plot aligned reads over annotated region.
#' @param aln read alignments in GAlignments objet (GenomicTRanges)
#' @param annots genome annotations in GRanges object
#' @param plotfile path for output plot
#' @importFrom ggbio autoplot tracks
#' @importFrom ggplot2 aes
#' @importFrom IRanges coverage
#' @export
PlotCoverageAnnot <- function(aln, annots, plotfile) {
cov <- coverage(aln)
coverage.track <- autoplot(cov, binwidth=10)
annots.track <- autoplot(annots,
aes(color=group, fill=group))
png(filename=plotfile,
width=1024, height=512)
print(tracks(coverage=coverage.track,
annotation=annots.track,
heights=c(5,1)))
dev.off()
}
#' Plot quality scores per cycle
#' @param data.dir string path to data
#' @param file.pattern string pattern for input files
#' @return plot object
#' @importFrom ShortRead qa
#' @importFrom ggplot2 ggplot geom_boxplot
PlotPerCycleQuality <- function(data.dir, file.pattern) {
input.type <- "fastq"
sreadq.qa <- qa(data.dir, file.pattern, type=input.type)
perCycle <- sreadq.qa[["perCycle"]]
perCycle.q <- perCycle$quality
pcq <- perCycle.q[rep(seq_len(nrow(perCycle.q)), perCycle.q$Count), ]
p <- ggplot(pcq, aes(factor(Cycle), Score))
p <- p + geom_boxplot()
return(p)
}
#' Plot mean quality per read
#' @param data.dir path to data dir
#' @param file.pattern pattern of input fastq
#' @return plot object
PlotMeanReadQuality <- function(data.dir, file.pattern) {
count.qmeans <- function(s, data.dir) {
fq <- yield(FastqSampler(file.path(data.dir, s), n=1000000))
qm <- as(quality(fq), "matrix")
row.means <- rowMeans(qm, na.rm=T)
qmeans <- data.frame(mean.qual=row.means, lane=rep(s, length(row.means)))
return(qmeans)
}
samples <- list.files(data.dir, file.pattern)
qmeans <- adply(samples, 1, count.qmeans, data.dir=data.dir)
p <- ggplot(qmeans, aes(x=mean.qual))
p <- p + geom_histogram(aes(y=..density..),
alpha=.1,
fill="green",
colour="darkgreen",
binwidth=.5)
p <- p + geom_density() + facet_wrap(~lane)
return(p)
}
#' Plot read length distribution
#' @param data.dir path to data dir
#' @param file.pattern pattern of input fastq
#' @return plot object
PlotReadLengthDistribution <- function(data.dir, file.pattern) {
samples <- list.files(data.dir, file.pattern)
count.rlens <- function(s, data.dir) {
fq <- yield(FastqSampler(file.path(data.dir, s), n=1000000))
rlens <- width(fq)
rlen95 <- quantile(width(fq), .95)
rlens <- rlens[rlens <= rlen95]
rlens.df <- data.frame(read.len=rlens, lane=rep(s, length(rlens)))
return(rlens.df)
}
rlens <- adply(samples, 1, count.rlens, data.dir=data.dir)
p <- ggplot(rlens, aes(x=read.len))
p <- p + geom_histogram(aes(y=..density..),
alpha=0.02,
fill="green",
colour="darkgreen",
binwidth=1)
p <- p + geom_density(adjust=3) + facet_wrap(~lane)
return(p)
}
#' Plot number of reads in each sample.
#'
#' Bar plot representing number of reads in each sample.
#' Samples are expected to be separate fastq files.
#' @param fqc FastQA from package ShortRead
#' @return plot object
#' @importFrom ggplot2 ggplot geom_bar theme labs
#' @export
Aplot <- function(fqc) {
df <- data.frame(nReads=fqc[['readCounts']]$read,
sample=row.names(fqc[['readCounts']]))
p <- ggplot(df, aes(x=sample, y=nReads, fill=sample))
p <- p + geom_bar(alpha=0.6, stat='identity')
p <- p + theme(legend.position="none", axis.text.x=element_text(angle=45, hjust=1))
p <- p + labs(x="Sample", y="Number of Reads")
return(p)
}
#' Plot number of reads in each sample for each grouping factor.
#'
#' Bar plots representing number of reads in each sample.
#' There is one bar plot per grouping factor.
#' Samples are grouped and clored by grouping factor.
#' Grouping factors are extracted from design table.
#' Samples are expected to be separate fastq files.
#' @param fqc FastQA from package ShortRead
#' @param design.table data.frame holds information about experimantal design
#' @return list of plot objects
#' @importFrom ggplot2 ggplot geom_bar theme labs
#' @importFrom stringr str_replace
#' @export
A.design.plot <- function(fqc, design.table) {
# get names of groups from desing file
groups <- names(design.table)
groups <- groups[groups != "sampleid"]
df <- data.frame(nReads=fqc[['readCounts']]$read,
sampleid=str_replace(row.names(fqc[['readCounts']]), "\\.f(ast)?q", ""))
df <- merge(df, design.table)
plots <- lapply(groups, function(g.factor) {
# order by grouping factor
df <- df[order(df[g.factor]), ]
.e <- environment()
p <- ggplot(df, aes(x=sampleid, y=nReads, fill=factor(df[,g.factor])),
environment=.e)
p <- p + geom_bar(alpha=0.6, stat='identity')
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + guides(fill=guide_legend(title=g.factor))
p <- p + labs(x="Sample", y="Number of Reads")
return(p)
})
names(plots) <- groups
return(plots)
}
#' Plot read lenght distribution
#'
#' Density plot representing freaquencies or readlenths.
#' Samples are explected to be separate fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @return plot object
#' @importFrom ggplot2 ggplot geom_density labs
#' @export
B1plot <- function(samples) {
rlens <- lapply(samples, width)
df <- data.frame(sample=rep(names(samples), lapply(rlens, length)),
rlen=unlist(rlens), row.names=NULL)
q95rlen <- quantile(df$rlen, 0.95)
p <- ggplot(df, aes(rlen, group=sample, colour=sample))
p <- p + geom_density(alpha=I(0.4), adjust=3) + xlim(0, q95rlen)
p <- p + labs(x="Read Length", y="Fraction of Reads", colour="Sample")
}
#' Plot read lenght distribution
#'
#' Density plot representing freaquencies or readlenths.
#' There is one plot per grouping factor.
#' Samples clored by grouping factor.
#' Grouping factors are extracted from design.table.
#' Samples are explected to be separate fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @param design.table data.frame holds information about experimantal design
#' @return list of plot objects
#' @importFrom ggplot2 ggplot geom_density labs
#' @importFrom stringr str_replace
#' @export
B1.design.plot <- function(samples, design.table) {
groups <- names(design.table)
groups <- groups[groups != "sampleid"]
rlens <- lapply(samples, width)
df <- data.frame(sampleid=rep(names(samples), lapply(rlens, length)),
rlen=unlist(rlens), row.names=NULL)
df <- merge(df, design.table)
q95rlen <- quantile(df$rlen, 0.95)
plots <- lapply(groups, function(g.factor){
.e <- environment()
p <- ggplot(df, aes(rlen, group=sampleid, colour=factor(df[,g.factor])),
environment=.e)
p <- p + geom_density(alpha=I(0.4), adjust=3) + xlim(0, q95rlen)
p <- p + guides(colour=guide_legend(title=g.factor))
p <- p + labs(x="Read Length", y="Fraction of Reads")
})
names(plots) <- groups
return(plots)
}
#' Plot fraction of reads with particular lengh or longer.
#'
#' Line plot showing fraction of reads on y axis
#' and minimal read length on x axis.
#' One line per sample.
#' Samples are explected to be separage fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @return plot object
#' @importFrom ggplot2 ggplot geom_line xlim labs
#' @import dplyr
#' @export
B2plot <- function(samples) {
rlens <- lapply(samples, width)
df <- data.frame(sampleid=rep(names(rlens), lapply(rlens, length)),
rlen=unlist(rlens), row.names=NULL)
q95rlen <- quantile(df$rlen, 0.95)
df <- df %>% group_by(sampleid, rlen) %>% summarise(count=n())
df <- df %>% group_by(sampleid) %>% mutate(cum.count=cumsum(count),
frac.count=cum.count/sum(count))
p <- ggplot(df, aes(group=sampleid, colour=sampleid ))
p <- p + geom_line(aes(rlen, 1-frac.count), alpha=0.4)
p <- p + xlim(min(df$rlen), q95rlen)
p <- p + labs(x="Read Length", y="Fraction of Reads", colour="Sampleid")
}
#' Plot fraction of reads with particular lengh or longer.
#'
#' Line plot showing fraction of reads on y axis
#' and minimal read length on x axis.
#' One line per sample.
#' There is one plot per grouping factor.
#' Samples clored by grouping factor.
#' Grouping factors are extracted from design.table.
#' Samples are explected to be separage fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @param design.table data.frame holds information about experimantal design
#' @return plots list of plot objects
#' @importFrom ggplot2 ggplot geom_line xlim labs
#' @import dplyr
#' @export
B2.design.plot <- function(samples, design.table) {
groups <- names(design.table)
groups <- groups[groups != "sampleid"]
rlens <- lapply(samples, width)
df <- data.frame(sampleid=rep(names(rlens), lapply(rlens, length)),
rlen=unlist(rlens), row.names=NULL)
q95rlen <- quantile(df$rlen, 0.95)
df <- df %>% group_by(sampleid, rlen) %>% summarise(count=n())
df <- df %>% group_by(sampleid) %>% mutate(cum.count=cumsum(count),
frac.count=cum.count/sum(count))
df <- merge(df,design.table)
plots <- lapply(groups, function(g.factor){
.e <- environment()
p <- ggplot(df, aes(group=sampleid, colour=factor(df[ ,g.factor])),
environment=.e)
p <- p + geom_line(aes(rlen, 1-frac.count), alpha=0.4)
p <- p + xlim(min(df$rlen), q95rlen)
p <- p + guides(colour=guide_legend(title=g.factor))
p <- p + labs(x="Read Length", y="Fraction of Reads", colour="Sampleid")
})
names(plots) <- groups
return(plots)
}
#' Plot mean read quality distribution per sample.
#'
#' Boxplot showing the distribution of mean read quality.
#' One boxplot per sample.
#' Samples are explected to be separage fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @return plot object
#' @importFrom ggplot2 ggplot geom_boxplot theme labs
#' @export
C1plot <- function(samples) {
calc.qmeans <- function(fq) {
qm <- as(quality(fq), "matrix")
row.means <- rowMeans(qm, na.rm=T)
return(row.means)
}
qmeans <- lapply(samples, calc.qmeans)
df <- data.frame(sample=rep(names(qmeans), lapply(qmeans, length)),
qmeans=unlist(qmeans), row.names=NULL)
p <- ggplot(df, aes(factor(sample), qmeans, fill=sample))
p <- p + geom_boxplot(alpha=0.6, outlier.size=0)
p <- p + theme(legend.position="none", axis.text.x=element_text(angle=45, hjust=1))
p <- p + labs(x="Sample", y="Mean Read Quality")
}
#' Plot mean read quality distribution per sample.
#'
#' Boxplot showing the distribution of mean read quality.
#' One boxplot per sample.
#' There is one plot per grouping factor.
#' Samples clored by grouping factor.
#' Grouping factors are extracted from design.table.
#' Samples are explected to be separage fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @param design.table data.frame holds information about experimantal design
#' @return list of plot objects
#' @importFrom ggplot2 ggplot geom_boxplot theme labs
#' @export
C1.design.plot <- function(samples, design.table) {
groups <- names(design.table)
groups <- groups[groups != "sampleid"]
calc.qmeans <- function(fq) {
qm <- as(quality(fq), "matrix")
row.means <- rowMeans(qm, na.rm=T)
return(row.means)
}
qmeans <- lapply(samples, calc.qmeans)
df <- data.frame(sampleid=rep(names(qmeans), lapply(qmeans, length)),
qmeans=unlist(qmeans), row.names=NULL)
df <- merge(df, design.table)
plots <- lapply(groups, function(g.factor) {
df <- df[order(df[g.factor]), ]
.e <- environment()
p <- ggplot(df, aes(factor(sampleid), qmeans, fill=factor(df[ ,g.factor])),
environment=.e)
p <- p + geom_boxplot(alpha=0.6, outlier.size=0)
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p <- p + guides(fill=guide_legend(title=g.factor))
p <- p + labs(x="Sampleid", y="Mean Read Quality")
})
names(plots) <- groups
return(plots)
}
#' Plot mean base quality at particualar position in the read.
#'
#' Line plot showing mean quality of the bases per position in the read.
#' One line per sample.
#' Samples are explected to be separage fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @return plot object
#' @import dplyr
#' @importFrom ggplot2 ggplot geom_line xlim labs
#' @export
C2plot <- function(samples) {
calc.qmeans <- function(fq) {
qm <- as(quality(fq), "matrix")
col.means <- colMeans(qm, na.rm=T)
}
q95rlen <- quantile(unlist(sapply(samples, width)), 0.95)
qmeans <- lapply(samples, calc.qmeans)
df <- data.frame(sampleid=rep(names(qmeans), lapply(qmeans, length)),
qmeans=unlist(qmeans), row.names=NULL)
df <- df %>% group_by(sampleid) %>% mutate(ones=1, pos=cumsum(ones))
p <- ggplot(df, aes(pos, qmeans, group=sampleid, colour=sampleid))
p <- p + geom_line(alpha=0.4) + xlim(0, q95rlen)
p <- p + labs(x="Position in the Read", y="Mean Base Quality", colour="Sampleid")
}
#' Plot mean base quality at particualar position in the read.
#'
#' Line plot showing mean quality of the bases per position in the read.
#' One line per sample.
#' Samples are explected to be separage fastq files.
#' There is one plot per grouping factor.
#' Samples colored by grouping factor.
#' Grouping factors are extracted from design.table.
#' @param samples ShortReadQ object from package ShortRead
#' @param design.table data.frame holds information about experimantal design
#' @return list of plot objects
#' @import dplyr
#' @importFrom ggplot2 ggplot geom_line xlim labs
#' @export
C2.design.plot <- function(samples, design.table) {
groups <- names(design.table)
groups <- groups[groups != "sampleid"]
calc.qmeans <- function(fq) {
qm <- as(quality(fq), "matrix")
col.means <- colMeans(qm, na.rm=T)
}
q95rlen <- quantile(unlist(sapply(samples, width)), 0.95)
qmeans <- lapply(samples, calc.qmeans)
df <- data.frame(sampleid=rep(names(qmeans), lapply(qmeans, length)),
qmeans=unlist(qmeans), row.names=NULL)
df <- df %>% group_by(sampleid) %>% mutate(ones=1, pos=cumsum(ones))
df <- merge(df, design.table)
plots <- lapply(groups, function(g.factor){
.e <- environment()
p <- ggplot(df, aes(pos, qmeans, group=sampleid, colour=factor(df[ ,g.factor])),
environment=.e)
p <- p + geom_line(alpha=0.4) + xlim(0, q95rlen)
p <- p + guides(colour=guide_legend(title=g.factor))
p <- p + labs(x="Position in the Read", y="Mean Base Quality")
})
names(plots) <- groups
return(plots)
}
#' Plot fraction of reads with partucular quality or higher per position.
#'
#' 4 line plots, for qualities 18, 20, 24 and 28, showing fraction
#' of reads with one of those qualities per position in the read.
#' One line per sample.
#' Samples are explected to be separage fastq files.
#' There is one plot per grouping factor.
#' Samples colored by grouping factor.
#' Grouping factors are extracted from design.table.
#' @param samples ShortReadQ object from package ShortRead
#' @param fqc FastQA from package ShortRead
#' @param design.table data.frame holds information about experimantal design
#' @return list of plot objects
#' @import dplyr
#' @importFrom ggplot2 ggplot geom_line facet_wrap ylim labs
#' @export
C3.design.plot <- function(samples, fqc, design.table) {
groups <- names(design.table)
groups <- groups[groups != "sampleid"]
border.quals <- c(18, 20, 24, 28)
q95rlen <- quantile(unlist(sapply(samples, width)), 0.95)
pcq <- fqc[["perCycle"]]$quality
pcq$sampleid <- str_replace(pcq$lane, "\\.f(ast)?q", "")
pcq <- merge(pcq, design.table, by='sampleid')
pcq <- pcq %>% group_by(sampleid, Cycle) %>% mutate(CycleCounts=sum(Count),
CountFrac=Count/CycleCounts,
ScoreCumSum=cumsum(CountFrac)) %>% ungroup()
pcq <- pcq[pcq$Cycle <= q95rlen, ]
subpcq <- pcq[pcq$Score %in% border.quals, ]
plots <- lapply(c(groups), function(g.factor){
.e <- environment()
p <- ggplot(subpcq, environment=.e)
p <- p + geom_line(aes_string(x='Cycle', y='1 - ScoreCumSum',
group='sampleid', colour=g.factor),
alpha=I(0.4))
p <- p + facet_wrap(~Score) + ylim(0,1)
p <- p + guides(colour=guide_legend(title=g.factor))
p <- p + labs(x="Position in the Read", y="Fraction of Reads")
})
names(plots) <- groups
return(plots)
}
#' Plot fraction of reads with partucular quality or higher per position.
#'
#' 4 line plots, for qualities 18, 20, 24 and 28, showing fraction
#' of reads with one of those qualities per position in the read.
#' One line per sample.
#' Samples are explected to be separage fastq files.
#' @param fqc FastQA from package ShortRead
#' @param samples ShortReadQ object from package ShortRead
#' @return plot object
#' @import dplyr
#' @importFrom ggplot2 ggplot geom_line facet_wrap ylim labs
#' @export
C3plot <- function(fqc, samples) {
border.quals <- c(18, 20, 24, 28)
q95rlen <- quantile(unlist(sapply(samples, width)), 0.95)
pcq <- fqc[["perCycle"]]$quality
pcq$sampleid <- str_replace(pcq$lane, "\\.f(ast)?q", "")
pcq <- pcq %>% group_by(sampleid, Cycle) %>% mutate(CycleCounts=sum(Count),
CountFrac=Count/CycleCounts,
ScoreCumSum=cumsum(CountFrac)) %>% ungroup()
pcq <- pcq[pcq$Cycle <= q95rlen, ]
subpcq <- pcq[pcq$Score %in% border.quals, ]
p <- ggplot(subpcq)
p <- p + geom_line(aes(Cycle, 1-ScoreCumSum, group=sampleid, colour=sampleid), alpha=I(0.4))
p <- p + facet_wrap(~Score) + ylim(0,1)
p <- p + labs(x="Position in the Read", y="Fraction of Reads", colour="Sampleid")
}
#' Plot base frequnecy for first 30 nt.
#'
#' Linepolots representing frequencies of bases, one line per base,
#' for first 30 nt in the read (from 5')
#' One plot per sample.
#' Each sample is a fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @return plot object
#' @importFrom reshape2 melt
#' @importFrom ggplot2 ggplot geom_line scale_x_discrete facet_wrap labs
#' @importFrom ggplot2 element_rect
#' @import dplyr
#' @export
D1plot <- function(samples) {
countLetterFreq <- function(fq) {
bases <- c("A", "C", "G", "T")
sr <- sread(fq)
alpha.freq <- alphabetByCycle(sr, bases)[, 1:30]
alpha.freq <- data.frame(alpha.freq)
colnames(alpha.freq) <- 1:30
alpha.freq$base <- rownames(alpha.freq)
df <- melt(alpha.freq, id=c("base"))
colnames(df) <- c("base", "pos", "count")
df <- df %>% group_by(pos) %>% mutate(total.count=sum(count),
frac.count=count/total.count)
# df <- ddply(df, .(pos), mutate,
# total.count=sum(count),
# frac.count=count/total.count)
return(df)
}
df.list <- lapply(samples, countLetterFreq)
df <- do.call("rbind", df.list)
df$sampleid <- rep(names(samples),each=4*30)
# df$sampleid <-sapply(rownames(df),
# function(rn) {str_split(rn, "\\.")[[1]][1]},
# simplify=TRUE, USE.NAMES=FALSE)
p <- ggplot(df, aes(pos, frac.count, group=base, colour=base))
p <- p + geom_line(alpha=0.4) + scale_x_discrete(breaks=seq(0, 30, 5))
p <- p + ylim(0,1)
p <- p + facet_wrap(~sampleid)
p <- p + labs(x="Position in the Read", y="Base Frequency", colour="Base")
p <- p + theme(panel.background=element_rect(fill="white", colour="grey"))
}
#' Plot base frequnecy for last 30 nt.
#'
#' Lineplots representing frequencies of bases, one line per base,
#' for last 30 nt in the read (from 3')
#' One plot per sample.
#' Each sample is a fastq files.
#' @param samples ShortReadQ object from package ShortRead
#' @return plot object
#' @importFrom reshape2 melt
#' @importFrom ggplot2 ggplot geom_line scale_x_discrete facet_wrap labs
#' @importFrom ggplot2 element_rect
#' @import dplyr
#' @export
D2plot <- function(samples) {
countLetterFreq <- function(fq) {
sr <- reverse(sread(fq))
bases <- c("A", "C", "G", "T")
alpha.freq <- alphabetByCycle(sr, bases)[, 1:30]
alpha.freq <- data.frame(alpha.freq)
colnames(alpha.freq) <- -1:-30
alpha.freq$base <- rownames(alpha.freq)
df <- melt(alpha.freq, id=c("base"))
colnames(df) <- c("base", "pos", "count")
df <- df %>% group_by(pos) %>% mutate(total.count=sum(count),
frac.count=count/total.count)
return(df)
}
df.list <- lapply(samples, countLetterFreq)
df <- do.call("rbind", df.list)
df$sampleid <- rep(names(samples),each=4*30)
# df$sampleid <-sapply(rownames(df),
# function(rn) {str_split(rn, "\\.")[[1]][1]},
# simplify=TRUE, USE.NAMES=FALSE)
p <- ggplot(df, aes(pos, frac.count, group=base, colour=base))
p <- p + geom_line(alpha=0.4) + scale_x_discrete(breaks=seq(-30, 0, 5))
p <- p + ylim(0,1)
p <- p + facet_wrap(~sampleid)
p <- p + labs(x="Position in the Read", y="Base Frequency", colour="Base")
p <- p + theme(panel.background=element_rect(fill="white", colour="white"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.