# Parse output from a FastQC run
parseFastQC<-function(fn.zip, exdir='.') {
# fn.zip Path to a .zip file generated by a FastQC run
fn<-unzip(fn.zip, exdir=exdir);
# Summary statistics
fn.summ<-fn[grep('/summary.txt$', fn)];
summ<-read.table(fn.summ, header=FALSE, sep='\t', row=2, stringsAsFactors = FALSE);
summ<-as.matrix(summ)[, 1];
# Load statistic tables
fn.data<-fn[grep("/fastqc_data.txt$", fn)];
lns<-scan(fn.data, what = '', flush = TRUE, sep = '\n');
ind<-grep('^>>', lns);
ind1<-ind[seq(1, length(ind), 2)];
ind2<-ind[seq(2, length(ind), 2)];
ttl<-sapply(strsplit(sub('^>>', '', lns[ind1]), '\t'), function(x) x[1]);
tbls<-lapply(1:length(ttl), function(i) {
if ((ind2[i]-ind1[i]) == 1) NULL else {
if (i == 9) ind1[i]<-ind1[i]+1;
hd<-strsplit(sub('^#', '', lns[ind1[i]+1]), '\t')[[1]];
ln<-lns[(ind1[i]+2):(ind2[i]-1)];
tb<-data.frame(do.call('rbind', strsplit(ln, '\t')), stringsAsFactors = FALSE);
names(tb)<-hd;
tb
}
});
names(tbls)<-ttl;
# Basic Statistics
basic<-as.list(tbls[["Basic Statistics"]][, 2]);
basic[4:length(basic)]<-sapply(4:length(basic), function(i) as.numeric(basic[[i]]));
names(basic)<-tbls[["Basic Statistics"]][, 1];
# Per base sequence quality
if (identical(NULL, tbls[["Per base sequence quality"]])) base.qual<-NULL else {
t<-tbls[["Per base sequence quality"]][, -1];
for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
base.qual<-as.matrix(t);
rownames(base.qual)<-tbls[["Per base sequence quality"]][, 1];
}
# Per tile sequence quality
if (identical(NULL, tbls[["Per tile sequence quality"]])) tile.qual<-NULL else {
rnm<-unique(tbls[["Per tile sequence quality"]][, 2]);
cnm<-unique(tbls[["Per tile sequence quality"]][, 1]);
mn<-as.numeric(tbls[["Per tile sequence quality"]][, 3]);
names(mn)<-tbls[["Per tile sequence quality"]][, 2];
mn<-split(mn, tbls[["Per tile sequence quality"]][, 1]);
tile.qual<-sapply(mn, function(mn) mn[rnm]);
}
# Per sequence quality scores
if (identical(NULL, tbls[["Per sequence quality scores"]])) seq.count<-NULL else {
t<-tbls[["Per sequence quality scores"]];
for (i in 1:ncol(t)) t[, i]<-as.integer(t[, i]);
seq.count<-as.matrix(t);
}
# Per base sequence content
if (identical(NULL, tbls[["Per base sequence content"]])) base.pcnt<-NULL else {
t<-tbls[["Per base sequence content"]][, -1];
for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
rownames(t)<-tbls[["Per base sequence content"]][, 1];
base.pcnt<-as.matrix(t);
}
# Per sequence GC content
if (identical(NULL, tbls[["Per sequence GC content"]])) gc.count<-NULL else {
t<-tbls[["Per sequence GC content"]];
for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
gc.count<-as.matrix(t);
}
# Per base N content
if (identical(NULL, tbls[["Per base N content"]])) n.percent<-NULL else {
n.percent<-as.numeric(tbls[["Per base N content"]][, 2]);
names(n.percent)<-tbls[["Per base N content"]][, 1];
}
# Sequence Length Distribution
if (identical(NULL, tbls[["Sequence Length Distribution"]])) len.dist<-NULL else {
t<-tbls[["Sequence Length Distribution"]];
for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
len.dist<-as.matrix(t);
}
# Sequence Duplication Levels
if (identical(NULL, tbls[["Sequence Duplication Levels"]])) dup.level<-NULL else {
t<-tbls[["Sequence Duplication Levels"]][, -1];
for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
rownames(t)<-tbls[["Sequence Duplication Levels"]][, 1];
dup.level<-as.matrix(t);
}
dup.ttl<-strsplit(sub('^#', '', lns[ind1[9]+1]), '\t')[[1]];
basic[[dup.ttl[1]]]<-round(as.numeric(dup.ttl[2]), 2);
# Overrepresented sequences
if (identical(NULL, tbls[["Overrepresented sequences"]])) over.seqs<-NULL else {
t<-tbls[["Overrepresented sequences"]];
over.seqs<-t[, -1];
rownames(over.seqs)<-t[, 1];
over.seqs[, 1]<-as.integer(t[, 2]);
over.seqs[, 2]<-as.numeric(t[, 3]);
}
# Adapter Content
if (identical(NULL, tbls[["Adapter Content"]])) adap.cont<-NULL else {
t<-tbls[["Adapter Content"]][, -1];
for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
rownames(t)<-tbls[["Adapter Content"]][, 1];
adap.cont<-as.matrix(t);
}
# Kmer Content
if (identical(NULL, tbls[["Kmer Content"]])) kmer.cont<-NULL else {
t<-tbls[["Kmer Content"]];
kmer.cont<-t[, -1];
rownames(kmer.cont)<-t[, 1];
kmer.cont[, 1]<-as.integer(t[, 2]);
kmer.cont[, 2]<-round(as.numeric(t[, 3]), 2);
kmer.cont[, 3]<-round(as.numeric(t[, 4]), 2);
}
out<-list(summ, basic, base.qual, tile.qual, seq.count, base.pcnt, gc.count, n.percent, len.dist, dup.level,
over.seqs, adap.cont, kmer.cont);
names(out)<-c("Summary", "Basic Statistics", "Per base sequence quality", "Per tile sequence quality",
"Per sequence quality scores", "Per base sequence content", "Per sequence GC content",
"Per base N content", "Sequence Length Distribution", "Sequence Duplication Levels",
"Overrepresented sequences", "Adapter Content", "Kmer Content" );
out;
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.