R/parseFastQC.r

Defines functions parseFastQC

# Parse output from a FastQC run
parseFastQC<-function(fn.zip, exdir='.') {
  # fn.zip    Path to a .zip file generated by a FastQC run
  
  fn<-unzip(fn.zip, exdir=exdir);
  
  # Summary statistics
  fn.summ<-fn[grep('/summary.txt$', fn)];
  summ<-read.table(fn.summ, header=FALSE, sep='\t', row=2, stringsAsFactors = FALSE);
  summ<-as.matrix(summ)[, 1];
  
  # Load statistic tables
  fn.data<-fn[grep("/fastqc_data.txt$", fn)];
  lns<-scan(fn.data, what = '', flush = TRUE, sep = '\n');
  ind<-grep('^>>', lns);
  ind1<-ind[seq(1, length(ind), 2)];
  ind2<-ind[seq(2, length(ind), 2)];
  ttl<-sapply(strsplit(sub('^>>', '', lns[ind1]), '\t'), function(x) x[1]);
  tbls<-lapply(1:length(ttl), function(i) {
    if ((ind2[i]-ind1[i]) == 1) NULL else {
      if (i == 9) ind1[i]<-ind1[i]+1;
      hd<-strsplit(sub('^#', '', lns[ind1[i]+1]), '\t')[[1]];
      ln<-lns[(ind1[i]+2):(ind2[i]-1)];
      tb<-data.frame(do.call('rbind', strsplit(ln, '\t')), stringsAsFactors = FALSE);
      names(tb)<-hd;
      tb
    }
  });
  names(tbls)<-ttl;
  
  # Basic Statistics
  basic<-as.list(tbls[["Basic Statistics"]][, 2]);
  basic[4:length(basic)]<-sapply(4:length(basic), function(i) as.numeric(basic[[i]]));
  names(basic)<-tbls[["Basic Statistics"]][, 1];
  
  # Per base sequence quality
  if (identical(NULL, tbls[["Per base sequence quality"]])) base.qual<-NULL else {
    t<-tbls[["Per base sequence quality"]][, -1];
    for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
    base.qual<-as.matrix(t);
    rownames(base.qual)<-tbls[["Per base sequence quality"]][, 1];
  }
  
  # Per tile sequence quality
  if (identical(NULL, tbls[["Per tile sequence quality"]])) tile.qual<-NULL else {
    rnm<-unique(tbls[["Per tile sequence quality"]][, 2]);
    cnm<-unique(tbls[["Per tile sequence quality"]][, 1]);
    mn<-as.numeric(tbls[["Per tile sequence quality"]][, 3]);
    names(mn)<-tbls[["Per tile sequence quality"]][, 2];
    mn<-split(mn, tbls[["Per tile sequence quality"]][, 1]);
    tile.qual<-sapply(mn, function(mn) mn[rnm]);
  }

  # Per sequence quality scores
  if (identical(NULL, tbls[["Per sequence quality scores"]])) seq.count<-NULL else {
    t<-tbls[["Per sequence quality scores"]];
    for (i in 1:ncol(t)) t[, i]<-as.integer(t[, i]);
    seq.count<-as.matrix(t);
  }
  
  # Per base sequence content
  if (identical(NULL, tbls[["Per base sequence content"]])) base.pcnt<-NULL else {
    t<-tbls[["Per base sequence content"]][, -1];
    for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
    rownames(t)<-tbls[["Per base sequence content"]][, 1];
    base.pcnt<-as.matrix(t);
  }
  
  # Per sequence GC content
  if (identical(NULL, tbls[["Per sequence GC content"]])) gc.count<-NULL else {
    t<-tbls[["Per sequence GC content"]];
    for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
    gc.count<-as.matrix(t);
  }
  
  # Per base N content
  if (identical(NULL, tbls[["Per base N content"]])) n.percent<-NULL else {
    n.percent<-as.numeric(tbls[["Per base N content"]][, 2]);
    names(n.percent)<-tbls[["Per base N content"]][, 1];
  }
  
  # Sequence Length Distribution
  if (identical(NULL, tbls[["Sequence Length Distribution"]])) len.dist<-NULL else {
    t<-tbls[["Sequence Length Distribution"]];
    for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
    len.dist<-as.matrix(t);
  }
  
  # Sequence Duplication Levels
  if (identical(NULL, tbls[["Sequence Duplication Levels"]])) dup.level<-NULL else {
    t<-tbls[["Sequence Duplication Levels"]][, -1]; 
    for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
    rownames(t)<-tbls[["Sequence Duplication Levels"]][, 1];
    dup.level<-as.matrix(t);
  }
  dup.ttl<-strsplit(sub('^#', '', lns[ind1[9]+1]), '\t')[[1]];
  basic[[dup.ttl[1]]]<-round(as.numeric(dup.ttl[2]), 2);
  
  # Overrepresented sequences
  if (identical(NULL, tbls[["Overrepresented sequences"]])) over.seqs<-NULL else {
    t<-tbls[["Overrepresented sequences"]];
    over.seqs<-t[, -1];
    rownames(over.seqs)<-t[, 1];
    over.seqs[, 1]<-as.integer(t[, 2]);
    over.seqs[, 2]<-as.numeric(t[, 3]);
  }
  
  # Adapter Content
  if (identical(NULL, tbls[["Adapter Content"]])) adap.cont<-NULL else {
    t<-tbls[["Adapter Content"]][, -1]; 
    for (i in 1:ncol(t)) t[, i]<-as.numeric(t[, i]);
    rownames(t)<-tbls[["Adapter Content"]][, 1];
    adap.cont<-as.matrix(t);
  }
  
  # Kmer Content
  if (identical(NULL, tbls[["Kmer Content"]])) kmer.cont<-NULL else {
    t<-tbls[["Kmer Content"]];
    kmer.cont<-t[, -1];
    rownames(kmer.cont)<-t[, 1];
    kmer.cont[, 1]<-as.integer(t[, 2]);
    kmer.cont[, 2]<-round(as.numeric(t[, 3]), 2);
    kmer.cont[, 3]<-round(as.numeric(t[, 4]), 2);
  }
  
  out<-list(summ, basic, base.qual, tile.qual, seq.count, base.pcnt, gc.count, n.percent, len.dist, dup.level,
            over.seqs, adap.cont, kmer.cont);
  names(out)<-c("Summary", "Basic Statistics", "Per base sequence quality", "Per tile sequence quality", 
                "Per sequence quality scores", "Per base sequence content", "Per sequence GC content",   
                "Per base N content", "Sequence Length Distribution", "Sequence Duplication Levels",
                "Overrepresented sequences", "Adapter Content", "Kmer Content" );
  
  out;
}
zhezhangsh/GtUtility documentation built on May 4, 2019, 11:20 p.m.