R/qQCReport.R

Defines functions .qa_ShortRead plotFragmentDistribution plotMismatchTypes plotErrorsByCycle plotUniqueness plotMappings plotDuplicated plotNuclByCycle plotQualByCycle truncStringToPlotWidth qQCReport calcMmInformation calcQaInformation

Documented in qQCReport

# proj        : qProject object, fasta file, fastq file or bam file
# pdfFilename : Name of the output file. If NULL then the graph are displayed 
# chunkSize   : chunk/yield size of the sample
# clObj       : cluster object for parallelization

calcQaInformation <- function(filename, label, filetype, chunkSize){
  if (any(filetype == "fasta") && any(compressedFileFormat(filename) != "none")) {
      warning("compressed 'fasta' input is not yet supported")
      return(NULL)
  } else {
    reads <- switch(as.character(filetype),
                    fastq = {
                      f <- ShortRead::FastqSampler(filename, n = chunkSize)
                      reads <- ShortRead::yield(f)
                      close(f)
                      reads[ShortRead::width(reads) > 0]
                    },
                    fasta = {
                      reads <- ShortRead::readFasta(as.character(filename), nrec = chunkSize)
                      reads[ShortRead::width(reads) > 0]
                    },
                    bam = {
                      bf <- Rsamtools::BamFile(filename, yieldSize = 1e6)
                      myyield <- function(x) {
                        tmp <- Rsamtools::scanBam(x, param = Rsamtools::ScanBamParam(what = c("seq", "qual", "strand")))[[1]]
                        minusStrand <- !is.na(tmp$strand) & tmp$strand == "-"
                        ShortRead::ShortReadQ(sread = c(tmp$seq[!minusStrand],
                                                      Biostrings::reverseComplement(tmp$seq[minusStrand])),
                                              quality = c(tmp$qual[!minusStrand],
                                                          IRanges::reverse(tmp$qual[minusStrand])))
                      }
                      reads <- reduceByYield(X = bf, YIELD = myyield, MAP = identity,
                                             REDUCE = GenomicFiles::REDUCEsampler(sampleSize = chunkSize, verbose = FALSE),
                                             parallel = FALSE)
                      reads[ShortRead::width(reads) > 0]
                    })
    }
    return(qa(reads, label))
}

calcMmInformation <- function(filename, genome, chunkSize){
    
    # get bamfile index statistics
    stats <- as.data.frame(.Call(idxstatsBam, filename))
    # get chromosome with mapped reads
    trg <- stats$mapped
    names(trg) <- stats$seqname
    selChr <- names(trg)[ trg != 0 ]

    # get sequence length
    if(is(genome, "BSgenome")) {
        # BSgenome
        ref <- genome
        seqlen <- seqlengths(ref)[selChr]
    } else {
        # Fasta File
        ref <- Rsamtools::FaFile(genome)
        seqInfo <- Rsamtools::scanFaIndex(ref)
        seqlen <- seqlengths(seqInfo)[selChr]
    }

    # if no mapped alignments then return array of NAs
    if(length(seqlen) == 0){
        mmDist <- list(array(NA,
                             dim=c(5,5,30),
                             dimnames=list(ref=c("A","C","G","T","N"),
                                           read=c("A","C","G","T","N"))),
                       array(NA,
                             dim=c(5,5,30),
                             dimnames=list(ref=c("A","C","G","T","N"),
                                           read=c("A","C","G","T","N"))),
                       rep(NA,100),
                       rep(NA,2))
        return(mmDist)
    }
        
    # create query regions
    if(sum(trg[selChr]) <= chunkSize){
        # number of mapped reads is smaller or equal than chunkSize
        # query all chromosome with mapped reads
        gr <- GRanges(seqnames=names(seqlen), ranges=IRanges(1, seqlen))
    } else {
        # number of mapped reads is bigger than chunkSize
        #  total number of alignments: sum(trg[selChr])
        #  fraction of alignments to be sampled: chunkSize / sum(trg[selChr])
        #  assuming uniform coverage, fraction of chromosome to retrieve: fraction of alignments to be samples * chromosome length
        reflen <- ceiling(chunkSize / sum(trg[selChr]) * seqlen[selChr])
        # use only chromosoms with the most mapped reads
        #seq <- names(sort(trg[selChr], decreasing=T)[1:ceiling(length(trg[selChr]) * chunkSize / sum(trg[selChr]))])
        seq <- selChr
        gr <- unlist(GRangesList(lapply(seq, function(s){
            GRanges(seqnames=s, ranges=breakInChunks(seqlen[s], chunksize=reflen[s]))
        })))
        rm(seq,reflen)
    }

    # get nucleotide alignment frequencies
    cntFor <- 0
    maxLen <- 0
    # allocate result vector for "nucleotideAlignmentFrequencies" function
    mmDist <- list(integer(25*1000), # mismatch distribution read1 
                   integer(25*1000), # mismatch distribution read2
                   integer(10000), # fragment length distribution
                   integer(2)) # uniqueness (unique/total)
    set.seed(0)
    for(s in sample(length(gr))){
        refseq <- as.character(getSeq(ref, gr[s], as.character=FALSE))
        reftid <- as.integer(match(seqnames(gr[s]), names(trg)) - 1)
        refstart <- start(gr[s])
        len <- .Call(nucleotideAlignmentFrequencies, filename, refseq, reftid,
                     refstart, mmDist, as.integer(chunkSize))
        if(len > maxLen)
            maxLen <- len
        if(sum(mmDist[[1]][1:25]) >= chunkSize || sum(mmDist[[2]][1:25]) >= chunkSize)
            break
        cntFor <- cntFor + 1
    }

    mmDist[[1]] <- array(mmDist[[1]],
                    dim=c(5,5,maxLen),
                    dimnames=list(ref=c("A","C","G","T","N"),
                                  read=c("A","C","G","T","N"),
                                  pos=1:maxLen))
    mmDist[[2]] <- array(mmDist[[2]],
                    dim=c(5,5,maxLen),
                    dimnames=list(ref=c("A","C","G","T","N"),
                                  read=c("A","C","G","T","N"),
                                  pos=1:maxLen))
    names(mmDist[[3]]) <- c(1:(length(mmDist[[3]])-1), paste(">", length(mmDist[[3]])-1, sep=""))
    names(mmDist) <- c("NucleotidMismatchRead1","NucleotidMismatchRead2","FragmentLength", "Uniqueness")
    return(mmDist)
}


qQCReport <- function(input, pdfFilename=NULL, chunkSize=1e6L, useSampleNames=FALSE, clObj=NULL, a4layout=TRUE, ...)
{
    if (grDevices::dev.cur() != 1L) { # only query current par if a device is open
        gpars <- par(no.readonly = TRUE)
        on.exit(par(gpars))
    }
    # 'proj' is correct type?
    if(inherits(input, "qProject", which=FALSE)){
        filetype <- input@samplesFormat
        if(input@paired == "no"){
            readFilename <- if(filetype == "bam") input@alignments$FileName else input@reads$FileName
            if(useSampleNames) {
                label <- sprintf("%i. %s", 1:length(readFilename), input@alignments$SampleName)
            } else {
                label <- sprintf("%i. %s", 1:length(readFilename), basename(readFilename))
            }
            mapLabel <- label
        } else {
            if(filetype == "bam"){
                readFilename <- rep(input@alignments$FileName, each=2)
                if(useSampleNames) {
                    label <- sprintf("%i(R%i). %s", rep(1:nrow(input@alignments),each=2), 1:2, rep(input@alignments$SampleName,each=2))
                    mapLabel <- sprintf("%i. %s", 1:nrow(input@alignments), input@alignments$SampleName)
                } else {
                    label <- sprintf("%i(R%i). %s", rep(1:nrow(input@alignments),each=2), 1:2, basename(readFilename))
                    mapLabel <- sprintf("%i. %s", 1:nrow(input@alignments), basename(input@alignments$FileName))
                }
            } else {
                readFilename <- as.vector(rbind(input@reads$FileName1, input@reads$FileName2))
                if(useSampleNames) {
                    label <- sprintf("%i(R%i). %s", rep(1:nrow(input@reads),each=2), 1:2, rep(input@reads$SampleName,each=2))
                    mapLabel <- sprintf("%i. %s", 1:nrow(input@reads), input@reads$SampleName)
                } else {
                    label <- sprintf("%i(R%i). %s", rep(1:nrow(input@reads),each=2), 1:2, basename(readFilename))
                    mapLabel <- sprintf("%i. %s", 1:nrow(input@reads), basename(input@reads$FileName1))
                }
            }
        }
        alnFilename <- input@alignments$FileName
        if(input@genomeFormat == "BSgenome"){
           require(input@genome, character.only=TRUE, quietly=TRUE)
           genome <- get(input@genome)
        } else {
            genome <- input@genome
        }
    }else if(is.character(input)){
        filetype <- unique(consolidateFileExtensions(input, compressed=TRUE))
        if(length(filetype) > 1L)
            stop("parameter 'input' must consist of unique filetype")
        if(useSampleNames && !is.null(names(input))) {
            label <- sprintf("%i. %s", 1:length(input), names(input))
        } else {
            label <- sprintf("%i. %s", 1:length(input), basename(input))
        }
        mapLabel <- label
        if(all(file.exists(input))){
            if(filetype != "bam"){
                readFilename <- input
                alnFilename <- NULL
                genome <- NULL
            } else {
                readFilename <- input
                alnFilename <- input
                genome <- NULL
            }
        } else {
            stop("could not find the files '", paste(input[!file.exists(input)], collapse=" "), "'")           
        }        
    } else {
        stop("'input' must be an object of type 'qProject' (returned by 'qAlign') or filenames")
    }

    # digest clObj
    clparam <- getListOfBiocParallelParam(clObj)
    nworkers <- unlist(lapply(clparam, BiocParallel::bpworkers))
    clsel <- which.max(nworkers) # will use only one layer of parallelization, select best
    if(!inherits(clparam[[clsel]],c("BatchJobsParam","SerialParam"))) { # don't test loading of QuasR on current or BatchJobs cluster nodes
        message("preparing to run on ", min(length(readFilename),nworkers[clsel]), " ", class(clparam[[clsel]]), " nodes...", appendLF=FALSE)
        ret <- BiocParallel::bplapply(seq.int(nworkers[clsel]), function(i) library(QuasR), BPPARAM=clparam[[clsel]])
        if(!all(sapply(ret, function(x) "QuasR" %in% x)))
            stop("'QuasR' package could not be loaded on all nodes")
        message("done")
    }

    message("collecting quality control data")
    # FASTQ/A quality control
    if(!inherits(clparam[[clsel]],"SerialParam"))
        nthreads <- .Call(ShortRead:::.set_omp_threads, 1L) # avoid nested parallelization
    qc1L <- BiocParallel::bpmapply(calcQaInformation, readFilename, label, MoreArgs=list(filetype=filetype, chunkSize=chunkSize), BPPARAM=clparam[[clsel]])
    if(!inherits(clparam[[clsel]],"SerialParam"))
        .Call(ShortRead:::.set_omp_threads, nthreads)
    qa <- do.call(rbind, qc1L)

    # BAM quality control, mismatch distribution
    if(!is.null(alnFilename) && !is.null(genome)) {
      
        # get bamfile index statistics for all bam files
        seqLen_bam_compatL <- lapply(alnFilename, function(x) seqlengths(BamFile(x)))
        
        # get sequence length of the genome
        if(is(genome, "BSgenome")) {
            # BSgenome
            seqlen_genome_compat <- seqlengths(genome)
        } else {
            # Fasta File
            seqlen_genome_compat <- seqlengths(Rsamtools::scanFaIndex(Rsamtools::FaFile(genome)))
        }
        
        # test if all sequence lengths in all bam files as well as the genome are identical
        # ... add the sequences lengths of the genome to the sequence lengths of all bam files
        seqlen_all_compatL <- c(list(seqlen_genome_compat), seqLen_bam_compatL)
        
        # ... sort by sequence name in case there is a difference in the order
        seqlen_all_compat_sort_L <- lapply(seqlen_all_compatL, function(x) x[order(names(x))])
        
        # ... check if sequence lengths are identical
        dupStatus <- !duplicated(seqlen_all_compat_sort_L)[-1]
        if(sum(dupStatus) != 0){
            stop("The chromosome names/lengths in the specified genome do not match the ones in the provided bam files")
        }
      
        distL <- BiocParallel::bplapply(alnFilename, calcMmInformation, genome=genome, chunkSize=chunkSize, BPPARAM=clparam[[clsel]])

        if(input@paired == "no") {
            unique <- lapply(distL,"[[", 4)
            frag <- NULL
            mm <- lapply(distL,"[[", 1)
        } else {
            unique <- lapply(distL,"[[", 4)
            frag <- lapply(distL,"[[", 3)
            names(frag) <- mapLabel
            mm <- do.call(c, lapply(distL,"[", c(1,2)))      
        }
        names(unique) <- mapLabel
        names(mm) <- label
    } else {
        unique <- NULL
        mm <- NULL
        frag <- NULL
    }
    
    # open/close pdf stream
    if(!is.null(pdfFilename)){
        pdf(pdfFilename, paper="default", onefile=TRUE, width=0, height=0)
        on.exit(dev.off())
    }

    # Plot
    message("creating QC plots")
    plotdata <- list(raw=list(qa=qa, mm=mm, frag=frag, unique=unique, mapdata=NULL))
    if(!is.null(qa)){
        if(filetype == "fastq" || filetype == "bam"){
            if(is.null(pdfFilename))
                dev.new()
            plotdata[['qualByCycle']] <- plotQualByCycle(qa, ...)
        }
    
        if(is.null(pdfFilename))
            dev.new()
        plotdata[['nuclByCycle']] <- plotNuclByCycle(qa, ...)
    
        if(is.null(pdfFilename))
            dev.new()
        plotdata[['duplicated']] <- plotDuplicated(qa, ...)

    } else if(!is.null(mm)){
        if(is.null(pdfFilename))
            dev.new()
        plotdata[['nuclByCycle']] <- plotNuclByCycle(mm, ...)        
    }

    if(!is.null(alnFilename)){
        # get bamfile index statistics
        mapdata <- lapply(alnFilename, function(file){
            colSums(as.data.frame(.Call(idxstatsBam, file)[c("mapped","unmapped")]))
        })
        mapdata <- do.call(rbind, mapdata)
        rownames(mapdata) <- mapLabel
        plotdata[['raw']][['mapdata']] <- mapdata
        if(is.null(pdfFilename))
            dev.new()
        plotdata[['mappings']] <- plotMappings(mapdata, a4layout = a4layout, ...)
    }

    if(!is.null(unique)){
        if(is.null(pdfFilename))
            dev.new()
        plotdata[['uniqueness']] <- plotUniqueness(unique, a4layout = a4layout, ...)
    }
        
    if(!is.null(mm)){
        if(is.null(pdfFilename))
            dev.new()
        plotdata[['errorsByCycle']] <- plotErrorsByCycle(mm, ...)

        if(is.null(pdfFilename))
            dev.new()
        plotdata[['mismatchTypes']] <- plotMismatchTypes(mm, ...)
    }

    if(!is.null(frag)){        
        if(is.null(pdfFilename))
            dev.new()
        plotdata[['fragDistribution']] <- plotFragmentDistribution(frag, ...)
    }

    invisible(plotdata)
}

truncStringToPlotWidth <- function(s, plotwidth) {
    sw <- strwidth(s)
    if(any(sw > plotwidth)) {
        w <- 10 # number of character to replace with ".."
        l <- nchar(s)
        news <- s
        i <- sw > plotwidth
        while(w < l && any(i)) {
            news <- ifelse(i, paste(substr(s, 1, ceiling((l-w)/2)+5), substr(s, floor((l+w)/2)+5, l), sep="..."), news)
            sw <- strwidth(news)
            i <- sw > plotwidth
            w <- w + 2
        }
        return(news)
    } else {
        return(s)
    }
}

plotQualByCycle <- function(qcdata, lmat=matrix(1:12, nrow=6, byrow=TRUE)) {
    data <- qcdata[['perCycle']][['quality']]
    qtiles <- by(list(data$Score, data$Count), list(data$lane, data$Cycle), function(x) {
        coef <- 1.5
        scoreRle <- Rle(x[[1]], x[[2]])
        n <- length(scoreRle)
        nna <- !is.na(scoreRle)
        stats <- c(min(scoreRle), quantile(scoreRle, c(0.25, 0.5, 0.75)), max(scoreRle))
        iqr <- diff(stats[c(2,4)])
        out <- if (!is.na(iqr)) {
            scoreRle < (stats[2L] - coef * iqr) | scoreRle > (stats[4L] + coef * iqr)
        } else !is.finite(scoreRle)
        if (any(out[nna], na.rm = TRUE))
            stats[c(1, 5)] <- range(scoreRle[!out], na.rm = TRUE)
        conf <- stats[3L] + c(-1.58, 1.58) * iqr/sqrt(n)
        list(stats=stats, n=n, conf=conf, out=which(out))
    }, simplify=FALSE)
    ns <- nrow(qtiles)

    qtilesL <- lapply(1:ns, function(i) {
        tmpconf <- do.call(cbind,lapply(qtiles[i,], "[[", 'conf'))
        tmpxn <- ncol(tmpconf)
        list(stats=do.call(cbind,lapply(qtiles[i,][1:tmpxn], "[[", 'stats')),
             n=sapply(qtiles[i,][1:tmpxn], "[[", 'n'),
             conf=tmpconf,
             out=numeric(0), #sapply(qtiles[i,][1:tmpxn], "[[", 'out'),
             group=numeric(0), #rep(1:ncol(qtiles), sapply(qtiles[i,], function(x) length(x$out))),
             names=colnames(tmpconf))
    })
    names(qtilesL) <- rownames(qtiles)

    layout(lmat)
    for(i in 1:ns) {
        xn <- length(qtilesL[[i]]$names)
        ym <- max(35,max(qtilesL[[i]]$stats))
        par(mar=c(5-1,4-1,4-4,2-1)+.1, mgp=c(3-1,1-0.25,0))
        plot(0:1,0:1,type="n", xlab="Position in read (bp)", ylab="Quality score", xlim=c(0,xn)+0.5, xaxs="i", ylim=c(0,ym))
        rect(xleft=seq.int(xn)-0.5, ybottom=-10, xright=seq.int(xn)+0.5, ytop=20,    col=c("#e6afaf","#e6c3c3"), border=NA)
        rect(xleft=seq.int(xn)-0.5, ybottom=20,  xright=seq.int(xn)+0.5, ytop=28,    col=c("#e6d7af","#e6dcc3"), border=NA)
        rect(xleft=seq.int(xn)-0.5, ybottom=28,  xright=seq.int(xn)+0.5, ytop=ym+10, col=c("#afe6af","#c3e6c3"), border=NA)
        do.call("bxp", c(list(qtilesL[[i]], notch=FALSE, width=NULL, varwidth=FALSE, log="", border=par('fg'),
                              pars=list(boxwex=0.8, staplewex=0.5,  outwex=0.5, boxfill="#99999944"),
                              outline=FALSE, horizontal=FALSE, add=TRUE, at=1:xn, axes=FALSE)))
        cxy <- par('cxy')
        text(x=par('usr')[1]+cxy[1]/4, y=par('usr')[3]+cxy[2]/4, adj=c(0,0),
             labels=truncStringToPlotWidth(rownames(qtiles)[i], diff(par("usr")[1:2]) - cxy[1]/2))
        box()
    }

    invisible(qtilesL)
}

plotNuclByCycle <- function(qcdata, lmat=matrix(1:12, nrow=6, byrow=TRUE)) {
    if(!is.null(qcdata[['perCycle']])){
        data <- qcdata[['perCycle']][['baseCall']]
        nfreq <- by(list(data$Base, data$Count), list(data$lane, data$Cycle), function(x) {
            y <- x[[2]] /sum(x[[2]]) *100
            names(y) <- x[[1]]
            y
        }, simplify=TRUE)
        ns <- nrow(nfreq)
    
        nfreqL <- lapply(1:ns, function(i) {
            tmp <- do.call(cbind, lapply(nfreq[i,], function(x) x[c('A','C','G','T','N')]))
            tmp[is.na(tmp)] <- 0
            rownames(tmp) <- c('A','C','G','T','N')
            tmp
        })
        names(nfreqL) <- rownames(nfreq)
    } else {
        nfreqL <- lapply(qcdata, function(x){
            nfreq <- do.call(rbind, lapply(1:dim(x)[3], function(j){
                  c(A=sum(x[,"A",j]),
                       C=sum(x[,"C",j]),
                       G=sum(x[,"G",j]),
                       T=sum(x[,"T",j]),
                       N=sum(x[,"N",j]))
            }))
            rownames(nfreq) <- 1:dim(x)[3]
            t(nfreq / rowSums(nfreq) * 100)
        })
        ns <- length(nfreqL)
#         names(nfreqL) <- names(qcdata)
    }
    nfreqL[is.na.data.frame(nfreqL)] <- 0

    mycols <- c("#5050ff","#e00000","#00c000","#e6e600","darkgray")
    layout(lmat)
    for(i in 1:ns) {
        xn <- ncol(nfreqL[[i]])
        ym <- max(50,ceiling(max(nfreqL[[i]], na.rm = TRUE) /5) *5 +5)
        par(mar=c(5-1,4-1,4-4,2-1)+.1, mgp=c(3-1,1-0.25,0))
        matplot(1:xn, t(nfreqL[[i]]), type="o", xlab="Position in read (bp)", ylab="Nucleotide frequency (%)",
                xlim=c(0,xn)+0.5, xaxs="i", ylim=c(0,ym), lwd=2, lty=1, pch=20, cex=0.6, col=mycols)
        abline(h=0,lty=2,col="gray")
        cxy <- par('cxy')
        text(x=par('usr')[1]+cxy[1]/4, y=par('usr')[4]-cxy[2]/4, adj=c(0,1),
             labels=truncStringToPlotWidth(names(nfreqL)[i], diff(par('usr')[1:2])-1.5*cxy[1]-strwidth(paste(rownames(nfreqL[[i]]), collapse=" "))))
        text(x=par('usr')[2]-cxy[1]/4-(4:0)*cxy[1]*0.8, y=par('usr')[4]-cxy[2]/4, adj=c(1,1), col=mycols, labels=rownames(nfreqL[[i]]))
    }

    invisible(nfreqL)
}

plotDuplicated <- function(qcdata, breaks=c(1:10), lmat=matrix(1:6, nrow=3, byrow=TRUE)) {
    if(breaks[length(breaks)]<Inf)
        breaks <- c(breaks,breaks[length(breaks)]+1,Inf)
    breakNames <- c(as.character(breaks[1:(length(breaks)-2)]),paste(">",breaks[length(breaks)-2],sep=""))
    data <- qcdata[['sequenceDistribution']]
    nocc <- by(list(data$nOccurrences, data$nReads), list(data$lane),
               function(x) Rle(values=x[[1]], lengths=x[[2]]), simplify=FALSE)
    ns <- nrow(nocc)

    nbin <- length(breaks)-1
    bocc <- do.call(rbind, lapply(1:ns, function(i) {
        bin <- findInterval(runValue(nocc[[i]]), breaks)
        tmp <- tapply(runLength(nocc[[i]]),bin,sum) /length(nocc[[i]]) *100
        tmp2 <- numeric(nbin)
        tmp2[ as.integer(names(tmp)) ] <- tmp
        tmp2
    }))
    dimnames(bocc) <- list(sampleName=rownames(nocc), duplicationLevel=breakNames)

    layout(lmat)
    for(i in 1:ns) {
        nm <- rownames(nocc)[i]
        fs <- qcdata[['frequentSequences']]
        fs <- fs[ fs$lane==nm, ]
        xn <- length(breaks)-1
        ym <- max(50,ceiling(max(bocc[i,]) /5) *5)
        par(mar=c(5-1,4-1,4-4,2-1)+.1, mgp=c(3-1,1-0.25,0))
        plot(1:xn, bocc[i,], type="o", xlab="Sequence duplication level", ylab="Percent of unique sequences",
             xlim=c(0,xn)+0.5, xaxs="i", ylim=c(0,ym), lwd=2, lty=1, pch=20, cex=0.6, axes=FALSE,
             panel.first=abline(h=0, lty=2, col="gray"))
        axis(1, at=1:xn, labels=breakNames)
        axis(2)
        box()
        frqseqcex <- 0.8
        frqseqS <- sprintf("%-*s",max(nchar(as.character(fs[,'sequence']))),fs[,'sequence'])
        frqseqF <- sprintf("(%6.0f)",fs[,'count']/sum(runValue(nocc[[i]])*as.numeric(runLength(nocc[[i]])))*1e6)
        frqseqJ <- " "
        frqseqW <- max(nchar(as.character(fs[,'sequence'])))
        xleft <- par('usr')[2] - max(strwidth(paste(frqseqS," ",frqseqF,sep=""), cex=frqseqcex, family="mono"))
        while(xleft < 1 && frqseqW > 7) {
            frqseqJ <- ".. "
            frqseqW <- frqseqW - 2
            frqseqS <- strtrim(frqseqS,frqseqW)
            xleft <- par('usr')[2] - max(strwidth(paste(frqseqS,frqseqJ,frqseqF,sep=""), cex=frqseqcex, family="mono"))
        }
        if(xleft >= 1 && frqseqW > 5) {
            cxy <- par('cxy')
            ytop <- par('usr')[4] - 2.0*cxy[2]
            yoff <- ytop - 1.8*cumsum(strheight(frqseqS, cex=frqseqcex, family="mono"))
            ii <- yoff+diff(yoff[1:2]) > max(bocc[i, 1:xn > floor(xleft)])
            if(any(ii)) {
                text(x=xleft, y=ytop,     adj=c(0,0),
                     labels=paste(truncStringToPlotWidth(nm,par("usr")[2]-xleft-cxy[1]/2),"frequent sequences (per Mio.):",sep="\n"))
                text(x=xleft, y=yoff[ii], adj=c(0,1),
                     labels=paste(frqseqS,frqseqJ,frqseqF,sep="")[ii], family="mono", cex=frqseqcex)
            } else {
                text(x=xleft, y=ytop,     adj=c(0,0),
                     labels=truncStringToPlotWidth(nm,par("usr")[2]-xleft-cxy[1]/2))
            }
        }
    }

    invisible(bocc)
}

plotMappings <- function(mapdata, cols=c("#006D2C","#E41A1C"), a4layout=TRUE) {
    nr <- nrow(mapdata)
    
    # set page layout
    if(a4layout)
        layout(rbind(c(0,1),c(0,2),c(0,0)), widths=c(2,3), heights=c(2, nr+2, max(25-nr,0.5)))
    else
        layout(rbind(c(0,1),c(0,2)), widths=c(2,3), heights=c(2, nr+2))

    lapply(seq(1, nr, by=29), function(i) {
        mapdataChunk <- mapdata[min(nr,i+29-1):i, , drop=FALSE]

        if(a4layout && nr>29 && nrow(mapdataChunk)<29)
            mapdataChunk <- rbind(mapdataChunk, matrix(NA, ncol=2, nrow=29-nrow(mapdataChunk)))
        
        # draw legend
        par(mar=c(0,1,0,3)+.1)
        plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
        legend(x="center", xjust=.5, yjust=.5, bty='n', x.intersp=0.25,
               fill=cols, ncol=length(cols), legend=colnames(mapdataChunk), xpd=NA)

        # draw bars
        par(mar=c(5,1,0,3)+.1)
        ymax <- nrow(mapdataChunk)*1.25
        mp <- barplot(t(mapdataChunk/rowSums(mapdataChunk))*100, horiz=TRUE, beside=FALSE, col=cols, border=NA,
                      ylim=c(0,ymax), names.arg=rep("",nrow(mapdataChunk)),
                      main='', xlab='Percent of sequences', ylab='', xpd=NA)

        # draw bar annotation
        cxy <- par('cxy')
        text(x=rep(par('usr')[1]+cxy[1]/3, nrow(mapdataChunk)), y=mp, col="white", adj=c(0,0.5),
             labels=sprintf("%.1f%%",mapdataChunk[,'mapped']/rowSums(mapdataChunk)*100), xpd=NA)
        text(x=rep(mean(par('usr')[1:2]), nrow(mapdataChunk)), y=mp, col="white", adj=c(0.5,0.5),
             labels=sprintf("total=%.3g",mapdataChunk[,'mapped']+mapdataChunk[,'unmapped']), xpd=NA)
        text(x=rep(par('usr')[2]-cxy[1]/5, nrow(mapdataChunk)), y=mp, col="white", adj=c(1,0.5),
             labels=sprintf("%.1f%%",mapdataChunk[,'unmapped']/rowSums(mapdataChunk)*100), xpd=NA)

        # draw sample names
        text(x=par('usr')[1] - 1.0*cxy[1], y=mp, col="black", adj=c(1,0.5),
             labels=truncStringToPlotWidth(rownames(mapdataChunk), ((diff(par("usr")[1:2]) + 4*par("cxy")[1]) /3 *2) - 3*par("cxy")[1]), xpd=NA)
    })

    invisible(mapdata)
}

plotUniqueness <- function(data, cols=c("#ff8c00","#4682b4"), a4layout=TRUE) {
    data <- do.call(rbind, data)
    nr <- nrow(data)
 
    data[,2] <- data[,2] - data[,1]
    colnames(data) <- c("unique","non-unique")
    
    # set page layout
    if(a4layout)
        layout(rbind(c(0,1),c(0,2),c(0,0)), widths=c(2,3), heights=c(2, nr+2, max(25-nr,0.5)))
    else
        layout(rbind(c(0,1),c(0,2)), widths=c(2,3), heights=c(2, nr+2))
    
    lapply(seq(1, nr, by=29), function(i) {
        dataChunk <- data[min(nr,i+29-1):i, , drop=FALSE]

        if(a4layout && nr>29 && nrow(dataChunk)<29)
            dataChunk <- rbind(dataChunk, matrix(NA, ncol=2, nrow=29-nrow(dataChunk)))

        # draw legend
        par(mar=c(0,1,0,3)+.1)
        plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
        legend(x="center", xjust=.5, yjust=.5, bty='n', x.intersp=0.25,
               fill=cols, ncol=length(cols), legend=colnames(dataChunk), xpd=NA)
    
        # draw bars
        par(mar=c(5,1,0,3)+.1)
        ymax <- nrow(dataChunk)*1.25
        mp <- barplot(t(dataChunk/rowSums(dataChunk))*100, horiz=TRUE, beside=FALSE, col=cols, border=NA,
                      ylim=c(0,ymax), names.arg=rep("",nrow(dataChunk)),
                      main='', xlab='Percent of unique alignment positions', ylab='', xpd=NA)

        # draw bar annotation
        cxy <- par('cxy')
        text(x=rep(par('usr')[1]+cxy[1]/3, nrow(dataChunk)), y=mp, col="white", adj=c(0,0.5),
             labels=sprintf("%.1f%%",dataChunk[,'unique']/rowSums(dataChunk)*100), xpd=NA)
        text(x=rep(mean(par('usr')[1:2]), nrow(dataChunk)), y=mp, col="white", adj=c(0.5,0.5),
             labels=sprintf("total=%.3g",dataChunk[,'unique']+dataChunk[,'non-unique']), xpd=NA)
        text(x=rep(par('usr')[2]-cxy[1]/5, nrow(dataChunk)), y=mp, col="white", adj=c(1,0.5),
             labels=sprintf("%.1f%%",dataChunk[,'non-unique']/rowSums(dataChunk)*100), xpd=NA)
    
        # draw sample names
        text(x=par('usr')[1] - 1.0*cxy[1], y=mp, col="black", adj=c(1,0.5),
             labels=truncStringToPlotWidth(rownames(dataChunk), ((diff(par("usr")[1:2]) + 4*par("cxy")[1]) /3 *2) - 3*par("cxy")[1]), xpd=NA)
    })
    
    invisible(data)
}

plotErrorsByCycle <- function(data, lmat=matrix(1:12, nrow=6, byrow=TRUE)) {

    ns <- length(data)
    layout(lmat)
    for(i in 1:ns) {
        xn <- dim(data[[i]])[3]
        cumcvg <- unlist(lapply(1:xn, function(j){ sum(data[[i]][,,j])}))
        nErr <- cumcvg - unlist(lapply(1:xn, function(j){ sum(diag(data[[i]][,,j]))}))
        frq <- nErr / cumcvg * 100
        ym <- max(5, ceiling(max(frq)), na.rm=TRUE)
        par(mar=c(5-1,4-1,4-4,2-1)+.1, mgp=c(3-1,1-0.25,0))
        plot(1:xn, frq, type='l', lwd=2, col="#E41A1C", ylim=c(0,ym), xaxs="i", xlim=c(0,xn)+.5, lty=1, pch=20, cex=0.6,
             main="", xlab='Position in read (bp)', ylab='Mismatche bases (%)')
        abline(h=0, lty=2, col='gray')
        #abline(v=c(12,25), lty=3, col='red')
        cxy <- par('cxy')
        text(x=par('usr')[1]+cxy[1]/4, y=par('usr')[4]-cxy[2]/4, adj=c(0,1),
             labels=truncStringToPlotWidth(names(data)[i], diff(par("usr")[1:2]) - cxy[1]/2))
        if(all(is.na(data[[i]])))
            text(x=mean(par('usr')[1:2]), y=mean(par('usr')[3:4]), adj=c(0.5,0.5), labels="no data")
    }

    invisible(data)
}

plotMismatchTypes <- function(data, lmat=matrix(1:12, nrow=6, byrow=TRUE)) {
    ns <- length(data)
    layout(lmat)
    mycols <- c("#5050ff","#e00000","#00c000","#e6e600","darkgray")
    for(i in 1:ns) {
        mtypes <- apply(data[[i]],1,rowSums)
        pmtypes <- mtypes *100 /sum(mtypes)
        diag(pmtypes) <- 0 # don't plot matches
        pmtypes <- pmtypes[,colnames(pmtypes)!="N"] # don't plot genomic N

        barplot(pmtypes, ylab="% of aligned bases", xlab="Genome", col=mycols,
                ylim=c(0,max(colSums(pmtypes),0.1,na.rm=TRUE)*1.16))
        if(all(is.na(mtypes)))
            text(x=mean(par('usr')[1:2]), y=mean(par('usr')[3:4]), adj=c(0.5,0.5), labels="no data")
        box()
        cxy <- par("cxy")
        text(x=par('usr')[2]-cxy[1]/4-(4:0)*cxy[1]*0.8, y=par('usr')[4]-cxy[2]/4,
             adj=c(1,1), col=mycols, labels=rownames(pmtypes))
        text(x=par('usr')[2]-cxy[1]/4-5*cxy[1]*0.8, y=par('usr')[4]-cxy[2]/4,
             col="black", labels="Read:", adj=c(1,1))
        text(x=par('usr')[1]+cxy[1]/4, y=par('usr')[4]-cxy[2]/4, adj=c(0,1), col="black",
             labels=truncStringToPlotWidth(names(data)[i], par('usr')[2]-cxy[1]/4-5*cxy[1]*0.8-strwidth("Read:")-cxy[1]*1.0))
    }

    invisible(data)
}

plotFragmentDistribution <- function(data, lmat=matrix(1:12, nrow=6, byrow=TRUE)) {
    ns <- length(data)
    frag <- do.call(cbind, data)
    xn <- dim(frag)[1]
    # trim vector
    if(sum(frag[xn,], na.rm=TRUE) == 0L){
        xn <- max(as.integer(rownames(frag)[rowSums(frag, na.rm=TRUE) > 0]), 100)
        frag <- frag[1:xn,, drop=FALSE]
    } 
      
    layout(lmat)
    for(i in 1:ns) {
        dens <- frag[,i]/sum(frag[,i])
        ym <- max(dens, 0.01, na.rm=TRUE)
        par(mar=c(5-1,4-1,4-4,2-1)+.1, mgp=c(3-1,1-0.25,0))

        #plot(dens, type='l', lwd=2, col="#377EB8", ylim=c(0,ym), xaxs="i", xlim=c(0,xn)+.5, lty=1, pch=20, cex=0.6,
        #     main="", xlab='Fragment size (nt)', ylab='Density')
        plot(dens, type='n', ylim=c(0,ym), xaxs="i", xlim=c(0,xn)+.5, lty=1, pch=20, cex=0.6,
             main="", xlab='Fragment size (nt)', ylab='Density', xpd=NA)
        rect(xleft=1:xn-.5, ybottom=0, xright=1:xn+.5, ytop=dens, col="#377EB8", border=NA)
        abline(h=0, lty=2, col='gray')
        if(any(ii <- grepl("^>",names(dens))))
            axis(side=1, at=xn, labels=names(dens)[xn])
        cxy <- par('cxy')
        text(x=par('usr')[1]+cxy[1]/4, y=par('usr')[4]-cxy[2]/4, adj=c(0,1),
             labels=truncStringToPlotWidth(names(data)[i], diff(par("usr")[1:2]) - cxy[1]/2))
        medlen <- median(Rle(values=1:xn, lengths=frag[,i]))
        text(x=par('usr')[1]+cxy[1]/4, y=par('usr')[4]-5*cxy[2]/4, adj=c(0,1), col="#377EB8", labels=sprintf("median = %1.f",medlen))
        abline(v=medlen, lty=3, col="black")
        if(all(is.na(data[[i]])))
            text(x=mean(par('usr')[1:2]), y=mean(par('usr')[3:4]), adj=c(0.5,0.5), labels="no data")
    }
    
    invisible(frag)
}

.qa_ShortRead <-
    function(dirPath, lane, ..., verbose=FALSE)
{
    if (missing(lane))
        stop("Paramteter 'lane' is missing.")
    obj <- dirPath
    alf <- alphabetFrequency(sread(obj), baseOnly=TRUE, collapse=TRUE)
#     bqtbl <- alphabetFrequency(quality(obj), collapse=TRUE)
#     rqs <- .qa_qdensity(quality(obj))
    freqtbl <- tables(sread(obj))
    abc <- alphabetByCycle(obj)
    names(dimnames(abc)) <- c("base", "cycle")
    dimnames(abc)$cycle <- as.character(1:dim(abc)[2])
    ac <- ShortRead:::.qa_adapterContamination(obj, lane, ...)
    perCycleBaseCall <- data.frame(Cycle = as.integer(colnames(abc)[col(abc)]), 
        Base = factor(rownames(abc)[row(abc)]), Count = as.vector(abc), 
        lane = lane, row.names = NULL)
    perCycleBaseCall <- perCycleBaseCall[perCycleBaseCall$Count != 0, ]
#     perCycleBaseCall <- ShortRead:::.qa_perCycleBaseCall(abc, lane)
#     perCycleQuality <- .qa_perCycleQuality(abc, quality(obj), lane)
    lst <- list(readCounts=data.frame(
           read=length(obj), filter=NA, aligned=NA,
           row.names=lane),
         baseCalls=data.frame(
           A=alf[["A"]], C=alf[["C"]], G=alf[["G"]], T=alf[["T"]],
           N=alf[["other"]], row.names=lane),
          readQualityScore=data.frame(
           quality=NULL, #rqs$x,
           density=NULL, #rqs$y,
           lane=NULL, #lane,
           type=NULL #"read"
           ),
          baseQuality=data.frame(
           score=NULL, #names(bqtbl),
           count=NULL, #as.vector(bqtbl),
           lane=NULL #lane
           ),
         alignQuality=data.frame(
           score=as.numeric(NA),
           count=as.numeric(NA),
           lane=lane, row.names=NULL),
         frequentSequences=data.frame(
           sequence=names(freqtbl$top),
           count=as.integer(freqtbl$top),
           type="read",
           lane=lane),
         sequenceDistribution=cbind(
           freqtbl$distribution,
           type="read",
           lane=lane),
          perCycle=list(
            baseCall=perCycleBaseCall,
            quality=NULL #perCycleQuality
              ),
         perTile=list(
           readCounts=data.frame(
             count=integer(0), type=character(0),
             tile=integer(0), lane=character(0)),
           medianReadQualityScore=data.frame(
             score=integer(), type=character(), tile=integer(),
             lane=integer(), row.names=NULL)),
         adapterContamination=ac

         )

    ShortRead:::.ShortReadQQA(lst)
}

setMethod(qa, "ShortRead", .qa_ShortRead)

Try the QuasR package in your browser

Any scripts or data that you put into this service are public.

QuasR documentation built on Nov. 8, 2020, 8:31 p.m.