R/utilities.R

Defines functions checkParam getMotifInfo .qa_ShortRead plotDuplicated plotNuclByCycle plotQualByCycle truncStringToPlotWidth calcQaInformation compressedFileFormat consolidateFileExtensions qQCReport

Documented in getMotifInfo

################################################################################
##                                                                            ##
##                                                                            ##
##            Functions related with qQCReport are from QuasR.                ##
##     We integrate these function in order to support the pipeline           ##
##     in our package. We recommend using qQCReport in QuauR if you           ##
##          want do quality control for your sequencing solely.               ##
##                                                                            ##
##                                                                            ##
################################################################################
#' @importFrom ShortRead FastqSampler
#' @importFrom ShortRead yield
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead sread
#' @importFrom ShortRead alphabetByCycle
#' @importFrom tools file_ext
#' @importFrom IRanges width
#' @importFrom graphics strwidth
#' @importFrom BiocGenerics which
#' @importFrom S4Vectors runValue
#' @importFrom S4Vectors runLength
#' @importFrom Biostrings alphabetFrequency
#' @importFrom graphics layout
#' @importFrom ShortRead tables
#' @importFrom graphics par
#' @importFrom graphics box
#' @importFrom graphics strheight
#' @importFrom graphics text
#' @importFrom graphics matplot
#' @importFrom graphics rect
#' @importFrom grDevices dev.new
qQCReport <- function(input, pdfFilename=NULL, chunkSize=1e6L, useSampleNames=FALSE, ...){
    if(is.character(input)){
        filetype <- unique(consolidateFileExtensions(input, compressed=TRUE))
        if(length(filetype) > 1L)
            stop("parameter 'input' must consist of unique filetype")
        if(useSampleNames && !is.null(names(input))) {
            label <- sprintf("%i. %s", 1:length(input), names(input))
        } else {
            label <- sprintf("%i. %s", 1:length(input), basename(input))
        }
        mapLabel <- label
        if(all(file.exists(input))){
            if(filetype != "bam"){
                readFilename <- input
                alnFilename <- NULL
                genome <- NULL
            } else {
                readFilename <- input
                alnFilename <- input
                genome <- NULL
            }
        } else {
            stop("could not find the files '", paste(input[!file.exists(input)], collapse=" "), "'")
        }
    } else {
        stop("'input' must be filenames")
    }

    qc1L <- mapply(FUN = calcQaInformation, filename = as.list(readFilename),
                   label = as.list(label), MoreArgs = list(filetype=filetype, chunkSize=chunkSize))

    qa <- do.call(rbind, qc1L)

    # open/close pdf stream
    if(!is.null(pdfFilename)){
        pdf(pdfFilename, paper="default", onefile=TRUE, width=0, height=0)
        on.exit(dev.off())
    }

    unique <- NULL
    mm <- NULL
    frag <- NULL

    message("creating QC plots")
    plotdata <- list(raw=list(qa=qa, mm=mm, frag=frag, unique=unique, mapdata=NULL))
    if(!is.null(qa)){
        if(filetype == "fastq" || filetype == "bam"){
            if(is.null(pdfFilename))
                dev.new()
            plotdata[['qualByCycle']] <- plotQualByCycle(qa)
        }

        if(is.null(pdfFilename))
            dev.new()
        plotdata[['nuclByCycle']] <- plotNuclByCycle(qa)

        if(is.null(pdfFilename))
            dev.new()
        plotdata[['duplicated']] <- plotDuplicated(qa)

    } else if(!is.null(mm)){
        if(is.null(pdfFilename))
            dev.new()
        plotdata[['nuclByCycle']] <- plotNuclByCycle(mm)
    }

    invisible(plotdata)
}

consolidateFileExtensions <- function(filename, compressed=FALSE) {
    # convert various sequence file extension to one representative
    if(compressed)
        fileExtension <- tolower(tools::file_ext(sub("[.](gz|bz2|xz)$","",filename)))
    else
        fileExtension <- tolower(tools::file_ext(filename))
    fileExtension[ fileExtension %in% c("fa", "fna", "fasta") ] <- "fasta"
    fileExtension[ fileExtension %in% c("fq", "fastq") ] <- "fastq"
    return(fileExtension)
}

compressedFileFormat <- function(filename) {
    ifelse(grepl("[.](gz|bz2|xz)$",filename),
           c("gz"="gzip", "bz2"="bzip2", "xz"="xz")[tools::file_ext(filename)],
           "none")
}

calcQaInformation <- function(filename, label, filetype, chunkSize){
    if(any(filetype == "fasta") && any(compressedFileFormat(filename) != "none")){
        qa <- NULL
        warning("compressed 'fasta' input is not yet supported")
    }else{
        reads <- switch(as.character(filetype),
                        fastq = {
                            f <- ShortRead::FastqSampler(filename, n=chunkSize)
                            reads <- ShortRead::yield(f)
                            close(f)
                            reads[IRanges::width(reads)>0]
                        },
                        fasta = {
                            reads <- ShortRead::readFasta(as.character(filename), nrec=chunkSize)
                            reads[IRanges::width(reads)>0]
                        }
                        # bam are not in consideration
                        # bam = {
                        #     bf <- Rsamtools::BamFile(filename, yieldSize=1e6)
                        #     myyield <- function(x) {
                        #         tmp <- Rsamtools::scanBam(x, param=ScanBamParam(what=c("seq", "qual", "strand")))[[1]]
                        #         minusStrand <- !is.na(tmp$strand) & tmp$strand == "-"
                        #         ShortRead::ShortReadQ(sread=c(tmp$seq[!minusStrand],Biostrings::reverseComplement(tmp$seq[minusStrand])),
                        #                               quality=c(tmp$qual[!minusStrand],Biostrings::reverse(tmp$qual[minusStrand])))
                        #     }
                        #     reads <- reduceByYield(X=bf, YIELD=myyield, MAP=identity,
                        #                            REDUCE=REDUCEsampler(sampleSize=chunkSize, verbose=FALSE),
                        #                            parallel=FALSE)
                        #     reads[width(reads)>0]
                        # }
        )
    }
    return(qa(reads, label))
}

truncStringToPlotWidth <- function(s, plotwidth) {
    sw <- graphics::strwidth(s)
    if(any(sw > plotwidth)) {
        w <- 10 # number of character to replace with ".."
        l <- nchar(s)
        news <- s
        i <- sw > plotwidth
        while(w < l && any(i)) {
            news <- ifelse(i, paste(substr(s, 1, ceiling((l-w)/2)+5), substr(s, floor((l+w)/2)+5, l), sep="..."), news)
            sw <- graphics::strwidth(news)
            i <- sw > plotwidth
            w <- w + 2
        }
        return(news)
    } else {
        return(s)
    }
}

plotQualByCycle <- function(qcdata, lmat=matrix(1:12, nrow=6, byrow=TRUE)) {
    data <- qcdata[['perCycle']][['quality']]
    qtiles <- by(list(data$Score, data$Count), list(data$lane, data$Cycle), function(x) {
        coef <- 1.5
        scoreRle <- Rle(x[[1]], x[[2]])
        n <- length(scoreRle)
        nna <- !is.na(scoreRle)
        stats <- c(min(scoreRle), quantile(scoreRle, c(0.25, 0.5, 0.75)), max(scoreRle))
        iqr <- diff(stats[c(2,4)])
        out <- if (!is.na(iqr)) {
            scoreRle < (stats[2L] - coef * iqr) | scoreRle > (stats[4L] + coef * iqr)
        } else !is.finite(scoreRle)
        if (any(out[nna], na.rm = TRUE))
            stats[c(1, 5)] <- range(scoreRle[!out], na.rm = TRUE)
        conf <- stats[3L] + c(-1.58, 1.58) * iqr/sqrt(n)
        list(stats=stats, n=n, conf=conf, out=BiocGenerics::which(out))
    }, simplify=FALSE)
    ns <- nrow(qtiles)

    qtilesL <- lapply(1:ns, function(i) {
        tmpconf <- do.call(cbind,lapply(qtiles[i,], "[[", 'conf'))
        tmpxn <- ncol(tmpconf)
        list(stats=do.call(cbind,lapply(qtiles[i,][1:tmpxn], "[[", 'stats')),
             n=sapply(qtiles[i,][1:tmpxn], "[[", 'n'),
             conf=tmpconf,
             out=numeric(0), #sapply(qtiles[i,][1:tmpxn], "[[", 'out'),
             group=numeric(0), #rep(1:ncol(qtiles), sapply(qtiles[i,], function(x) length(x$out))),
             names=colnames(tmpconf))
    })
    names(qtilesL) <- rownames(qtiles)

    graphics::layout(lmat)
    for(i in 1:ns) {
        xn <- length(qtilesL[[i]]$names)
        ym <- max(35,max(qtilesL[[i]]$stats))
        par(mar=c(5-1,4-1,4-4,2-1)+.1, mgp=c(3-1,1-0.25,0))
        plot(0:1,0:1,type="n", xlab="Position in read (bp)", ylab="Quality score", xlim=c(0,xn)+0.5, xaxs="i", ylim=c(0,ym))
        rect(xleft=seq.int(xn)-0.5, ybottom=-10, xright=seq.int(xn)+0.5, ytop=20,    col=c("#e6afaf","#e6c3c3"), border=NA)
        rect(xleft=seq.int(xn)-0.5, ybottom=20,  xright=seq.int(xn)+0.5, ytop=28,    col=c("#e6d7af","#e6dcc3"), border=NA)
        rect(xleft=seq.int(xn)-0.5, ybottom=28,  xright=seq.int(xn)+0.5, ytop=ym+10, col=c("#afe6af","#c3e6c3"), border=NA)
        do.call("bxp", c(list(qtilesL[[i]], notch=FALSE, width=NULL, varwidth=FALSE, log="", border=par('fg'),
                              pars=list(boxwex=0.8, staplewex=0.5,  outwex=0.5, boxfill="#99999944"),
                              outline=FALSE, horizontal=FALSE, add=TRUE, at=1:xn, axes=FALSE)))
        cxy <- par('cxy')
        text(x=par('usr')[1]+cxy[1]/4, y=par('usr')[3]+cxy[2]/4, adj=c(0,0),
             labels=truncStringToPlotWidth(rownames(qtiles)[i], diff(par("usr")[1:2]) - cxy[1]/2))
        box()
    }

    invisible(qtilesL)
}

plotNuclByCycle <- function(qcdata, lmat=matrix(1:12, nrow=6, byrow=TRUE)) {
    if(!is.null(qcdata[['perCycle']])){
        data <- qcdata[['perCycle']][['baseCall']]
        nfreq <- by(list(data$Base, data$Count), list(data$lane, data$Cycle), function(x) {
            y <- x[[2]] /sum(x[[2]]) *100
            names(y) <- x[[1]]
            y
        }, simplify=TRUE)
        ns <- nrow(nfreq)

        nfreqL <- lapply(1:ns, function(i) {
            tmp <- do.call(cbind, lapply(nfreq[i,], function(x) x[c('A','C','G','T','N')]))
            tmp[is.na(tmp)] <- 0
            rownames(tmp) <- c('A','C','G','T','N')
            tmp
        })
        names(nfreqL) <- rownames(nfreq)
    } else {
        nfreqL <- lapply(qcdata, function(x){
            nfreq <- do.call(rbind, lapply(1:dim(x)[3], function(j){
                c(A=sum(x[,"A",j]),
                  C=sum(x[,"C",j]),
                  G=sum(x[,"G",j]),
                  T=sum(x[,"T",j]),
                  N=sum(x[,"N",j]))
            }))
            rownames(nfreq) <- 1:dim(x)[3]
            t(nfreq / rowSums(nfreq) * 100)
        })
        ns <- length(nfreqL)
        #         names(nfreqL) <- names(qcdata)
    }
    nfreqL[is.na.data.frame(nfreqL)] <- 0

    mycols <- c("#5050ff","#e00000","#00c000","#e6e600","darkgray")
    graphics::layout(lmat)
    for(i in 1:ns) {
        xn <- ncol(nfreqL[[i]])
        ym <- max(50,ceiling(max(nfreqL[[i]], na.rm = TRUE) /5) *5 +5)
        par(mar=c(5-1,4-1,4-4,2-1)+.1, mgp=c(3-1,1-0.25,0))
        matplot(1:xn, t(nfreqL[[i]]), type="o", xlab="Position in read (bp)", ylab="Nucleotide frequency (%)",
                xlim=c(0,xn)+0.5, xaxs="i", ylim=c(0,ym), lwd=2, lty=1, pch=20, cex=0.6, col=mycols)
        abline(h=0,lty=2,col="gray")
        cxy <- par('cxy')
        text(x=par('usr')[1]+cxy[1]/4, y=par('usr')[4]-cxy[2]/4, adj=c(0,1),
             labels=truncStringToPlotWidth(names(nfreqL)[i], diff(par('usr')[1:2])-1.5*cxy[1]-strwidth(paste(rownames(nfreqL[[i]]), collapse=" "))))
        text(x=par('usr')[2]-cxy[1]/4-(4:0)*cxy[1]*0.8, y=par('usr')[4]-cxy[2]/4, adj=c(1,1), col=mycols, labels=rownames(nfreqL[[i]]))
    }

    invisible(nfreqL)
}

plotDuplicated <- function(qcdata, breaks=c(1:10), lmat=matrix(1:6, nrow=3, byrow=TRUE)) {
    if(breaks[length(breaks)]<Inf)
        breaks <- c(breaks,breaks[length(breaks)]+1,Inf)
    breakNames <- c(as.character(breaks[1:(length(breaks)-2)]),paste(">",breaks[length(breaks)-2],sep=""))
    data <- qcdata[['sequenceDistribution']]
    nocc <- by(list(data$nOccurrences, data$nReads), list(data$lane),
               function(x) Rle(values=x[[1]], lengths=x[[2]]), simplify=FALSE)
    ns <- nrow(nocc)

    nbin <- length(breaks)-1
    bocc <- do.call(rbind, lapply(1:ns, function(i) {
        bin <- findInterval(S4Vectors::runValue(nocc[[i]]), breaks)
        tmp <- tapply(S4Vectors::runLength(nocc[[i]]),bin,sum) /length(nocc[[i]]) *100
        tmp2 <- numeric(nbin)
        tmp2[ as.integer(names(tmp)) ] <- tmp
        tmp2
    }))
    dimnames(bocc) <- list(sampleName=rownames(nocc), duplicationLevel=breakNames)

    graphics::layout(lmat)
    for(i in 1:ns) {
        nm <- rownames(nocc)[i]
        fs <- qcdata[['frequentSequences']]
        fs <- fs[ fs$lane==nm, ]
        xn <- length(breaks)-1
        ym <- max(50,ceiling(max(bocc[i,]) /5) *5)
        graphics::par(mar=c(5-1,4-1,4-4,2-1)+.1, mgp=c(3-1,1-0.25,0))
        plot(1:xn, bocc[i,], type="o", xlab="Sequence duplication level", ylab="Percent of unique sequences",
             xlim=c(0,xn)+0.5, xaxs="i", ylim=c(0,ym), lwd=2, lty=1, pch=20, cex=0.6, axes=FALSE,
             panel.first=abline(h=0, lty=2, col="gray"))
        axis(1, at=1:xn, labels=breakNames)
        axis(2)
        box()
        frqseqcex <- 0.8
        frqseqS <- sprintf("%-*s",max(nchar(as.character(fs[,'sequence']))),fs[,'sequence'])
        frqseqF <- sprintf("(%6.0f)",fs[,'count']/sum(runValue(nocc[[i]])*as.numeric(runLength(nocc[[i]])))*1e6)
        frqseqJ <- " "
        frqseqW <- max(nchar(as.character(fs[,'sequence'])))
        xleft <- par('usr')[2] - max(strwidth(paste(frqseqS," ",frqseqF,sep=""), cex=frqseqcex, family="mono"))
        while(xleft < 1 && frqseqW > 7) {
            frqseqJ <- ".. "
            frqseqW <- frqseqW - 2
            frqseqS <- strtrim(frqseqS,frqseqW)
            xleft <- par('usr')[2] - max(strwidth(paste(frqseqS,frqseqJ,frqseqF,sep=""), cex=frqseqcex, family="mono"))
        }
        if(xleft >= 1 && frqseqW > 5) {
            cxy <- par('cxy')
            ytop <- par('usr')[4] - 2.0*cxy[2]
            yoff <- ytop - 1.8*cumsum(strheight(frqseqS, cex=frqseqcex, family="mono"))
            ii <- yoff+diff(yoff[1:2]) > max(bocc[i, 1:xn > floor(xleft)])
            if(any(ii)) {
                text(x=xleft, y=ytop,     adj=c(0,0),
                     labels=paste(truncStringToPlotWidth(nm,par("usr")[2]-xleft-cxy[1]/2),"frequent sequences (per Mio.):",sep="\n"))
                text(x=xleft, y=yoff[ii], adj=c(0,1),
                     labels=paste(frqseqS,frqseqJ,frqseqF,sep="")[ii], family="mono", cex=frqseqcex)
            } else {
                text(x=xleft, y=ytop,     adj=c(0,0),
                     labels=truncStringToPlotWidth(nm,par("usr")[2]-xleft-cxy[1]/2))
            }
        }
    }

    invisible(bocc)
}

.qa_ShortRead <- function(dirPath, lane, ..., verbose=FALSE) {
    if (missing(lane))
        stop("Paramteter 'lane' is missing.")
    obj <- dirPath
    alf <- Biostrings::alphabetFrequency(ShortRead::sread(obj), baseOnly=TRUE, collapse=TRUE)
    #     bqtbl <- alphabetFrequency(quality(obj), collapse=TRUE)
    #     rqs <- .qa_qdensity(quality(obj))
    freqtbl <- ShortRead::tables(ShortRead::sread(obj))
    abc <- ShortRead::alphabetByCycle(obj)
    names(dimnames(abc)) <- c("base", "cycle")
    dimnames(abc)$cycle <- as.character(1:dim(abc)[2])
    ac <- ShortRead:::.qa_adapterContamination(obj, lane, ...)
    perCycleBaseCall <- data.frame(Cycle = as.integer(colnames(abc)[col(abc)]),
                                   Base = factor(rownames(abc)[row(abc)]), Count = as.vector(abc),
                                   lane = lane, row.names = NULL)
    perCycleBaseCall <- perCycleBaseCall[perCycleBaseCall$Count != 0, ]
    #     perCycleBaseCall <- ShortRead:::.qa_perCycleBaseCall(abc, lane)
    #     perCycleQuality <- .qa_perCycleQuality(abc, quality(obj), lane)
    lst <- list(readCounts=data.frame(
        read=length(obj), filter=NA, aligned=NA,
        row.names=lane),
        baseCalls=data.frame(
            A=alf[["A"]], C=alf[["C"]], G=alf[["G"]], T=alf[["T"]],
            N=alf[["other"]], row.names=lane),
        readQualityScore=data.frame(
            quality=NULL, #rqs$x,
            density=NULL, #rqs$y,
            lane=NULL, #lane,
            type=NULL #"read"
        ),
        baseQuality=data.frame(
            score=NULL, #names(bqtbl),
            count=NULL, #as.vector(bqtbl),
            lane=NULL #lane
        ),
        alignQuality=data.frame(
            score=as.numeric(NA),
            count=as.numeric(NA),
            lane=lane, row.names=NULL),
        frequentSequences=data.frame(
            sequence=names(freqtbl$top),
            count=as.integer(freqtbl$top),
            type="read",
            lane=lane),
        sequenceDistribution=cbind(
            freqtbl$distribution,
            type="read",
            lane=lane),
        perCycle=list(
            baseCall=perCycleBaseCall,
            quality=NULL #perCycleQuality
        ),
        perTile=list(
            readCounts=data.frame(
                count=integer(0), type=character(0),
                tile=integer(0), lane=character(0)),
            medianReadQualityScore=data.frame(
                score=integer(), type=character(), tile=integer(),
                lane=integer(), row.names=NULL)),
        adapterContamination=ac

    )

    ShortRead:::.ShortReadQQA(lst)
}

setMethod(qa, "ShortRead", .qa_ShortRead)



############################get PWM matrix from user's file#####################
#' @name getMotifInfo
#' @title Generate PFMatrix or PFMatrixList from file.
#' @description
#' atacMotifScan and atacMotifScanPair accept PFM in a \code{list}, this
#' function convert JASPAR PFM file to \code{\link{PFMatrix}} or \code{\link{PFMatrixList}}.
#' @param motif.file Motif PFM file downloaded from JASPAR.
#' @details Generate \code{\link{PFMatrix}} or \code{\link{PFMatrixList}}.
#' @return \code{\link{PFMatrix}} or \code{\link{PFMatrixList}}.
#' @author Wei Zhang
#' @importFrom TFBSTools PFMatrix
#' @importFrom TFBSTools as.matrix
#' @importFrom TFBSTools PFMatrixList
#' @examples
#'
#' motif_file <- system.file("extdata", "CustomizedMotif.txt", package="esATAC")
#' pfm <- getMotifInfo(motif.file = motif_file)
#'
#' @export
getMotifInfo <- function(motif.file = NULL){
    PWMList <- list()
    name.flag <- FALSE
    A.flag <- FALSE
    C.flag <- FALSE
    G.flag <- FALSE
    T.flag <- FALSE

    con <- file(motif.file, "r")
    line <- readLines(con, n = 1)
    while( length(line) != 0 ){

        if(substring(line, 1, 1) == ">"){
            name_str <- unlist(strsplit(x = sub(pattern = ">", replacement = "", x = line), split = "\\s+"))
            motif_name <- tail(name_str, n = 1)
            name.flag <- TRUE
        }else if(substring(line, 1, 1) == "A"){
            A_str_num <- regmatches(line, gregexpr("[[:digit:]]+", line))
            A_num <- as.numeric(unlist(A_str_num))
            A.flag <- TRUE
        }else if(substring(line, 1, 1) == "C"){
            C_str_num <- regmatches(line, gregexpr("[[:digit:]]+", line))
            C_num <- as.numeric(unlist(C_str_num))
            C.flag <- TRUE
        }else if(substring(line, 1, 1) == "G"){
            G_str_num <- regmatches(line, gregexpr("[[:digit:]]+", line))
            G_num <- as.numeric(unlist(G_str_num))
            G.flag <- TRUE
        }else if(substring(line, 1, 1) == "T"){
            T_str_num <- regmatches(line, gregexpr("[[:digit:]]+", line))
            T_num <- as.numeric(unlist(T_str_num))
            T.flag <- TRUE
        }

        if(name.flag & A.flag & C.flag & G.flag & T.flag){
            p_matrix <- matrix(data = c(A_num, C_num, G_num, T_num), nrow = 4,
                               byrow = TRUE,  dimnames=list(c("A", "C", "G", "T")))

            p_matrix <- TFBSTools::PFMatrix(profileMatrix = p_matrix)

            PWMList[[motif_name]] <- p_matrix

            name.flag <- FALSE
            A.flag <- FALSE
            C.flag <- FALSE
            G.flag <- FALSE
            T.flag <- FALSE
        }

        line <- readLines(con, n = 1)
    }
    close(con)

    PWMList <- do.call(PFMatrixList, PWMList)

    return(PWMList)
}

checkParam <- function(paramlist,paramPattern,...){
    rs<-grepl(paramPattern, paramlist)
    if(sum(rs)>0){
        banp=paste(paramlist[rs], collapse = "'/'")
        stop(sprintf("Parameter(s) '%s' are not acceptable in paramList. it should be set as fix parameter.",banp))
    }
}
wzthu/ATACpipe documentation built on Aug. 12, 2022, 7:38 a.m.