R/seqTools.r

Defines functions .onUnload fastqq fastqKmerLocs fastqKmerSubsetLocs pd_l10 listMelt

Documented in fastqKmerLocs fastqKmerSubsetLocs fastqq

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
##                                                                           ##
##  Project   :   seqTools                                                   ##
##  Created   :   26.August.2013                                             ##
##  Author    :   W. Kaisers                                                 ##
##  Content   :   Doing some diagnostic and interventional tasks on fastq    ##
##                and fasta                                                  ##
##                esp. DNA k-mer counts.                                     ##
##  Version   :   0.99.34                                                    ##
##                                                                           ##
##  Changelog :                                                              ##
##  26.08.13  :   0.0.1 Project created                                      ##
##  03.09.13  :   0.0.6 C-Code valgrind tested                               ##
##  27.09.13  :   0.1.0 Added fastqDnaKmers                                  ##
##  14.10.13  :   0.1.1 Added C-library for parsing gz fasta and fastq files ##
##  17.10.13  :   0.1.3 C-Wrapper for fastq working.                         ##
##  17.10.13  :   0.1.6 First version of R package                           ##
##  21.10.13  :   0.3.0 New C library for fastq parsing                      ##
##  28.10.13  :   0.4.0 Added fastq-loc functions                            ##
##  29.10.13  :   0.4.4 seqTools C-code valgrind tested.                     ##
##  01.11.13  :   0.5.0 Distance matrices implemented                        ##
##  02.11.13  :   0.5.1 First working version with clustering based on       ##
##                      K-mers                                               ##
##  07.11.13  :   0.5.4 countFastaKmers now resizes arrays automatically     ##
##  09.11.13  :   0.6.0 count_fastq now resizes arrays automatically         ##
##  11.11.13  :   0.6.5 Added fastq simulation functions                     ##
##  19.11.13  :   0.7.0 Added trimFastq function                             ##
##  30.11.13  :   0.9.0 Separated R source files                             ##
##  22.12.13  :   0.99.2 Added '['-operator for Fastqq class                 ##
##  19.01.14  :   0.99.7 Added zlib version control for correction of        ##
##                       gzBuffer                                            ##
##                       error                                               ##
##                        + checks: cran win-builder + valgrind              ##
##  21.05.14  :   0.99.34 Corrected error in count_fasta_Kmers               ##
##                        which freezed function                             ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##

.onUnload <- function(libpath) { library.dynam.unload("seqTools", libpath) }

## see: http://bioconductor.org/developers/how-to/coding-style/

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Data collection functions:
## Fastqq,  fastqKmerLocs,  fastqKmerSubsetLocs
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##

fastqq <- function(filenames, k=6, probeLabel)
{
    k <- as.integer(k)
    
    tl <- list()
    tl$start <- Sys.time()
    filenames <- path.expand(filenames)
    
    res <- .Call("count_fastq", filenames, k, PACKAGE="seqTools")
    
    tl$end <- Sys.time()
    res@collectTime <- tl
    
    # Correct minSeqLen when empty files are counted
    if(any(res@nReads==0))
        res@seqLen[1,res@nReads==0] <- 0
    
    if(!missing(probeLabel))
    {
        if(length(probeLabel) == res@nFiles)
            res@probeLabel <- as.character(probeLabel) 
        else{
            warning("[Fastqq] probeLabel and filenames must have equal length.")
            res@probeLabel <- as.character(1:res@nFiles)
        }
    }else{
        res@probeLabel <- as.character(1:res@nFiles)
    }
    return(res)
}

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## fastq K-mer locs
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##

fastqKmerLocs <- function(filenames, k=4)
{
    if(!is.numeric(k))
        stop("'k' must be numeric.")
    k <- as.integer(k)
    
    if( (k < 0) || (k > max_k) )
        stop("'k' must be in range 0, ... , 16.")
    
    return(.Call("fastq_Kmer_locs", filenames, k, PACKAGE="seqTools"))
}


fastqKmerSubsetLocs <- function(filenames, k=4, kIndex)
{
    # Returns list with matrix elements.
    if(!is.numeric(k))
        stop("'k' must be numeric.")
    
    k <- as.integer(k)
    
    if( (k < 0) || (k > max_k) )
        stop("'k' must be in range 0, ... , ", max_k, ".")
    
    if(!is.numeric(kIndex))
        stop("'kIndex' must be numeric.")
    kIndex <- as.integer(kIndex)
    
    if(any(kIndex < 0))
        stop("No negative 'kIndex' values allowed.")
    
    if(any(kIndex > (4^k)) )
        stop("'kIndex' out of range (>4^k).")
    
    return(.Call("fastq_KmerSubset_locs",
                                filenames, k, kIndex, PACKAGE="seqTools"))
}

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Data collection functions.
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##


## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Standard slot accessor functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##


setMethod("fileNames", "Fastqq", function(object)
                                            {return(object@filenames)})

setMethod("collectTime", "Fastqq", function(object)
                                            {return(object@collectTime)})

setMethod("collectDur", "Fastqq", function(object) {
return(as.numeric(difftime(object@collectTime$end, object@collectTime$start,
                                                    units = "secs")))
})

setMethod("getK", "Fastqq", function(object) {return(object@k)})

setMethod("nFiles", "Fastqq", function(object) {return(object@nFiles)})

setMethod("nNnucs", "Fastqq", function(object) {return(object@nN)})

setMethod("nReads", "Fastqq", function(object) {return(object@nReads)})

setMethod("maxSeqLen", "Fastqq", function(object) {return(object@maxSeqLen)})

setMethod("seqLenCount", "Fastqq", function(object)
{
    res<-object@seqLenCount
    colnames(res) <- object@probeLabel
    return(res)
})

setMethod("nucFreq", "Fastqq", function(object, i)
{
    if(missing(i))
        stop("Argument 'i' is not optional.")
    
    if(!is.numeric(i))
        stop("'i' must be numeric.")
    
    i <- as.integer(i)
    if( (i < 1) || (i > object@nFiles) )
        stop("'i' must be >0 and < nFiles.")
    
    return(object@nac[[i]])
})

setMethod("gcContent", "Fastqq", function(object, i)
{
    if(missing(i))
        stop("Argument 'i' is not optional.")
    
    if(!is.numeric(i))
        stop("'i' must be numeric.")
    
    i <- as.integer(i)
    if( (i < 1) || (i > object@nFiles))
        stop("'i' must be >0 and < nFiles.")
    
    return(object@gcContent[, i])
})


setMethod("gcContentMatrix", "Fastqq", function(object)
{
    gcc <- object@gcContent
    colnames(gcc) <- object@probeLabel
    return(gcc)
})

setMethod("seqLen", "Fastqq", function(object)
{
    sql <- object@seqLen
    colnames(sql) <- object@probeLabel
    return(sql)
})

setMethod("kmerCount", "Fastqq", function(object)
{
    kmer <- object@kmer
    colnames(kmer) <- object@probeLabel
    return(kmer)
})


setMethod("probeLabel", "Fastqq", function(object){return(object@probeLabel)})
setReplaceMethod("probeLabel", "Fastqq", function(object, value)
{
    if(length(value) != nFiles(object))
        stop("'value' must have length ", nFiles(object), ".")
    
    val <- as.character(value)
    if(any(table(val)) > 1)
    {
        warning("[probeLabel <- .Fastqq] probeLabel unique suffix added.")
        val <- paste(1:nFiles(object), val, sep="_")
    }
    object@probeLabel <- val
    
    return(object)
})

setMethod("phred", signature="Fastqq", definition=function(object, i)
{
    if(missing(i))
        stop("Argument 'i' is not optional.")
    
    if(!is.numeric(i))
        stop("'i' must be numeric.")
    
    i <- as.integer(i)
    if( (i < 1) || (i > object@nFiles) )
        stop("'i' must be >0 and < nFiles.")
    
    return(object@phred[[i]])
})


## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Standard slot accessor functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##


setMethod("show", "Fastqq", function(object)
{
    bm <- Sys.localeconv()[7]
    w <- 20
    r <- "right"
    cat("Class       : ", format(class(object), w=w, j=r)                              , "\n", sep="")
    cat("nFiles      : ", format(format(nFiles(object)          , big.m=bm), w=w, j=r) , "\n", sep="")
    cat("maxSeqLen   : ", format(format(maxSeqLen(object)       , big.m=bm), w=w, j=r) , "\n", sep="")
    cat("k (Kmer len): ", format(format(getK(object)            , big.m=bm), w=w, j=r) , "\n", sep="")
    cat("\n")
    cat("nReads      : ", format(format(sum(as.numeric(nReads(object)))    , big.m=bm), w=w, j=r)   , "\n", sep="")
    cat("nr  N   nuc : ", format(format(sum(nNnucs(object))     , big.m=bm), w=w, j=r) , "\n", sep="")
    cat("Min seq len : ", format(format(min(seqLen(object)[1, ]), big.m=bm), w=w, j=r) , "\n", sep="")
    cat("Max seq len : ", format(format(max(seqLen(object)[2, ]), big.m=bm), w=w, j=r) , "\n", sep="")
    return(invisible())
})


## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Phred related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##

setMethod("phredQuantiles", "Fastqq", function(object, quantiles, i, ...)
{
    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    # Checking arguments
    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    
    ## Check quantiles argument
    if(missing(quantiles))
        stop("'quantiles' argument is not optional")
    
    if(!is.numeric(quantiles))
        stop("Quantiles must be numeric.")
    
    if(!(all(quantiles >= 0) & all(quantiles <= 1) ))
        stop("All quantiles mustbe in [0, 1]")
    
    quantiles <- sort(unique(round(quantiles, 2)))
    
    ## Check 'i' argument
    if(missing(i))
        stop("'i' argument is not optional")
    
    if(!is.numeric(i))
        stop("'i' must be numeric.")
    
    if(length(i) > 1)
        stop("'i' must have length 1")
    
    i <- as.integer(i)
    if( (i < 1) || (i > object@nFiles) )
            stop("'i' must be >0 and <nFiles.")

    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    # Count qual values for each sequence position
    # Convert integer counts into column-wise relative values
    # Maximum counted read length
    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    vec <- 1:seqLen(object)[2, i]
    qrel <- as.data.frame(apply(object@phred[[i]][, vec], 2, rel_int))
    names(qrel) <- vec
    
    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    # Walk through each column and extract row number
    # for given quantile values
    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE="seqTools")
    return(res)
})


setMethod("plotPhredQuant", "Fastqq", function(object, i, main, ...){
    if(!is.numeric(i))
        stop("'i' must be numeric.")
    
    i <- as.integer(i)
    
    if( (i < 1) || (i > object@nFiles) )
        stop("'i' must be >0 and <nFiles.")
    
    quant <- c(0.1, 0.25, 0.5, 0.75, 0.9)
    cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4") 
    qq <- phredQuantiles(object, quant, i)
    maxQ = floor(1.2*max(qq))
    xv <- 1:ncol(qq)
    
    if(missing(main))
    {
        main <- paste("Position wise Phred Quantiles (", 
                                probeLabel(object)[i], ")", sep="")
    }

    plot(xv, xv, ylim=c(0, maxQ), type="n", bty="n", las=1,
        ylab = "Phred score", xlab="Sequence position", main=main, ...)
     
    lines(xv, qq[1, ], col=cols[1], lty=2)
    lines(xv, qq[2, ], col=cols[2], lty=1)
    lines(xv, qq[3, ], col=cols[3], lwd=2)
    lines(xv, qq[4, ], col=cols[4], lty=1)
    lines(xv, qq[5, ], col=cols[5], lty=2)
    
    legend("top", ncol=6, lty=c(2, 1, 1, 1, 2),
            lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5,
            legend=c("10%", "25%", "50%", "75%", "90%"), bty="n", cex=0.8)
    return(invisible())
})




## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Global Phred content functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("phredDist", "Fastqq", function(object, i){
    idx <- 1:nFiles(object)
    
    if(missing(i))
        i <- idx
    else
    {
        if(!is.numeric(i))
            stop("'i' must be numeric.")
        
        if(!all(is.element(i, idx)))
            stop("'i' is out of range.")
    }
    
    phred <- Reduce("+", object@phred[i])
    phred <- matrix(as.numeric(phred), nrow=nrow(phred))
    
    phred_vals <- apply(phred, 1, sum)
    phred_dist <- phred_vals/sum(phred_vals)
    names(phred_dist) <- rownames(object@phred[[1]])
    
    return(phred_dist)    
})


setMethod("plotPhredDist", "Fastqq", function(object, i, maxp=45, col, ...)
{
    if(!is.numeric(maxp))
        stop("maxp must be numeric")
    
    if(!is.integer(maxp))
        maxp<-as.integer(maxp)
    
    if(maxp <= 0)
        stop("maxp must be >=0")
    
    if(missing(col))
        col <- topo.colors(10)[3]
    
    phred <- phredDist(object, i)
    maxy <- ceiling(max(phred) * 5) / 5
    x <- 1:maxp
    xmax <- 10 * (ceiling(maxp / 10))
    
    plot(x, phred[x], ylim=c(0, maxy), xlim=c(0, xmax), type="l", lwd=2,
            col=col, yaxt="n", bty="n", xlab="Phred value",
            ylab="Content (%)", ...)
    
    ylb <- 0:(10 * maxy) / 10
    axis(side=2, at=ylb, labels=100 * ylb, las=1)
    return(invisible())
})


# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Not exported:
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
pd_l10 <- function(x){ x <- phredDist(x); return(sum(x[1:10]) / sum(x))}


setMethod("propPhred","Fastqq",function(object, greater=30, less=93)
{
    if(!is.numeric(greater))
        stop("'greater' must be numeric.")
    
    if(length(greater) != 1)
        stop("'greater' must have length 1.")
    
    if(!is.numeric(less))
        stop("'less' must be numeric.")
    
    if(length(less) != 1)
        stop("'less must have length 1.")
    
    ## + + + + + + + + + + + + + + + + + + + + + + ##
    ## greater and less shall be used as
    ## array indices: increase greater
    ## + + + + + + + + + + + + + + + + + + + + + + ##
    greater<-as.integer(greater+1)
    less<-as.integer(less)
    
    if(greater < 1)
        stop("'greater' must be >=0.")
    
    if(less > 93)
        stop("'less' must be < 94.")
    
    if(greater >= less)
        stop("'greater' must be <= 'less'")
    
    n <- nFiles(object)
    res <- numeric(n)
    for(i in 1:n)
    {
        pd <- phredDist(object, i)
        res[i] <- sum(pd[greater:less])
    }
    names(res) <- probeLabel(object)
    return(res)
})

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Global Phred content functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##

setMethod("mergedPhred", "Fastqq", function(object){
    sql <- seqLen(object)
    maxSeqLen <- max(sql[2, ])
    
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    ## as.numeric: Sum on integer is likely to exceed 
    ##                 maximal 32-bit integer  values
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    
    if(sql[2, 1] < maxSeqLen)
    {
        mp <- as.numeric(.Call("r_enlarge_int_mat", object@phred[[1]], 
                c(nrow(object@phred[[1]]), maxSeqLen), PACKAGE="seqTools"))
    }else{
        mp <- as.numeric(object@phred[[1]])
    }
    
    
    n <- nFiles(object)
    if(n > 1)
    {
        for(i in 2:n)
        {
            if(sql[2, i] < maxSeqLen)  
            {
                mp <- mp + as.numeric(.Call("r_enlarge_int_mat",
                            object@phred[[i]],
                            c(nrow(object@phred[[i]]), maxSeqLen), 
                            PACKAGE="seqTools"))
            }else{
                mp <- mp + as.numeric(object@phred[[i]])
            }
        }
    }
    mp <- matrix(mp, ncol = maxSeqLen)
    rownames(mp) <- rownames(object@phred[[1]])
    colnames(mp) <- 1:maxSeqLen
    return(mp)
})

setMethod("mergedPhredQuantiles", "Fastqq", function(object, quantiles)
{
    if(!(all(quantiles >= 0) & all(quantiles <= 1)) )
        stop("[mergedPhredQuantiles.Fastqq] all quantiles mustbe in [0, 1]")
    quantiles <- sort(unique(round(quantiles, 2)))
  
    sql <- seqLen(object)
    maxSeqLen <- max(sql[2, ])
    
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    ## Count qual values for each sequence position
    ## Convert counts into column-wise relative values
    ## Maximum counted read length
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    vec <- 1:maxSeqLen
    mrg <- mergedPhred(object)
    qrel <- as.data.frame(apply(mrg[, vec], 2, rel_real))
    names(qrel)
    names(qrel) <- vec
    
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    ## Walk through each column and extract row number
    ## for given quantile values
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE="seqTools")
    return(res)
})


setMethod("plotMergedPhredQuant", "Fastqq", function(object, main, ...)
{
    quant <- c(0.1, 0.25, 0.5, 0.75, 0.9)
    cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4")
    qq <- mergedPhredQuantiles(object, quant)
    maxQ = floor(1.2*max(qq))
    xv <- 1:ncol(qq)
    
    if(missing(main))
        main <- paste("Merged position wise Phred Quantiles.", sep = "")
    
    plot(xv, xv, ylim=c(0, maxQ), type="n", bty="n", las=1,
        ylab="Phred score", xlab="Sequence position", main=main, ...)
    
    
    lines(xv, qq[1, ], col=cols[1], lty=2)
    lines(xv, qq[2, ], col=cols[2], lty=1)
    lines(xv, qq[3, ], col=cols[3], lwd=2)
    lines(xv, qq[4, ], col=cols[4], lty=1)
    lines(xv, qq[5, ], col=cols[5], lty=2)
    
    legend("top", ncol=6, lty=c(2, 1, 1, 1, 2), 
            lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5, 
            legend=c("10%", "25%", "50%", "75%", "90%"), bty="n", cex=0.8)
    
    return(invisible()) 
})

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Phred related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Nucleotide frequency related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##


setMethod("plotNucFreq", "Fastqq", function(object, i, main, maxx, ...)
{
    if(!is.numeric(i))
        stop("'i' must be numeric.")
    
    i <- as.integer(i)
    
    if( (i < 1) || (i > object@nFiles) )
        stop("'i' must be >0 and <nFiles.")
    
    maxSeqlen <- max(seqLen(object)[2, ])
    if(missing(maxx))
    {
        maxx <- maxSeqlen
    }
    else
    {
        if(!is.numeric(maxx))
            stop("'maxx' must be numeric.")
        maxx <- as.integer(maxx)
        if(maxx < 1)
            stop("'maxx' must be >0.")
        if(maxx > maxSeqlen)
            maxx <- maxSeqlen
    }
    
    # Skip extra line
    x <- 1:maxx
    nac <- object@nac[[i]][1:4, x]
    nacrel <- apply(nac, 2, rel_int)
    maxY = round(1.4 * max(nacrel), 1)
    
    # Maximum counted read length
    nacrel <- apply(nac, 2, rel_int)
    cols <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00")
    
    if(missing(main))
        main <- paste("Position wise Nucleotide frequency (",
                                probeLabel(object)[i], ")", sep="")
  
    plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
                ylab="Nucleotide fequency", xlab="sequence position",
                main=main, ...)
 
    lines(x, nacrel[1, ], col=cols[1], lwd=2)
    lines(x, nacrel[2, ], col=cols[2], lwd=2)
    lines(x, nacrel[3, ], col=cols[3], lwd=2)
    lines(x, nacrel[4, ], col=cols[4], lwd=2)
 
    legend("top", ncol=6, 
            lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5, 
            legend=c("A", "C", "G", "T"), bty="n", cex=0.8)
    
    return(invisible())
})


setMethod("plotGCcontent", "Fastqq", function(object,main,...)
{
    cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", 
            "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
    
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    ## Normalize matrix to colsum = 1
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    
    gc <- apply(object@gcContent, 2, rel_int)
    maxY = round(1.3*max(gc), 2)
    nf <- nFiles(object)
    x <- 1:nrow(gc)
    
    if(missing(main))
        main<-"GC content"
    
    plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
        ylab="Proportion of reads (%)", xlab="Relative GC content (%)", 
        main=main, ...) 
  
    for(i in 1:nf)
        lines(x, gc[, i], col=cols[i], lwd=2)
  
    legend("right", lwd=2, col=cols, legend=probeLabel(object),
                                                bty="n", cex=0.8)
  
    return(invisible())
})


setMethod("plotNucCount", "Fastqq", function(object, nucs=16, maxx, ...)
{
  
    # j = 16: N,    j = 2:3: gc
    if(!is.numeric(nucs))
        stop("'nucs' must be numeric.")
    
    nucs <- as.integer(nucs)
    if(any(nucs < 1) || any(nucs > 19))
        stop("'nucs' must be >0 and <20.")

    maxSeqlen <- max(seqLen(object)[2, ])
    
    if(missing(maxx))
    {
        maxx <- maxSeqlen
    }
    else
    {
        if(!is.numeric(maxx))
            stop("'maxx' must be numeric.")
        
        maxx <- as.integer(maxx)
        
        if(maxx<1)
            stop("'maxx must be >0.")
        
        if(maxx > maxSeqlen)
            maxx <- maxSeqlen
    }
    
    n <- nFiles(object)
    fvec <- 1:n
    
    i <- 1
    
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    ## Skip extra line
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    x <- 1:maxx
    nac <- object@nac[[i]][, x]
    nacrel <- apply(nac, 2, rel_int)
    
    if(length(nucs) == 1){
        dfr <- data.frame(a = nacrel[nucs, ])
    }else{
        dfr <- data.frame(a = apply(nacrel[nucs, ], 2, sum))
    }
    
    if(n > 1)
    {
        for(i in 2:n)
        {
            nac <- object@nac[[i]][, x]
            nacrel <- apply(nac, 2, rel_int)
            if(length(nucs) == 1){
                dfr[, i] <- nacrel[nucs, ]
            }else{
                dfr[, i] <- apply(nacrel[nucs, ], 2, sum)
            }
        }        
    }
    
    maxY = round(1.4 * max(dfr), 3)
    cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", 
                    "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
    
    nv <- paste(iupac_chars[nucs], collapse = "")
    
    plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
        ylab="Nucleotide fequency", xlab="sequence position",
        main=paste("Position wise Nucleotide frequency:  '",
                                            nv, "'", sep=""))
  
    for(i in fvec)
        lines(x, dfr[, i], col=cols[i %% 10], lwd=2)
    
    legend("top", ncol=6, 
        lwd=c(1, 1, 2, 1, 1), col=cols[fvec %% 10], xjust=0.5,
        legend=probeLabel(object), bty="n", cex=0.8)
    
    return(invisible())
})

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Nucleotide frequency related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##



setMethod("plotKmerCount", "Fastqq",
                    function(object, index, mxey, main="K-mer count", ...)
{
    n <- nFiles(object)
    if(missing(index))
    {
        index <- 1:n
    }else{
        if(!is.numeric(index))
            stop("'index' must be numeric.")
        
        index <- sort(unique(as.integer(index)))
        
        if(any(index < 0) || any(index > n))
                stop("'index' must be in 1, .., ", n, " ( = nFiles).")
    }
    
    if(!missing(mxey))
    {
        if(!is.numeric(mxey))
            stop("'mxey' must be numeric.")
        mxey <- as.integer(mxey)
        
        if(mxey<1)
            stop("'mxey' must be positive.")
    }
    
    cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", 
                "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")  
    pk <- 6
    if(object@k <= pk)
    {
        pk <- object@k
        x <- 1:(4^pk)
        
        if(missing(mxey))
            lg2y <- floor(log2(1.2 * (max(object@kmer))))
        else
            lg2y <- mxey
        
        maxY <- 2^lg2y

        
        plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
             ylab="K-mer count", xlab="K-mer index", main=main,
                                                axes=FALSE, ...)
    
    for(i in index)
        lines(x, object@kmer[, i], col=cols[i], lwd=2)
    }
    else
    {
        x <- 1:(4^pk)
        melt_factor <- as.integer(4^(object@k - pk))
        
        y_factor <- max(.Call("melt_vector", object@kmer[, 1], melt_factor,
                        PACKAGE="seqTools")) / max(object@kmer[, 1])
    
        if(missing(mxey))
            lg2y <- floor(log2(1.2 * (max(object@kmer)) * y_factor))
        else
            lg2y <- mxey
        maxY <- 2^lg2y
    
        plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
            ylab="K-mer count", xlab="K-mer index", 
                            main=main, axes=FALSE, ...)
    
        for(i in index)
        {
            lines(x, .Call("melt_vector", object@kmer[, i], melt_factor,
                        PACKAGE="seqTools"), col=cols[i], lwd=2)
        }
    }
    axis(side=1, at=0:4 * 4^(pk - 1), labels=c("A", "C", "G", "T", ""))
  
    axis(side=2, at=c(0, maxY),
                    labels=c(0, paste("2^", lg2y, sep="")), las=1)
  
    legend("right", lwd=2, col=cols[index], 
                legend=probeLabel(object)[index], bty="n", cex=0.8)
  
    return(invisible())
})



## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Merging and melting Fastqq objects
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##

## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("mergeFastqq", "Fastqq", function(lhs, rhs)
{
    
    res <- new("Fastqq")
    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
    ## Simple concatenations
    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
    
    res@filenames <- c(lhs@filenames, rhs@filenames)
    res@nFiles <- lhs@nFiles+rhs@nFiles
    res@nReads <- c(lhs@nReads, rhs@nReads)
    res@nN <- c(lhs@nN, rhs@nN)
    res@seqLenCount <- cbind(lhs@seqLenCount, rhs@seqLenCount)
    res@gcContent <- cbind(gcContentMatrix(lhs), gcContentMatrix(rhs))
    res@seqLen <- cbind(lhs@seqLen, rhs@seqLen)
    
    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
    ## Singularize probeLabel
    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
    
    res@probeLabel <- c(lhs@probeLabel, rhs@probeLabel)  
    if(any(table(res@probeLabel) > 1))
    {
        message("[mergeFastqq] Singularizing probeLabel (new suffix).")
        res@probeLabel <- paste(1:res@nFiles, res@probeLabel, sep="_")
    }
    
    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
    ## Eventually resize arrays when lhs and rhs have different maxSeqLen
    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
    if(lhs@maxSeqLen > rhs@maxSeqLen)
    {
        message("[mergeFastqq] Resizing rhs.")
        msl <- lhs@maxSeqLen
        
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        ## nac
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
        new_dim <- as.integer(c(nrow(rhs@nac), msl))
        rhs_nac <- .Call("r_enlarge_int_mat", rhs@nac, new_dim, 
                                                    PACKAGE="seqTools")
        res@nac <- c(lhs@nac, rhs_nac)
        
        # seqLenCount
        new_dim <- as.integer(c(msl, rhs@nFiles))
        rhs_seqLenCount <- .Call("r_enlarge_int_mat", 
                                rhs@seqLenCount, new_dim, PACKAGE="seqTools")
        
        res@seqLenCount <- c(lhs@seqLenCount, rhs_seqLenCount)
        
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        ## phred
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
        new_dim <- as.integer(nrow(rhs@phred), msl)
        rhs_phred_list <- list()
        
        for(i in 1:nFiles(rhs)){
                rhs_phred_list[[i]] <- .Call("r_enlarge_int_mat", 
                                rhs@phred[[i]], new_dim, PACKAGE="seqTools")
        }
        
        res@phred <- c(lhs@phred, rhs_phred_list)
        res@maxSeqLen <- lhs@maxSeqLen
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
    } else if(rhs@maxSeqLen > lhs@maxSeqLen)
    {
        
        message("[mergeFastqq] Resizing lhs.")
        msl <- rhs@maxSeqLen
        
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        ## nac
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
        new_dim <- as.integer(c(nrow(lhs@nac), msl))
        
        lhs_nac <- .Call("r_enlarge_int_mat",
                                lhs@nac, new_dim, PACKAGE="seqTools")
        
        res@nac <- c(rhs@nac, lhs_nac)
        
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        ## seqLenCount
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
        new_dim <- as.integer(c(msl, lhs@nFiles))
        
        lhs_seqLenCount <- .Call("r_enlarge_int_mat",
                    lhs@seqLenCount, new_dim, PACKAGE="seqTools")
        
        res@seqLenCount <- c(rhs@seqLenCount, lhs_seqLenCount)
        
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        ## phred
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
        new_dim <- as.integer(nrow(lhs@phred), msl)
        lhs_phred_list <- list()
        
        for(i in 1:nFiles(lhs))
        {
            lhs_phred_list[[i]] <- .Call("r_enlarge_int_mat", 
                    lhs@phred[[i]], new_dim, PACKAGE="seqTools")
        }
        res@phred <- c(rhs@phred, lhs_phred_list)
        res@maxSeqLen <- rhs@maxSeqLen
        
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
    } else { 
        
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        ## rhs@maxSeqLen == lhs@maxSeqLen
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
        res@maxSeqLen = lhs@maxSeqLen
        res@nac <- c(lhs@nac, rhs@nac)
        res@phred <- c(lhs@phred, rhs@phred)
        
        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
        
    }
    
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    ## Eventually melt down k-mer count matrix
    ## when lhs and rhs have different k
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    res@k <- pmin(lhs@k, rhs@k)
    
    kml <- kmerCount(lhs)
    if(lhs@k > res@k)
    {
        kml <- .Call("melt_kmer_matrix",
                    kml, c(lhs@k, res@k), PACKAGE="seqTools")
    }
    
    kmr <- kmerCount(rhs)
    if(rhs@k > res@k)
    {
        kmr <- .Call("melt_kmer_matrix",
                    kmr, c(rhs@k, res@k), PACKAGE="seqTools")
    }
    res@kmer <- cbind(kml, kmr)
    
    fkml <- lhs@firstKmer
    if(lhs@k > res@k)
    {
        fkml <- .Call("melt_kmer_matrix",
                    fkml, c(lhs@k, res@k), PACKAGE="seqTools")
    }
    fkmr <- rhs@firstKmer
    
    if(rhs@k > res@k)
    {
        fkmr <- .Call("melt_kmer_matrix",
                    fkmr, c(rhs@k, res@k), PACKAGE="seqTools")
    }
    res@firstKmer <- cbind(fkml, fkmr)
    
    return(res)
})


setMethod("meltDownK", "Fastqq", function(object, newK)
{
  
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    if(!is.numeric(newK))
        stop("'newK' must be numeric")
    
    newK <- as.integer(newK)
    
    if(length(newK) != 1)
        stop("'newK' must have length 1.")
    
    if(newK < 1 || newK >=  getK(object))
        stop("'newK' must be >= 1 and <= ", getK(object), ".")
    
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    res <- new("Fastqq")
    res@filenames <- object@filenames
    res@nFiles <- object@nFiles
    res@k <- newK
    res@maxSeqLen <- object@maxSeqLen
    
    res@kmer <- .Call("melt_kmer_matrix",
        object@kmer, c(getK(object), newK), PACKAGE="seqTools")
    
    res@firstKmer <- .Call("melt_kmer_matrix",
        object@firstKmer, c(getK(object), newK), PACKAGE="seqTools")
    
    res@nReads <- object@nReads
    res@nN <- object@nN
    res@seqLenCount <- object@seqLenCount
    res@gcContent <- object@gcContent
    res@nac <- object@nac
    res@phred <- object@phred
    res@seqLen <- object@seqLen
    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
  
    return(res)
})


listMelt <- function(x, oldK, newK)
{
    f <- function(x) .Call("melt_kmer_matrix", x, 
            as.integer(c(oldK, newK)), PACKAGE="seqTools")
  
    return(lapply(x, f))
}



setMethod("[", "Fastqq", function(x, i, j, drop="missing")
{
  
    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    res <- new("Fastqq")
    res@filenames <- x@filenames[i]
    res@probeLabel <- x@probeLabel[i]
    res@nFiles <- length(i)
    res@k <- x@k
    res@seqLen <- x@seqLen[, i, drop=FALSE]
    res@maxSeqLen <- max(res@seqLen[2, ])
    res@kmer <- x@kmer[, i, drop=FALSE]
    res@firstKmer <- x@firstKmer[, i, drop=FALSE]
    res@nN <- x@nN[i]
    res@seqLenCount <- x@seqLenCount[, i, drop=FALSE]
    res@gcContent <- x@gcContent[, i, drop=FALSE]
    res@nac <- x@nac[i]
    res@phred <- x@phred[i]
    res@nReads <- x@nReads[i]
    res@collectTime <- x@collectTime
    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
    
    return(res)
})


## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Calculation of distance matrix based on Canberra distance
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##

setMethod("cbDistMatrix", "Fastqq", 
        function(object, nReadNorm=max(nReads(object)))
{
    if(!is.numeric(nReadNorm))
        stop("'nReadNorm' must be numeric.")
    nReadNorm <- as.integer(nReadNorm)
    
    if(nReadNorm < max(nReads(object)))
        stop("'nReadNorm' must be greater than all nRead.")
    
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    ## Column-wise normalizing read counts (by upscaling) 
    ## so that column sums become nearly equal in order to
    ## compensate sequencing depth artifacts in 
    ## Canberra distance values
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
    
    scale <- nReadNorm/nReads(object)
    
    scaled <- .Call("scale_kmer_matrix",
            kmerCount(object), scale, PACKAGE="seqTools")
    
    colnames(scaled) <- object@probeLabel
    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
  
    return(.Call("cb_distance_matrix", scaled, PACKAGE="seqTools"))
})


## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## END OF FILE
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
wokai/seqTools documentation built on May 4, 2019, 9:46 a.m.