# calcWIGmetrics.R
# calculate some metrics about how the WIG reads compare to the annotation
# changing from bins to the wiggle tracks compressed objects
`calcWIGmetrics` <- function( WIG, asDataFrame=FALSE, logFile=NULL ) {
setCurrentSpecies( WIG$Info$Species)
seqMap <- getCurrentSeqMap()
geneMap <- getCurrentGeneMap()
exonMap <- getCurrentExonMap()
if ( ! is.null( logFile)) {
sink( file=logFile, split=TRUE)
}
# count up how many reads hit the right and wrong strands, as per the genome annotation
Nright <- Nwrong <- NinterG <- 0
Nright_combo <- Nwrong_combo <- NinterG_combo <- 0
ngenes <- nngs <- 0
gOut <- readsOut <- correctPct <- vector()
curSeqID <- ""
nGenes1deep <- nGenes10deep <- nGenes100deep <- 0
readLength <- WIG$Info$ReadLength
if ( is.na(readLength) || readLength < 32) readLength <- 32
for( ig in 1:nrow( geneMap)) {
if ( ("REAL_G" %in% colnames( geneMap)) && geneMap$REAL_G[ig] == FALSE) {
nngs <- nngs + 1
isReal <- FALSE
} else {
ngenes <- ngenes + 1
isReal <- TRUE
}
# get that chromosome's set of wiggle bins
thisSeqID <- geneMap$SEQ_ID[ig]
if ( curSeqID != thisSeqID) {
wiggleChunk <- WIG_getWigglesOneSeq( WIG, thisSeqID)
curSeqID <- thisSeqID
}
# get the chunks for this gene
start <- geneMap$POSITION[ig]
stop <- geneMap$END[ig]
# get the read counts for this region
counts <- WIG_getReadCountsInRegion( wiggleChunk, start, stop, readLength=readLength)
nPlusMulti <- counts$Plus
nPlusUnique <- counts$PlusUnique
nMinusMulti <- counts$Minus
nMinusUnique <- counts$MinusUnique
if ( ig %% 100 == 0) cat( "\r", ig, geneMap$GENE_ID[ig], "\t",
nPlusMulti+nMinusMulti+nPlusUnique+nMinusUnique, " ")
# inter-genic regions may get special treatment
if ( is.na( geneMap$STRAND[ig])) {
NinterG <- NinterG + nPlusUnique + nMinusUnique
NinterG_combo <- NinterG_combo + nPlusMulti + nMinusMulti
# no other intergenic stuff
next
} else if ( geneMap$STRAND[ig] %in% c("","+")) {
Nright <- Nright + nPlusUnique
Nright_combo <- Nright_combo + nPlusMulti
goodCnts <- nPlusUnique
Nwrong <- Nwrong + nMinusUnique
Nwrong_combo <- Nwrong_combo + nMinusMulti
badCnts <- nMinusUnique
} else {
Nright <- Nright + nMinusUnique
Nright_combo <- Nright_combo + nMinusMulti
goodCnts <- nMinusUnique
Nwrong <- Nwrong + nPlusUnique
Nwrong_combo <- Nwrong_combo + nPlusMulti
badCnts <- nPlusUnique
}
# other gene based metrics...
if ( isReal) {
nGenes1deep <- nGenes1deep + sum( goodCnts >= 1)
nGenes10deep <- nGenes10deep + sum( goodCnts >= 10)
nGenes100deep <- nGenes100deep + sum( goodCnts >= 100)
}
# build up some coverage numbers for each gene
gOut[ ngenes] <- geneMap$GENE_ID[ ig]
readsOut[ ngenes] <- goodCnts
correctPct[ ngenes] <- ( goodCnts - badCnts) / ( goodCnts + badCnts)
}
cat("\n\nStrand Correctness of 'In Gene' reads: \n")
Nright <- round( Nright)
Nwrong <- round( Nwrong)
total <- Nright + Nwrong
options( digits=4)
cat( "\nUnique Reads hitting expected strand: ", Nright, "\t Pct: ", as.percent( Nright / max(total,1)))
cat( "\nUnique Reads hitting wrong strand: ", Nwrong, "\t Pct: ", as.percent( Nwrong / max(total,1)))
Nright_combo <- round( Nright_combo)
Nwrong_combo <- round( Nwrong_combo)
total_combo <- Nright_combo + Nwrong_combo
cat( "\nAll Reads hitting expected strand: ", Nright_combo, "\t Pct: ",
as.percent( Nright_combo / max(total_combo,1)))
cat( "\nAll Reads hitting wrong strand: ", Nwrong_combo, "\t Pct: ",
as.percent( Nwrong_combo / max(total_combo,1)))
cat( "\n\nGene Specificity of reads: \n")
NinterG <- round( NinterG)
NinterG_combo <- round( NinterG_combo)
totalAllReads <- total + NinterG
cat( "\nUnique Reads hitting annotated genes: ", total, "\t Pct: ",
as.percent( total / max(totalAllReads,1)))
cat( "\nUnique Reads hitting intergenic areas: ", NinterG, "\t Pct: ",
as.percent( NinterG / max(totalAllReads,1)))
totalAllReads_combo <- total_combo + NinterG_combo
cat( "\nAll Reads hitting annotated genes: ", total_combo, "\t Pct: ",
as.percent( total_combo / max(totalAllReads_combo,1)))
cat( "\nAll Reads hitting intergenic areas: ", NinterG_combo, "\t Pct: ",
as.percent( NinterG_combo / max(totalAllReads_combo,1)))
cat( "\n\nGene Sensitivity of reads: \n")
cat( "\nGenes with at least 1 read: ", nGenes1deep, "\t Pct: ",
as.percent( nGenes1deep / ngenes))
cat( "\nGenes with at least 10 reads: ", nGenes10deep, "\t Pct: ",
as.percent( nGenes10deep / ngenes))
cat( "\nGenes with at least 100 reads: ", nGenes100deep, "\t Pct: ",
as.percent( nGenes100deep / ngenes))
cat( "\n")
if ( ! is.null( logFile)) {
sink( file=NULL)
cat( "\n\nWrote Bin Metrics summary file: ", logFile, "\n")
}
# prep the output data.frame
if ( asDataFrame) {
out <- data.frame( gOut, readsOut, correctPct, stringsAsFactors=FALSE)
colnames( out) <- c( "GENE_ID", "RAW_READS_U", "STRAND_CC")
ord <- base::order( out$RAW_READS_U, decreasing=TRUE)
out <- out[ ord, ]
rownames( out) <- 1:nrow( out)
return(out)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.