# given a GRanges object, measure the features and make statements about spread around the genome.
# ideally, the GRanges object should have seqlengths info.
# TODO get this to export a table
# TDOO including clustering metric
# Could do this using distribution of distances between sequential features (works with overlapping sets)
# OR could do using reflect() and then size distribution of what are the gaps (but what about overlapping features?)
analyseGenomeSpread.GR <- function(x, show.plots=FALSE, n.plotChroms=10, return.table=FALSE, fileName=NULL, runTestBinSizes=c(1000, 10000, 100000, 1000000)) {
stopifnot(require(GenomicRanges))
stopifnot(class(x) == "GRanges")
op <- options(width=300) # to keep rows of stats table together while using sink.
on.exit(options(op))
if(!is.null(fileName)) {
sink(fileName)
#sink()
}
formatBig <- function(x) format(x,big.mark=",",scientific=FALSE, trim=TRUE) # for repeated use on large numbers.
Mode <- function(x) { # https://stackoverflow.com/a/8189441/1129734
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
print("Genome spread analysis. analyseGenomeSpread.GR() from https://github.com/davetgerrard/utilsGerrardDT ")
allChromNames <- seqnames(seqinfo(x))
chrom.canonical <- grep("_", allChromNames, invert=TRUE, value=TRUE)
chrom.canonical <- chrom.canonical[order(seqlengths(x)[chrom.canonical])]
# order by size
genomeSize <- sum(as.numeric(seqlengths(x)))
if(is.na(genomeSize)) {
print("Genome size information is missing or incomplete. Only limited results will be shown. To fix this, set the seqinfo() of your GRanges object")
}
outTable <- data.frame()
x.canonical <- x[as.vector(seqnames(x)) %in% chrom.canonical]
#x.canonical <- x
#seqinfo(x.canonical) <- seqinfo(x)[chrom.canonical]
seqlevels(x.canonical, pruning.mode="coarse") <- chrom.canonical # will throw out any data on noncanonical chroms.
genomeSize.canonical <- sum(as.numeric(seqlengths(x.canonical)))
### Drop all unused seqlevels: - could be useful later..
#seqlevels(gr) <- seqlevelsInUse(gr)
#print(seqinfo(x))
print(paste("Object with", formatBig(length(x)), "regions from genome", unique(genome(x))))
print(paste("The full genome has length", formatBig(genomeSize), "over", length(allChromNames), "sequences"))
print(paste("The canonical genome has length", formatBig(genomeSize.canonical), "over", length(chrom.canonical), "sequences"))
n.chroms.used <- length(seqlevelsInUse(x.canonical))
choms.notUsed <- setdiff(chrom.canonical,seqlevelsInUse(x.canonical))
n.features <- length(x.canonical)
print(paste("Your track contains", formatBig(n.features), "features over", n.chroms.used, "sequences"))
if(length(choms.notUsed) > 0) {
print(paste("The following chromosomes have zero features:-", paste(choms.notUsed, collapse=",")))
} else {
print("All canonical chroms have features")
}
coverage.canonical <- sum(width(x.canonical))
x.reduced <- reduce(x.canonical)
n.features.reduced <- length(x.reduced)
coverage.reduced <- sum(width(x.reduced))
print(paste("The features cover", formatBig(coverage.canonical), "bases or ", round((coverage.canonical/genomeSize.canonical)*100, digits=3) , "% of the genome"))
if(n.features.reduced < n.features) {
print(paste("Some features overlap each other."))
print(paste("The intersect of features cover", formatBig(coverage.reduced), "bases or ", round((coverage.reduced/genomeSize.canonical)*100, digits=3) , "% of the genome"))
x.use <- x.reduced
} else {
print(paste("No features overlap each other"))
x.use <- x.canonical
}
# calc counts per chrom
chrom.counts <- table(seqnames(x.canonical))
max.count <- max(chrom.counts)
max.count.chrom <- names(chrom.counts)[which.max(chrom.counts)]
min.count <- min(chrom.counts)
min.count.chrom <- names(chrom.counts)[which.min(chrom.counts)]
print(paste("The sequence with most features is", max.count.chrom, "with" , formatBig(max.count)))
print(paste("The sequence with fewest features is", min.count.chrom, "with" , formatBig(min.count)))
# calc count per kilobase
cpkb <- (chrom.counts[chrom.canonical] / seqlengths(x.canonical)) * 1000000
max.cpkb <- max(cpkb)
max.cpkb.chrom <- names(cpkb)[which.max(cpkb)]
min.cpkb <- min(cpkb)
min.cpkb.chrom <- names(cpkb)[which.min(cpkb)]
print(paste("The sequence with most features per Mb is", max.cpkb.chrom, "with" , round(max.cpkb, digits=3)))
print(paste("The sequence with fewest features per Mb is", min.cpkb.chrom, "with" , round(min.cpkb, digits=3)))
# calc coverage per chrom.
chrom.sums <- unlist(lapply(coverage(x.canonical), sum))
chrom.sums.vec <- as.integer(chrom.sums)
chrom.sums.reduced <- unlist(lapply(coverage(x.reduced), sum))
chrom.sums.vec.reduced <- as.integer(chrom.sums.reduced)
names(chrom.sums.vec) <- names(chrom.sums)
names(chrom.sums.vec.reduced) <- names(chrom.sums.reduced)
if(show.plots) {
plot(seqlengths(x.canonical)[chrom.canonical],chrom.sums.vec[chrom.canonical] , ylim=c(0, max(chrom.sums.vec)),xlab="sequence length", ylab="coverage")
points(seqlengths(x.reduced)[chrom.canonical],chrom.sums.vec.reduced[chrom.canonical], col="blue")
}
# plot coverage per chrom against seqlengths.
# add on non-redundant coverage per chrom.
print(paste("Correlation between coverage and chromosome lengths:" , round(cor(seqlengths(x.use)[chrom.canonical],chrom.sums.vec[chrom.canonical], use="complete.obs"), digits=3 )))
x.max <- max(width(x.use))
x.min <- min(width(x.use))
x.mean <- mean(width(x.use))
x.meanTrimmed <- mean(width(x.use), trim=.1)
x.sd <- sd(width(x.use))
x.CoV <- x.sd / x.mean
# do some stats on distribution of lengths.
# Are all features from the same distribution?
print(paste("Minimumm length:", x.min))
print(paste("Maximum length:", x.max))
print(paste("Mean length:", round(x.mean, digits=3)))
print(paste("Trimmed mean length:", x.meanTrimmed))
print(paste("Standard deviation:", round(x.sd, digits=3)))
print(paste("Coefficient of variation:", round(x.CoV, digits=3)))
#print(paste())
# What about clumping within chromosomes?
if(is.numeric(runTestBinSizes)) { # use run.test at different resolutions to test for spatial association
if(require(randtests) & require(GenomicRanges)) {
x_midpoints.GR <- GRanges(seqnames(x.canonical), IRanges(start=end(x.canonical) - floor((end(x.canonical) - start(x.canonical))/2), width=1))
for(thisBinSize in runTestBinSizes) {
genome.bins <- tileGenome(seqlengths(x.canonical), tilewidth = thisBinSize, cut.last.tile.in.chrom = TRUE)
binCounts <- countOverlaps(genome.bins, x_midpoints.GR)
runTestResult <- runs.test(binCounts, threshold=median(binCounts) +.1) # use median +0.1 for threshold as often the median value is zero.
print(paste("Runs test using bins of width", thisBinSize, ":", format.pval(runTestResult$p.value, digits=3), "suggests", ifelse(runTestResult$p.value <=0.05, "non-randomness", "randomness")))
}
} else {
print("No run tests peformed, please install package randstats")
}
}
#if(return.table) {
for(thisChrom in chrom.canonical) {
#starts <- NA
chromFeatures <-x[seqnames(x)==thisChrom]
if(length(chromFeatures) > 1) {
starts <- sort(start(chromFeatures))
gapsVec <- starts[2:length(starts)] -starts[1:(length(starts)-1)]
# sometimes there'll be one or two very large gaps for centromeres and sequencing gaps..
} else {
gapsVec=NA
}
modalWidth=Mode(width(chromFeatures))
modalGap=Mode(gapsVec)
thisRow <- data.frame(chr=thisChrom, n=length(x[seqnames(x)==thisChrom]),
meanWidth=mean(width(chromFeatures)),
minWidth=min(width(chromFeatures)),
maxWidth=max(width(chromFeatures)),
modalWidth=modalWidth, modalWidthFreq=sum(width(chromFeatures) == modalWidth),
meanGap=mean(gapsVec), medianGap=median(gapsVec),
modalGap=modalGap, modalGapFreq=sum(gapsVec == modalGap),
minGap=min(gapsVec), maxGap=max(gapsVec))
outTable <- rbind(outTable, thisRow)
}
#}
if(return.table) {
print("Chromosome level stats:-")
print(outTable)
}
print(paste("Finished analyseGenomeSpread.GR() ! "))
if(!is.null(fileName)) {
#sink(fileName)
sink()
print(paste("output directed to ", fileName))
}
if(return.table) return(outTable)
}
# source("C:/Users/Dave/utilsGerrardDT/R/analyseGenomeSpread.R")
#library(BSgenome.Hsapiens.UCSC.hg38)
#myData <- GRanges(paste0("chr", rep(c(1:22, "X", "Y", "M", "1_GL383518v1_alt"), length.out=1000)) , IRanges(1:1000, width=1:1000), seqinfo=seqinfo(BSgenome.Hsapiens.UCSC.hg38))
#analyseGenomeSpread.GR(myData, show.plots=TRUE)
# ideas for good variance measures
# http://stats.stackexchange.com/questions/18661/mean-and-variance-of-a-zero-inflated-poisson-distribution
# https://stat.ethz.ch/pipermail/r-help/2007-November/146285.html
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.