inst/doc/GenomicScores.R

## ----setup, echo=FALSE--------------------------------------------------------
options(width=80)

## ----library_install, message=FALSE, cache=FALSE, eval=FALSE------------------
#  install.packages("BiocManager")
#  BiocManager::install("GenomicScores")

## ----library_upload, message=FALSE, warning=FALSE, cache=FALSE----------------
library(GenomicScores)

## ---- message=FALSE, warning=FALSE, cache=FALSE-------------------------------
library(phastCons100way.UCSC.hg19)
phast <- phastCons100way.UCSC.hg19
class(phast)

## -----------------------------------------------------------------------------
gscores(phast, GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1)))

## -----------------------------------------------------------------------------
phast

## -----------------------------------------------------------------------------
citation(phast)

## -----------------------------------------------------------------------------
provider(phast)
providerVersion(phast)
organism(phast)
seqlevelsStyle(phast)

## ---- echo=FALSE--------------------------------------------------------------
avgs <- readRDS(system.file("extdata", "avgs.rds", package="GenomicScores"))

## ----retrieve2, message=FALSE, cache=FALSE, eval=FALSE------------------------
#  availableGScores()

## ---- echo=FALSE--------------------------------------------------------------
avgs

## ----retrieve3, message=FALSE, cache=FALSE, eval=FALSE------------------------
#  phast <- getGScores("phastCons100way.UCSC.hg19")

## ----retrieve4, message=FALSE, cache=FALSE------------------------------------
gscores(phast, GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1)))

## ----eval=FALSE---------------------------------------------------------------
#  makeGScoresPackage(phast, maintainer="Me <me@example.com>", author="Me", version="1.0.0")

## ----echo=FALSE---------------------------------------------------------------
cat("Creating package in ./phastCons100way.UCSC.hg19\n")

## ---- echo=FALSE--------------------------------------------------------------
obj <- readRDS(system.file("extdata", "mcap.v1.0.hg19.chr22.rds", package="GenomicScores"))
mdobj <- metadata(obj)
mcap <- GScores(provider=mdobj$provider,
                provider_version=mdobj$provider_version,
                download_url=mdobj$download_url,
                download_date=mdobj$download_date,
                reference_genome=mdobj$reference_genome,
                data_pkgname=mdobj$data_pkgname,
                data_dirpath="../inst/extdata",
                data_serialized_objnames=c(mcap.v1.0.hg19.chr22.rds="mcap.v1.0.hg19.chr22.rds"),
                data_tag="MCAP")
gscopops <- get(mdobj$data_pkgname, envir=mcap@.data_cache)
gscopops[["default"]] <- RleList(compress=FALSE)
gscopops[["default"]][[mdobj$seqname]] <- obj
assign(mdobj$data_pkgname, gscopops, envir=mcap@.data_cache)

## ---- eval=FALSE--------------------------------------------------------------
#  mcap <- getGScores("mcap.v1.0.hg19")

## -----------------------------------------------------------------------------
mcap
citation(mcap)
gr <- GRanges(seqnames="chr22", IRanges(50967020:50967025, width=1))
gscores(mcap, gr)

## ---- message=FALSE-----------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)

refAlleles <- as.character(getSeq(Hsapiens, gr))
altAlleles <- DNA_BASES[(match(refAlleles, DNA_BASES)) %% 4 + 1]
cbind(REF=refAlleles, ALT=altAlleles)
gscores(mcap, gr, ref=refAlleles, alt=altAlleles)

## -----------------------------------------------------------------------------
gr1 <- GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1))
gr1sco <- gscores(phast, gr1)
gr1sco
mean(gr1sco$default)
gr2 <- GRanges("chr22:50967020-50967025")
gscores(phast, gr2)

## -----------------------------------------------------------------------------
gscores(phast, gr2, summaryFun=max)
gscores(phast, gr2, summaryFun=min)
gscores(phast, gr2, summaryFun=median)

## -----------------------------------------------------------------------------
phastqfun <- qfun(phast)
phastqfun
phastdqfun <- dqfun(phast)
phastdqfun

## -----------------------------------------------------------------------------
gr1sco <- gscores(phast, gr1, quantized=TRUE)
gr1sco

## -----------------------------------------------------------------------------
phastdqfun(gr1sco$default)

## -----------------------------------------------------------------------------
populations(phast)

## -----------------------------------------------------------------------------
defaultPopulation(phast)

## -----------------------------------------------------------------------------
gscores(phast, gr1, pop="DP2")
qfun(phast, pop="DP2")
dqfun(phast, pop="DP2")

## -----------------------------------------------------------------------------
defaultPopulation(phast) <- "DP2"
phast
gscores(phast, gr1)
qfun(phast)
dqfun(phast)

## ---- message=FALSE-----------------------------------------------------------
library(MafDb.1Kgenomes.phase1.hs37d5)

mafdb <- MafDb.1Kgenomes.phase1.hs37d5
mafdb

populations(mafdb)

## ---- eval=FALSE, message=FALSE-----------------------------------------------
#  library(gwascat)
#  
#  gwascat <- makeCurrentGwascat()
#  eyersids <- unique(subsetByTraits(gwascat, tr="Eye color")$SNPS)

## ---- echo=FALSE--------------------------------------------------------------
eyersids <- readRDS(file.path(system.file("extdata", package="GenomicScores"),
                              "eyersids.rds"))

## -----------------------------------------------------------------------------
eyersids

## ---- message=FALSE-----------------------------------------------------------
library(SNPlocs.Hsapiens.dbSNP144.GRCh37)
rng <- snpsById(SNPlocs.Hsapiens.dbSNP144.GRCh37, ids=eyersids)
rng

## ---- message=FALSE-----------------------------------------------------------
eyecolormafs <- gscores(mafdb, rng, pop=c("AF", "EUR_AF", "AFR_AF"))
eyecolormafs

## ---- message=FALSE-----------------------------------------------------------
gscores(mafdb, eyersids, pop=c("AF", "EUR_AF", "AFR_AF"))

## ----eyecolormafs, fig.cap = "Eye color MAFs. Minor allele frequencies (MAFs) of variants associated with eye color for global, european and african populations of the Phase 1 data from the 1000 Genomes Project.", fig.height=5, fig.wide = TRUE, echo=TRUE----
par(mar=c(3, 5, 1, 1))
mafpops <- c("AF", "EUR_AF", "AFR_AF")
bp <- barplot(t(as.matrix(mcols(eyecolormafs)[, mafpops])), beside=TRUE,
              col=c("darkblue", "darkgreen", "darkorange"),
              ylim=c(0, 0.55), las=1, ylab="Minor allele frequency")
axis(1, bp[2, ], 1:length(eyecolormafs))
mtext("Variant", side=1, line=2)
legend("topright", mafpops, fill=c("darkblue", "darkgreen", "darkorange"))

## ---- message=FALSE-----------------------------------------------------------
library(VariantAnnotation)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

## -----------------------------------------------------------------------------
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl)##, "hg19")
seqlevelsStyle(vcf)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevelsStyle(txdb)

## -----------------------------------------------------------------------------
seqlevelsStyle(vcf) <- seqlevelsStyle(txdb)

## ---- message=FALSE-----------------------------------------------------------
loc <- locateVariants(vcf, txdb, AllVariants())
loc[1:3]
table(loc$LOCATION)

## -----------------------------------------------------------------------------
loc$PHASTCONS <- score(phast, loc)
loc[1:3]

## ----plot1, fig.cap = "Distribution of phastCons conservation scores in variants across different annotated regions. Diamonds indicate mean values.", echo = FALSE, fig.height=5, fig.wide = TRUE, echo=TRUE----
x <- split(loc$PHASTCONS, loc$LOCATION)
mask <- elementNROWS(x) > 0
boxplot(x[mask], ylab="phastCons score", las=1, cex.axis=1.2, cex.lab=1.5, col="gray")
points(1:length(x[mask])+0.25, sapply(x[mask], mean, na.rm=TRUE), pch=23, bg="black")

## -----------------------------------------------------------------------------
maskSNVs <- isSNV(vcf)[loc$QUERYID]
loc$MCAP <- rep(NA_real_, length(loc))
loc$MCAP[maskSNVs] <- score(mcap, loc[maskSNVs],
                            ref=ref(vcf)[loc$QUERYID[maskSNVs]],
                            alt=alt(vcf)[loc$QUERYID[maskSNVs]])

## ---- echo=FALSE--------------------------------------------------------------
obj <- readRDS(system.file("extdata", "cadd.v1.3.hg19.chr22.rds", package="GenomicScores"))
mdobj <- metadata(obj)
cadd <- GScores(provider=mdobj$provider,
                provider_version=mdobj$provider_version,
                download_url=mdobj$download_url,
                download_date=mdobj$download_date,
                reference_genome=mdobj$reference_genome,
                data_pkgname=mdobj$data_pkgname,
                data_dirpath="../inst/extdata",
                data_serialized_objnames=c(cadd.v1.3.hg19.chr22.rds="cadd.v1.3.hg19.chr22.rds"),
                data_tag="CADD")
gscopops <- get(mdobj$data_pkgname, envir=cadd@.data_cache)
gscopops[["default"]] <- RleList(compress=FALSE)
gscopops[["default"]][[mdobj$seqname]] <- obj
assign(mdobj$data_pkgname, gscopops, envir=cadd@.data_cache)

## -----------------------------------------------------------------------------
maskSNVs <- isSNV(vcf)[loc$QUERYID]
loc$CADD <- rep(NA_real_, length(loc))
loc$CADD[maskSNVs] <- score(cadd, loc[maskSNVs],
                            ref=ref(vcf)[loc$QUERYID[maskSNVs]],
                            alt=alt(vcf)[loc$QUERYID[maskSNVs]])

## ----mcapvscadd, fig.cap = "Comparison of M-CAP and CADD scores. Values on the x- and y-axis are jittered to facilitate visualization.", echo = FALSE, fig.height=5, fig.width=7, dpi=100, echo=TRUE----
library(RColorBrewer)
par(mar=c(4, 5, 1, 1))
hmcol <- colorRampPalette(brewer.pal(nlevels(loc$LOCATION), "Set1"))(nlevels(loc$LOCATION))
plot(jitter(loc$MCAP, factor=2), jitter(loc$CADD, factor=2), pch=19,
     col=hmcol, xlab="M-CAP scores", ylab="CADD scores",
     las=1, cex.axis=1.2, cex.lab=1.5, panel.first=grid())
legend("bottomright", levels(loc$LOCATION), pch=19, col=hmcol, inset=0.01)

## -----------------------------------------------------------------------------
seqlevelsStyle(loc) <- seqlevelsStyle(mafdb)[1]
seqinfo(loc, new2old=match(seqlevels(mafdb), seqlevels(loc))) <- seqinfo(mafdb)
maskSNVs <- isSNV(vcf)[loc$QUERYID]
loc$MAF[maskSNVs] <- gscores(mafdb, loc[maskSNVs])$AF
loc$MAF[!maskSNVs] <- gscores(mafdb, loc[!maskSNVs], type="nonsnrs")$AF
loc[1:3]

## -----------------------------------------------------------------------------
afs <- DataFrame(matrix(NA, nrow=length(loc), ncol=2,
                        dimnames=list(NULL, c("AF_REF", "AF_ALT"))))
afs[maskSNVs, ] <- score(mafdb, loc[maskSNVs],
                         ref=ref(vcf)[loc$QUERYID[maskSNVs]],
                         alt=alt(vcf)[loc$QUERYID[maskSNVs]])
afs[!maskSNVs, ] <- score(mafdb, loc[!maskSNVs],
                          ref=ref(vcf)[loc$QUERYID[!maskSNVs]],
                          alt=alt(vcf)[loc$QUERYID[!maskSNVs]],
                          type="nonsnrs")
mcols(loc) <- cbind(mcols(loc), afs)
loc[1:3]

## ----showpositions, message=FALSE, cache=FALSE--------------------------------
origpcscoCDS <- readRDS(system.file("extdata", "origphastCons100wayhg19CDS.rds", package="GenomicScores"))
origpcscoCDS

length(unique(origpcscoCDS$score))

## -----------------------------------------------------------------------------
numDecimals <- function(x) {
  spl <- strsplit(as.character(x+1), "\\.")
  spl <- sapply(spl, "[", 2)
  spl[is.na(spl)] <- ""
  nchar(spl)
}

nd1 <- numDecimals(origpcscoCDS$score)
table(nd1)

## ----showpositions2, message=FALSE, cache=FALSE-------------------------------
origpcsco3UTRs <- readRDS(system.file("extdata", "origphastCons100wayhg193UTR.rds", package="GenomicScores"))
origpcsco3UTRs

length(table(origpcsco3UTRs$score))

nd2 <- numDecimals(origpcsco3UTRs$score)
table(nd2)

## -----------------------------------------------------------------------------
defaultPopulation(phast) <- "default"
phast

## -----------------------------------------------------------------------------
pkgpcscoCDS <- score(phast, origpcscoCDS)
pkgpcsco3UTRs <- score(phast, origpcsco3UTRs)

## ----plot2, fig.height=9, fig.width=8, dpi=100, fig.cap = "Original and lossy-compressed phastCons scores. Top panels (a, b): comparison of the distribution of values. Bottom panels (c, d): comparison of the cumulative distribution", echo = FALSE----
labelPanel <- function(lab, font=2, cex=2, offsetx=0.05, offsety=0.05) {
  par(xpd=TRUE)
  w <- par("usr")[2] - par("usr")[1]
  h <- par("usr")[4] - par("usr")[3]
  text(par("usr")[1]-w*offsetx, par("usr")[4]+h*offsety, lab, font=font, cex=cex)
  par(xpd=FALSE)
}

par(mfrow=c(2, 2), mar=c(4, 5, 2, 1))
plot(origpcscoCDS$score, jitter(pkgpcscoCDS), pch=19, cex=1, cex.lab=1.2,
     xaxt="n", yaxt="n", xlab="Original phastCons scores (CDS)",
     ylab="Compressed phastCons scores (CDS)")
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
abline(0, 1)
labelPanel(letters[1])
plot(origpcsco3UTRs$score, jitter(pkgpcsco3UTRs), pch=19, cex=1, cex.lab=1.2,
     xaxt="n", yaxt="n", xlab="Original phastCons scores (3' UTR)",
     ylab="Compressed phastCons scores (3' UTR)")
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
abline(0, 1)
labelPanel(letters[2])
ForigCDS <- ecdf(origpcscoCDS$score)
FpkgCDS <- ecdf(pkgpcscoCDS)
plot(sort(origpcscoCDS$score), ForigCDS(sort(origpcscoCDS$score)), xaxt="n", yaxt="n", cex.lab=1.2,
     pch=".", cex=4, xlab="phastCons scores (CDS)", ylab="F(x)", ylim=c(0, 1))
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
points(sort(pkgpcscoCDS), FpkgCDS(sort(pkgpcscoCDS)), pch=19, cex=1)
legend("topleft", c("Original score", "Rounded score"), pch=c(46, 19),
       pt.cex=c(4, 1), inset=0.01, bg="white")
labelPanel(letters[3])
Forig3UTRs <- ecdf(origpcsco3UTRs$score)
Fpkg3UTRs <- ecdf(pkgpcsco3UTRs)
plot(sort(origpcsco3UTRs$score), Forig3UTRs(sort(origpcsco3UTRs$score)), xaxt="n", yaxt="n", cex.lab=1.2,
     pch=".", cex=4, xlab="phastCons scores (3'UTR)", ylab="F(x)", ylim=c(0, 1))
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
points(sort(pkgpcsco3UTRs), Fpkg3UTRs(sort(pkgpcsco3UTRs)), pch=19, cex=1)
legend("topleft", c("Original score", "Rounded score"), pch=c(46, 19),
       pt.cex=c(4, 1), inset=0.01, bg="white")
labelPanel(letters[4])

## ----session_info, cache=FALSE------------------------------------------------
sessionInfo()

Try the GenomicScores package in your browser

Any scripts or data that you put into this service are public.

GenomicScores documentation built on Nov. 8, 2020, 5:21 p.m.