Nothing
## ----install_h5vcData, eval=FALSE---------------------------------------------
# if (!requireNamespace("BiocManager", quietly=TRUE))
# install.packages("BiocManager")
# BiocManager::install("h5vcData")
## ----loadPackages-------------------------------------------------------------
suppressPackageStartupMessages(library(h5vc))
suppressPackageStartupMessages(library(rhdf5))
tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
## ----listTallyFile------------------------------------------------------------
h5ls(tallyFile)
## ----getSampleData------------------------------------------------------------
sampleData <- getSampleData( tallyFile, "/ExampleStudy/16" )
sampleData
## ----setSampleData------------------------------------------------------------
sampleData$ClinicalVariable <- rnorm(nrow(sampleData))
setSampleData( tallyFile, "/ExampleStudy/16", sampleData )
sampleData
## ----h5readBlockExample-------------------------------------------------------
data <- h5readBlock(
filename = tallyFile,
group = "/ExampleStudy/16",
names = c( "Coverages", "Counts" ),
range = c(29000000,29001000)
)
str(data)
## ----h5dapplyGRanges----------------------------------------------------------
suppressPackageStartupMessages(require(GenomicRanges))
data <- h5dapply(
filename = tallyFile,
group = "/ExampleStudy",
names = c( "Coverages" ),
dims = c(3),
range = GRanges("16", ranges = IRanges(start = seq(29e6, 30e6, 5e6), width = 1000))
)
str(data)
## ----h5dapplyExample----------------------------------------------------------
#rangeA <- GRanges("16", ranges = IRanges(start = seq(29e6, 29.5e6, 1e5), width = 1000))
#rangeB <- GRanges("22", ranges = IRanges(start = seq(39e6, 39.5e6, 1e5), width = 1000))
range <- GRanges(
rep(c("16", "22"), each = 6),
ranges = IRanges(
start = c(seq(29e6, 29.5e6, 1e5),seq(39e6, 39.5e6, 1e5)),
width = 1000
))
coverages <- h5dapply(
filename = tallyFile,
group = "/ExampleStudy",
names = c( "Coverages" ),
dims = c(3),
range = range,
FUN = binnedCoverage,
sampledata = sampleData
)
#options(scipen=10)
coverages <- do.call( rbind, lapply( coverages, function(x) do.call(rbind, x) ))
#rownames(coverages) <- NULL #remove block-ids used as row-names
coverages
## ----callVariantsExample------------------------------------------------------
variants <- h5dapply(
filename = tallyFile,
group = "/ExampleStudy/16",
names = c( "Coverages", "Counts", "Deletions", "Reference" ),
range = c(29950000,30000000),
blocksize = 10000,
FUN = callVariantsPaired,
sampledata = sampleData,
cl = vcConfParams(returnDataPoints = TRUE)
)
variants <- do.call( rbind, variants )
variants$AF <- (variants$caseCountFwd + variants$caseCountRev) / (variants$caseCoverageFwd + variants$caseCoverageRev)
variants <- variants[variants$AF > 0.2,]
rownames(variants) <- NULL # remove rownames to save some space on output :D
variants
## ----mismatchPlotExample, fig.width=10.5, fig.height=8.5, dpi=300, out.width="750px", fig.retina=1----
windowsize <- 35
position <- variants$Start[2]
data <- h5readBlock(
filename = tallyFile,
group = paste( "/ExampleStudy", variants$Chrom[2], sep="/" ),
names = c("Coverages","Counts","Deletions", "Reference"),
range = c( position - windowsize, position + windowsize)
)
patient <- sampleData$Patient[sampleData$Sample == variants$Sample[2]]
samples <- sampleData$Sample[sampleData$Patient == patient]
p <- mismatchPlot(
data = data,
sampledata = sampleData,
samples = samples,
windowsize = windowsize,
position = position
)
print(p)
## ----mismatchPlotExamplesTheming, fig.width=10.5, fig.height=8.5, dpi=300, out.width="750px", fig.show='asis'----
print(p + theme(strip.text.y = element_text(family="serif", size=16, angle=0)))
## -----------------------------------------------------------------------------
suppressPackageStartupMessages(require(IRanges))
suppressPackageStartupMessages(require(biomaRt))
mart <- useDataset(dataset = "hsapiens_gene_ensembl", mart = useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org"))
exons <- getBM(attributes = c("ensembl_exon_id", "exon_chrom_start", "exon_chrom_end"), filters = c("chromosome_name"), values = c("16"), mart)
exons <- subset(exons, exon_chrom_start > 29e6 & exon_chrom_end < 30e6)
ranges <- IRanges(start = exons$exon_chrom_start, end = exons$exon_chrom_end)
coverages <- h5dapply(
filename = tallyFile,
group = "/ExampleStudy/16",
names = c( "Coverages" ),
dims = c(3),
range = ranges,
FUN = binnedCoverage,
sampledata = sampleData
)
options(scipen=10)
coverages <- do.call( rbind, coverages )
rownames(coverages) <- NULL #remove block-ids used as row-names
coverages$ExonID <- exons$ensembl_exon_id
head(coverages)
## ----mismatchPlotRangesExample, fig.width=10.5, fig.height=8.5, dpi=300, out.width="750px", fig.retina=1----
windowsize <- 35
position <- variants$Start[2]
data <- h5dapply(
filename = tallyFile,
group = paste( "/ExampleStudy", variants$Chrom[2], sep="/" ),
names = c("Coverages","Counts","Deletions", "Reference"),
range = flank( IRanges(start = c(position - 10, position, position + 10), width = 1), width = 15, both = TRUE )
)
p <- mismatchPlot(
data = data,
sampledata = sampleData,
samples <- c("PT5ControlDNA", "PT5PrimaryDNA", "PT5RelapseDNA", "PT8ControlDNA", "PT8EarlyStageDNA", "PT8PrimaryDNA")
)
print(p)
## ----loadFiles, eval = .Platform$OS.type == "unix", results="hide"------------
suppressPackageStartupMessages(library("h5vc"))
suppressPackageStartupMessages(library("rhdf5"))
files <- list.files( system.file("extdata", package = "h5vcData"), "Pt.*bam$" )
files
bamFiles <- file.path( system.file("extdata", package = "h5vcData"), files)
## ----chromDim, eval = (.Platform$OS.type == "unix")---------------------------
suppressPackageStartupMessages(library("Rsamtools"))
chromdim <- sapply( scanBamHeader(bamFiles), function(x) x$targets )
colnames(chromdim) <- files
head(chromdim)
## ----hdf5SetUp, eval = .Platform$OS.type == "unix"----------------------------
chrom <- "2"
chromlength <- chromdim[chrom,1]
study <- "/DNMT3A"
tallyFile <- file.path( tempdir(), "DNMT3A.tally.hfs5" )
if( file.exists(tallyFile) ){
file.remove(tallyFile)
}
if( prepareTallyFile( tallyFile, study, chrom, chromlength, nsamples = length(files) ) ){
h5ls(tallyFile)
}else{
message( paste( "Preparation of:", tallyFile, "failed" ) )
}
## ----sampleDataSetUp, eval = .Platform$OS.type == "unix"----------------------
sampleData <- data.frame(
File = files,
Type = "Control",
stringsAsFactors = FALSE
)
sampleData$Sample <- gsub(x = sampleData$File, pattern = ".bam", replacement = "")
sampleData$Patient <- substr(sampleData$Sample, start = 1, 4)
sampleData$Column <- seq_along(files)
sampleData$Type[grep(pattern = "Cancer", x = sampleData$Sample)] <- "Case"
group <- paste( study, chrom, sep = "/" )
setSampleData( tallyFile, group, sampleData )
getSampleData( tallyFile, group )
## ----tallyingTheBamFiles, eval = .Platform$OS.type == "unix"------------------
suppressPackageStartupMessages(require(BSgenome.Hsapiens.NCBI.GRCh38))
suppressPackageStartupMessages(require(GenomicRanges))
dnmt3a <- read.table(system.file("extdata", "dnmt3a.txt", package = "h5vcData"), header=TRUE, stringsAsFactors = FALSE)
dnmt3a <- with( dnmt3a, GRanges(seqname, ranges = IRanges(start = start, end = end)))
dnmt3a <- reduce(dnmt3a)
require(BiocParallel)
register(MulticoreParam())
theData <- tallyRanges( bamFiles, ranges = dnmt3a, reference = Hsapiens )
str(theData[[1]])
## ----writingToTallyFile, eval = .Platform$OS.type == "unix"-------------------
writeToTallyFile(theData, tallyFile, study = "/DNMT3A", ranges = dnmt3a)
## ----extractingData, eval = .Platform$OS.type == "unix"-----------------------
data <- h5dapply(
filename = tallyFile,
group = "/DNMT3A",
range = dnmt3a
)
str(data[["2"]][[1]])
## ----callingVariants, eval = .Platform$OS.type == "unix"----------------------
vars <- h5dapply(
filename = tallyFile,
group = "/DNMT3A",
FUN = callVariantsPaired,
sampledata = getSampleData(tallyFile,group),
cl = vcConfParams(),
range = dnmt3a
)
vars <- do.call(rbind, vars[["2"]])
vars
## ----plottingVariant, eval = .Platform$OS.type == "unix"----------------------
position <- vars$End[1]
windowsize <- 30
data <- h5readBlock(
filename = tallyFile,
group = group,
range = c(position - windowsize, position + windowsize)
)
sampleData <- getSampleData(tallyFile,group)
p <- mismatchPlot( data, sampleData, samples = c("Pt17Control", "Pt17Cancer"), windowsize=windowsize, position=position )
print(p)
## -----------------------------------------------------------------------------
tallyFileMS <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
data( "example.variants", package = "h5vcData" ) #example variant calls
head(variantCalls)
ms = mutationSpectrum( variantCalls, tallyFileMS, "/ExampleStudy" )
head(ms)
## ----plottingMS, fig.width=10.5, fig.height=7, dpi=300, out.width="750px"-----
plotMutationSpectrum(ms) + theme(
strip.text.y = element_text(angle=0, size=10),
axis.text.x = element_text(size = 7),
axis.text.y = element_text(size = 10)) + scale_y_continuous(breaks = c(0,5,10,15))
## ----parallelTally, eval = .Platform$OS.type == "unix"------------------------
register(MulticoreParam())
multicore.time <- system.time(theData <- tallyRanges( bamFiles, ranges = dnmt3a, reference = Hsapiens ))
register(SerialParam())
serial.time <- system.time(theData <- tallyRanges( bamFiles, ranges = dnmt3a, reference = Hsapiens ))
serial.time["elapsed"]
multicore.time["elapsed"]
## ----parallelTallyToFile, eval = .Platform$OS.type == "unix"------------------
register(MulticoreParam())
multicore.time <- system.time(tallyRangesToFile( tallyFile, study, bamFiles, ranges = dnmt3a, reference = Hsapiens ))
register(SerialParam())
serial.time <- system.time(tallyRangesToFile( tallyFile, study, bamFiles, ranges = dnmt3a, reference = Hsapiens ))
serial.time["elapsed"]
multicore.time["elapsed"]
## ---- eval = .Platform$OS.type == "unix"--------------------------------------
tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
theRanges <- GRanges("16", ranges = IRanges(start = seq(29e6,29.2e6,1000), width = 1000))
register(SerialParam())
system.time(
coverages_serial <- h5dapply(
filename = tallyFile,
group = "/ExampleStudy",
names = c( "Coverages" ),
dims = c(3),
range = theRanges,
FUN = binnedCoverage,
sampledata = sampleData
)
)
register(MulticoreParam())
system.time(
coverages_parallel <- h5dapply(
filename = tallyFile,
group = "/ExampleStudy",
names = c( "Coverages" ),
dims = c(3),
range = theRanges,
FUN = binnedCoverage,
sampledata = sampleData
)
)
## ---- eval = FALSE------------------------------------------------------------
# tallyRangesBatch( tallyFile, study = "/DNMT3A", bamfiles = bamFiles, ranges = dnmt3a, reference = Hsapiens )
## ----batchjobsconf, eval=FALSE------------------------------------------------
# cluster.functions <- makeClusterFunctionsLSF("/home/pyl/batchjobs/lsf.tmpl")
# mail.start <- "first"
# mail.done <- "last"
# mail.error <- "all"
# mail.from <- "<paul.theodor.pyl@embl.de>"
# mail.to <- "<pyl@embl.de>"
# mail.control <- list(smtpServer="smtp.embl.de")
## ----eval=FALSE---------------------------------------------------------------
# ## Default resources can be set in your .BatchJobs.R by defining the variable
# ## 'default.resources' as a named list.
#
# ## remove everthing in [] if your cluster does not support arrayjobs
# #BSUB-J <%= job.name %>[1-<%= arrayjobs %>] # name of the job / array jobs
# #BSUB-o <%= log.file %> # output is sent to logfile, stdout + stderr by default
# #BSUB-n <%= resources$ncpus %> # Number of CPUs on the node
# #BSUB-q <%= resources$queue %> # Job queue
# #BSUB-W <%= resources$walltime %> # Walltime in minutes
# #BSUB-M <%= resources$memory %> # Memory requirements in Kbytes
#
# # we merge R output with stdout from LSF, which gets then logged via -o option
# R CMD BATCH --no-save --no-restore "<%= rscript %>" /dev/stdout
## ----eval=FALSE---------------------------------------------------------------
# library("BiocParallel")
# library("BatchJobs")
# cf <- makeClusterFunctionsLSF("/home/pyl/batchjobs/lsf.tmpl")
# bjp <- BatchJobsParam( cluster.functions=cf, resources = list("queue" = "medium_priority", "memory"="4000", "ncpus"="4", walltime="00:30") )
# bplapply(1:10, sqrt)
# bplapply(1:10, sqrt, BPPARAM=bjp)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.