Basic genomic regions exploration

Project: r project.

Introduction

This report is meant to help explore a set of genomic regions and was generated using the regionReport r Citep(bib[['regionReport']]) package. While the report is rich, it is meant to just start the exploration of the results and exemplify some of the code used to do so. If you need a more in-depth analysis for your specific data set you might want to use the customCode argument.

Most plots were made with using ggplot2 r Citep(bib[['ggplot2']]).

Code setup

## knitrBoostrap and device chunk options
library('knitr')
opts_chunk$set(bootstrap.show.code = FALSE, dev = device, crop = NULL)
if(!outputIsHTML) opts_chunk$set(bootstrap.show.code = FALSE, dev = device, echo = FALSE)
#### Libraries needed

## Bioconductor
library('bumphunter')
library('derfinder')
library('derfinderPlot')
library('GenomeInfoDb')
library('GenomicRanges')
library('ggbio')

## Transcription database to use by default
if(is.null(txdb)) {
    library('TxDb.Hsapiens.UCSC.hg19.knownGene')
    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
}

## CRAN
library('ggplot2')
if(!is.null(theme)) theme_set(theme)
library('grid')
library('gridExtra')
library('knitr')
library('RColorBrewer')
library('mgcv')
library('whisker')
library('DT')
library('sessioninfo')

#### Code setup

## For ggplot
tmp <- regions
names(tmp) <- seq_len(length(tmp))
regions.df <- as.data.frame(tmp)
regions.df$width <- width(tmp)
rm(tmp)

## Special subsets: need at least 3 points for a density plot
keepChr <- table(regions.df$seqnames) > 2
regions.df.plot <- subset(regions.df, seqnames %in% names(keepChr[keepChr]))

if(hasSignificant) {
    ## Keep only those sig
    regions.df.sig <- regions.df[significantVar, ]
    keepChr <- table(regions.df.sig$seqnames) > 2
    regions.df.sig <- subset(regions.df.sig, seqnames %in% names(keepChr[keepChr]))
}

## Find which chrs are present in the data set
chrs <- levels(seqnames(regions))

## areaVar initialize
areaVar <- NULL

Quality checks

for(i in seq_len(length(pvalueVars))) {
    densityVarName <- names(pvalueVars[i])
    densityVarName <- ifelse(is.null(densityVarName), pvalueVars[i], densityVarName)
   cat(knit_child(text = whisker.render(templatePvalueDensityInUse, list(varName = pvalueVars[i], densityVarName = densityVarName)), quiet = TRUE), sep = '\n')
}

Region width

xrange <- range(log10(regions.df.plot$width)) * c(0.95, 1.05)
p2a <- ggplot(regions.df.plot, aes(x=log10(width), colour=seqnames)) + 
    geom_line(stat='density') + labs(title='Density of region lengths') +
    xlab('Region width (log10)') + scale_colour_discrete(limits=chrs) +
    xlim(xrange) + theme(legend.title=element_blank())
p2b <- ggplot(regions.df.sig, aes(x=log10(width), colour=seqnames)) +
    geom_line(stat='density') +
    labs(title='Density of region lengths (significant only)') +
    xlab('Region width (log10)') + scale_colour_discrete(limits=chrs) +
    xlim(xrange) + theme(legend.title=element_blank())
grid.arrange(p2a, p2b)
p2a <- ggplot(regions.df.plot, aes(x=log10(width), colour=seqnames)) +
    geom_line(stat='density') + labs(title='Density of region lengths') +
    xlab('Region width (log10)') + scale_colour_discrete(limits=chrs) +
    theme(legend.title=element_blank())
p2a

This plot shows the density of the region lengths for all regions. r ifelse(hasSignificant, 'The bottom panel is restricted to significant regions.', '')

for(i in seq_len(length(densityVars))) {
    densityVarName <- names(densityVars[i])
    densityVarName <- ifelse(is.null(densityVarName), densityVars[i], densityVarName)
   cat(knit_child(text = whisker.render(templateDensityInUse, list(varName = densityVars[i], densityVarName = densityVarName)), quiet = TRUE), sep = '\n')
}

Genomic overview

The following plots were made using ggbio r Citep(bib[['ggbio']]) which in turn uses ggplot2 r Citep(bib[['ggplot2']]). For more details check plotOverview in derfinderPlot r Citep(bib[['derfinderPlot']]).

P-values

## Choose what variable to show on the top
tmp <- regions
tmp$significant <- factor(significantVar, levels = c('TRUE', 'FALSE'))
if(!'area' %in% colnames(mcols(tmp))) {
    if(hasDensityVars) {
        tmp$area <- mcols(tmp)[[densityVars[1]]]
        areaVar <- densityVars[1]
        areaVar <- ifelse(is.null(names(areaVar)), densityVars[1], names(areaVar))
    } else {
        tmp$area <- 0
        areaVar <- NULL
    }
} else {
    areaVar <- 'area'
}
plotOverview(regions=tmp, type='pval', base_size=overviewParams$base_size, areaRel=overviewParams$areaRel, legend.position=c(0.97, 0.12))
rm(tmp)

This plot shows the genomic locations of the regions found in the analysis. The significant regions are highlighted and the r areaVar of the regions is shown on top of each chromosome r ifelse(is.null(areaVar), '(skipped because there was no applicable variable)', '(shown in a relative scale)').

for(i in seq_len(length(pvalueVars))) {
    densityVarName <- names(pvalueVars[i])
    densityVarName <- ifelse(is.null(densityVarName), pvalueVars[i], densityVarName)
   cat(knit_child(text = whisker.render(templateManhattanInUse, list(varName = pvalueVars[i], densityVarName = densityVarName)), quiet = TRUE), sep = '\n')
}

Annotation

## Annotate regions with bumphunter
if(is.null(annotation)) {
    genes <- annotateTranscripts(txdb = txdb)
    annotation <- matchGenes(x = regions, subject = genes)
}


## Make the plot
plotOverview(regions=regions, annotation=annotation, type='annotation', base_size=overviewParams$base_size, areaRel=overviewParams$areaRel, legend.position=c(0.97, 0.12))

This genomic overview plot shows the annotation region type for the regions as determined using bumphunter r Citep(bib[['bumphunter']]). Note that the regions are shown only if the annotation information is available. Below is a table of the actual number of results per annotation region type.

annoReg <- table(annotation$region, useNA='always')
annoReg.df <- data.frame(Region=names(annoReg), Count=as.vector(annoReg))
if(outputIsHTML) {
    kable(annoReg.df, format = 'markdown', align=rep('c', 3))
} else {
    kable(annoReg.df)
}

Annotation (significant)

plotOverview(regions=regions[significantVar, ], annotation=annotation[significantVar, ], type='annotation', base_size=overviewParams$base_size, areaRel=overviewParams$areaRel, legend.position=c(0.97, 0.12))

This genomic overview plot shows the annotation region type for the statistically significant regions. Note that the regions are shown only if the annotation information is available. r ifelse(hasSignificant, '', 'Plot skipped because there are no significant regions.')

Best regions

Genomic states

Below is a table summarizing the number of genomic states per region as determined using derfinder r Citep(bib[['derfinder']]).

## Construct genomic state object
genomicState <- makeGenomicState(txdb = txdb, chrs = chrs, verbose = FALSE)

## Annotate regions by genomic state
annotatedRegions <- annotateRegions(regions, genomicState$fullGenome, verbose = FALSE)

## Genomic states table
info <- do.call(rbind, lapply(annotatedRegions$countTable, function(x) { data.frame(table(x)) }))
colnames(info) <- c('Number of Overlapping States', 'Frequency')
info$State <- gsub('\\..*', '', rownames(info))
rownames(info) <- NULL
if(outputIsHTML) {
    kable(info, format = 'markdown', align=rep('c', 4))
} else {
    kable(info)
}

The following is a venn diagram showing how many regions overlap known exons, introns, and intergenic segments, none of them, or multiple of these groups.

## Venn diagram for all regions
venn <- vennRegions(annotatedRegions, counts.col = 'blue', 
    main = 'Regions overlapping genomic states')

r ifelse(hasSignificant, 'The following plot is the genomic states venn diagram only for the significant regions.', '')

## Venn diagram for all regions
vennSig <- vennRegions(annotatedRegions, counts.col = 'blue', 
    main = 'Significant regions overlapping genomic states',
    subsetIndex = significantVar)

Region information

Below is an interactive table with the top r min(nrow(regions.df), nBestRegions) regions (out of r nrow(regions.df)) as ranked by p-value r ifelse(hasPvalueVars, '', 'without ranking because no p-value information was provided'). Inf and -Inf are shown as 1e100 and -1e100 respectively. r ifelse(outputIsHTML, 'Use the search function to find your region of interest or sort by one of the columns.', 'Since the report is in PDF format, only the top 20 regions are shown.')

## Add annotation information
regions.df <- cbind(regions.df, annotation)

## Rank by p-value (first pvalue variable supplied)
if(hasPvalueVars){
    topRegions <- head(regions.df[order(regions.df[, pvalueVars[1]], 
        decreasing = FALSE), ], nBestRegions)
    topRegions <- cbind(data.frame('pvalueRank' = seq_len(nrow(topRegions))), 
        topRegions)
} else {
    topRegions <- head(regions.df, nBestRegions)
}

## Clean up -Inf, Inf if present
## More details at https://github.com/ramnathv/rCharts/issues/259
replaceInf <- function(df, colsubset=seq_len(ncol(df))) {
    for(i in colsubset) {
        inf.idx <- !is.finite(df[, i])
        if(any(inf.idx)) {
            inf.sign <- sign(df[inf.idx, i])
            df[inf.idx, i] <- inf.sign * 1e100
        }
    }
    return(df)
}
topRegions <- replaceInf(topRegions, which(sapply(topRegions, function(x) {
    class(x) %in% c('numeric', 'integer')})))

## Make the table
greptext <- 'value$|area$|mean|log2FoldChange'
greppval <- 'pvalues$|qvalues$|fwer$'
if(hasPvalueVars) {
    greppval <- paste0(paste(pvalueVars, collapse = '$|'), '$|', greppval)
}
if(hasDensityVars) {
    greptext <- paste0(paste(densityVars, collapse = '$|'), '$|', greptext)
}

for(i in which(grepl(greppval, colnames(topRegions)))) topRegions[, i] <- format(topRegions[, i], scientific = TRUE)

if(outputIsHTML) {
    datatable(topRegions, options = list(pagingType='full_numbers', pageLength=10, scrollX='100%'), rownames = FALSE) %>% formatRound(which(grepl(greptext, colnames(topRegions))), digits)
} else {
    ## Only print the top part if your output is a PDF file
    df_top <- head(topRegions, 20)
    for(i in which(grepl(greptext, colnames(topRegions)))) df_top[, i] <- round(df_top[, i], digits)
    kable(df_top)
}

Reproducibility

This report was generated in path r tmpdir using the following call to renderReport():

theCall

Date the report was generated.

## Date the report was generated
Sys.time()

Wallclock time spent generating the report.

## Processing time in seconds
totalTime <- diff(c(startTime, Sys.time()))
round(totalTime, digits=3)

R session information.

## Session info
options(width = 120)
session_info()

Pandoc version used: r rmarkdown::pandoc_version().

Bibliography

This report was created with regionReport r Citep(bib[['regionReport']]) using rmarkdown r Citep(bib[['rmarkdown']]) while knitr r Citep(bib[['knitr']]) and DT r Citep(bib[['DT']]) were running behind the scenes. whisker r Citep(bib[['whisker']]) was used for creating templates for the pvalueVars and densityVars.

Citations made with r CRANpkg('RefManageR') r Citep(bib[['RefManageR']]). The BibTeX file can be found here.

## Print bibliography
PrintBibliography(bib, .opts = list(hyperlink = "to.doc", style = "html"))


Try the regionReport package in your browser

Any scripts or data that you put into this service are public.

regionReport documentation built on Dec. 20, 2020, 2:01 a.m.