knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.align = "center" )
valr
includes several functions for exploring statistical relationships between sets of intervals.
Calculate significance of overlaps between sets of intervals with bed_fisher()
and bed_projection()
.
Quantify relative and absolute distances between sets of intervals with bed_reldist()
and bed_absdist()
.
Quantify extent of overlap between sets of intervals with bed_jaccard()
.
In this vignette we explore the relationship between transcription start sites and repetitive elements in the human genome.
library(valr) library(dplyr) library(ggplot2) library(cowplot) library(tidyr) # load repeats and genes. Data in the valr package is restricted to chr22; the entire # files can be downloaded from UCSC. rpts <- read_bed(valr_example("hg19.rmsk.chr22.bed.gz")) genes <- read_bed12(valr_example("hg19.refGene.chr22.bed.gz")) # load chrom sizes genome <- read_genome(valr_example("hg19.chrom.sizes.gz")) # create 1 bp intervals representing transcription start sites tss <- create_tss(genes) tss
First we define a function that takes x
and y
intervals and computes distance statistics (using bed_reldist()
and bed_absdist()
) for specified groups. The value of each statistic is assigned to a .value
column.
distance_stats <- function(x, y, genome, group_var, type = NA) { group_by(x, !!rlang::sym(group_var)) |> do( reldist = bed_reldist(., y, detail = TRUE) |> select(.value = .reldist), absdist = bed_absdist(., y, genome) |> select(.value = .absdist) ) |> tidyr::pivot_longer( cols = -name, names_to = "stat", values_to = "value" ) |> mutate(type = type) }
We use the distance_stats()
function to apply the bed_absdist()
function to each group of data.
obs_stats <- distance_stats(rpts, tss, genome, "name", "obs") obs_stats
And the same is done for a set of shuffled group of data. bed_shuffle()
is used to shuffle coordinates of the repeats within each chromosome (i.e., the coordinates change, but the chromosome stays the same.)
shfs <- bed_shuffle(rpts, genome, within = TRUE) shf_stats <- distance_stats(shfs, tss, genome, "name", "shuf")
Now we can bind the observed and shuffled data together, and do some tidying to put the data into a format appropriate for a statistical test. This involves:
unnest()
ing the data framesname
), stat (reldist
or absdist
) and type (obs
or shf
)tidyr::pivot_wider()
to create two new obs
and shuf
columnsNA
values.res <- bind_rows(obs_stats, shf_stats) |> tidyr::unnest(value) |> group_by(name, stat, type) |> mutate(.id = row_number()) |> tidyr::pivot_wider( names_from = "type", values_from = ".value" ) |> na.omit() res
Now that the data are formatted, we can use the non-parametric ks.test()
to determine whether there are significant differences between the observed and shuffled data for each group. broom::tidy()
is used to reformat the results of each test into a tibble
, and the results of each test are pivot
ed to into a type
column for each test type.
library(broom) pvals <- res |> do( twosided = tidy(ks.test(.$obs, .$shuf)), less = tidy(ks.test(.$obs, .$shuf, alternative = "less")), greater = tidy(ks.test(.$obs, .$shuf, alternative = "greater")) ) |> tidyr::pivot_longer(cols = -c(name, stat), names_to = "alt", values_to = "type") |> unnest(type) |> select(name:p.value) |> arrange(p.value)
Histgrams of the different stats help visualize the distribution of p.values.
ggplot(pvals, aes(p.value)) + geom_histogram(binwidth = 0.05) + facet_grid(stat ~ alt) + theme_cowplot()
We can also assess false discovery rates (q.values) using p.adjust()
.
pvals <- group_by(pvals, stat, alt) |> mutate(q.value = p.adjust(p.value)) |> ungroup() |> arrange(q.value)
Finally we can visualize these results using stat_ecdf()
.
res_gather <- tidyr::pivot_longer(res, cols = -c(name, stat, .id), names_to = "type", values_to = "value" ) signif <- head(pvals, 5) res_signif <- signif |> left_join(res_gather, by = c("name", "stat")) ggplot(res_signif, aes(x = value, color = type)) + stat_ecdf() + facet_grid(stat ~ name) + theme_cowplot() + scale_x_log10() + scale_color_brewer(palette = "Set1")
bed_projection()
is a statistical approach to assess the relationship between two intervals based on the binomial distribution. Here, we examine the distribution of repetitive elements within the promoters of coding or non-coding genes.
First we'll extract 5 kb regions upstream of the transcription start sites to represent the promoter regions for coding and non-coding genes.
# create intervals 5kb upstream of tss representing promoters promoters <- bed_flank(genes, genome, left = 5000, strand = TRUE) |> mutate(name = ifelse(grepl("NR_", name), "non-coding", "coding")) |> select(chrom:strand) # select coding and non-coding promoters promoters_coding <- filter(promoters, name == "coding") promoters_ncoding <- filter(promoters, name == "non-coding") promoters_coding promoters_ncoding
Next we'll apply the bed_projection()
test for each repeat class for both coding and non-coding regions.
# function to apply bed_projection to groups projection_stats <- function(x, y, genome, group_var, type = NA) { group_by(x, !!rlang::sym(group_var)) |> do( n_repeats = nrow(.), projection = bed_projection(., y, genome) ) |> mutate(type = type) } pvals_coding <- projection_stats(rpts, promoters_coding, genome, "name", "coding") pvals_ncoding <- projection_stats(rpts, promoters_ncoding, genome, "name", "non_coding") pvals <- bind_rows(pvals_ncoding, pvals_coding) |> ungroup() |> tidyr::unnest(cols = c(n_repeats, projection)) |> select(-chrom) # filter for repeat classes with at least 10 intervals pvals <- filter( pvals, n_repeats > 10, obs_exp_ratio != 0 ) # adjust pvalues pvals <- mutate(pvals, q.value = p.adjust(p.value)) pvals
The projection test is a two-tailed statistical test. A significant p-value indicates either enrichment or depletion of query intervals compared to the reference interval sets. A value of lower_tail = TRUE
column indicates that the query intervals are depleted, whereas lower_tail = FALSE
indicates that the query intervals are enriched.
library(DT) # find and show top 5 most significant repeats signif_tests <- pvals |> arrange(q.value) |> group_by(type) |> top_n(-5, q.value) |> arrange(type) DT::datatable(signif_tests)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.