In jayhesselberth/valr: Genome Interval Arithmetic

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center"
)

Overview

valr includes several functions for exploring statistical relationships between sets of intervals.

Calculate significance of overlaps between sets of intervals with bed_fisher() and bed_projection().
Quantify relative and absolute distances between sets of intervals with bed_reldist() and bed_absdist().
Quantify extent of overlap between sets of intervals with bed_jaccard().

In this vignette we explore the relationship between transcription start sites and repetitive elements in the human genome.

library(valr)
library(dplyr)
library(ggplot2)
library(cowplot)
library(tidyr)

# load repeats and genes. Data in the valr package is restricted to chr22; the entire
# files can be downloaded from UCSC.
rpts <- read_bed(valr_example("hg19.rmsk.chr22.bed.gz"))
genes <- read_bed12(valr_example("hg19.refGene.chr22.bed.gz"))

# load chrom sizes
genome <- read_genome(valr_example("hg19.chrom.sizes.gz"))

# create 1 bp intervals representing transcription start sites
tss <- create_tss(genes)

tss

Distance metrics

First we define a function that takes x and y intervals and computes distance statistics (using bed_reldist() and bed_absdist()) for specified groups. The value of each statistic is assigned to a .value column.

distance_stats <- function(x, y, genome, group_var, type = NA) {
  group_by(x, !!rlang::sym(group_var)) |>
    do(
      reldist = bed_reldist(., y, detail = TRUE) |>
        select(.value = .reldist),
      absdist = bed_absdist(., y, genome) |>
        select(.value = .absdist)
    ) |>
    tidyr::pivot_longer(
      cols = -name,
      names_to = "stat",
      values_to = "value"
    ) |>
    mutate(type = type)
}

We use the distance_stats() function to apply the bed_absdist() function to each group of data.

obs_stats <- distance_stats(rpts, tss, genome, "name", "obs")
obs_stats

And the same is done for a set of shuffled group of data. bed_shuffle() is used to shuffle coordinates of the repeats within each chromosome (i.e., the coordinates change, but the chromosome stays the same.)

shfs <- bed_shuffle(rpts, genome, within = TRUE)
shf_stats <- distance_stats(shfs, tss, genome, "name", "shuf")

Now we can bind the observed and shuffled data together, and do some tidying to put the data into a format appropriate for a statistical test. This involves:

unnest()ing the data frames
creating groups for each repeat (name), stat (reldist or absdist) and type (obs or shf)
adding unique surrogate row numbers for each group
using tidyr::pivot_wider() to create two new obs and shuf columns
removing rows with NA values.

res <- bind_rows(obs_stats, shf_stats) |>
  tidyr::unnest(value) |>
  group_by(name, stat, type) |>
  mutate(.id = row_number()) |>
  tidyr::pivot_wider(
    names_from = "type",
    values_from = ".value"
  ) |>
  na.omit()

res

Now that the data are formatted, we can use the non-parametric ks.test() to determine whether there are significant differences between the observed and shuffled data for each group. broom::tidy() is used to reformat the results of each test into a tibble, and the results of each test are pivoted to into a type column for each test type.

library(broom)

pvals <- res |>
  do(
    twosided = tidy(ks.test(.$obs, .$shuf)),
    less = tidy(ks.test(.$obs, .$shuf, alternative = "less")),
    greater = tidy(ks.test(.$obs, .$shuf, alternative = "greater"))
  ) |>
  tidyr::pivot_longer(cols = -c(name, stat), names_to = "alt", values_to = "type") |>
  unnest(type) |>
  select(name:p.value) |>
  arrange(p.value)

Histgrams of the different stats help visualize the distribution of p.values.

ggplot(pvals, aes(p.value)) +
  geom_histogram(binwidth = 0.05) +
  facet_grid(stat ~ alt) +
  theme_cowplot()

We can also assess false discovery rates (q.values) using p.adjust().

pvals <-
  group_by(pvals, stat, alt) |>
  mutate(q.value = p.adjust(p.value)) |>
  ungroup() |>
  arrange(q.value)

Finally we can visualize these results using stat_ecdf().

res_gather <- tidyr::pivot_longer(res,
  cols = -c(name, stat, .id),
  names_to = "type",
  values_to = "value"
)

signif <- head(pvals, 5)

res_signif <-
  signif |>
  left_join(res_gather, by = c("name", "stat"))

ggplot(res_signif, aes(x = value, color = type)) +
  stat_ecdf() +
  facet_grid(stat ~ name) +
  theme_cowplot() +
  scale_x_log10() +
  scale_color_brewer(palette = "Set1")

Projection test

bed_projection() is a statistical approach to assess the relationship between two intervals based on the binomial distribution. Here, we examine the distribution of repetitive elements within the promoters of coding or non-coding genes.

First we'll extract 5 kb regions upstream of the transcription start sites to represent the promoter regions for coding and non-coding genes.

# create intervals 5kb upstream of tss representing promoters
promoters <-
  bed_flank(genes, genome, left = 5000, strand = TRUE) |>
  mutate(name = ifelse(grepl("NR_", name), "non-coding", "coding")) |>
  select(chrom:strand)

# select coding and non-coding promoters
promoters_coding <- filter(promoters, name == "coding")
promoters_ncoding <- filter(promoters, name == "non-coding")

promoters_coding

promoters_ncoding

Next we'll apply the bed_projection() test for each repeat class for both coding and non-coding regions.

# function to apply bed_projection to groups
projection_stats <- function(x, y, genome, group_var, type = NA) {
  group_by(x, !!rlang::sym(group_var)) |>
    do(
      n_repeats = nrow(.),
      projection = bed_projection(., y, genome)
    ) |>
    mutate(type = type)
}

pvals_coding <- projection_stats(rpts, promoters_coding, genome, "name", "coding")
pvals_ncoding <- projection_stats(rpts, promoters_ncoding, genome, "name", "non_coding")

pvals <-
  bind_rows(pvals_ncoding, pvals_coding) |>
  ungroup() |>
  tidyr::unnest(cols = c(n_repeats, projection)) |>
  select(-chrom)

# filter for repeat classes with at least 10 intervals
pvals <- filter(
  pvals,
  n_repeats > 10,
  obs_exp_ratio != 0
)

# adjust pvalues
pvals <- mutate(pvals, q.value = p.adjust(p.value))

pvals

The projection test is a two-tailed statistical test. A significant p-value indicates either enrichment or depletion of query intervals compared to the reference interval sets. A value of lower_tail = TRUE column indicates that the query intervals are depleted, whereas lower_tail = FALSE indicates that the query intervals are enriched.

library(DT)

# find and show top 5 most significant repeats
signif_tests <-
  pvals |>
  arrange(q.value) |>
  group_by(type) |>
  top_n(-5, q.value) |>
  arrange(type)

DT::datatable(signif_tests)