knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center"
)

Overview

valr includes several functions for exploring statistical relationships between sets of intervals.

In this vignette we explore the relationship between transcription start sites and repetitive elements in the human genome.

library(valr)
library(dplyr)
library(ggplot2)
library(cowplot)
library(tidyr)

# load repeats and genes. Data in the valr package is restricted to chr22; the entire
# files can be downloaded from UCSC.
rpts <- read_bed(valr_example("hg19.rmsk.chr22.bed.gz"))
genes <- read_bed12(valr_example("hg19.refGene.chr22.bed.gz"))

# load chrom sizes
genome <- read_genome(valr_example("hg19.chrom.sizes.gz"))

# create 1 bp intervals representing transcription start sites
tss <- create_tss(genes)

tss

Distance metrics

First we define a function that takes x and y intervals and computes distance statistics (using bed_reldist() and bed_absdist()) for specified groups. The value of each statistic is assigned to a .value column.

distance_stats <- function(x, y, genome, group_var, type = NA) {
  group_by(x, !!rlang::sym(group_var)) |>
    do(
      reldist = bed_reldist(., y, detail = TRUE) |>
        select(.value = .reldist),
      absdist = bed_absdist(., y, genome) |>
        select(.value = .absdist)
    ) |>
    tidyr::pivot_longer(
      cols = -name,
      names_to = "stat",
      values_to = "value"
    ) |>
    mutate(type = type)
}

We use the distance_stats() function to apply the bed_absdist() function to each group of data.

obs_stats <- distance_stats(rpts, tss, genome, "name", "obs")
obs_stats

And the same is done for a set of shuffled group of data. bed_shuffle() is used to shuffle coordinates of the repeats within each chromosome (i.e., the coordinates change, but the chromosome stays the same.)

shfs <- bed_shuffle(rpts, genome, within = TRUE)
shf_stats <- distance_stats(shfs, tss, genome, "name", "shuf")

Now we can bind the observed and shuffled data together, and do some tidying to put the data into a format appropriate for a statistical test. This involves:

  1. unnest()ing the data frames
  2. creating groups for each repeat (name), stat (reldist or absdist) and type (obs or shf)
  3. adding unique surrogate row numbers for each group
  4. using tidyr::pivot_wider() to create two new obs and shuf columns
  5. removing rows with NA values.
res <- bind_rows(obs_stats, shf_stats) |>
  tidyr::unnest(value) |>
  group_by(name, stat, type) |>
  mutate(.id = row_number()) |>
  tidyr::pivot_wider(
    names_from = "type",
    values_from = ".value"
  ) |>
  na.omit()

res

Now that the data are formatted, we can use the non-parametric ks.test() to determine whether there are significant differences between the observed and shuffled data for each group. broom::tidy() is used to reformat the results of each test into a tibble, and the results of each test are pivoted to into a type column for each test type.

library(broom)

pvals <- res |>
  do(
    twosided = tidy(ks.test(.$obs, .$shuf)),
    less = tidy(ks.test(.$obs, .$shuf, alternative = "less")),
    greater = tidy(ks.test(.$obs, .$shuf, alternative = "greater"))
  ) |>
  tidyr::pivot_longer(cols = -c(name, stat), names_to = "alt", values_to = "type") |>
  unnest(type) |>
  select(name:p.value) |>
  arrange(p.value)

Histgrams of the different stats help visualize the distribution of p.values.

ggplot(pvals, aes(p.value)) +
  geom_histogram(binwidth = 0.05) +
  facet_grid(stat ~ alt) +
  theme_cowplot()

We can also assess false discovery rates (q.values) using p.adjust().

pvals <-
  group_by(pvals, stat, alt) |>
  mutate(q.value = p.adjust(p.value)) |>
  ungroup() |>
  arrange(q.value)

Finally we can visualize these results using stat_ecdf().

res_gather <- tidyr::pivot_longer(res,
  cols = -c(name, stat, .id),
  names_to = "type",
  values_to = "value"
)

signif <- head(pvals, 5)

res_signif <-
  signif |>
  left_join(res_gather, by = c("name", "stat"))

ggplot(res_signif, aes(x = value, color = type)) +
  stat_ecdf() +
  facet_grid(stat ~ name) +
  theme_cowplot() +
  scale_x_log10() +
  scale_color_brewer(palette = "Set1")

Projection test

bed_projection() is a statistical approach to assess the relationship between two intervals based on the binomial distribution. Here, we examine the distribution of repetitive elements within the promoters of coding or non-coding genes.

First we'll extract 5 kb regions upstream of the transcription start sites to represent the promoter regions for coding and non-coding genes.

# create intervals 5kb upstream of tss representing promoters
promoters <-
  bed_flank(genes, genome, left = 5000, strand = TRUE) |>
  mutate(name = ifelse(grepl("NR_", name), "non-coding", "coding")) |>
  select(chrom:strand)

# select coding and non-coding promoters
promoters_coding <- filter(promoters, name == "coding")
promoters_ncoding <- filter(promoters, name == "non-coding")

promoters_coding

promoters_ncoding

Next we'll apply the bed_projection() test for each repeat class for both coding and non-coding regions.

# function to apply bed_projection to groups
projection_stats <- function(x, y, genome, group_var, type = NA) {
  group_by(x, !!rlang::sym(group_var)) |>
    do(
      n_repeats = nrow(.),
      projection = bed_projection(., y, genome)
    ) |>
    mutate(type = type)
}

pvals_coding <- projection_stats(rpts, promoters_coding, genome, "name", "coding")
pvals_ncoding <- projection_stats(rpts, promoters_ncoding, genome, "name", "non_coding")

pvals <-
  bind_rows(pvals_ncoding, pvals_coding) |>
  ungroup() |>
  tidyr::unnest(cols = c(n_repeats, projection)) |>
  select(-chrom)

# filter for repeat classes with at least 10 intervals
pvals <- filter(
  pvals,
  n_repeats > 10,
  obs_exp_ratio != 0
)

# adjust pvalues
pvals <- mutate(pvals, q.value = p.adjust(p.value))

pvals

The projection test is a two-tailed statistical test. A significant p-value indicates either enrichment or depletion of query intervals compared to the reference interval sets. A value of lower_tail = TRUE column indicates that the query intervals are depleted, whereas lower_tail = FALSE indicates that the query intervals are enriched.

library(DT)

# find and show top 5 most significant repeats
signif_tests <-
  pvals |>
  arrange(q.value) |>
  group_by(type) |>
  top_n(-5, q.value) |>
  arrange(type)

DT::datatable(signif_tests)


rnabioco/valr documentation built on April 25, 2024, 11:22 a.m.