Find overlaps between mutations and a genomic region.

Share:

Description

Function finds the number of mutations that reside in genomic region and takes surveyed area of genome into account.

Usage

1
genomic_distribution(vcf_list, surveyed_list, region_list)

Arguments

vcf_list

A list with VCF GRanges objects

surveyed_list

A list with Granges of regions of the genome that have been surveyed (e.g. determined using GATK CallableLoci)

region_list

List with GRanges objects containing locations of genomic regions

Value

A data.frame containing the number observed and number of expected mutations in each genomic region.

See Also

read_vcfs_as_granges

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
## See the 'read_vcfs_as_granges()' example for how we obtained the
## following data:
vcfs <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
                package="MutationalPatterns"))

## Exclude mitochondrial and allosomal chromosomes.
autosomal = extractSeqlevelsByGroup(species="Homo_sapiens",
                                    style="UCSC",
                                    group="auto")

vcfs <- lapply(vcfs, function(x) keepSeqlevels(x, autosomal))

## Use biomaRt to obtain data.
## We can query the BioMart database, but this may take a long time
## though, so we take some shortcuts by loading the results from our
## examples.  The corresponding code for downloading data can be
## found above the command we run.

# mart="ensemble"
# library(biomaRt)

# regulatory <- useEnsembl(biomart="regulation",
#                          dataset="hsapiens_regulatory_feature",
#                          GRCh = 37)
regulatory <- readRDS(system.file("states/regulatory_data.rds",
                                    package="MutationalPatterns"))

## Download the regulatory CTCF binding sites and convert them to
## a GRanges object.
# CTCF <- getBM(attributes = c('chromosome_name',
#                             'chromosome_start',
#                             'chromosome_end',
#                             'feature_type_name',
#                             'cell_type_name'),
#              filters = "regulatory_feature_type_name", 
#              values = "CTCF Binding Site", 
#              mart = regulatory)
#
# CTCF_g <- reduce(GRanges(CTCF$chromosome_name,
#                 IRanges(CTCF$chromosome_start,
#                 CTCF$chromosome_end)))

CTCF_g <- readRDS(system.file("states/CTCF_g_data.rds",
                    package="MutationalPatterns"))

## Download the promoter regions and conver them to a GRanges object.
# promoter = getBM(attributes = c('chromosome_name', 'chromosome_start',
#                                 'chromosome_end', 'feature_type_name'),
#                  filters = "regulatory_feature_type_name", 
#                  values = "Promoter", 
#                  mart = regulatory)
# promoter_g = reduce(GRanges(promoter$chromosome_name,
#                     IRanges(promoter$chromosome_start,
#                             promoter$chromosome_end)))

promoter_g <- readRDS(system.file("states/promoter_g_data.rds",
                        package="MutationalPatterns"))

# open = getBM(attributes = c('chromosome_name', 'chromosome_start',
#                             'chromosome_end', 'feature_type_name'),
#               filters = "regulatory_feature_type_name",
#               values = "Open chromatin",
#               mart = regulatory)
# open_g = reduce(GRanges(open$chromosome_name,
#                 IRanges(open$chromosome_start,
#                         open$chromosome_end)))

open_g <- readRDS(system.file("states/open_g_data.rds",
                    package="MutationalPatterns"))

# flanking = getBM(attributes = c('chromosome_name',
#                                 'chromosome_start',
#                                 'chromosome_end',
#                                 'feature_type_name'),
#                  filters = "regulatory_feature_type_name", 
#                  values = "Promoter Flanking Region", 
#                  mart = regulatory)
# flanking_g = reduce(GRanges(
#                        flanking$chromosome_name,
#                        IRanges(flanking$chromosome_start,
#                        flanking$chromosome_end)))

flanking_g <- readRDS(system.file("states/promoter_flanking_g_data.rds",
                                    package="MutationalPatterns"))

# TF_binding = getBM(attributes = c('chromosome_name', 'chromosome_start',
#                                   'chromosome_end', 'feature_type_name'),
#                      filters = "regulatory_feature_type_name",
#                      values = "TF binding site",
#                      mart = regulatory)
# TF_binding_g = reduce(GRanges(TF_binding$chromosome_name,
#                               IRanges(TF_binding$chromosome_start,
#                               TF_binding$chromosome_end)))

TF_binding_g <- readRDS(system.file("states/TF_binding_g_data.rds",
                                    package="MutationalPatterns"))

regions <- GRangesList(promoter_g, flanking_g, CTCF_g, open_g, TF_binding_g)

names(regions) <- c("Promoter", "Promoter flanking", "CTCF",
                    "Open chromatin", "TF binding")

# Use a naming standard consistently.
seqlevelsStyle(regions) <- "UCSC"

## Get the filename with surveyed/callable regions
surveyed_file <- list.files(system.file("extdata",
                            package="MutationalPatterns"),
                            pattern = ".bed",
                            full.names = TRUE)

## Import the file using rtracklayer and use the UCSC naming standard
library(rtracklayer)
surveyed <- import(surveyed_file)
seqlevelsStyle(surveyed) <- "UCSC"

## For this example we use the same surveyed file for each sample.
surveyed_list <- rep(list(surveyed), 9)

## Calculate the number of observed and expected number of mutations in
## each genomic regions for each sample.
distr <- genomic_distribution(vcfs, surveyed_list, regions)

Want to suggest features or report bugs for rdrr.io? Use the GitHub issue tracker.