extractRegionProperty-methods: Method extractRegionProperty

Description Usage Arguments Details Value Author(s) See Also Examples

Description

Methods for annotating the query GenomicRanges with various properties on the subject regions, and the subject regions could be either GenomicRanges or GRangesList objects.

The extractRegionProperty function extract the annotations on each query ranges from the properties of their overlapping/nearest subject regions.

Other functions are specific implementations of extractRegionProperty, and their purpose is to summarize the genomic features frequently involved in functional genomics analysis.

extractRegionOverlap computes the overlapping of each query ranges with the subject regions. Likewise, the extractRegionLength, extractRegionLetterFrequency, and extractRegionScores functions compute the Lengths, Sequence Contents, and GenomicScores of the overlapped subjects.

extractRegionYCount and extractRegionNearestDistToY quantify the density/distance of an annotation (Y) over the subject regions, the resulting properties can capture the clustering effect between the subject region and the annotation object.

The functions extractDistToRegion5end & extractDistToRegion3end retrieve the distance between the query and ends of the subject, and the extractRegionRelativePosition function extract the relative position of the query in the matched regions. Specifically, the relative position is calculated by dividing the distance of the query to the 5'-end of the region by the length of the region.

Usage

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
extractRegionProperty(
  x,
  region,
  property,
  ambiguityMethod = c("auto", "mean", "sum", "min", "max"),
  maxgap = 0L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "FALSE", "nearest"),
  ignore.strand = FALSE
)

extractRegionOverlap(
  x,
  region = NULL,
  ambiguityMethod = c("auto", "mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  ignore.strand = FALSE,
  output.logical = TRUE
)

extractRegionLength(
  x,
  region = NULL,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  ignore.strand = FALSE
)

extractRegionLetterFrequency(
  x,
  sequence,
  region = NULL,
  letters = "GC",
  as.prob = TRUE,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  ignore.strand = FALSE,
  efficient = TRUE,
  ...
)

extractRegionScores(
  x,
  gscores,
  region = NULL,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  missingScores = c("zero", "mean", "none"),
  ignore.strand = FALSE,
  efficient = TRUE,
  ...
)

extractRegionYCount(
  x,
  y,
  region = NULL,
  normalize = FALSE,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  ignore.strand = FALSE,
  efficient = TRUE
)

extractRegionNearestDistToY(
  x,
  y,
  region = NULL,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  maxDist = 3e+06,
  ignore.strand = FALSE
)

extractRegionRelativePosition(
  x,
  region = NULL,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  nomapValue = c("NA", "0"),
  ignore.strand = FALSE
)

extractDistToRegion5end(
  x,
  region = NULL,
  ignore.strand = FALSE,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest")
)

extractDistToRegion3end(
  x,
  region = NULL,
  ignore.strand = FALSE,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest")
)

## S4 method for signature 'GRanges'
extractRegionProperty(
  x,
  region,
  property,
  ambiguityMethod = c("auto", "mean", "sum", "min", "max"),
  maxgap = 0L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "FALSE", "nearest"),
  ignore.strand = FALSE
)

## S4 method for signature 'GRanges'
extractRegionOverlap(
  x,
  region = NULL,
  ambiguityMethod = c("auto", "mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  ignore.strand = FALSE,
  output.logical = TRUE
)

## S4 method for signature 'GRanges'
extractRegionLength(
  x,
  region = NULL,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  ignore.strand = FALSE
)

## S4 method for signature 'GRanges'
extractRegionLetterFrequency(
  x,
  sequence,
  region = NULL,
  letters = "GC",
  as.prob = TRUE,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  ignore.strand = FALSE,
  efficient = TRUE,
  ...
)

## S4 method for signature 'GRanges'
extractRegionScores(
  x,
  gscores,
  region = NULL,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  missingScores = c("zero", "mean", "none"),
  ignore.strand = FALSE,
  efficient = TRUE,
  ...
)

## S4 method for signature 'GRanges'
extractRegionYCount(
  x,
  y,
  region = NULL,
  normalize = FALSE,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  ignore.strand = FALSE,
  efficient = TRUE
)

## S4 method for signature 'GRanges'
extractRegionNearestDistToY(
  x,
  y,
  region = NULL,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest"),
  maxDist = 3e+06,
  ignore.strand = FALSE
)

## S4 method for signature 'GRanges'
extractRegionRelativePosition(
  x,
  region = NULL,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  nomapValue = c("NA", "0"),
  ignore.strand = FALSE
)

## S4 method for signature 'GRanges'
extractDistToRegion5end(
  x,
  region = NULL,
  ignore.strand = FALSE,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest")
)

## S4 method for signature 'GRanges'
extractDistToRegion3end(
  x,
  region = NULL,
  ignore.strand = FALSE,
  ambiguityMethod = c("mean", "sum", "min", "max"),
  maxgap = -1L,
  minoverlap = 0L,
  type = c("any", "start", "end", "within", "equal"),
  nomapValue = c("NA", "0", "nearest")
)

Arguments

x

A GRanges object for the query.

region

A GRanges or GRangesList object for the subject. If not provided (region=NULL), the returned properties will be calculated directly from x.

property

A vector specifying the properties/attributes of the region.

ambiguityMethod

By default, for logical property input, if x overlaps multiple regions, as long as any of the properties is TRUE, the returned value will be TRUE. For other types of property input, the returned property will be the average of the properties over multiple overlapped regions. If ambiguityMethod is "mean", "sum", "min", or "max", then the mean, sum, minimum, and maximum values of the >1 mapping will be returned in the output value.

maxgap, minoverlap, type

See ?findOverlaps in the IRanges package for a description of these arguments.

nomapValue

When nomapValue is "NA", "0", or "FALSE", the x that do not match the region will return NA, 0, and FALSE respectively. If nomapValue is "nearest", the not matched x will be set to be the property value on its nearest region.

ignore.strand

When set to TRUE, the strand information is ignored in the overlap calculations.

output.logical

If TRUE then the returned value is logical, otherwise numeric.

sequence

A BSgenome or XStringSet object. See the available.genomes function for how to install a genome.

as.prob, letters

See ?letterFrequency in the Biostrings package for a description of these arguments.

efficient

TRUE if only internally extract the properties on regions overlap with x, the option makes the computation more efficient if the number of gnomic regions is much larger than the query.

...

Additional arguments, it passes to letterFrequency in method extractRegionLetterFrequency, and it passes to score in method extractRegionScores.

gscores

A GScores object. See the vignette for more details: vignette("GenomicScores", package = "GenomicScores").

missingScores

Specifying the imputation methods on missing scores in extractRegionScores, default is returning zero for unknown gscores under the region.

y

A GRanges or GRangesList object for the clustering annotation.

normalize

If TRUE, then returned count in extractRegionYCount will be divided by the length of the region. The normalized count can be understood as the density of the annotation on the area.

maxDist

A value of the maximum nucleotide distance returned in extractRegionNearestDistToY, default: 3e+06.

Details

Please check the documentation of extractRegionProperty for details.

For specific extractor functions, when only x is provided with region = NULL, the functions directly use the query ranges to calculate the corresponding properties.

In case the region ranges are provided, the extractor functions first find the index where x overlaps with the region and then matches the property of the region to the query GRanges according to the overlapping index.

For nomapValue = "nearest", when ranges in x do not overlap with ranges in regions, the extractor functions will additionally match the unmapped x with its nearest region.

For extractRegionNearestDistToY, if ranges in y overlap the ranges in x, the overlapped ranges in y are skipped and the ranges precede or follow x are used to calculate the distances.

Value

A logical or a numeric vector with the same length as x.

For extractRegionProperty, the vector type depends on the type of property.

For extractRegionOverlap either logical when output.logical = TRUE or a numeric otherwise.

Author(s)

Zhen Wei

See Also

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
## ---------------------------------------------------------------------
## BASIC EXAMPLES
## ---------------------------------------------------------------------
library(GenomicRanges)

## Build the query GRanges object:
x_gr <- GRanges(rep(c("chr1", "chr2"), c(5, 15)),
                IRanges(c(sample(11874:12127, 5), sample(38814:41527,15)), width=100),
                strand=Rle(c("+", "-"), c(5, 15)))
x_gr    

## The region GRanges or GRangesList object:
exons_gr <- GRanges(c("chr1","chr2","chr2"),
                    IRanges(start=c(11874,38814,45440),end=c(12227,41627,46588)),
                     strand=c("+","-","-"))
genes_grl <- GRangesList(gene1=exons_gr[1],gene2=exons_gr[c(2,3)])        

## Extract lengths of the query:
extractRegionLength(x_gr)

## Extract length of the exon overlapped by the query:
extractRegionLength(x_gr, exons_gr)

## Extract exonic length of the genes overlapped by the query:
extractRegionLength(x_gr, genes_grl)

## Exract self defined property on exons overlapped by the query:
exons_property <- c(1,6,8)
extractRegionProperty(x_gr, exons_gr, exons_property)

## ---------------------------------------------------------------------
## MORE FEATURE EXTRACTORS
## ---------------------------------------------------------------------

## Quantifying clustering effect with annotation y:

## Self clustering
extractRegionYCount(x_gr, x_gr)

## Self clustering on overlapping exons/genes
extractRegionYCount(x_gr, x_gr, exons_gr)
extractRegionYCount(x_gr, x_gr, genes_grl)

## Clustering with another annotation:
y_gr <- GRanges(rep(c("chr1", "chr2"), c(50, 50)),
                IRanges(c(sample(11874:12127, 50), 
                          sample(38814:41527,50)), width=1),
                strand=Rle(c("+", "-"), c(50, 50)))

extractRegionYCount(x_gr, y_gr)
extractRegionYCount(x_gr, y_gr, exons_gr)
extractRegionYCount(x_gr, y_gr, genes_grl)
extractRegionNearestDistToY(x_gr, y_gr)

## Relative position on exons/genes:
extractRegionRelativePosition(x_gr, exons_gr)
extractRegionRelativePosition(x_gr, genes_grl)

## Distance to 5'end/3'end of exons/genes:
extractDistToRegion5end(x_gr, exons_gr)
extractDistToRegion5end(x_gr, genes_grl)
extractDistToRegion3end(x_gr, exons_gr)
extractDistToRegion3end(x_gr, genes_grl)

## Extract features that depends on sequence/annotation packages:

## Not run: 

library(BSgenome.Hsapiens.UCSC.hg19)
bsgenome <- BSgenome.Hsapiens.UCSC.hg19

## GC content of the query:
extractRegionLetterFrequency(x_gr, bsgenome, letters="GC")

## GC contents of exons/genes containing the query:
extractRegionLetterFrequency(x_gr, bsgenome, exons_gr, letters="GC")
extractRegionLetterFrequency(x_gr, bsgenome, genes_grl, letters="GC")

library(phastCons100way.UCSC.hg19)
gscores <- phastCons100way.UCSC.hg19

## PhastCons scores of the query:
extractRegionScores(x_gr, gscores)

## PhastCons scores of exons/genes containing the query:
extractRegionScores(x_gr, gscores, exons_gr)
extractRegionScores(x_gr, gscores, genes_grl)


## End(Not run)

## ---------------------------------------------------------------------
## VISUALIZE FEATURE IN PREDICTION MODEL
## ---------------------------------------------------------------------

## Load the GRanges of the m6A miCLIP dataset prepared for the classification model:
GSE63753_abcam <- readRDS(system.file("extdata", "GSE63753_abcam.rds", package = "WhistleR"))

## The metadata column of the GRanges is a 0/1 vector, 1 for the positive m6A site, 0 is the negative DRACH:
GSE63753_abcam
table(GSE63753_abcam$target)

## Extract exon length overlapped by the DRACH sites:
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

GSE63753_abcam$exon_length <- extractRegionLength(GSE63753_abcam, exons(TxDb.Hsapiens.UCSC.hg19.knownGene))

## Plot the logistic regression fit with cubic splines:
library(ggplot2)

ggplot(na.omit(as.data.frame(mcols(GSE63753_abcam))), aes(log(exon_length), target)) +
  geom_smooth(formula = y ~ splines::ns(x, 3), method = "glm", method.args = list(family = "binomial")) + 
  geom_hline(yintercept = 0.5, linetype = 3) + 
  scale_x_continuous(limits = c(4,9)) +
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1)) +
  theme_classic() + labs(x = "log(length of exon)", y = "prob of m6A = 1", title = "LR fit with cubic splines")

ZW-xjtlu/WhistleR documentation built on March 13, 2021, 10:50 a.m.