R/Rcpp_interface.R

Defines functions binary_search extract_sites rowWhichMax dist_by_closeness

Documented in extract_sites

# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

binary_search = function(breaks, search, left_index = TRUE) {
    .Call('epik_binary_search', PACKAGE = 'epik', breaks, search, left_index) + 1
}

# == title
# Extract subset of sites in a set of intervals
#
# == param
# -start      start positions, a single integer or a numeric vector
# -end        end positions, a single integer or a numeric vector
# -site       positions of all sites, a numeric vector, should be sorted increasingly.
# -return_index   whether return the index in the position vector or just the position itself?
# -min_sites  minimal number of sites in an interval, regions which contain sites less than this value will be filtered out.
#
# == details
# Providing a huge vector of genomic positions, we want to extract subset of positions which
# locate in a specific group of regions (e.g. extract CpG sites in DMRs). The simplest way is to use:
#
#   site = sort(sample(10000000, 1000000))
#   start = 123456
#   end = 654321
#   subsite = site[site >= start & site <= end]
#
# Unfortunately, in above code, the whole vector ``site`` will be scanned four times
# (``>=``, ``<=``, ``&`` and ``[``).
# If you want to look for sites in more than one regions (e.g. 1000 regions), in every
# loop, the whole ``site`` vector will be re-scanned again and again which is very time-consuming.
#
# Here we have `extract_sites` function which uses binary search to do subsetting.
# Of course, ``site`` should be sorted non-decreasing beforehand.
#
#   subsite = extract_sites(start, end, site, index = FALSE)
#
# Not only for single interval, you can also extract sites in multiple genomic regins,
# by setting ``start`` and ``end`` as vectors. E.g. extracting CpG sites in exons in every gene.
#
#   start = c(123456, 234567, 345678)
#   end = c(133456, 244567, 355678)
#   subsite = extract_sites(start, end, site)
#
# You can choose to return index only or positions.
#
#   subsite = extract_sites(start, end, site, return_index = FALSE)
#   head(subsite)
#   subsite_index = extract_sites(start, end, site, return_index = TRUE)
#   head(subsite_index)
#   head(site[subsite_index])
#
# Regions that include sites less than ``min_site`` will be filtered out.
#
# This kind of overlapping can also be done by defining an `IRanges::IRanges` or `GenomicRanges::GRanges`
# object and calling `GenomicRanges::findOverlaps` function. Following compares the running time for using `IRanges::findOverlaps`
# and `extract_sites`:
#
#   set.seed(123)
#   site = sort(sample(1000, 100))
#   pos = do.call("rbind", lapply(1:10, function(i) sort(sample(max(site), 2))))
#   pos_ir = IRanges(pos[, 1], pos[, 2])
#   site_ir = IRanges(site, site)
#   library(microbenchmark)
#   microbenchmark(f1 = {r1 = extract_sites(pos[, 1], pos[, 2], site)},
#       f2 = {
#       mtch = findOverlaps(pos_ir, site_ir)
#       r2 = start(site_ir[unique(subjectHits(mtch))])
#   })
#   # Unit: microseconds
#   #  expr      min        lq       mean    median       uq      max neval cld
#   #    f1    6.052    8.3925   13.99696   15.5855   18.132   29.366   100  a
#   #    f2 1105.678 1175.9220 1237.87162 1192.6070 1271.972 2054.595   100   b
#
# == value
# A vector of positions or index.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# site = sort(sample(1000, 100))
# pos = do.call("rbind", lapply(1:10, function(i) sort(sample(max(site), 2))))
# extract_sites(pos[, 1], pos[, 2], site)
extract_sites = function(start, end, site, return_index = FALSE, min_sites = 0) {
    .Call('epik_extract_sites', PACKAGE = 'epik', start, end, site, return_index, min_sites)
}

rowWhichMax = function(m) {
    .Call('epik_rowWhichMax', PACKAGE = 'epik', m)
}

dist_by_closeness <- function(mat) {
    as.dist(.Call('epik_dist_by_closeness', PACKAGE = 'epik', mat))
}
jokergoo/epik documentation built on Sept. 28, 2019, 9:20 a.m.