R/extract_site.R

#  title
# Extract subset of sites which are in a set of intervals
#
# == param
# -start      start position, a vector
# -end        end position, a vector. Note there should be no overlap between all ``[start, end]``
#             (You may use `IRanges::reduce` to merge the overlapping intervals.)
# -site       positions of all sites, should be sorted.
# -index      whether return the index in the position vector or just the position itself?
# -filter_fun filter sites in the interval. If there are more than one intervals, sites
#             in each interval will be filtered by ``filter_fun``. Argument ``s`` contains positions of sites in every interval.
#             For example, you can filter intervals if number of sites in 
#             the interval is small (< 10), by ``filter_fun = function(s) length(s) >= 10``
#
# == details
# Providing a huge vector of genomic positions, we want to extract subset of sites which
# locate in a specific region (e.g. extract CpG sites in DMRs). Normally, we will use:
#
# 	site = sort(sample(10000000, 1000000))
# 	start = 123456
# 	end = 654321
# 	subsite = site[site >= start & site <= end]
#
# Unfortunately, in above code, the whole vector ``site`` will be scaned for four times
# (``>=``, ``<=``, ``&`` and ``[``).
# If you want to look for sites in more than one regions (e.g. 1000 regions), in every
# loop, the whole ``site`` vector will be re-scanned again and again which is very time-consuming.
#
# Here we have `extract_sites` function which uses binary search to do subsetting.
# Of course, ``site`` should be sorted non-decreasing beforehand.
#
# 	subsite = extract_sites(start, end, site, index = FALSE)
#
# Not only for single interval, you can also extract sites in a list of genomic regins,
# by setting ``start`` and ``end`` as a vector.
#
# 	start = c(123456, 234567, 345678)
# 	end = c(133456, 244567, 355678)
# 	subsite = extract_sites(start, end, site)
#
# ``filter_fun`` can filter sites in each interval by looking at the number of sites in it.
# Following code filters out intervals that have less than 10 sites.
#
# 	subsite = extract_sites(start, end, site, filter_fun = function(x) length(x) >= 10)
#
# You can choose to return index only or positions.
#
# 	subsite = extract_sites(start, end, site, index = FALSE)
# 	head(subsite)
# 	subsite_index = extract_sites(start, end, site, index = TRUE)
# 	head(subsite_index)
# 	head(site[subsite_index])
#
# Follows compare the normal subsetting and binary search.
#
# We first extract all the index that we want to test so that it would not affect the comparison.
# In following code, we explicitly transform ``site`` and ``pos`` to ``double`` mode because binary search
# is implemented in C-level, and it expects parameter stored in ``double`` mode.
#
# 	pos = do.call("rbind", lapply(1:1000, function(i) sort(sample(max(site), 2))))
# 	head(pos)
#
# 	t1 = system.time(for(i in 1:1000) {
# 		which(site >= pos[i, 1] & site <= pos[i, 2])
# 	})
# 	t1
#   #   user  system elapsed
#   # 29.878   2.904  33.432
#
# 	site2 = as.double(site)
# 	pos2 = as.double(pos)
# 	dim(pos2) = dim(pos)
# 	t2 = system.time(for(i in 1:1000) {
# 		extract_sites(pos2[i, 1], pos2[i, 2], site2)
# 	})
# 	t2
#   #  user  system elapsed
#   # 1.515   0.783   2.436
#
#
# == value
# a vector of positions or index. If there is no sites in the interval, it will return ``NULL``.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# extract_sites = function(start, end, site, index = TRUE, filter_fun = NULL) {

# 	i1 = binary_search(site, start, include_left = TRUE, include_right = FALSE)
# 	i2 = binary_search(site, end, include_left = FALSE, include_right = TRUE)

# 	l = !is.na(i1) & !is.na(l2)

# 	if(sum(l) == 0) {
# 		return(integer(0))
# 	}

# 	i1 = i1[l]
# 	i2 = i2[l]

# 	if(!is.null(filter_fun)) {
# 		sapply(seq_len(i1))
# 	}

# 	if(index) {
# 		unlist()
# 	}

# 	if(!is.double(site)) site = as.double(site)
# 	if(!is.double(start)) start = as.double(start)
# 	if(!is.double(end)) end = as.double(end)
	
# 	.extract = function(start, end) {
# 		n = length(site)
		
# 		# --------o------o---- site
# 		# --+--+--------------
# 		if(end < site[1]) return(NULL)
		
# 		# --------o------o---- site
# 		# -----------------+-+
# 		if(start > site[n]) return(NULL)
		
# 		# --------o------o---- site
# 		# -----+------------+-
# 		if(start <= site[1] & end >= site[n]) return(seq_along(site))
		
# 		if(start <= site[1]) {
# 			i1 = 1
# 		} else {
# 			i1 = .Internal(findInterval(site, start, FALSE, FALSE))
# 		}

# 		if(end >= site[n]) {
# 			i2 = n
# 		} else {
# 			i2 = .Internal(findInterval(site, end, FALSE, FALSE))
# 		}
		
# 		# --------o------o---- site
# 		# ----------+--+------
# 		if(i1 == i2) {
# 			if(site[i1] < start && site[i2+1] > end) return(NULL)
# 		}
		
# 		if(start > site[1] && site[i1] < start) i1 = i1 + 1
# 		return(i1:i2)
# 	}
	
# 	id = integer(0)
# 	if(is.null(filter_fun)) {
# 		for(i in seq_along(start)) {
# 			ind = .extract(start[i], end[i])
# 			id = c(id, ind)
# 		}
# 	} else {
# 		for(i in seq_along(start)) {
# 			ind = .extract(start[i], end[i])
# 			if(filter_fun(site[ind])) {
# 				id = c(id, ind)
# 			}
# 		}
# 	}
	
# 	if(!index) {
# 		return(as.integer(site[id]))
# 	} else {
# 		return(id)
# 	}
# }
eilslabs/epic documentation built on May 16, 2019, 1:24 a.m.