Introduction

This vignette gives some indication of how kd build and query times vary with the length of the query, target input data. These are based on real world data for Drosophila neurons (see Costa et al 2016). Since the source data are large (>16000 neurons) we have prepared a smaller random sample.

# control whether to run - this vignette takes a long time and
# downloads some large (50 Mb) sample data.
runv=nzchar(Sys.getenv("RUN_RANN2_VIGNETTES"))
knitr::opts_chunk$set(eval=runv)

Load packages

library(knitr)
rgl::setupKnitr()
library(microbenchmark)
library(RANN2)
library(plyr)
library(ggplot2)
library(rgl)

This is how we prepared the sample data:

tf=tempfile(fileext = 'dpscanon.rds')
download.file("http://virtualflybrain.org/data/VFB/nblast/flycircuit/dpscanon.rds", destfile = tf)
dps=readRDS(tf)
set.seed(42)
dps500=sample(dps,500)
saveRDS(dps500, file='dps500.rds')
unlink(tf)

OK now we're going to use that canned sample. It has been made available by a github release. This allows it to be downloaded directly (rather than bloating the git repository).

tf2=tempfile(fileext = 'dps500.rds')
download.file("https://github.com/jefferis/RANN2/releases/download/v0.12pre/dps500.rds", destfile = tf2)
dps500=readRDS(tf2)
unlink(tf2)

Here we define a utility function borrowed from the nat.nblast package:

neuron_pairs <- function (query, target, n = NA, ignoreSelf = TRUE) 
{
    if (!is.character(query)) 
        query = names(query)
    if (missing(target)) 
        target = query
    else if (!is.character(target)) 
        target = names(target)
    if (is.na(n)) {
        rval = expand.grid(query = query, target = target, stringsAsFactors = FALSE, 
            KEEP.OUT.ATTRS = FALSE)
        if (ignoreSelf) 
            rval <- rval[rval$target != rval$query, ]
        return(rval)
    }
    remove_self = ignoreSelf && any(query %in% target)
    q = sample(query, n, replace = TRUE)
    t = sapply(q, function(z) sample(if (remove_self) 
        setdiff(target, z)
    else target, 1))
    data.frame(query = q, target = t, stringsAsFactors = F)
}

Choosee

np=neuron_pairs(dps500, n = 500)
sl=sapply(dps500, function(x) nrow(x$points))
np$query.len=sl[np$query]
np$target.len=sl[np$target]

Let's investigate the build time for the kdtree of those neurons

times=llply(np$target, function(n)
  microbenchmark(nn2(dps500[[n]]$points,matrix(0,ncol=3,nrow=1), k=1), times = 20))
np$target.time=sapply(times,function(x) median(x$time)/1e6)
library(ggplot2)
ggplot(np, aes(target.len,target.time)) +
  scale_x_log10() + 
  scale_y_log10('Tree build time /ms') +
  geom_point() + 
  geom_smooth()
l=lm(target.time~target.len, data=np)
l$coefficients[2]

So in conclusion, ~ 0.5 ms / 1000 points to build.

For queries, total time will include build time, + query time depending O(n) on number of query points and O(n log n) on number of target points.

times2=mlply(np, function(query, target, ...)
  microbenchmark(nn2(dps500[[target]]$points,dps500[[query]]$points, k=1), times = 20))
np$search.time=sapply(times2,function(x) median(x$time)/1e6)
with(np, plot3d(log2(query.len), log2(target.len), log2(search.time)))
set.seed(421)
dps20=sample(dps500, 20)
times3=llply(dps20, function(query, ...)
  microbenchmark(nn2(dps20[[1]]$points,query$points, k=1), times = 20))
df=data.frame(query=names(dps20),target= names(dps20)[1], stringsAsFactors = F)
df$search.time=sapply(times3,function(x) median(x$time)/1e6)
df$query.len=sl[df$query]
ggplot(data=df, aes(query.len, search.time)) +
  scale_y_continuous('Query time /ms') +
  geom_point() + 
  geom_smooth(method = 'lm')
l=lm(search.time~query.len, data=df)

Slope is r l$coefficients[2].

np40=neuron_pairs(dps20[1:2],dps20)
times4=mlply(np40, function(target, query, ...)
  microbenchmark(nn2(dps20[[target]]$points, dps20[[query]]$points, k=1), times = 20))

np40$search.time=sapply(times4,function(x) median(x$time)/1e6)
np40$target.len=sl[np40$target]
np40$query.len=sl[np40$query]
ggplot(np40, aes(target.len, search.time)) +
  scale_y_continuous('Query time /ms') +
  geom_point() +
  geom_smooth(method = 'lm')
l=lm(search.time~target.len, data=np40)

Slope is r l$coefficients[2].



jefferis/RANN2 documentation built on Feb. 27, 2024, 4:42 p.m.