# When working, also examine
# using cudaMallocPitch()
# and passing the two matrices separately to the euclidean_kernel routine
library(RCUDA)
kfile = system.file("sampleKernels", "distance_gputools.ptx", package = "RCUDA")
if(!file.exists(kfile))
kfile = nvcc(system.file("sampleKernels", "distance_gputools.cu", package = "RCUDA"), "distance_gputools.ptx")
mod = loadModule(kfile)
N = c(A = 1e2L, B = 499L)
p = 200L
A = matrix(rnorm(N["A"]*p), N["A"], p)
B = matrix(rnorm(N["B"]*p), N["B"], p)
AB = rbind(A, B)
gdist.same =
function(AB, mod, blockSize = 32L, .async = FALSE, ...)
{
out = .gpu(mod$euclidean_kernel_same,
t(AB), ncol(AB), nrow(AB),
NULL, 0L, 0L,
ncol(AB), ans = numeric(nrow(AB)^2), nrow(AB), 2.0,
outputs = 8L, gridDim = c(nrow(AB), nrow(AB)), blockDim = blockSize, .async = .async, ...)
if(!.async)
matrix(out, nrow(AB), nrow(AB))
else
out
}
invisible(gdist.same(matrix(rnorm(100), 20, 5), mod))
B = 100
tm.parallel = system.time({
o = gdist.same(AB, mod, .async = TRUE)
# do something else here
replicate(B, prod(rnorm(1e6)))
cudaDeviceSynchronize()
distances = matrix(o$ans[], nrow(AB), nrow(AB))
})
tm.serial = system.time({ replicate(B, prod(rnorm(1e6))); gdist.same(AB, mod)})
print(tm.parallel)
print(tm.serial)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.