tests/rgamma.R

library(RCUDA)

fix_seed <- TRUE

cat("Setting cuGetContext(TRUE)...\n")
cuGetContext(TRUE)
cat("done. Profiling CUDA code...\n")

cat("Loading module...\n")
m <- loadModule(system.file("sampleKernels", "rgamma.ptx", package = "RCUDA"))
cat("done. Extracting kernels...\n")
k_setup <- m$setup_kernel
k_rgamma <- m$rgamma_kernel

cat("done. Setting up miscellaneous stuff...\n")
N <- as.integer(500000) # 1e6L fails on my mac... :/
n_states <- as.integer(5000) # (this is the number of *useful* threads to be launched)

# Gamma parameters:
a <- 10.0
b <- 2.0

# if...
# N = 1,000,000
# => 1954 blocks of 512 threads will suffice
# => (62 x 32) grid, (512 x 1 x 1) blocks

# Fix block dims:
threads_per_block <- 512L # 512L
block_dims <- c(threads_per_block, 1L, 1L)
grid_d1 <- floor(sqrt(n_states/threads_per_block))
grid_d2 <- ceiling(n_states/(grid_d1*threads_per_block))
grid_dims <- c(grid_d1, grid_d2, 1L)

cat("Grid size:\n")
print(grid_dims)
cat("Block size:\n")
print(block_dims)

nthreads <- prod(grid_dims)*prod(block_dims) 
cat("Total number of threads to launch = ",nthreads,"\n")
if (nthreads < n_states){
    stop("Grid is not large enough...!")
}

# Need to setup RNG (this doesn't work yet since
# getElementSize() isn't defined for curandState...
# In the meantime, on my Mac: 
# sizeof(curandState) = 48
cat("Allocating memory on device for curandStates...\n")
cu_rng_alloc_time <- system.time({
    rng_states <- cudaMalloc(elType = "curandState", numEls=n_states, sizeof=48L) 
})

if (fix_seed){
    set.seed(512312)
}
rng_seeds <- as.integer(198 + 6*seq(0,n_states,by=1)) # runif(n=n_states,min=1,max=1e7))
cu_rng_seeds <- copyToDevice(rng_seeds)

# Need to allocate space for results:
cat("done. Allocating space for results...\n")
x_double <- rep(0.0,N)
x_d_mem <- copyToDevice(x_double) 

# Initializing RNG's...
cat("Launching CUDA kernel for RNG setup...\n")
cu_init_time <- system.time({
    .cuda(k_setup, rng_states, cu_rng_seeds, n_states, inplace = TRUE, gridDim = grid_dims, blockDim = block_dims)
})

# Call RNGs...
cat("Launching rgamma CUDA kernel...\n")
cu_rgamma_time <- system.time({
    .cuda(k_rgamma, rng_states, n_states, x_d_mem, N, a, 1.0/b, inplace = TRUE, gridDim = grid_dims, blockDim = block_dims)
})

cat("Copying result back from device...\n")
cu_rgamma_copy_time <- system.time({
    cu_rgamma_x <- copyFromDevice(obj=x_d_mem,nels=x_d_mem@nels,type="float")
})

r_rgamma_time <- system.time({
    r_rgamma_x <- rgamma(n=N,a,b)
})

cat("Quantiles for CUDA:\n")
print(quantile(cu_rgamma_x))
cat("Quantiles for R:\n")
print(quantile(r_rgamma_x))

cat("Timings (N=",N,"):\n",sep="")
cat("CUDA init+fluff time = ",(cu_rng_alloc_time+cu_init_time+cu_rgamma_copy_time)[3],"\n",sep="")
cat("CUDA rgamma time     = ",cu_rgamma_time[3],"\n",sep="")
cat("rgamma time          = ",r_rgamma_time[3],"\n",sep="")
duncantl/RCUDA documentation built on May 15, 2019, 5:26 p.m.