RCUDA: R Bindings for the CUDA Library for GPU Computing

setClass("size_t", contains = "numeric")
setClass("CUdevice", contains = "integer")

tmp = function(from)
        new("CUdevice", as.integer(from - 1L))
setAs("numeric", "CUdevice", tmp)
setAs("integer", "CUdevice", tmp)



setClass("CUmodule", contains = "RC++Reference")
setClass("CUfunction", contains = "RC++Reference")
setClass("CUcontext", contains = "RC++Reference")

setClass("CUdeviceptr", contains = "RC++Reference")

setClass("CUstream", contains = "RC++Reference")
setClass("cudastream_t", contains = "RC++Reference")

setClass("CUevent", contains = "RC++Reference")
setClass("cudaEvent_t", contains = "RC++Reference")


setClass("cudaPtr", contains = "RC++Reference")
setClass("cudaPtrWithLength", representation(nels = "integer", elSize = "integer", elTypeName = "character"), contains = "RC++Reference")
setClass("cudaFloatPtr", contains = "cudaPtr")
setClass("cudaIntPtr", contains = "cudaPtr")

setClass("cudaDoubleArray", contains= "cudaPtrWithLength")
setClass("cudaFloatArray", contains= "cudaPtrWithLength")
setClass("cudaIntArray", contains= "cudaPtrWithLength")


setClass("CUresult", contains = "cudaError")
CUresultValues = cudaErrorValues

# Should be the other way around, but not a big deal.
#setClass("cudaError", contains = "cudaError_t")

  # This is 0 based.  When converting from integer to CUDeviceNum, we do the subtraction for the user.
setClass("CUDeviceNum", contains = "integer")


tmp = function(from)   new("CUDeviceNum", as.integer(from - 1L))
setAs("numeric", "CUDeviceNum", tmp)
setAs("integer", "CUDeviceNum", tmp)

setMethod("[[", c("CUmodule", "character", "missing"),
           function(x, i, j, globalVar = FALSE, ...) {
              if(globalVar)
                cuModuleGetGlobal(x, i)
              else
                getFunction(x, i)
           })

setMethod("$", c("CUmodule"),
           function(x, name) {
              getFunction(x, name)
           })

getFunction =
function(module, name)
{
  ans = .Call("R_Module_getFunction", as(module, "CUmodule"),  as.character(name))
  if(is.integer(ans))  #  !is(ans, "RC++Reference"))
    raiseError(ans, msg = c("failed to get function ", name))

  ans
}

loadModule =
#
#
#
function(filename, ctx = cuGetContext(TRUE), 
         isCode = is(filename, "raw") || is(filename, "AsIs") || grepl(" __cudaparm", filename),
          ...)
{
  
  if(isCode) {
     force(ctx)
     return(cuModuleLoadDataEx(filename, ...))
  } 

  filename = path.expand(filename)
  if(!file.exists(filename))
    stop("no such file ", filename)

  force(ctx)
  
  ans = .Call("R_loadModule", as.character(filename))
  if(is.integer(ans))  #  !is(ans, "RC++Reference"))
    raiseError(ans, msg = c("failed to load module ", filename))
  else
    ans
}


raiseError =
function(status, msg = character(), ...)
{
 i =  match(status, CUresultValues)
 type = names(CUresultValues)[i]
 e = simpleError(paste(c(msg, names(status), "(", type, ")"), collapse = " "), ...)
 class(e) = c(type, class(e))
 stop(e)
}


createContext =
function(flags = 0L, device = 1L)
{
  ans = .Call("R_createContext", as(flags, "CUctx_flags"), as(as(device, "CUDeviceNum"), "integer"))
  if(is(ans, "CUresult") && ans != 0)  
     raiseError(ans, msg = c("failed to create context"))
   
  ans
}



.gpu = .cuda =
# should we have a .gpu argument to specify which GPU to use
# Problem is we have to get the context also. 
# User can do this directly and avoid overhead by doing it once.
function(fun, ..., .args = list(...), gridDim, blockDim,
         sharedMemBytes = 0L, stream = NULL, inplace = FALSE,
          outputs = logical(), .gc = TRUE, gridBy = NULL, 
          .async = !is.null(stream), .numericAsDouble = getOption("CUDA.useDouble", FALSE))
{
   if(.gc)
     gc()

#  if(!missing(.gpu)) {
#    ctxt = createContext( device = .gpu)
#    on.exit(cuCtxPopCurrent())
#  }

#   .args = list(...)   

   .numericAsDouble = as.logical(.numericAsDouble)

   if(length(.numericAsDouble) != length(.args))
      .numericAsDouble = rep(.numericAsDouble, length = length(.args))

   if(!missing(gridBy) && missing(gridDim)) {
#
# We could use the names of the arguments by 
# examining the call and determining their names
# This is probably overkill and expensive.
# So instead, have the caller specify the actual objects
#     call = substitute(gridBy)
#     if(is.call(call))
#        call = call[-1]
#     vars = sapply(call, function(x) if(is.name(x)) .args[[as.character(x)]]
#                                     else if(is.numeric(x)) .args[[x]])
                                    
     if(missing(blockDim))
       blockDim = getDeviceProperties(1L)@maxThreadsPerBlock

        # allow for a list of objects or a single vector
     lens = if(is.list(gridBy)) 
               sapply(gridBy, length)
            else if(length(gridBy) == 1)
                gridBy
            else
                length(gridBy)
     tmp = getGridSize(lens, blockDim)
     gridDim = tmp$grid
   }
   
   fun = as(fun, "CUfunction")

   if(length(gridDim) < 3)
      gridDim = c(gridDim, c(1L, 1L, 1L))[1:3]
   if(length(blockDim) < 3)
      blockDim = c(blockDim, c(1L, 1L, 1L))[1:3]
   
   mustCopy = sapply(.args, function(x) is.atomic(x) && length(x) > 1)
   if(any(mustCopy))
     .args[mustCopy] = mapply(function(obj, strict)
                                 copyToDevice(obj, strict = strict),
                             .args[mustCopy], .numericAsDouble[mustCopy]) #, SIMPLIFY = FALSE, USE.NAMES = FALSE)
   
   ans = .Call("R_cuLaunchKernel", fun, as.integer(gridDim), as.integer(blockDim), .args, 
                                   as.integer(sharedMemBytes), stream, .numericAsDouble)

   if(is(ans, "cudaError_t"))  #  !is(ans, "RC++Reference"))
      raiseError(ans, msg = c("failed to launch kernel"))

   if(.async) 
     return(.args[mustCopy])

  ans = cuCtxSynchronize() # cuStreamSynchronize(stream) 
  if(is(ans, "cudaError_t"))  #  !is(ans, "RC++Reference"))
      raiseError(ans, msg = c("failed to launch kernel"))

  if(!missing(outputs)) {
     if(length(outputs) == 0 || is.logical(outputs) && !any(outputs))
         return(NULL)

     ans = lapply(.args[outputs], function(x) if(is(x, "cudaPtrWithLength")) x[] else x)
     return(if(length(ans) == 1) ans[[1]] else ans)
   }
   
   if(any(mustCopy)) {
      ans = lapply(.args[mustCopy], `[`)
      if(sum(mustCopy) == 1)
        ans[[1]]
      else
        ans
   } else
     ans
}


getElementSize =
function(obj, type = typeof(obj), strict = inherits(obj, "AsIs"))
{
    if(!strict && type == "double")
       type = "float"

    i = match(type, names(CUDAStructSizes))
    if(!is.na(i)) 
      return(CUDAStructSizes[i])

   switch(type,
           logical=, integer= 4L,
           float=, double=, numeric = 8L,
           stop("don't know size of elements"))
}

cudaAlloc = cudaMalloc =
function(numEls, sizeof = 4L, elType = NA, strict = !missing(elType) || inherits(elType, "AsIs"))
{
  if(missing(sizeof) && !missing(elType)) 
     sizeof = getElementSize(type = elType, strict = strict)

  ans = .Call("R_cudaMalloc", as.numeric(as.numeric(numEls) * sizeof))
  if(is.integer(ans))  #  !is(ans, "RC++Reference"))
     raiseError(ans, msg = c("failed to create context"))

  k =  "cudaPtrWithLength"
  if(!is.na(elType)) {
    classType = if(elType == "double" && strict)
                  "Double"
                else if(elType %in% c("integer", "logical")) 
                  "Int" 
                else if(elType %in% c("float", "double", "numeric")) 
                  "Float" 
                else NA
    if(!is.na(classType))
      k = sprintf("cuda%sArray",  classType)
  }

  new(k, ref = ans@ref, nels = as.integer(numEls), elSize = as.integer(sizeof), elTypeName = as.character(elType))
}

copyToDevice =
function(obj, to = cudaMalloc(length(obj), elType = elType, strict = strict), 
          elType = typeof(obj), strict = !missing(elType))
{
  ans = .Call("R_manual_cudaMemcpy", obj, to, to@elSize)
  if(is(ans, "CUresult"))
    raiseError(ans, "copying data to GPU")

  to
}

setGeneric("copyFromDevice",
            function(obj, nels, type)
              standardGeneric("copyFromDevice"))

setMethod("copyFromDevice", "cudaFloatArray",
            function(obj, nels, type)
                copyFromDevice(obj@ref, obj@nels, "float"))

setMethod("copyFromDevice", "cudaDoubleArray",
            function(obj, nels, type)
                copyFromDevice(obj@ref, obj@nels, "double"))


setMethod("copyFromDevice", "cudaFloatArray",
            function(obj, nels, type)
                copyFromDevice(obj@ref, obj@nels, "float"))


setMethod("copyFromDevice", "cudaIntArray",
            function(obj, nels, type)
                copyFromDevice(obj@ref, obj@nels, "integer"))



setMethod("copyFromDevice", c("ANY"),
function(obj, nels, type)
{
  nels = as.integer(nels)
  ans = if(type == "integer")
          .Call("R_getCudaIntVector", obj, nels)
        else if(type == "logical")
          .Call("R_getCudaIntVector", obj, nels)
        else if(type == "float" || type == "numeric")
          .Call("R_getCudaFloatVector", obj, nels, NULL)
        else if(type == "double")
          .Call("R_getCudaDoubleVector", obj, nels, NULL)

  if(is(ans, "CUresult"))
      raiseError(ans, "copying data on device")

  ans
})

# Allow
#  p = cudaMalloc()
#  p[] = x
# to be shorthand for this.

setMethod("[", c("cudaFloatArray", "missing", "missing"),
           function(x, i, j, ...) {
             copyFromDevice(x, x@nels, type = "float")
           })

setMethod("[", c("cudaDoubleArray", "missing", "missing"),
           function(x, i, j, ...) {
             copyFromDevice(x, x@nels, type = "double")
           })

setMethod("[", c("cudaIntArray", "missing", "missing"),
           function(x, i, j, ...) {
             copyFromDevice(x, x@nels, type = "integer")
           })

# Called for integer or numeric
setMethod("[", c("cudaFloatArray", "numeric", "missing"),
           function(x, i, j, ...) {

             if(all(i < 0))
               return(x[][i])

             i = as.integer(i - 1L)
             # do we need to add 1 to max(i)
             .Call("R_getCudaFloatVector", x, max(i) + 1L, i)
           })

setMethod("[", c("cudaIntArray", "numeric", "missing"),
           function(x, i, j, ...) {
             if(all(i < 0))
               return(x[][i])

             i = as.integer(i)
             ans = .Call("R_getCudaIntVector", x, max(i))
             ans[i]
           })



setMethod("[", c("cudaPtrWithLength", "logical", "missing"),
           function(x, i, j, ...) {
             x[which(i)]
           })

################


setMethod("[<-", c("cudaPtrWithLength", "missing", "missing"),
           function(x, i, j, ..., value) {
             if(length(value) > x@nels)
               warning("only copying ", x@nels, " elements")
             # coerce to the correct type
             copyToDevice(rep(value, length = x@nels), x)
           })


# Should be able to use the single generic version of this above
# and have the C code avoid copying the data.
setMethod("[<-", c("cudaFloatArray", "missing", "missing"),
           function(x, i, j, ..., value) {
             if(length(value) > x@nels)
               warning("only copying ", x@nels, " elements")             
             copyToDevice(rep(as.numeric(value), length = x@nels), x)
           })

setMethod("[<-", c("cudaIntArray", "missing", "missing"),
           function(x, i, j, ..., value) {
             if(length(value) > x@nels)
               warning("only copying ", x@nels, " elements")             
             copyToDevice(rep(as.integer(value), length = x@nels), x)
           })




cuInit =
function(flags = 0L)
{  
  ans = .Call("R_cuInit", as.integer(flags), FALSE)
  if(ans != 0)
     raiseError(ans, "failed to initialize CUDA")
  ans
}

cuGetContext =
function(create = TRUE, ..., asContext = TRUE)
{  
  ans = .Call("R_cuCtxGetCurrent", as.logical(asContext))
  if(is.integer(ans))
     raiseError(ans, "failed to get current CUDA context")

  if(isNativeNull(ans) && create)
    ans = createContext(...)
  
  ans
}




cudaVersion = cuVersion =
function()
{
  structure(.Call("R_cuGetVersion"), names = c("driver", "runtime"))
}


cudaErrorString =
function()
  .Call("R_cudaGetLastError")




cuMemGetInfo = cuMemInfo =
function()
{
  ans = .Call("R_cuMemGetInfo")
  if(is.integer(ans))
     raiseError(ans, "failed to get current CUDA context")

  ans[3] = ans[1]/ans[2]
  names(ans) = c("free", "total", "% free")
  ans
}




cuFuncGetAttributes =
function(func)
{
     # ignore the MAX entry 
   vals = CUfunction_attributeValues
   sapply(unclass(vals)[-length(vals)], cuFuncGetAttribute,  as(func, "CUfunction"))
}







cuModuleLoadDataEx <-
function( image , options = integer(), opts = FALSE)
{
  cuGetContext()
  options = if(length(options))
              sapply(options, function(x) as.integer(as(x, "CUjit_option")))
            else
              integer()
  
  ans = .Call('R_cuModuleLoadDataEx', image, options, as.logical(opts))
  
  if(is(ans, 'CUresult') && ans != 0)
    raiseError(ans, 'R_cuModuleLoadDataEx')
  if(opts)
    ans
  else
    ans
}


cudaSetDevice = 
function(device = 1L)
{
   device = as.integer(device) - 1L
   if(device  < 0)
     stop("negative index for device. Must be a positive number")

   .Call("R_cudaSetDevice", device)
}