R/kernelManager.R

Defines functions getGroupSize hasKernel readCode kernel.getSharedMem kernel.getOption .kernel

Documented in .kernel kernel.getOption

#' Excute the openCL function
#' 
#' The function serves as a bridge between R and openCL, it sends the openCL code and 
#' R matrix object to the device and excutes it on the device. The function has an auto-type 
#' ability which can make the openCL code independent with the type of its function argument,
#' see detail and examples for the usage.
#' 
#' @details 
#' The function `.kernel()` is the low level API to communicate with openCL device.
#' It provides a way to run the customized code on the device, the source code should be
#' openCL code and the kernel is the kernel function that you want to run on the device.
#' 
#' You can specify with device the code should be run on by specifying the `.device` argument.
#' By default, if you do not specify any device, the first device in the device list will be used
#' 
#' The argument `.globalThreadNum` specifys the number of threads that will be used to excute the kernel.
#' The concept is the same as `global_work_size`` in openCL functions
#' 
#' There are multiple options that you can change in the kernel function. 
#' You can call the function `kernel.getOption()` to obtain the default setting. 
#' The most distinguishable feature in this package is probably the auto type function, which 
#' can set the type of the kernel arguments as an macro in the openCL code. This feature allows the 
#' user to create a type-free code. If the `kernelOption$autoType` in `.options` is true(Default),
#' four macros will be defined, they are(X is the position of the function arguments):
#' 
#' autoX: The variable type
#' 
#' gAutoX: Short for global autoX
#' 
#' lAutoX: short for local autoX
#' 
#' autoX_v4: Define a vector of length 4 with the same variable type as the X th function argument
#' 
#' Please refer to the example for the usage
#' 
#' 
#' @param src the source code, it can be either a file directory or the code
#' @param kernel the kernel function that will be called on the device
#' @param parms a list containing the function arguments. 
#' The number of elements in the list has to match the number of function arguments.
#' @param .device the device that will excute the function. 
#' If not specified, all the selected devices will be used.
#' @param .globalThreadNum the number of threads that will be created to excute the kernel. 
#' If not specified, the length of the first argument will be used as the thread number
#' @param .options the kernel options
#' @examples 
#' #The GPU code
#' code='
#' kernel void matAdd(gAuto1* A,gAuto2* B,gAuto3* C,gAuto4* size){
#' uint col_id=get_global_id(0);
#' uint rowNum=*size;
#' for(uint i=0;i<rowNum;i++){
#' C[i+col_id*rowNum]=A[i+col_id*rowNum]+B[i+col_id*rowNum];
#' }
#' }
#' '
#' #Create data in R
#' m=100
#' n=200
#' A=matrix(runif(m*n),m,n)
#' B=matrix(runif(m*n),m,n)
#' #Send the data to GPU
#' A_dev=gpuMatrix(A,type='double')
#' B_dev=gpuMatrix(B,type='double')
#' #Create an empty data matrix in GPU
#' C_dev=gpuEmptMatrix(row=m,col=n,type='double')
#' 
#' #Get the default options
#' options=kernel.getOption()
#' #Run the GPU function with n threads, each thread computes one column addition
#' .kernel(src = code,kernel='matAdd',parms=list(A_dev,B_dev,C_dev,m),
#' .globalThreadNum = n,.options = options)
#' 
#' #Retrieve the data
#' C_dev=download(C_dev)
#' C=as.matrix(C_dev)
#' #Check the error
#' range(C-A-B)
#' @return A vector or a matrix
#' @export
.kernel <- function(src = "", kernel, parms, .device = "auto", .globalThreadNum = "length(FirstArg)", 
    .options = kernel.getOption()) {
    verbose = .options$verbose
    kernelMsg = .options$kernelMsg
    kernelOption = .options$kernelOption
    
    if (verbose || sum(as.matrix(kernelMsg[, -grep("warning", colnames(kernelMsg))])) != 
        0) 
        message("======kelnel compilation=========")
    # Read the opencl code
    codePack = readCode(src)
    src = codePack$src
    srcSig = codePack$srcSig
    
    
    # Find the device and platform id
    if (length(.device) > 1) 
      stop("Multiple devices are not supported!")
    if (.device == "auto") {
      deviceId = getFirstSelectedDevice()
    } else {
      deviceId = .device
    }
    device = getSelectedDevice(deviceId)
    
    dataType = c()
    for (i in seq_len(length(parms))) {
      if (is(parms[[i]],"list")) {
        dataType[i] = parms[[i]]$type
        next
      }
      if (!is(parms[[i]],"gpuMatrix")) {
        parms[[i]] = gpuMatrix(parms[[i]], device = deviceId)
      }
      dataType[i] = .type(parms[[i]])
      if (.device(parms[[i]]) != deviceId) 
        stop("The data is not in the same device!")
    }
    
    #Find the global thread number
    if (.globalThreadNum == "length(FirstArg)") {
      .globalThreadNum = length(parms[[1]])
    }
    
    #Find the local thread number
    localThreadNum=getThreadNumber(kernelOption)
    #Limit the thread number to the hardware limitation
    localThreadNum=min(localThreadNum,getDeviceInfo(deviceId)$work_group_size)
    #Make sure the number is compatible with the global thread number
    localThreadNum=gcd(localThreadNum,.globalThreadNum)
    
    if (kernelOption$localThreadNumMacro) {
      threadMacro = paste0("#define cl_local_thread_num ", localThreadNum)
      srcSig = paste0(srcSig, localThreadNum)
      src = paste0(threadMacro, "\n", src)
    }
    
    # Create the signature for the kernel function
    sig = paste0(srcSig, kernelOption$flag, kernelOption$signature)
    
    
    # Create the data type macros Add the signature if needed
    if (kernelOption$autoType && length(dataType) != 0) {
        gAUTO = paste0("#define gAuto", seq_along(dataType), " global ", 
            dataType, "\n", collapse = "")
        lAUTO = paste0("#define lAuto", seq_along(dataType), " local ", 
            dataType, "\n", collapse = "")
        pAUTO = paste0("#define auto", seq_along(dataType), " ", dataType, 
            "\n", collapse = "")
        
        AUTOVector = paste0("#define auto", seq_along(dataType), "_v4 ", 
            dataType, "4", "\n", collapse = "")
        
        src = paste0(gAUTO, lAUTO, pAUTO, AUTOVector, src)
        sig = c(sig, paste0(dataType, collapse = ""))
    }
    
    sig_hash = digest(sig)
    # Create the kernel if it does not exist
    if (!hasKernel(device, sig_hash, kernel)) {
        if (verbose || kernelMsg$compilation.msg) 
            message("OpenCL compiler message: The kernel does not exist and will be created")
        .Call(C_createKernel, device[1], device[2], sig_hash, kernelOption$flag, 
            src, kernel)
    }
    # Compute the usage of the shared memory and global memory upload the
    # parameters
    global_memory = 0
    share_memory = 0
    for (i in seq_len(length(parms))) {
        if (is(parms[[i]],"list")) {
            share_memory = share_memory + parms[[i]]$size
            
            .Call(C_setSharedParameter, device[1], device[2], sig_hash, 
                kernel, as.integer(parms[[i]]$size), as.integer(i - 1))
        } else {
            global_memory = global_memory + getSize(parms[[i]])
            # message(getSize(parms[[i]]))
            .Call(C_setParameter, device[1], device[2], sig_hash, kernel, 
                .getAddress(parms[[i]]), as.integer(i - 1))
        }
    }
    if (verbose || kernelMsg$memory.usage.msg) {
        message("OpenCL memory usage report:")
        message("Global memory: ", format_memory_size_output(global_memory))
        message("Shared memory: ", format_memory_size_output(share_memory))
    }
    
    
    
    if (verbose || kernelMsg$thread.num.msg) {
        message("OpenCL thread Number report:")
        message(paste0("Total thread number: ", .globalThreadNum))
        message(paste0("block number: ", .globalThreadNum/localThreadNum))
        message(paste0("Thread number per block: ", localThreadNum))
    }
    
    localThreadNum_preferred=getGroupSize(device,sig_hash,kernel)
    if(localThreadNum_preferred<localThreadNum){
      if(verbose&&kernelOption$localThreadNumMacro){
        message("The current thread number is larger than the preferred setting, please consider to reduce it\n",
                "current value:",localThreadNum,"  Preferred value:",localThreadNum_preferred)
      }
    }
    
    .Call(C_launchKernel, device[1], device[2], sig_hash, kernel, as.integer(.globalThreadNum), 
        as.integer(localThreadNum))
    
    invisible()
}

#' Get the openCL compilation options
#' 
#' @details 
#' #' `verbose` turn the verbose mode on and off.
#' 
#' `kernelOption$localThreadNum` controls the local thread number in each group, 
#' the local thread number should be a divisor of the argument `.globalThreadNum`.
#' If it is set to `auto`, the suggested number of local thread number will be
#' obtained from openCL API and reduced to a divisor of `.globalThreadNum`.
#' 
#' `kernelOption$localThreadNumMacro` specifys whether the local thread number 
#' should be inserted into the code as an macro. If it is TRUE, the macro 
#' `cl_local_thread_num` will be defined. It is useful when you want to dynamically
#' allocate the memory \(Mostly local memory\) according to the local thread number
#' 
#' `kernelOption$signature` This is for internal usage only, please do not change it
#' 
#' `kernelOption$flag` The openCL compiler flag.
#' 
#' `kernelOption$autoType` Determine whether the type of kernel arguments should be 
#' defined as an macro, see the `.kernel` document for detail
#' @examples 
#' opt=kernel.getOption()
#' opt
#' @return A list of available options
#' @export
kernel.getOption <- function() {
  curOp = list()
  curOp$verbose = FALSE
  
  curOp$kernelMsg = data.frame(
    compilation.msg = FALSE, 
    memory.usage.msg = FALSE, 
    thread.num.msg = FALSE)
  
  curOp$kernelOption = data.frame(
    localThreadNum = "auto", 
    localThreadNumMacro = FALSE, 
    signature = "", 
    flag = "", 
    autoType = TRUE, 
    stringsAsFactors = FALSE)
  curOp = structure(curOp, class = "options")
  curOp
}

kernel.getSharedMem <- function(length, type) {
    checkTypeSupport(type)
    return(list(length = length, size = length * getTypeSize(type), type = type))
}

readCode <- function(src) {
    # if the src can be treated as a file name, then read the file,
    # otherwise check if it is code.
    if (nchar(src) < 128 && file.exists(src)) {
        fileName = src
        ## Read source file
        src = readChar(fileName, file.info(fileName)$size)
        sig = as.character(file.mtime(fileName))
        # Add a space to make it more stable
        src = paste0(" ", src)
    } else {
        if (nchar(src) < 40) {
            if (length(grep("\\(.*\\)", src)) == 0) {
                warning("It looks like the src variable is a file address, but I cannot find it.")
            }
        }
        sig = digest(src)
    }
    return(list(src = src, srcSig = sig))
}


# Check if the kernel is already exist
hasKernel <- function(device, sig, kernel) {
    res = .Call(C_hasKernel, device[1], device[2], sig, kernel)
    res
}

getGroupSize <- function(device, sig, kernel) {
    res = .Call(C_getPreferredGroupSize, device[1], device[2], sig, kernel)
    res
}
Jiefei-Wang/gpuMagic documentation built on Oct. 25, 2019, 10:18 a.m.