Currently job artifacts in CI/CD pipelines on LRZ GitLab never expire. Starting from Wed 26.1.2022 the default expiration time will be 30 days (GitLab default). Currently existing artifacts in already completed jobs will not be affected by the change. The latest artifacts for all jobs in the latest successful pipelines will be kept. More information: https://gitlab.lrz.de/help/user/admin_area/settings/continuous_integration.html#default-artifacts-expiration

Commit adc98fe6 authored by Benedikt Zoennchen's avatar Benedikt Zoennchen
Browse files

implementation of Bitonic sort on the GPU.

parent caff4ed7
Pipeline #61189 failed with stage
in 21 seconds
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#define LOCAL_SIZE_LIMIT 256U
inline void ComparatorPrivate(
uint *keyA,
uint *valA,
uint *keyB,
uint *valB,
uint dir
){
if( (*keyA > *keyB) == dir ){
uint t;
t = *keyA; *keyA = *keyB; *keyB = t;
t = *valA; *valA = *valB; *valB = t;
}
}
inline void ComparatorLocal(
__local uint *keyA,
__local uint *valA,
__local uint *keyB,
__local uint *valB,
uint dir
){
if( (*keyA > *keyB) == dir ){
uint t;
t = *keyA; *keyA = *keyB; *keyB = t;
t = *valA; *valA = *valB; *valB = t;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void bitonicSortLocal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint dir
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
//Offset to the beginning of subbatch and load data
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
for(uint size = 2; size < arrayLength; size <<= 1){
//Bitonic merge
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
dir
);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void bitonicSortLocal1(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
//Offset to the beginning of subarray and load data
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
//Bitonic merge
uint ddd = (comparatorI & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
//Odd / even arrays of LOCAL_SIZE_LIMIT elements
//sorted in opposite directions
{
uint ddd = (get_group_id(0) & 1);
for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void bitonicMergeGlobal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint size,
uint stride,
uint dir
){
uint global_comparatorI = get_global_id(0);
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
uint keyA = d_SrcKey[pos + 0];
uint valA = d_SrcVal[pos + 0];
uint keyB = d_SrcKey[pos + stride];
uint valB = d_SrcVal[pos + stride];
ComparatorPrivate(
&keyA, &valA,
&keyB, &valB,
ddd
);
d_DstKey[pos + 0] = keyA;
d_DstVal[pos + 0] = valA;
d_DstKey[pos + stride] = keyB;
d_DstVal[pos + stride] = valB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void bitonicMergeLocal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint stride,
uint size,
uint dir
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
\ No newline at end of file
package org.vadere.util.opencl;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.jetbrains.annotations.NotNull;
import org.lwjgl.PointerBuffer;
import org.lwjgl.opencl.CLContextCallback;
import org.lwjgl.opencl.CLProgramCallback;
import org.lwjgl.system.Configuration;
import org.lwjgl.system.MemoryStack;
import org.lwjgl.system.MemoryUtil;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import static org.lwjgl.opencl.CL10.CL_CONTEXT_PLATFORM;
import static org.lwjgl.opencl.CL10.CL_DEVICE_NAME;
import static org.lwjgl.opencl.CL10.CL_DEVICE_TYPE_GPU;
import static org.lwjgl.opencl.CL10.CL_MEM_ALLOC_HOST_PTR;
import static org.lwjgl.opencl.CL10.CL_MEM_COPY_HOST_PTR;
import static org.lwjgl.opencl.CL10.CL_MEM_READ_ONLY;
import static org.lwjgl.opencl.CL10.CL_MEM_READ_WRITE;
import static org.lwjgl.opencl.CL10.CL_MEM_WRITE_ONLY;
import static org.lwjgl.opencl.CL10.CL_PROGRAM_BUILD_STATUS;
import static org.lwjgl.opencl.CL10.CL_SUCCESS;
import static org.lwjgl.opencl.CL10.clBuildProgram;
import static org.lwjgl.opencl.CL10.clCreateBuffer;
import static org.lwjgl.opencl.CL10.clCreateCommandQueue;
import static org.lwjgl.opencl.CL10.clCreateContext;
import static org.lwjgl.opencl.CL10.clCreateKernel;
import static org.lwjgl.opencl.CL10.clCreateProgramWithSource;
import static org.lwjgl.opencl.CL10.clEnqueueNDRangeKernel;
import static org.lwjgl.opencl.CL10.clEnqueueReadBuffer;
import static org.lwjgl.opencl.CL10.clEnqueueWriteBuffer;
import static org.lwjgl.opencl.CL10.clFinish;
import static org.lwjgl.opencl.CL10.clGetDeviceIDs;
import static org.lwjgl.opencl.CL10.clGetPlatformIDs;
import static org.lwjgl.opencl.CL10.clReleaseCommandQueue;
import static org.lwjgl.opencl.CL10.clReleaseContext;
import static org.lwjgl.opencl.CL10.clReleaseKernel;
import static org.lwjgl.opencl.CL10.clReleaseMemObject;
import static org.lwjgl.opencl.CL10.clReleaseProgram;
import static org.lwjgl.opencl.CL10.clSetKernelArg1i;
import static org.lwjgl.opencl.CL10.clSetKernelArg1p;
import static org.lwjgl.system.MemoryStack.stackPush;
import static org.lwjgl.system.MemoryUtil.NULL;
import static org.lwjgl.system.MemoryUtil.memUTF8;
/**
* @author Benedikt Zoennchen
*/
public class CLBitonicSort {
private static Logger log = LogManager.getLogger(CLBitonicSort.class);
// CL ids
private long clPlatform;
private long clDevice;
private long clContext;
private long clQueue;
private long clProgram;
// CL Memory
private long clInKeys;
private long clOutKeys;
private long clInValues;
private long clOutValues;
// Host Memory
private IntBuffer inKeys;
private IntBuffer outKeys;
private IntBuffer inValues;
private IntBuffer outValues;
private ByteBuffer source;
// CL callbacks
private CLContextCallback contextCB;
private CLProgramCallback programCB;
// CL kernel
private long clBitonicSortLocal;
private long clBitonicSortLocal1;
private long clBitonicMergeGlobal;
private long clBitonicMergeLocal;
private long clKernel;
private int[] keys;
private int[] values;
private int[] resultValues;
private int[] resultKeys;
//Note: logically shared with BitonicSort.cl!
private static final int LOCAL_SIZE_LIMIT = 256;
private boolean debug = false;
public enum KernelType {
Separate,
Col,
Row,
NonSeparate
}
public CLBitonicSort() throws OpenCLException {
if(debug) {
Configuration.DEBUG.set(true);
Configuration.DEBUG_MEMORY_ALLOCATOR.set(true);
Configuration.DEBUG_STACK.set(true);
}
init();
}
public int[] getResultKeys() {
return resultKeys;
}
public int[] getResultValues() {
return resultValues;
}
public void init() throws OpenCLException {
initCallbacks();
initCL();
buildProgram();
}
public void sort(@NotNull final int[] keys, @NotNull final int[] values) throws OpenCLException {
assert factorRadix2(keys.length) == 1 && keys.length == values.length;
inKeys = CLUtils.toIntBuffer(keys, CLUtils.toIntBuffer(keys));
outKeys = CLUtils.toIntBuffer(keys);
inValues = CLUtils.toIntBuffer(values, CLUtils.toIntBuffer(values));
outValues = CLUtils.toIntBuffer(values);
try (MemoryStack stack = stackPush()) {
int dir = 1;
PointerBuffer clGlobalWorkSize = stack.callocPointer(1);
PointerBuffer clLocalWorkSize = stack.callocPointer(1);
IntBuffer errcode_ret = stack.callocInt(1);
// host memory to gpu memory
clInKeys = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, inKeys, errcode_ret);
CLInfo.checkCLError(errcode_ret);
clOutKeys = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * keys.length, errcode_ret);
CLInfo.checkCLError(errcode_ret);
clInValues = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, inValues, errcode_ret);
CLInfo.checkCLError(errcode_ret);
clOutValues = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * keys.length, errcode_ret);
CLInfo.checkCLError(errcode_ret);
// small sorts
if(keys.length <= LOCAL_SIZE_LIMIT)
{
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 0, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 1, clOutValues));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 2, clInKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 3, clInValues));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 4, keys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 5, 1));
clGlobalWorkSize.put(0, keys.length / 2);
clLocalWorkSize.put(0, keys.length / 2);
// run the kernel and read the result
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
}
else {
//Launch bitonicSortLocal1
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 0, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 1, clOutValues));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 2, clInKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 3, clInValues));
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, keys.length / 2);
clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 2);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal1, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
for(int size = 2 * LOCAL_SIZE_LIMIT; size <= keys.length; size <<= 1)
{
for(int stride = size / 2; stride > 0; stride >>= 1)
{
if(stride >= LOCAL_SIZE_LIMIT)
{
//Launch bitonicMergeGlobal
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 0, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 1, clOutValues));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 2, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 3, clOutValues));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 4, keys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 5, size));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 6, stride));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 7, dir));
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, keys.length / 2);
clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 4);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeGlobal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
}
else
{
//Launch bitonicMergeLocal
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 0, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 1, clOutValues));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 2, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 3, clOutValues));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 4, keys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 5, stride));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 6, size));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 7, dir));
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, keys.length / 2);
clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 2);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
break;
}
}
}
}
clEnqueueReadBuffer(clQueue, clOutKeys, true, 0, outKeys, null, null);
clEnqueueReadBuffer(clQueue, clOutValues, true, 0, outValues, null, null);
resultKeys = CLUtils.toIntArray(outKeys, keys.length);
resultValues = CLUtils.toIntArray(outValues, values.length);
}
clearCL();
}
static long factorRadix2(long L){
if(L==0){
return 0;
}else{
for(int log2L = 0; (L & 1) == 0; L >>= 1, log2L++);
return L;
}
}
public void clear() throws OpenCLException {
clearMemory();
}
private void clearMemory() throws OpenCLException {
// release memory and devices
try {
CLInfo.checkCLError(clReleaseMemObject(clInKeys));
CLInfo.checkCLError(clReleaseMemObject(clOutKeys));
CLInfo.checkCLError(clReleaseMemObject(clInValues));
CLInfo.checkCLError(clReleaseMemObject(clOutValues));
}
catch (OpenCLException ex) {
throw ex;
}
finally {
MemoryUtil.memFree(inKeys);
MemoryUtil.memFree(outKeys);
MemoryUtil.memFree(inValues);
MemoryUtil.memFree(inKeys);
MemoryUtil.memFree(source);
}
}
private void clearCL() throws OpenCLException {
CLInfo.checkCLError(clReleaseKernel(clBitonicSortLocal));
CLInfo.checkCLError(clReleaseKernel(clBitonicSortLocal1));
CLInfo.checkCLError(clReleaseKernel(clBitonicMergeGlobal));
CLInfo.checkCLError(clReleaseKernel(clBitonicMergeLocal));
CLInfo.checkCLError(clReleaseCommandQueue(clQueue));
CLInfo.checkCLError(clReleaseProgram(clProgram));
CLInfo.checkCLError(clReleaseContext(clContext));
contextCB.free();
programCB.free();
}
// private helpers
private void initCallbacks() {
contextCB = CLContextCallback.create((errinfo, private_info, cb, user_data) ->
{
log.debug("[LWJGL] cl_context_callback" + "\tInfo: " + memUTF8(errinfo));
});
programCB = CLProgramCallback.create((program, user_data) ->
{
try {
log.debug("The cl_program [0x"+program+"] was built " + (CLInfo.getProgramBuildInfoInt(program, clDevice, CL_PROGRAM_BUILD_STATUS) == CL_SUCCESS ? "successfully" : "unsuccessfully"));
} catch (OpenCLException e) {
e.printStackTrace();
}
});
}
private void initCL() throws OpenCLException {
try (MemoryStack stack = stackPush()) {
IntBuffer errcode_ret = stack.callocInt(1);
IntBuffer numberOfPlatforms = stack.mallocInt(1);
CLInfo.checkCLError(clGetPlatformIDs(null, numberOfPlatforms));
PointerBuffer platformIDs = stack.mallocPointer(numberOfPlatforms.get(0));
CLInfo.checkCLError(clGetPlatformIDs(platformIDs, numberOfPlatforms));
clPlatform = platformIDs.get(0);
IntBuffer numberOfDevices = stack.mallocInt(1);
CLInfo.checkCLError(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, null, numberOfDevices));
PointerBuffer deviceIDs = stack.mallocPointer(numberOfDevices.get(0));
CLInfo.checkCLError(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, deviceIDs, numberOfDevices));
clDevice = deviceIDs.get(0);
log.debug("CL_DEVICE_NAME = " + CLInfo.getDeviceInfoStringUTF8(clDevice, CL_DEVICE_NAME));
PointerBuffer ctxProps = stack.mallocPointer(3);
ctxProps.put(CL_CONTEXT_PLATFORM)
.put(clPlatform)
.put(NULL)
.flip();
clContext = clCreateContext(ctxProps, clDevice, contextCB, NULL, errcode_ret);