diff --git a/VadereUtils/resources/BitonicSort.cl b/VadereUtils/resources/BitonicSort.cl new file mode 100644 index 0000000000000000000000000000000000000000..722ebfc8a0881adf1520030b324df62a725da6c7 --- /dev/null +++ b/VadereUtils/resources/BitonicSort.cl @@ -0,0 +1,241 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#define LOCAL_SIZE_LIMIT 256U + +inline void ComparatorPrivate( + uint *keyA, + uint *valA, + uint *keyB, + uint *valB, + uint dir +){ + if( (*keyA > *keyB) == dir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + t = *valA; *valA = *valB; *valB = t; + } +} + +inline void ComparatorLocal( + __local uint *keyA, + __local uint *valA, + __local uint *keyB, + __local uint *valB, + uint dir +){ + if( (*keyA > *keyB) == dir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + t = *valA; *valA = *valB; *valB = t; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Monolithic bitonic sort kernel for short arrays fitting into local memory +//////////////////////////////////////////////////////////////////////////////// +__kernel void bitonicSortLocal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint dir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subbatch and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + for(uint size = 2; size < arrayLength; size <<= 1){ + //Bitonic merge + uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 ); + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + ddd + ); + } + } + + //ddd == dir for the last bitonic merge step + { + for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + dir + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bitonic sort kernel for large arrays (not fitting into local memory) +//////////////////////////////////////////////////////////////////////////////// +//Bottom-level bitonic sort +//Almost the same as bitonicSortLocal with the only exception +//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being +//sorted in opposite directions +__kernel void bitonicSortLocal1( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subarray and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); + + for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ + //Bitonic merge + uint ddd = (comparatorI & (size / 2)) != 0; + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + ddd + ); + } + } + + //Odd / even arrays of LOCAL_SIZE_LIMIT elements + //sorted in opposite directions + { + uint ddd = (get_group_id(0) & 1); + for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + ddd + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT +__kernel void bitonicMergeGlobal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint size, + uint stride, + uint dir +){ + uint global_comparatorI = get_global_id(0); + uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); + + //Bitonic merge + uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + + uint keyA = d_SrcKey[pos + 0]; + uint valA = d_SrcVal[pos + 0]; + uint keyB = d_SrcKey[pos + stride]; + uint valB = d_SrcVal[pos + stride]; + + ComparatorPrivate( + &keyA, &valA, + &keyB, &valB, + ddd + ); + + d_DstKey[pos + 0] = keyA; + d_DstVal[pos + 0] = valA; + d_DstKey[pos + stride] = keyB; + d_DstVal[pos + stride] = valB; +} + +//Combined bitonic merge steps for +//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] +__kernel void bitonicMergeLocal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint stride, + uint size, + uint dir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + //Bitonic merge + uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); + uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + for(; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + ddd + ); + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} \ No newline at end of file diff --git a/VadereUtils/src/org/vadere/util/opencl/CLBitonicSort.java b/VadereUtils/src/org/vadere/util/opencl/CLBitonicSort.java new file mode 100644 index 0000000000000000000000000000000000000000..b8ac32a08cebf98494abb03145f662b996183a73 --- /dev/null +++ b/VadereUtils/src/org/vadere/util/opencl/CLBitonicSort.java @@ -0,0 +1,374 @@ +package org.vadere.util.opencl; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.NotNull; +import org.lwjgl.PointerBuffer; +import org.lwjgl.opencl.CLContextCallback; +import org.lwjgl.opencl.CLProgramCallback; +import org.lwjgl.system.Configuration; +import org.lwjgl.system.MemoryStack; +import org.lwjgl.system.MemoryUtil; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; + +import static org.lwjgl.opencl.CL10.CL_CONTEXT_PLATFORM; +import static org.lwjgl.opencl.CL10.CL_DEVICE_NAME; +import static org.lwjgl.opencl.CL10.CL_DEVICE_TYPE_GPU; +import static org.lwjgl.opencl.CL10.CL_MEM_ALLOC_HOST_PTR; +import static org.lwjgl.opencl.CL10.CL_MEM_COPY_HOST_PTR; +import static org.lwjgl.opencl.CL10.CL_MEM_READ_ONLY; +import static org.lwjgl.opencl.CL10.CL_MEM_READ_WRITE; +import static org.lwjgl.opencl.CL10.CL_MEM_WRITE_ONLY; +import static org.lwjgl.opencl.CL10.CL_PROGRAM_BUILD_STATUS; +import static org.lwjgl.opencl.CL10.CL_SUCCESS; +import static org.lwjgl.opencl.CL10.clBuildProgram; +import static org.lwjgl.opencl.CL10.clCreateBuffer; +import static org.lwjgl.opencl.CL10.clCreateCommandQueue; +import static org.lwjgl.opencl.CL10.clCreateContext; +import static org.lwjgl.opencl.CL10.clCreateKernel; +import static org.lwjgl.opencl.CL10.clCreateProgramWithSource; +import static org.lwjgl.opencl.CL10.clEnqueueNDRangeKernel; +import static org.lwjgl.opencl.CL10.clEnqueueReadBuffer; +import static org.lwjgl.opencl.CL10.clEnqueueWriteBuffer; +import static org.lwjgl.opencl.CL10.clFinish; +import static org.lwjgl.opencl.CL10.clGetDeviceIDs; +import static org.lwjgl.opencl.CL10.clGetPlatformIDs; +import static org.lwjgl.opencl.CL10.clReleaseCommandQueue; +import static org.lwjgl.opencl.CL10.clReleaseContext; +import static org.lwjgl.opencl.CL10.clReleaseKernel; +import static org.lwjgl.opencl.CL10.clReleaseMemObject; +import static org.lwjgl.opencl.CL10.clReleaseProgram; +import static org.lwjgl.opencl.CL10.clSetKernelArg1i; +import static org.lwjgl.opencl.CL10.clSetKernelArg1p; +import static org.lwjgl.system.MemoryStack.stackPush; +import static org.lwjgl.system.MemoryUtil.NULL; +import static org.lwjgl.system.MemoryUtil.memUTF8; + +/** + * @author Benedikt Zoennchen + */ +public class CLBitonicSort { + private static Logger log = LogManager.getLogger(CLBitonicSort.class); + + // CL ids + private long clPlatform; + private long clDevice; + private long clContext; + private long clQueue; + private long clProgram; + + // CL Memory + private long clInKeys; + private long clOutKeys; + private long clInValues; + private long clOutValues; + + // Host Memory + private IntBuffer inKeys; + private IntBuffer outKeys; + private IntBuffer inValues; + private IntBuffer outValues; + + private ByteBuffer source; + + // CL callbacks + private CLContextCallback contextCB; + private CLProgramCallback programCB; + + // CL kernel + private long clBitonicSortLocal; + private long clBitonicSortLocal1; + private long clBitonicMergeGlobal; + private long clBitonicMergeLocal; + private long clKernel; + + private int[] keys; + private int[] values; + + private int[] resultValues; + private int[] resultKeys; + + //Note: logically shared with BitonicSort.cl! + private static final int LOCAL_SIZE_LIMIT = 256; + + private boolean debug = false; + + public enum KernelType { + Separate, + Col, + Row, + NonSeparate + } + + public CLBitonicSort() throws OpenCLException { + if(debug) { + Configuration.DEBUG.set(true); + Configuration.DEBUG_MEMORY_ALLOCATOR.set(true); + Configuration.DEBUG_STACK.set(true); + } + init(); + } + + public int[] getResultKeys() { + return resultKeys; + } + + public int[] getResultValues() { + return resultValues; + } + + public void init() throws OpenCLException { + initCallbacks(); + initCL(); + buildProgram(); + } + + public void sort(@NotNull final int[] keys, @NotNull final int[] values) throws OpenCLException { + assert factorRadix2(keys.length) == 1 && keys.length == values.length; + + inKeys = CLUtils.toIntBuffer(keys, CLUtils.toIntBuffer(keys)); + outKeys = CLUtils.toIntBuffer(keys); + inValues = CLUtils.toIntBuffer(values, CLUtils.toIntBuffer(values)); + outValues = CLUtils.toIntBuffer(values); + + try (MemoryStack stack = stackPush()) { + + int dir = 1; + + PointerBuffer clGlobalWorkSize = stack.callocPointer(1); + PointerBuffer clLocalWorkSize = stack.callocPointer(1); + IntBuffer errcode_ret = stack.callocInt(1); + // host memory to gpu memory + clInKeys = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, inKeys, errcode_ret); + CLInfo.checkCLError(errcode_ret); + clOutKeys = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * keys.length, errcode_ret); + CLInfo.checkCLError(errcode_ret); + clInValues = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, inValues, errcode_ret); + CLInfo.checkCLError(errcode_ret); + clOutValues = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * keys.length, errcode_ret); + CLInfo.checkCLError(errcode_ret); + + + // small sorts + if(keys.length <= LOCAL_SIZE_LIMIT) + { + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 0, clOutKeys)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 1, clOutValues)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 2, clInKeys)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 3, clInValues)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 4, keys.length)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 5, 1)); + clGlobalWorkSize.put(0, keys.length / 2); + clLocalWorkSize.put(0, keys.length / 2); + + // run the kernel and read the result + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + CLInfo.checkCLError(clFinish(clQueue)); + } + else { + //Launch bitonicSortLocal1 + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 0, clOutKeys)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 1, clOutValues)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 2, clInKeys)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 3, clInValues)); + + clGlobalWorkSize = stack.callocPointer(1); + clLocalWorkSize = stack.callocPointer(1); + clGlobalWorkSize.put(0, keys.length / 2); + clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 2); + + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal1, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + CLInfo.checkCLError(clFinish(clQueue)); + + for(int size = 2 * LOCAL_SIZE_LIMIT; size <= keys.length; size <<= 1) + { + for(int stride = size / 2; stride > 0; stride >>= 1) + { + if(stride >= LOCAL_SIZE_LIMIT) + { + //Launch bitonicMergeGlobal + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 0, clOutKeys)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 1, clOutValues)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 2, clOutKeys)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 3, clOutValues)); + + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 4, keys.length)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 5, size)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 6, stride)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 7, dir)); + + clGlobalWorkSize = stack.callocPointer(1); + clLocalWorkSize = stack.callocPointer(1); + clGlobalWorkSize.put(0, keys.length / 2); + clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 4); + + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeGlobal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + CLInfo.checkCLError(clFinish(clQueue)); + } + else + { + //Launch bitonicMergeLocal + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 0, clOutKeys)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 1, clOutValues)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 2, clOutKeys)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 3, clOutValues)); + + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 4, keys.length)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 5, stride)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 6, size)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 7, dir)); + + clGlobalWorkSize = stack.callocPointer(1); + clLocalWorkSize = stack.callocPointer(1); + clGlobalWorkSize.put(0, keys.length / 2); + clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 2); + + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + CLInfo.checkCLError(clFinish(clQueue)); + break; + } + } + } + } + clEnqueueReadBuffer(clQueue, clOutKeys, true, 0, outKeys, null, null); + clEnqueueReadBuffer(clQueue, clOutValues, true, 0, outValues, null, null); + resultKeys = CLUtils.toIntArray(outKeys, keys.length); + resultValues = CLUtils.toIntArray(outValues, values.length); + } + + clearCL(); + } + + static long factorRadix2(long L){ + if(L==0){ + return 0; + }else{ + for(int log2L = 0; (L & 1) == 0; L >>= 1, log2L++); + return L; + } + } + + public void clear() throws OpenCLException { + clearMemory(); + } + + private void clearMemory() throws OpenCLException { + // release memory and devices + try { + CLInfo.checkCLError(clReleaseMemObject(clInKeys)); + CLInfo.checkCLError(clReleaseMemObject(clOutKeys)); + CLInfo.checkCLError(clReleaseMemObject(clInValues)); + CLInfo.checkCLError(clReleaseMemObject(clOutValues)); + } + catch (OpenCLException ex) { + throw ex; + } + finally { + MemoryUtil.memFree(inKeys); + MemoryUtil.memFree(outKeys); + MemoryUtil.memFree(inValues); + MemoryUtil.memFree(inKeys); + MemoryUtil.memFree(source); + } + } + + private void clearCL() throws OpenCLException { + CLInfo.checkCLError(clReleaseKernel(clBitonicSortLocal)); + CLInfo.checkCLError(clReleaseKernel(clBitonicSortLocal1)); + CLInfo.checkCLError(clReleaseKernel(clBitonicMergeGlobal)); + CLInfo.checkCLError(clReleaseKernel(clBitonicMergeLocal)); + + CLInfo.checkCLError(clReleaseCommandQueue(clQueue)); + CLInfo.checkCLError(clReleaseProgram(clProgram)); + CLInfo.checkCLError(clReleaseContext(clContext)); + contextCB.free(); + programCB.free(); + } + + // private helpers + private void initCallbacks() { + contextCB = CLContextCallback.create((errinfo, private_info, cb, user_data) -> + { + log.debug("[LWJGL] cl_context_callback" + "\tInfo: " + memUTF8(errinfo)); + }); + + programCB = CLProgramCallback.create((program, user_data) -> + { + try { + log.debug("The cl_program [0x"+program+"] was built " + (CLInfo.getProgramBuildInfoInt(program, clDevice, CL_PROGRAM_BUILD_STATUS) == CL_SUCCESS ? "successfully" : "unsuccessfully")); + } catch (OpenCLException e) { + e.printStackTrace(); + } + }); + } + + private void initCL() throws OpenCLException { + try (MemoryStack stack = stackPush()) { + IntBuffer errcode_ret = stack.callocInt(1); + IntBuffer numberOfPlatforms = stack.mallocInt(1); + + CLInfo.checkCLError(clGetPlatformIDs(null, numberOfPlatforms)); + PointerBuffer platformIDs = stack.mallocPointer(numberOfPlatforms.get(0)); + CLInfo.checkCLError(clGetPlatformIDs(platformIDs, numberOfPlatforms)); + + clPlatform = platformIDs.get(0); + + IntBuffer numberOfDevices = stack.mallocInt(1); + CLInfo.checkCLError(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, null, numberOfDevices)); + PointerBuffer deviceIDs = stack.mallocPointer(numberOfDevices.get(0)); + CLInfo.checkCLError(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, deviceIDs, numberOfDevices)); + + clDevice = deviceIDs.get(0); + + log.debug("CL_DEVICE_NAME = " + CLInfo.getDeviceInfoStringUTF8(clDevice, CL_DEVICE_NAME)); + + PointerBuffer ctxProps = stack.mallocPointer(3); + ctxProps.put(CL_CONTEXT_PLATFORM) + .put(clPlatform) + .put(NULL) + .flip(); + + clContext = clCreateContext(ctxProps, clDevice, contextCB, NULL, errcode_ret); + CLInfo.checkCLError(errcode_ret); + + clQueue = clCreateCommandQueue(clContext, clDevice, 0, errcode_ret); + CLInfo.checkCLError(errcode_ret); + } + } + + private void buildProgram() throws OpenCLException { + try (MemoryStack stack = stackPush()) { + IntBuffer errcode_ret = stack.callocInt(1); + + PointerBuffer strings = stack.mallocPointer(1); + PointerBuffer lengths = stack.mallocPointer(1); + + try { + source = CLUtils.ioResourceToByteBuffer("BitonicSort.cl", 4096); + } catch (IOException e) { + throw new OpenCLException(e.getMessage()); + } + + + strings.put(0, source); + lengths.put(0, source.remaining()); + + clProgram = clCreateProgramWithSource(clContext, strings, lengths, errcode_ret); + CLInfo.checkCLError(clBuildProgram(clProgram, clDevice, "", programCB, NULL)); + clBitonicSortLocal = clCreateKernel(clProgram, "bitonicSortLocal", errcode_ret); + CLInfo.checkCLError(errcode_ret); + clBitonicSortLocal1 = clCreateKernel(clProgram, "bitonicSortLocal1", errcode_ret); + CLInfo.checkCLError(errcode_ret); + clBitonicMergeGlobal = clCreateKernel(clProgram, "bitonicMergeGlobal", errcode_ret); + CLInfo.checkCLError(errcode_ret); + clBitonicMergeLocal = clCreateKernel(clProgram, "bitonicMergeLocal", errcode_ret); + CLInfo.checkCLError(errcode_ret); + + } + + } +} diff --git a/VadereUtils/src/org/vadere/util/opencl/CLUtils.java b/VadereUtils/src/org/vadere/util/opencl/CLUtils.java index 6ae433c11cc605c2009f187d2cfb203a5c4136a1..cae8a492fd11d6271bc0ffec956778e698b5bdb4 100644 --- a/VadereUtils/src/org/vadere/util/opencl/CLUtils.java +++ b/VadereUtils/src/org/vadere/util/opencl/CLUtils.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.FloatBuffer; +import java.nio.IntBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.channels.SeekableByteChannel; @@ -63,6 +64,18 @@ public class CLUtils { return buffer; } + public static IntBuffer toIntBuffer(@NotNull final int[] array) { + IntBuffer intBuffer = MemoryUtil.memAllocInt(array.length); + return intBuffer; + } + + public static IntBuffer toIntBuffer(@NotNull final int[] array, @NotNull final IntBuffer intBuffer) { + for(int i = 0; i < array.length; i++) { + intBuffer.put(i, array[i]); + } + return intBuffer; + } + public static FloatBuffer toFloatBuffer(@NotNull final float[] floats) { FloatBuffer floatBuffer = MemoryUtil.memAllocFloat(floats.length); return toFloatBuffer(floats, floatBuffer); @@ -75,7 +88,16 @@ public class CLUtils { return floatBuffer; } - public static float[] toFloatArray(@NotNull final FloatBuffer floatBuffer, final int size) { + public static int[] toIntArray(@NotNull final IntBuffer floatBuffer, final int size) { + int[] result = new int[size]; + for(int i = 0; i < size; i++) { + result[i] = floatBuffer.get(i); + } + return result; + } + + + public static float[] toFloatArray(@NotNull final FloatBuffer floatBuffer, final int size) { float[] result = new float[size]; for(int i = 0; i < size; i++) { result[i] = floatBuffer.get(i); diff --git a/VadereUtils/tests/org/vadere/util/math/TestBitonicSort.java b/VadereUtils/tests/org/vadere/util/math/TestBitonicSort.java new file mode 100644 index 0000000000000000000000000000000000000000..0b20ccefea36201f1cf9d4a56118f0a0db621b60 --- /dev/null +++ b/VadereUtils/tests/org/vadere/util/math/TestBitonicSort.java @@ -0,0 +1,77 @@ +package org.vadere.util.math; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.Before; +import org.junit.Test; +import org.vadere.util.opencl.CLBitonicSort; +import org.vadere.util.opencl.OpenCLException; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Random; + +import static org.junit.Assert.assertTrue; + +/** + * @author Benedikt Zoennchen + */ +public class TestBitonicSort { + + private static Logger logger = LogManager.getLogger(TestConvolution.class); + + private static Random random = new Random(); + + @Before + public void setUp() throws Exception {} + + @Test + public void testLocalSort() throws IOException, OpenCLException { + int[] keys = randomArray(256); + int[] values = randomArray(256); + + CLBitonicSort clBitonicSort = new CLBitonicSort(); + clBitonicSort.sort(keys, values); + + int[] resultKeys = clBitonicSort.getResultKeys(); + int[] resultValues = clBitonicSort.getResultValues(); + + Arrays.sort(keys); + assertTrue(Arrays.equals(keys, resultKeys)); + } + + @Test + public void testGlobalSort() throws IOException, OpenCLException { + int[] keys = randomArray(2048*2*2*2*2); + int[] values = randomArray(2048*2*2*2*2); + + CLBitonicSort clBitonicSort = new CLBitonicSort(); + clBitonicSort.sort(keys, values); + + int[] resultKeys = clBitonicSort.getResultKeys(); + int[] resultValues = clBitonicSort.getResultValues(); + + Arrays.sort(keys); + assertTrue(Arrays.equals(keys, resultKeys)); + } + + public static int[] randomArray(final int size) { + List list = new ArrayList<>(size); + int[] array = new int[size]; + + for(int i = 0; i < size; i++) { + list.add(i); + } + + Collections.shuffle(list); + + for(int i = 0; i < size; i++) { + array[i] = list.get(i); + } + + return array; + } +}