2.12.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit d3774a1b authored by Benedikt Zoennchen's avatar Benedikt Zoennchen
Browse files

GPU bitonic sorft for arrays of length unequal to n**2

parent 29258581
......@@ -167,7 +167,7 @@ public class CLParallelOptimalStepsModel {
private boolean debug = false;
private boolean profiling = false;
private int numberOfSortElements;
private long numberOfSortElements;
public enum KernelType {
Separate,
......@@ -330,7 +330,7 @@ public class CLParallelOptimalStepsModel {
this.iCellSize = (float)cellSize;
// support also not multiple of 2 !
numberOfSortElements = expOf(pedestrians.size(), 2);
numberOfSortElements = CLUtils.power(pedestrians.size(), 2);
int toRemove = 0;
int originalSize = pedestrians.size();
......@@ -663,7 +663,7 @@ public class CLParallelOptimalStepsModel {
}
else {
localWorkSize = maxWorkGroupSize;
globalWorkSize = multipleOf(numberOfElements, localWorkSize);
globalWorkSize = CLUtils.multiple(numberOfElements, localWorkSize);
}
clGlobalWorkSize.put(0, globalWorkSize);
......@@ -706,7 +706,7 @@ public class CLParallelOptimalStepsModel {
}
else {
localWorkSize = maxWorkGroupSize;
globalWorkSize = multipleOf(numberOfElements, localWorkSize);
globalWorkSize = CLUtils.multiple(numberOfElements, localWorkSize);
}
clGlobalWorkSize.put(0, globalWorkSize);
......@@ -761,22 +761,6 @@ public class CLParallelOptimalStepsModel {
}
}
private int expOf(int value, int multiple) {
int result = 2;
while (result < value) {
result *= multiple;
}
return result;
}
private long multipleOf(long value, long multiple) {
long result = multiple;
while (result < value) {
result += multiple;
}
return result;
}
// TODO: global and local work size computation
private void clBitonicSort(
final long clKeysIn,
......
......@@ -3,6 +3,7 @@ package org.vadere.util.opencl;
import org.jetbrains.annotations.NotNull;
import org.lwjgl.PointerBuffer;
import org.lwjgl.opencl.CL;
import org.lwjgl.opencl.CLContextCallback;
import org.lwjgl.opencl.CLProgramCallback;
import org.lwjgl.system.Configuration;
......@@ -74,8 +75,6 @@ public class CLBitonicSort {
private IntBuffer inValues;
private IntBuffer outValues;
private ByteBuffer source;
// CL callbacks
private CLContextCallback contextCB;
private CLProgramCallback programCB;
......@@ -87,9 +86,6 @@ public class CLBitonicSort {
private long clBitonicMergeLocal;
private long clKernel;
private int[] keys;
private int[] values;
private int[] resultValues;
private int[] resultKeys;
......@@ -98,13 +94,6 @@ public class CLBitonicSort {
private boolean debug = false;
public enum KernelType {
Separate,
Col,
Row,
NonSeparate
}
public CLBitonicSort() throws OpenCLException {
if(debug) {
Configuration.DEBUG.set(true);
......@@ -122,51 +111,80 @@ public class CLBitonicSort {
return resultValues;
}
public void init() throws OpenCLException {
/**
* Builds all OpenCL resources. This does not initialize or reserve any device or host memory.
*
* @throws OpenCLException
*/
private void init() throws OpenCLException {
initCallbacks();
initCL();
buildProgram();
}
public void sort(@NotNull final int[] keys, @NotNull final int[] values) throws OpenCLException {
assert factorRadix2(keys.length) == 1 && keys.length == values.length;
assert keys.length == values.length;
final int[] padKeys;
final int[] padValues;
boolean padding = CLUtils.factorRadix2(keys.length) != 1;
// padding is required!
if(padding) {
int k = (int)CLUtils.power(keys.length, 2);
assert k > keys.length && CLUtils.factorRadix2(k) == 1;
padKeys = new int[k];
padValues = new int[k];
System.arraycopy(keys, 0, padKeys, 0, keys.length);
System.arraycopy(values, 0, padValues, 0, values.length);
for(int i = keys.length; i < padKeys.length; i++) {
padKeys[i] = Integer.MAX_VALUE;
}
} else {
padKeys = keys;
padValues = values;
}
inKeys = CLUtils.toIntBuffer(keys, CLUtils.toIntBuffer(keys));
outKeys = CLUtils.toIntBuffer(keys);
inValues = CLUtils.toIntBuffer(values, CLUtils.toIntBuffer(values));
outValues = CLUtils.toIntBuffer(values);
/**
* We use non-stack memory because the stack might be too small.
*/
inKeys = CLUtils.toIntBuffer(padKeys, CLUtils.toIntBuffer(padKeys));
outKeys = CLUtils.toIntBuffer(padKeys);
inValues = CLUtils.toIntBuffer(padValues, CLUtils.toIntBuffer(padValues));
outValues = CLUtils.toIntBuffer(padValues);
try (MemoryStack stack = stackPush()) {
int dir = 1;
PointerBuffer clGlobalWorkSize = stack.callocPointer(1);
PointerBuffer clLocalWorkSize = stack.callocPointer(1);
PointerBuffer clGlobalWorkSize = stack.mallocPointer(1);
PointerBuffer clLocalWorkSize = stack.mallocPointer(1);
IntBuffer errcode_ret = stack.callocInt(1);
// host memory to gpu memory
clInKeys = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, inKeys, errcode_ret);
CLInfo.checkCLError(errcode_ret);
clOutKeys = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * keys.length, errcode_ret);
clOutKeys = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * padKeys.length, errcode_ret);
CLInfo.checkCLError(errcode_ret);
clInValues = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, inValues, errcode_ret);
CLInfo.checkCLError(errcode_ret);
clOutValues = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * keys.length, errcode_ret);
clOutValues = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * padKeys.length, errcode_ret);
CLInfo.checkCLError(errcode_ret);
long ms = System.currentTimeMillis();
// small sorts
if(keys.length <= max_work_group_size)
if(padKeys.length <= max_work_group_size)
{
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 0, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 1, clOutValues));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 2, clInKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 3, clInValues));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 4, keys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 4, padKeys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 5, 1));
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal, 6, keys.length * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal, 7, keys.length * 4)); // local memory
clGlobalWorkSize.put(0, keys.length / 2);
clLocalWorkSize.put(0, keys.length / 2);
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal, 6, padKeys.length * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal, 7, padKeys.length * 4)); // local memory
clGlobalWorkSize.put(0, padKeys.length / 2);
clLocalWorkSize.put(0, padKeys.length / 2);
// run the kernel and read the result
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
......@@ -181,15 +199,15 @@ public class CLBitonicSort {
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal1, 4, max_work_group_size * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal1, 5, max_work_group_size * 4)); // local memory
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, keys.length / 2);
clGlobalWorkSize = stack.mallocPointer(1);
clLocalWorkSize = stack.mallocPointer(1);
clGlobalWorkSize.put(0, padKeys.length / 2);
clLocalWorkSize.put(0, max_work_group_size / 2);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal1, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
for(int size = 2 * max_work_group_size; size <= keys.length; size <<= 1)
for(int size = 2 * max_work_group_size; size <= padKeys.length; size <<= 1)
{
for(int stride = size / 2; stride > 0; stride >>= 1)
{
......@@ -201,18 +219,18 @@ public class CLBitonicSort {
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 2, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 3, clOutValues));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 4, keys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 4, padKeys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 5, size));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 6, stride));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 7, dir));
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, keys.length / 2);
clGlobalWorkSize = stack.mallocPointer(1);
clLocalWorkSize = stack.mallocPointer(1);
clGlobalWorkSize.put(0, padKeys.length / 2);
clLocalWorkSize.put(0, max_work_group_size / 4);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeGlobal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
//CLInfo.checkCLError(clFinish(clQueue));
}
else
{
......@@ -222,47 +240,46 @@ public class CLBitonicSort {
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 2, clOutKeys));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 3, clOutValues));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 4, keys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 4, padKeys.length));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 5, stride));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 6, size));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 7, dir));
CLInfo.checkCLError(clSetKernelArg(clBitonicMergeLocal, 8, max_work_group_size * 4 )); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicMergeLocal, 9, max_work_group_size * 4)); // local memory
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, keys.length / 2);
clGlobalWorkSize = stack.mallocPointer(1);
clLocalWorkSize = stack.mallocPointer(1);
clGlobalWorkSize.put(0, padKeys.length / 2);
clLocalWorkSize.put(0, max_work_group_size / 2);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
//CLInfo.checkCLError(clFinish(clQueue));
break;
}
}
}
}
//System.out.println(System.currentTimeMillis() - ms);
clEnqueueReadBuffer(clQueue, clOutKeys, true, 0, outKeys, null, null);
clEnqueueReadBuffer(clQueue, clOutValues, true, 0, outValues, null, null);
resultKeys = CLUtils.toIntArray(outKeys, keys.length);
resultValues = CLUtils.toIntArray(outValues, values.length);
}
clearCL();
clearMemory();
}
static long factorRadix2(long L){
if(L==0){
return 0;
}else{
for(int log2L = 0; (L & 1) == 0; L >>= 1, log2L++);
return L;
}
}
public void clear() throws OpenCLException {
clearMemory();
clearCL();
//clearMemory();
}
/**
* Clears the device and host memory.
*
* @throws OpenCLException
*/
private void clearMemory() throws OpenCLException {
// release memory and devices
try {
......@@ -275,15 +292,20 @@ public class CLBitonicSort {
throw ex;
}
finally {
MemoryUtil.memFree(inKeys);
MemoryUtil.memFree(outKeys);
MemoryUtil.memFree(inValues);
MemoryUtil.memFree(inKeys);
MemoryUtil.memFree(source);
MemoryUtil.memFree(inValues);
MemoryUtil.memFree(inKeys);
MemoryUtil.memFree(outValues);
MemoryUtil.memFree(outKeys);
// release host memory.
}
}
private void clearCL() throws OpenCLException {
/**
* Clears the OpenCL resources i.e. kernels, queues and programs.
*
* @throws OpenCLException
*/
private void clearCL() throws OpenCLException {
CLInfo.checkCLError(clReleaseKernel(clBitonicSortLocal));
CLInfo.checkCLError(clReleaseKernel(clBitonicSortLocal1));
CLInfo.checkCLError(clReleaseKernel(clBitonicMergeGlobal));
......@@ -315,7 +337,7 @@ public class CLBitonicSort {
private void initCL() throws OpenCLException {
try (MemoryStack stack = stackPush()) {
IntBuffer errcode_ret = stack.callocInt(1);
IntBuffer errcode_ret = stack.mallocInt(1);
IntBuffer numberOfPlatforms = stack.mallocInt(1);
CLInfo.checkCLError(clGetPlatformIDs(null, numberOfPlatforms));
......@@ -349,10 +371,11 @@ public class CLBitonicSort {
private void buildProgram() throws OpenCLException {
try (MemoryStack stack = stackPush()) {
IntBuffer errcode_ret = stack.callocInt(1);
IntBuffer errcode_ret = stack.mallocInt(1);
PointerBuffer strings = stack.mallocPointer(1);
PointerBuffer lengths = stack.mallocPointer(1);
ByteBuffer source = null;
try {
source = CLUtils.ioResourceToByteBuffer("BitonicSort.cl", 4096);
......@@ -378,6 +401,8 @@ public class CLBitonicSort {
PointerBuffer pp = stack.mallocPointer(1);
clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, pp, null);
max_work_group_size = (int)pp.get(0);
//System.out.println(max_work_group_size);
MemoryUtil.memFree(source);
}
}
......
......@@ -201,11 +201,11 @@ public class CLConvolution extends CLOperation {
private void convolveSeparate() throws OpenCLException {
//init();
try (MemoryStack stack = stackPush()) {
PointerBuffer clGlobalWorkSizeEdges = stack.callocPointer(2);
PointerBuffer clGlobalWorkSizeEdges = stack.mallocPointer(2);
clGlobalWorkSizeEdges.put(0, matrixWidth);
clGlobalWorkSizeEdges.put(1, matrixHeight);
PointerBuffer ev = stack.callocPointer(1);
PointerBuffer ev = stack.mallocPointer(1);
// run the kernel and read the result
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clKernelConvolveCol, 2, null, clGlobalWorkSizeEdges, null, null, null));
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clKernelConvolveRow, 2, null, clGlobalWorkSizeEdges, null, null, null));
......@@ -215,7 +215,7 @@ public class CLConvolution extends CLOperation {
private void convolve(final long clKernel) throws OpenCLException {
try (MemoryStack stack = stackPush()) {
PointerBuffer clGlobalWorkSizeEdges = stack.callocPointer(2);
PointerBuffer clGlobalWorkSizeEdges = stack.mallocPointer(2);
clGlobalWorkSizeEdges.put(0, matrixWidth);
clGlobalWorkSizeEdges.put(1, matrixHeight);
......@@ -227,7 +227,7 @@ public class CLConvolution extends CLOperation {
private void setArguments(final long clKernel) throws OpenCLException {
try (MemoryStack stack = stackPush()) {
IntBuffer errcode_ret = stack.callocInt(1);
IntBuffer errcode_ret = stack.mallocInt(+1);
// host memory to gpu memory
clInput = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * matrixWidth * matrixHeight, errcode_ret);
......@@ -245,7 +245,7 @@ public class CLConvolution extends CLOperation {
private void setArguments(final long clKernelConvolveCol, final long clKernelConvolveRow) throws OpenCLException {
try (MemoryStack stack = stackPush()) {
IntBuffer errcode_ret = stack.callocInt(1);
IntBuffer errcode_ret = stack.mallocInt(1);
clTmp = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * matrixWidth * matrixHeight, errcode_ret);
clInput = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * matrixWidth * matrixHeight, errcode_ret);
......@@ -310,7 +310,7 @@ public class CLConvolution extends CLOperation {
private void buildProgram() throws OpenCLException {
try (MemoryStack stack = stackPush()) {
IntBuffer errcode_ret = stack.callocInt(1);
IntBuffer errcode_ret = stack.mallocInt(1);
PointerBuffer strings = stack.mallocPointer(1);
PointerBuffer lengths = stack.mallocPointer(1);
......
......@@ -223,4 +223,55 @@ public class CLUtils {
MemoryUtil.memFree(buffer);
return newBuffer;
}
/**
* Returns an integer n such that n = <tt>base</tt>^k, where k > 0 is the smallest integer such that
* n >= <tt>value</tt>.
*
* @param value the value
* @param base the base
*
* @return an integer n such that n = 2 * <tt>multiple</tt>^k
*/
public static long power(long value, long base) {
assert value > 0 && base > 0;
long result = base;
while (result < value) {
result *= base;
}
return result;
}
/**
* Returns an long n such that n = <tt>base</tt> * k, where k > 0 is the smallest long such that
* n >= <tt>value</tt>.
*
* @param value the value
* @param base the multiple
*
* @return an integer n such that n = 2 * <tt>multiple</tt>^k
*/
public static long multiple(long value, long base) {
long result = base;
while (result < value) {
result += base;
}
return result;
}
/**
* Computes the the factor radix which is 1 for all long of the form 2^k.
*
* @param L
* @return
*/
public static long factorRadix2(long L){
if(L==0){
return 0;
}else{
for(int log2L = 0; (L & 1) == 0; L >>= 1, log2L++);
return L;
}
}
}
......@@ -12,6 +12,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import static org.junit.Assert.assertTrue;
......@@ -21,52 +22,75 @@ import static org.junit.Assert.assertTrue;
public class TestBitonicSort {
private static Logger logger = Logger.getLogger(TestConvolution.class);
private static Random random = new Random();
@Before
public void setUp() throws Exception {}
public void setUp() throws Exception {
logger.setDebug();
}
@Test
public void testLocalSort() throws IOException, OpenCLException {
int[] keys = randomArray(32);
int[] values = randomArray(32);
int size = 123;
int[] keys = randomArray(size);
int[] values = randomArray(size);
CLBitonicSort clBitonicSort = new CLBitonicSort();
long ms = System.currentTimeMillis();
clBitonicSort.sort(keys, values);
long diff = System.currentTimeMillis() - ms;
logger.debug("Sort (GPU):" + diff + "[ms], size:" + size);
clBitonicSort.clear();
int[] resultKeys = clBitonicSort.getResultKeys();
int[] resultValues = clBitonicSort.getResultValues();
ms = System.currentTimeMillis();
Arrays.sort(keys);
diff = System.currentTimeMillis() - ms;
logger.debug("Sort (CPU):" + diff + "[ms], size:" + size);
assertTrue(Arrays.equals(keys, resultKeys));
}
@Test
public void testGlobalSort() throws IOException, OpenCLException {
int[] keys = randomArray(2048*2*2*2*2);
int[] values = randomArray(2048*2*2*2*2);
int size = Integer.MAX_VALUE / 1024;
int[] keys = randomArray(size);
int[] values = randomArray(size);
CLBitonicSort clBitonicSort = new CLBitonicSort();
long ms = System.currentTimeMillis();
clBitonicSort.sort(keys, values);
long diff = System.currentTimeMillis() - ms;
logger.debug("Sort (GPU):" + diff + "[ms], size:" + size);
clBitonicSort.clear();
int[] resultKeys = clBitonicSort.getResultKeys();
int[] resultValues = clBitonicSort.getResultValues();
ms = System.currentTimeMillis();
Arrays.sort(keys);
diff = System.currentTimeMillis() - ms;
logger.debug("Sort (CPU):" + diff + "[ms], size:" + size);
assertTrue(Arrays.equals(keys, resultKeys));
}
public static int[] randomArray(final int size) {
List<Integer> list = new ArrayList<>(size);
//List<Integer> list = new ArrayList<>(size);
int[] array = new int[size];
for(int i = 0; i < size; i++) {
list.add(i);
array[i] = i;
}
Collections.shuffle(list);
// shuffle
for(int i = 0; i < size; i++) {
array[i] = list.get(i);
int j = random.nextInt(size);
int tmp = array[j];
array[j] = array[i];
array[i] = tmp;
}
return array;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment