From a58b9135b6de9e796a74775f07839132a76c5844 Mon Sep 17 00:00:00 2001 From: Benedikt Zoennchen Date: Thu, 12 Jul 2018 15:08:48 +0200 Subject: [PATCH] implement the linked cell on the GPU. --- VadereUtils/resources/Particles.cl | 523 +++++++++++++++ .../util/opencl/CLUniformHashedGrid.java | 607 ++++++++++++++++++ .../org/vadere/util/math/TestBitonicSort.java | 2 - .../vadere/util/math/TestCellGridSort.java | 107 +++ 4 files changed, 1237 insertions(+), 2 deletions(-) create mode 100644 VadereUtils/resources/Particles.cl create mode 100644 VadereUtils/src/org/vadere/util/opencl/CLUniformHashedGrid.java create mode 100644 VadereUtils/tests/org/vadere/util/math/TestCellGridSort.java diff --git a/VadereUtils/resources/Particles.cl b/VadereUtils/resources/Particles.cl new file mode 100644 index 000000000..3a33ccccd --- /dev/null +++ b/VadereUtils/resources/Particles.cl @@ -0,0 +1,523 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + + + +//////////////////////////////////////////////////////////////////////////////// +// Common definitions +//////////////////////////////////////////////////////////////////////////////// +#define UMAD(a, b, c) ( (a) * (b) + (c) ) + +typedef struct{ + float x; + float y; + float z; +} Float3; + +typedef struct{ + uint x; + uint y; + uint z; +}Uint3; + +typedef struct{ + int x; + int y; + int z; +}Int3; + + +typedef struct{ + Float3 colliderPos; + float colliderRadius; + + Float3 gravity; + float globalDamping; + float particleRadius; + + Uint3 gridSize; + uint numCells; + Float3 worldOrigin; + Float3 cellSize; + + uint numBodies; + uint maxParticlesPerCell; + + float spring; + float damping; + float shear; + float attraction; + float boundaryDamping; +} simParams_t; + +typedef struct { + float2 position; + float stepLength; +} pedestrian; + +#define LOCAL_SIZE_LIMIT 16U + +inline void ComparatorPrivate( + uint *keyA, + uint *valA, + uint *keyB, + uint *valB, + uint dir +){ + if( (*keyA > *keyB) == dir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + t = *valA; *valA = *valB; *valB = t; + } +} + +inline void ComparatorLocal( + __local uint *keyA, + __local uint *valA, + __local uint *keyB, + __local uint *valB, + uint dir +){ + if( (*keyA > *keyB) == dir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + t = *valA; *valA = *valB; *valB = t; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Save particle grid cell hashes and indices +//////////////////////////////////////////////////////////////////////////////// +uint2 getGridPos(float2 p, __constant float* cellSize, __constant float2* worldOrigin){ + uint2 gridPos; + float2 wordOr = (*worldOrigin); + gridPos.x = (int)floor((p.x - wordOr.x) / (*cellSize)); + gridPos.y = (int)floor((p.y - wordOr.y) / (*cellSize)); + return gridPos; +} + +//Calculate address in grid from position (clamping to edges) +uint getGridHash(uint2 gridPos, __constant uint2* gridSize){ + //Wrap addressing, assume power-of-two grid dimensions + gridPos.x = gridPos.x & ((*gridSize).x - 1); + gridPos.y = gridPos.y & ((*gridSize).y - 1); + return UMAD( UMAD(1.0, (*gridSize).y, gridPos.y), (*gridSize).x, gridPos.x ); +} + + +//Calculate grid hash value for each particle +__kernel void calcHash( + __global uint *d_Hash, //output + __global uint *d_Index, //output + __global const float2 *d_Pos, //input: positions + __constant float* cellSize, + __constant float2* worldOrigin, + __constant uint2* gridSize, + uint numParticles +){ + const uint index = get_global_id(0); + if(index >= numParticles) + return; + + float2 p = d_Pos[index]; + + //Get address in grid + uint2 gridPos = getGridPos(p, cellSize, worldOrigin); + uint gridHash = getGridHash(gridPos, gridSize); + + //Store grid hash and particle index + d_Hash[index] = gridHash; + d_Index[index] = index; +} + + + +//////////////////////////////////////////////////////////////////////////////// +// Find cell bounds and reorder positions+velocities by sorted indices +//////////////////////////////////////////////////////////////////////////////// +__kernel void Memset( + __global uint *d_Data, + uint val, + uint N +){ + if(get_global_id(0) < N) + d_Data[get_global_id(0)] = val; +} + +__kernel void findCellBoundsAndReorder( + __global uint *d_CellStart, //output: cell start index + __global uint *d_CellEnd, //output: cell end index + __global float2 *d_ReorderedPos, //output: reordered by cell hash positions + __global float2 *d_ReorderedVel, //output: reordered by cell hash velocities + + __global const uint *d_Hash, //input: sorted grid hashes + __global const uint *d_Index, //input: particle indices sorted by hash + __global const float2 *d_Pos, //input: positions array sorted by hash + __local uint *localHash, //get_group_size(0) + 1 elements + uint numParticles +){ + uint hash; + const uint index = get_global_id(0); + + //Handle case when no. of particles not multiple of block size + if(index < numParticles){ + hash = d_Hash[index]; + + //Load hash data into local memory so that we can look + //at neighboring particle's hash value without loading + //two hash values per thread + localHash[get_local_id(0) + 1] = hash; + + //First thread in block must load neighbor particle hash + if(index > 0 && get_local_id(0) == 0) + localHash[0] = d_Hash[index - 1]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(index < numParticles){ + //Border case + if(index == 0) + d_CellStart[hash] = 0; + + //Main case + else{ + if(hash != localHash[get_local_id(0)]) + d_CellEnd[localHash[get_local_id(0)]] = d_CellStart[hash] = index; + }; + + //Another border case + if(index == numParticles - 1) + d_CellEnd[hash] = numParticles; + + + //Now use the sorted index to reorder the pos and vel arrays + uint sortedIndex = d_Index[index]; + float2 pos = d_Pos[sortedIndex]; + + d_ReorderedPos[index] = pos; + } +} + + + +//////////////////////////////////////////////////////////////////////////////// +// Process collisions (calculate accelerations) +//////////////////////////////////////////////////////////////////////////////// +float4 collideSpheres( + float4 posA, + float4 posB, + float4 velA, + float4 velB, + float radiusA, + float radiusB, + float spring, + float damping, + float shear, + float attraction +){ + //Calculate relative position + float4 relPos = (float4)(posB.x - posA.x, posB.y - posA.y, posB.z - posA.z, 0); + float dist = sqrt(relPos.x * relPos.x + relPos.y * relPos.y + relPos.z * relPos.z); + float collideDist = radiusA + radiusB; + + float4 force = (float4)(0, 0, 0, 0); + if(dist < collideDist){ + float4 norm = (float4)(relPos.x / dist, relPos.y / dist, relPos.z / dist, 0); + + //Relative velocity + float4 relVel = (float4)(velB.x - velA.x, velB.y - velA.y, velB.z - velA.z, 0); + + //Relative tangential velocity + float relVelDotNorm = relVel.x * norm.x + relVel.y * norm.y + relVel.z * norm.z; + float4 tanVel = (float4)(relVel.x - relVelDotNorm * norm.x, relVel.y - relVelDotNorm * norm.y, relVel.z - relVelDotNorm * norm.z, 0); + + //Spring force (potential) + float springFactor = -spring * (collideDist - dist); + force = (float4)( + springFactor * norm.x + damping * relVel.x + shear * tanVel.x + attraction * relPos.x, + springFactor * norm.y + damping * relVel.y + shear * tanVel.y + attraction * relPos.y, + springFactor * norm.z + damping * relVel.z + shear * tanVel.z + attraction * relPos.z, + 0 + ); + } + + return force; +} + + + +__kernel void collide( + __global float2 *d_Vel, //output: new velocity + __global const float2 *d_ReorderedPos, //input: reordered positions + __global const float2 *d_ReorderedVel, //input: reordered velocities + __global const uint *d_Index, //input: reordered particle indices + __global const uint *d_CellStart, //input: cell boundaries + __global const uint *d_CellEnd, + __constant float* cellSize, + __constant float2* worldOrigin, + __constant uint2* gridSize, + uint numParticles +){ + uint index = get_global_id(0); + if(index >= numParticles) + return; + + float2 pos = d_ReorderedPos[index]; + float2 vel = d_ReorderedVel[index]; + float2 force = (float2)(0, 0); + + //Get address in grid + uint2 gridPos = getGridPos(pos, cellSize, worldOrigin); + + //Accumulate surrounding cells + for(int z = -1; z <= 1; z++) + for(int y = -1; y <= 1; y++) + for(int x = -1; x <= 1; x++){ + //Get start particle index for this cell + uint hash = getGridHash(gridPos + (uint2)(x, y), gridSize); + uint startI = d_CellStart[hash]; + + //Skip empty cell + if(startI == 0xFFFFFFFFU) + continue; + + //Iterate over particles in this cell + uint endI = d_CellEnd[hash]; + for(uint j = startI; j < endI; j++){ + if(j == index) + continue; + + float2 pos2 = d_ReorderedPos[j]; + float2 vel2 = d_ReorderedVel[j]; + + //Collide two spheres + /*force += collideSpheres( + pos, pos2, + vel, vel2, + params->particleRadius, params->particleRadius, + params->spring, params->damping, params->shear, params->attraction + );*/ + } + } + + //Collide with cursor sphere + /*force += collideSpheres( + pos, (float4)(params->colliderPos.x, params->colliderPos.y, params->colliderPos.z, 0), + vel, (float4)(0, 0, 0, 0), + params->particleRadius, params->colliderRadius, + params->spring, params->damping, params->shear, params->attraction + );*/ + + //Write new velocity back to original unsorted location + d_Vel[d_Index[index]] = vel + force; +} + +//////////////////////////////////////////////////////////////////////////////// +// Monolithic bitonic sort kernel for short arrays fitting into local memory +//////////////////////////////////////////////////////////////////////////////// +__kernel void bitonicSortLocal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint dir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subbatch and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + for(uint size = 2; size < arrayLength; size <<= 1){ + //Bitonic merge + uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 ); + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + ddd + ); + } + } + + //ddd == dir for the last bitonic merge step + { + for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + dir + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bitonic sort kernel for large arrays (not fitting into local memory) +//////////////////////////////////////////////////////////////////////////////// +//Bottom-level bitonic sort +//Almost the same as bitonicSortLocal with the only exception +//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being +//sorted in opposite directions +__kernel void bitonicSortLocal1( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subarray and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); + + for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ + //Bitonic merge + uint ddd = (comparatorI & (size / 2)) != 0; + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + ddd + ); + } + } + + //Odd / even arrays of LOCAL_SIZE_LIMIT elements + //sorted in opposite directions + { + uint ddd = (get_group_id(0) & 1); + for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + ddd + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT +__kernel void bitonicMergeGlobal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint size, + uint stride, + uint dir +){ + uint global_comparatorI = get_global_id(0); + uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); + + //Bitonic merge + uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + + uint keyA = d_SrcKey[pos + 0]; + uint valA = d_SrcVal[pos + 0]; + uint keyB = d_SrcKey[pos + stride]; + uint valB = d_SrcVal[pos + stride]; + + ComparatorPrivate( + &keyA, &valA, + &keyB, &valB, + ddd + ); + + d_DstKey[pos + 0] = keyA; + d_DstVal[pos + 0] = valA; + d_DstKey[pos + stride] = keyB; + d_DstVal[pos + stride] = valB; +} + +//Combined bitonic merge steps for +//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] +__kernel void bitonicMergeLocal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint stride, + uint size, + uint dir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + //Bitonic merge + uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); + uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + for(; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + ddd + ); + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} \ No newline at end of file diff --git a/VadereUtils/src/org/vadere/util/opencl/CLUniformHashedGrid.java b/VadereUtils/src/org/vadere/util/opencl/CLUniformHashedGrid.java new file mode 100644 index 000000000..6bc0d78d4 --- /dev/null +++ b/VadereUtils/src/org/vadere/util/opencl/CLUniformHashedGrid.java @@ -0,0 +1,607 @@ +package org.vadere.util.opencl; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.NotNull; +import org.lwjgl.PointerBuffer; +import org.lwjgl.opencl.CLContextCallback; +import org.lwjgl.opencl.CLProgramCallback; +import org.lwjgl.system.Configuration; +import org.lwjgl.system.MemoryStack; +import org.lwjgl.system.MemoryUtil; +import org.vadere.util.geometry.shapes.VPoint; +import org.vadere.util.geometry.shapes.VRectangle; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import java.util.List; + +import static org.lwjgl.opencl.CL10.CL_CONTEXT_PLATFORM; +import static org.lwjgl.opencl.CL10.CL_DEVICE_ADDRESS_BITS; +import static org.lwjgl.opencl.CL10.CL_DEVICE_MAX_WORK_GROUP_SIZE; +import static org.lwjgl.opencl.CL10.CL_DEVICE_NAME; +import static org.lwjgl.opencl.CL10.CL_DEVICE_TYPE_GPU; +import static org.lwjgl.opencl.CL10.CL_MEM_ALLOC_HOST_PTR; +import static org.lwjgl.opencl.CL10.CL_MEM_COPY_HOST_PTR; +import static org.lwjgl.opencl.CL10.CL_MEM_READ_ONLY; +import static org.lwjgl.opencl.CL10.CL_MEM_READ_WRITE; +import static org.lwjgl.opencl.CL10.CL_PROGRAM_BUILD_STATUS; +import static org.lwjgl.opencl.CL10.CL_SUCCESS; +import static org.lwjgl.opencl.CL10.clBuildProgram; +import static org.lwjgl.opencl.CL10.clCreateBuffer; +import static org.lwjgl.opencl.CL10.clCreateCommandQueue; +import static org.lwjgl.opencl.CL10.clCreateContext; +import static org.lwjgl.opencl.CL10.clCreateKernel; +import static org.lwjgl.opencl.CL10.clCreateProgramWithSource; +import static org.lwjgl.opencl.CL10.clEnqueueNDRangeKernel; +import static org.lwjgl.opencl.CL10.clEnqueueReadBuffer; +import static org.lwjgl.opencl.CL10.clEnqueueWaitForEvents; +import static org.lwjgl.opencl.CL10.clEnqueueWriteBuffer; +import static org.lwjgl.opencl.CL10.clFinish; +import static org.lwjgl.opencl.CL10.clGetDeviceIDs; +import static org.lwjgl.opencl.CL10.clGetPlatformIDs; +import static org.lwjgl.opencl.CL10.clReleaseCommandQueue; +import static org.lwjgl.opencl.CL10.clReleaseContext; +import static org.lwjgl.opencl.CL10.clReleaseKernel; +import static org.lwjgl.opencl.CL10.clReleaseMemObject; +import static org.lwjgl.opencl.CL10.clReleaseProgram; +import static org.lwjgl.opencl.CL10.clSetKernelArg1i; +import static org.lwjgl.opencl.CL10.clSetKernelArg1p; +import static org.lwjgl.system.MemoryStack.stackPush; +import static org.lwjgl.system.MemoryUtil.NULL; +import static org.lwjgl.system.MemoryUtil.memUTF8; + +/** + * @author Benedikt Zoennchen + */ +public class CLUniformHashedGrid { + private static Logger log = LogManager.getLogger(CLUniformHashedGrid.class); + + // CL ids + private long clPlatform; + private long clDevice; + private long clContext; + private long clQueue; + private long clProgram; + + // CL Memory + private long clHashes; + private long clIndices; + private long clCellStarts; + private long clCellEnds; + private long clReorderedPositions; + private long clPositions; + private long clCellSize; + private long clWorldOrigin; + private long clGridSize; + + // Host Memory + private IntBuffer hashes; + private IntBuffer indices; + private IntBuffer cellStarts; + private IntBuffer cellEnds; + private FloatBuffer reorderedPositions; + private FloatBuffer positions; + private FloatBuffer worldOrigin; + private FloatBuffer cellSize; + private IntBuffer gridSize; + + + private IntBuffer inValues; + private IntBuffer outValues; + + private ByteBuffer source; + private ByteBuffer particleSource; + + // CL callbacks + private CLContextCallback contextCB; + private CLProgramCallback programCB; + + // CL kernel + private long clBitonicSortLocal; + private long clBitonicSortLocal1; + private long clBitonicMergeGlobal; + private long clBitonicMergeLocal; + private long clCalcHash; + private long clFindCellBoundsAndReorder; + + private int numberOfElements; + private int numberOfGridCells; + private VRectangle bound; + private float iCellSize; + private int[] iGridSize; + private List positionList; + + private int[] keys; + private int[] values; + + private int[] resultValues; + private int[] resultKeys; + + //Note: logically shared with BitonicSort.cl! + private static final int LOCAL_SIZE_LIMIT = 16; + + private static final Logger logger = LogManager.getLogger(CLUniformHashedGrid.class); + + private int max_work_group_size; + + private boolean debug = false; + + public enum KernelType { + Separate, + Col, + Row, + NonSeparate + } + + public CLUniformHashedGrid(final int numberOfElements, final VRectangle bound, final double cellSize) throws OpenCLException { + this.numberOfElements = numberOfElements; + this.iGridSize = new int[]{ (int)Math.ceil(bound.getWidth() / cellSize), (int)Math.ceil(bound.getHeight() / cellSize)}; + this.numberOfGridCells = this.iGridSize[0] * this.iGridSize[1]; + this.bound = bound; + this.iCellSize = (float)cellSize; + + if(debug) { + Configuration.DEBUG.set(true); + Configuration.DEBUG_MEMORY_ALLOCATOR.set(true); + Configuration.DEBUG_STACK.set(true); + } + init(); + } + + public class GridCells { + public int[] cellStarts; + public int[] cellEnds; + public float[] reorderedPositions; + public int[] indices; + public int[] hashes; + public float[] positions; + } + + public GridCells calcPositionsInCell(@NotNull final List positions) throws OpenCLException { + assert positions.size() == numberOfElements; + this.positionList = positions; + allocHostMemory(); + allocDeviceMemory(); + + clCalcHash(clHashes, clIndices, clPositions, clCellSize, clWorldOrigin, clGridSize, numberOfElements); + clBitonicSort(clHashes, clIndices, clHashes, clIndices, numberOfElements, 1); + clFindCellBoundsAndReorder(clCellStarts, clCellEnds, clReorderedPositions, clHashes, clIndices, clPositions, numberOfElements); + + /*clEnqueueReadBuffer(clQueue, clCellStarts, true, 0, cellStarts, null, null); + clEnqueueReadBuffer(clQueue, clCellEnds, true, 0, cellEnds, null, null); + clEnqueueReadBuffer(clQueue, clReorderedPositions, true, 0, reorderedPositions, null, null); + clEnqueueReadBuffer(clQueue, clIndices, true, 0, indices, null, null); + clEnqueueReadBuffer(clQueue, clHashes, true, 0, hashes, null, null); + clEnqueueReadBuffer(clQueue, clPositions, true, 0, this.positions, null, null);*/ + + /*int[] aCellStarts = CLUtils.toIntArray(cellStarts, numberOfElements); + int[] aCellEnds = CLUtils.toIntArray(cellEnds, numberOfElements); + float[] aReorderedPositions = CLUtils.toFloatArray(reorderedPositions, numberOfElements * 2); + int[] aIndices = CLUtils.toIntArray(indices, numberOfElements); + int[] aHashes = CLUtils.toIntArray(hashes, numberOfElements); + float[] aPositions = CLUtils.toFloatArray(this.positions, numberOfElements * 2);*/ + + GridCells gridCells = new GridCells(); + /*gridCells.cellEnds = aCellEnds; + gridCells.cellStarts = aCellStarts; + gridCells.reorderedPositions = aReorderedPositions; + gridCells.indices = aIndices; + gridCells.hashes = aHashes; + gridCells.positions = aPositions;*/ + + //clearMemory(); + //clearCL(); + + return gridCells; + //clBitonicSort(clHashes, clIndices, clHashes, clIndices, numberOfElements, 1); + //clFindCellBoundsAndReorder(clCellStarts, clCellEnds, clReorderedPositions, clHashes, clIndices, clPositions, numberOfElements, numberOfGridCells); + } + + public int[] calcSortedHashes(@NotNull final List positions) throws OpenCLException { + assert positions.size() == numberOfElements; + this.positionList = positions; + allocHostMemory(); + allocDeviceMemory(); + + clCalcHash(clHashes, clIndices, clPositions, clCellSize, clWorldOrigin, clGridSize, numberOfElements); + clBitonicSort(clHashes, clIndices, clHashes, clIndices, numberOfElements, 1); + clEnqueueReadBuffer(clQueue, clHashes, true, 0, hashes, null, null); + int[] result = CLUtils.toIntArray(hashes, numberOfElements); + + clearMemory(); + clearCL(); + return result; + + //clBitonicSort(clHashes, clIndices, clHashes, clIndices, numberOfElements, 1); + //clFindCellBoundsAndReorder(clCellStarts, clCellEnds, clReorderedPositions, clHashes, clIndices, clPositions, numberOfElements, numberOfGridCells); + } + + public int[] calcHashes(@NotNull final List positions) throws OpenCLException { + assert positions.size() == numberOfElements; + this.positionList = positions; + allocHostMemory(); + allocDeviceMemory(); + + clCalcHash(clHashes, clIndices, clPositions, clCellSize, clWorldOrigin, clGridSize, numberOfElements); + clEnqueueReadBuffer(clQueue, clHashes, true, 0, hashes, null, null); + int[] result = CLUtils.toIntArray(hashes, numberOfElements); + + clearMemory(); + clearCL(); + return result; + + //clBitonicSort(clHashes, clIndices, clHashes, clIndices, numberOfElements, 1); + //clFindCellBoundsAndReorder(clCellStarts, clCellEnds, clReorderedPositions, clHashes, clIndices, clPositions, numberOfElements, numberOfGridCells); + } + + public int[] getGridSize() { + return new int[]{iGridSize[0], iGridSize[1]}; + } + + public float getCellSize() { + return iCellSize; + } + + public VPoint getWorldOrign() { + return new VPoint(bound.getMinX(), bound.getMinY()); + } + + public void allocHostMemory() { + assert positionList.size() == numberOfElements; + float[] pos = new float[numberOfElements*2]; + for(int i = 0; i < numberOfElements; i++) { + pos[i*2] = (float)positionList.get(i).getX(); + pos[i*2+1] = (float)positionList.get(i).getY(); + } + this.positions = CLUtils.toFloatBuffer(pos, CLUtils.toFloatBuffer(pos)); + this.hashes = MemoryUtil.memAllocInt(numberOfElements); + + float[] originArray = new float[]{(float)bound.getMinX(), (float)bound.getMinX()}; + this.worldOrigin = CLUtils.toFloatBuffer(originArray, CLUtils.toFloatBuffer(originArray)); + + this.cellSize = MemoryUtil.memAllocFloat(1); + this.cellSize.put(0, iCellSize); + + this.gridSize = CLUtils.toIntBuffer(iGridSize, CLUtils.toIntBuffer(iGridSize)); + + this.cellStarts = MemoryUtil.memAllocInt(numberOfElements); + this.cellEnds = MemoryUtil.memAllocInt(numberOfElements); + this.indices = MemoryUtil.memAllocInt(numberOfElements); + this.reorderedPositions = MemoryUtil.memAllocFloat(numberOfElements * 2); + } + + private void allocDeviceMemory() { + try (MemoryStack stack = stackPush()) { + IntBuffer errcode_ret = stack.callocInt(1); + clCellSize = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, cellSize, errcode_ret); + clWorldOrigin = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, worldOrigin, errcode_ret); + clGridSize = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, gridSize, errcode_ret); + clHashes = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * numberOfElements, errcode_ret); + clIndices = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * numberOfElements, errcode_ret); + clCellStarts = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * numberOfGridCells, errcode_ret); + clCellEnds = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 4 * numberOfGridCells, errcode_ret); + clReorderedPositions = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2 * 4 * numberOfElements, errcode_ret); + clPositions = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2 * 4 * numberOfElements, errcode_ret); + clEnqueueWriteBuffer(clQueue, clPositions, true, 0, positions, null, null); + } + } + + public int[] getResultKeys() { + return resultKeys; + } + + public int[] getResultValues() { + return resultValues; + } + + public void init() throws OpenCLException { + initCallbacks(); + initCL(); + buildProgram(); + } + + /*__kernel void calcHash( + __global uint *d_Hash, //output + __global uint *d_Index, //output + __global const float2 *d_Pos, //input: positions + __constant float cellSize, + __constant float2 worldOrigin, + __constant uint2 gridSize + uint numParticles*/ + + private void clCalcHash( + final long clHashes, + final long clIndices, + final long clPositions, + final long clCellSize, + final long clWorldOrign, + final long clGridSize, + final int numberOfElements) throws OpenCLException { + try (MemoryStack stack = stackPush()) { + PointerBuffer clGlobalWorkSize = stack.callocPointer(1); + CLInfo.checkCLError(clSetKernelArg1p(clCalcHash, 0, clHashes)); + CLInfo.checkCLError(clSetKernelArg1p(clCalcHash, 1, clIndices)); + CLInfo.checkCLError(clSetKernelArg1p(clCalcHash, 2, clPositions)); + CLInfo.checkCLError(clSetKernelArg1p(clCalcHash, 3, clCellSize)); + CLInfo.checkCLError(clSetKernelArg1p(clCalcHash, 4, clWorldOrign)); + CLInfo.checkCLError(clSetKernelArg1p(clCalcHash, 5, clGridSize)); + CLInfo.checkCLError(clSetKernelArg1i(clCalcHash, 6, numberOfElements)); + clGlobalWorkSize.put(0, numberOfElements); + //TODO: local work size? + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clCalcHash, 1, null, clGlobalWorkSize, null, null, null)); + } + } + + private void clFindCellBoundsAndReorder( + final long clCellStarts, + final long clCellEnds, + final long clReorderedPositions, + final long clHashes, + final long clIndices, + final long clPositions, + final int numberOfElements) throws OpenCLException { + + try (MemoryStack stack = stackPush()) { + + PointerBuffer clGlobalWorkSize = stack.callocPointer(1); + PointerBuffer clLocalWorkSize = stack.callocPointer(1); + IntBuffer errcode_ret = stack.callocInt(1); + + CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 0, clCellStarts)); + CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 1, clCellEnds)); + CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 2, clReorderedPositions)); + CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 3, clHashes)); + CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 4, clIndices)); + CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 5, clPositions)); + CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 5, (max_work_group_size+1) * 4)); // local memory + CLInfo.checkCLError(clSetKernelArg1i(clFindCellBoundsAndReorder, 6, numberOfElements)); + + clGlobalWorkSize.put(0, numberOfElements); + clLocalWorkSize.put(0, max_work_group_size); + //TODO: local work size? + check 2^n constrain! + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clFindCellBoundsAndReorder, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + } + } + + private void clBitonicSort( + final long clKeysIn, + final long clValuesIn, + final long clKeysOut, + final long clValuesOut, + final int numberOfElements, + final int dir) throws OpenCLException { + try (MemoryStack stack = stackPush()) { + + PointerBuffer clGlobalWorkSize = stack.callocPointer(1); + PointerBuffer clLocalWorkSize = stack.callocPointer(1); + IntBuffer errcode_ret = stack.callocInt(1); + + // small sorts + if (numberOfElements <= LOCAL_SIZE_LIMIT) { + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 0, clKeysOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 1, clValuesOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 2, clKeysIn)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 3, clValuesIn)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 4, numberOfElements)); + //TODO: check the hard coded 1, and the waiting of the queue + CLInfo.checkCLError(clSetKernelArg1i(clBitonicSortLocal, 5, 1)); + clGlobalWorkSize.put(0, numberOfElements / 2); + clLocalWorkSize.put(0, numberOfElements / 2); + + // run the kernel and read the result + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + CLInfo.checkCLError(clFinish(clQueue)); + } else { + //Launch bitonicSortLocal1 + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 0, clKeysOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 1, clValuesOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 2, clKeysIn)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 3, clValuesIn)); + + clGlobalWorkSize = stack.callocPointer(1); + clLocalWorkSize = stack.callocPointer(1); + clGlobalWorkSize.put(0, numberOfElements / 2); + clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 2); + + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal1, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + CLInfo.checkCLError(clFinish(clQueue)); + + for (int size = 2 * LOCAL_SIZE_LIMIT; size <= numberOfElements; size <<= 1) { + for (int stride = size / 2; stride > 0; stride >>= 1) { + if (stride >= LOCAL_SIZE_LIMIT) { + //Launch bitonicMergeGlobal + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 0, clKeysOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 1, clValuesOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 2, clKeysOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 3, clValuesOut)); + + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 4, numberOfElements)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 5, size)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 6, stride)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeGlobal, 7, dir)); + + clGlobalWorkSize = stack.callocPointer(1); + clLocalWorkSize = stack.callocPointer(1); + clGlobalWorkSize.put(0, numberOfElements / 2); + clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 4); + + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeGlobal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + CLInfo.checkCLError(clFinish(clQueue)); + } else { + //Launch bitonicMergeLocal + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 0, clKeysOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 1, clValuesOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 2, clKeysOut)); + CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeLocal, 3, clValuesOut)); + + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 4, numberOfElements)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 5, stride)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 6, size)); + CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 7, dir)); + + clGlobalWorkSize = stack.callocPointer(1); + clLocalWorkSize = stack.callocPointer(1); + clGlobalWorkSize.put(0, numberOfElements / 2); + clLocalWorkSize.put(0, LOCAL_SIZE_LIMIT / 2); + + CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null)); + CLInfo.checkCLError(clFinish(clQueue)); + break; + } + } + } + } + } + } + + static long factorRadix2(long L){ + if(L==0){ + return 0; + }else{ + for(int log2L = 0; (L & 1) == 0; L >>= 1, log2L++); + return L; + } + } + + public void clear() throws OpenCLException { + clearMemory(); + } + + private void clearMemory() throws OpenCLException { + // release memory and devices + try { + CLInfo.checkCLError(clReleaseMemObject(clHashes)); + CLInfo.checkCLError(clReleaseMemObject(clIndices)); + CLInfo.checkCLError(clReleaseMemObject(clCellStarts)); + CLInfo.checkCLError(clReleaseMemObject(clCellEnds)); + CLInfo.checkCLError(clReleaseMemObject(clReorderedPositions)); + CLInfo.checkCLError(clReleaseMemObject(clPositions)); + CLInfo.checkCLError(clReleaseMemObject(clCellSize)); + CLInfo.checkCLError(clReleaseMemObject(clWorldOrigin)); + CLInfo.checkCLError(clReleaseMemObject(clGridSize)); + } + catch (OpenCLException ex) { + throw ex; + } + finally { + MemoryUtil.memFree(hashes); + MemoryUtil.memFree(indices); + MemoryUtil.memFree(cellStarts); + MemoryUtil.memFree(cellEnds); + MemoryUtil.memFree(reorderedPositions); + MemoryUtil.memFree(positions); + MemoryUtil.memFree(worldOrigin); + MemoryUtil.memFree(cellSize); + MemoryUtil.memFree(gridSize); + } + } + + private void clearCL() throws OpenCLException { + CLInfo.checkCLError(clReleaseKernel(clBitonicSortLocal)); + CLInfo.checkCLError(clReleaseKernel(clBitonicSortLocal1)); + CLInfo.checkCLError(clReleaseKernel(clBitonicMergeGlobal)); + CLInfo.checkCLError(clReleaseKernel(clBitonicMergeLocal)); + CLInfo.checkCLError(clReleaseKernel(clCalcHash)); + CLInfo.checkCLError(clReleaseKernel(clFindCellBoundsAndReorder)); + + CLInfo.checkCLError(clReleaseCommandQueue(clQueue)); + CLInfo.checkCLError(clReleaseProgram(clProgram)); + CLInfo.checkCLError(clReleaseContext(clContext)); + contextCB.free(); + programCB.free(); + } + + // private helpers + private void initCallbacks() { + contextCB = CLContextCallback.create((errinfo, private_info, cb, user_data) -> + { + log.debug("[LWJGL] cl_context_callback" + "\tInfo: " + memUTF8(errinfo)); + }); + + programCB = CLProgramCallback.create((program, user_data) -> + { + try { + log.debug("The cl_program [0x"+program+"] was built " + (CLInfo.getProgramBuildInfoInt(program, clDevice, CL_PROGRAM_BUILD_STATUS) == CL_SUCCESS ? "successfully" : "unsuccessfully")); + } catch (OpenCLException e) { + e.printStackTrace(); + } + }); + } + + private void initCL() throws OpenCLException { + try (MemoryStack stack = stackPush()) { + IntBuffer errcode_ret = stack.callocInt(1); + IntBuffer numberOfPlatforms = stack.mallocInt(1); + + CLInfo.checkCLError(clGetPlatformIDs(null, numberOfPlatforms)); + PointerBuffer platformIDs = stack.mallocPointer(numberOfPlatforms.get(0)); + CLInfo.checkCLError(clGetPlatformIDs(platformIDs, numberOfPlatforms)); + + clPlatform = platformIDs.get(0); + + IntBuffer numberOfDevices = stack.mallocInt(1); + CLInfo.checkCLError(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, null, numberOfDevices)); + PointerBuffer deviceIDs = stack.mallocPointer(numberOfDevices.get(0)); + CLInfo.checkCLError(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, deviceIDs, numberOfDevices)); + + clDevice = deviceIDs.get(0); + + log.debug("CL_DEVICE_NAME = " + CLInfo.getDeviceInfoStringUTF8(clDevice, CL_DEVICE_NAME)); + + PointerBuffer ctxProps = stack.mallocPointer(3); + ctxProps.put(CL_CONTEXT_PLATFORM) + .put(clPlatform) + .put(NULL) + .flip(); + + clContext = clCreateContext(ctxProps, clDevice, contextCB, NULL, errcode_ret); + CLInfo.checkCLError(errcode_ret); + + clQueue = clCreateCommandQueue(clContext, clDevice, 0, errcode_ret); + CLInfo.checkCLError(errcode_ret); + } + } + + private void buildProgram() throws OpenCLException { + try (MemoryStack stack = stackPush()) { + IntBuffer errcode_ret = stack.callocInt(1); + + PointerBuffer strings = stack.mallocPointer(1); + PointerBuffer lengths = stack.mallocPointer(1); + + // TODO delete memory? + + try { + source = CLUtils.ioResourceToByteBuffer("Particles.cl", 4096); + } catch (IOException e) { + throw new OpenCLException(e.getMessage()); + } + + strings.put(0, source); + lengths.put(0, source.remaining()); + + clProgram = clCreateProgramWithSource(clContext, strings, lengths, errcode_ret); + CLInfo.checkCLError(clBuildProgram(clProgram, clDevice, "", programCB, NULL)); + clBitonicSortLocal = clCreateKernel(clProgram, "bitonicSortLocal", errcode_ret); + CLInfo.checkCLError(errcode_ret); + clBitonicSortLocal1 = clCreateKernel(clProgram, "bitonicSortLocal1", errcode_ret); + CLInfo.checkCLError(errcode_ret); + clBitonicMergeGlobal = clCreateKernel(clProgram, "bitonicMergeGlobal", errcode_ret); + CLInfo.checkCLError(errcode_ret); + clBitonicMergeLocal = clCreateKernel(clProgram, "bitonicMergeLocal", errcode_ret); + CLInfo.checkCLError(errcode_ret); + + clCalcHash = clCreateKernel(clProgram, "calcHash", errcode_ret); + CLInfo.checkCLError(errcode_ret); + clFindCellBoundsAndReorder = clCreateKernel(clProgram, "findCellBoundsAndReorder", errcode_ret); + CLInfo.checkCLError(errcode_ret); + + max_work_group_size = CLInfo.getDeviceInfoInt(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE); + logger.info("CL_DEVICE_MAX_WORK_GROUP_SIZE = " + max_work_group_size); + } + + } +} diff --git a/VadereUtils/tests/org/vadere/util/math/TestBitonicSort.java b/VadereUtils/tests/org/vadere/util/math/TestBitonicSort.java index 0b20ccefe..cd18ce9c0 100644 --- a/VadereUtils/tests/org/vadere/util/math/TestBitonicSort.java +++ b/VadereUtils/tests/org/vadere/util/math/TestBitonicSort.java @@ -23,8 +23,6 @@ public class TestBitonicSort { private static Logger logger = LogManager.getLogger(TestConvolution.class); - private static Random random = new Random(); - @Before public void setUp() throws Exception {} diff --git a/VadereUtils/tests/org/vadere/util/math/TestCellGridSort.java b/VadereUtils/tests/org/vadere/util/math/TestCellGridSort.java new file mode 100644 index 000000000..8d9445ab5 --- /dev/null +++ b/VadereUtils/tests/org/vadere/util/math/TestCellGridSort.java @@ -0,0 +1,107 @@ +package org.vadere.util.math; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.Before; +import org.junit.Test; +import org.vadere.util.geometry.shapes.VPoint; +import org.vadere.util.geometry.shapes.VRectangle; +import org.vadere.util.opencl.CLUniformHashedGrid; +import org.vadere.util.opencl.OpenCLException; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Random; + +import static org.junit.Assert.assertEquals; + +/** + * @author Benedikt Zoennchen + */ +public class TestCellGridSort { + + private static Logger logger = LogManager.getLogger(TestConvolution.class); + + private static Random random = new Random(); + + @Before + public void setUp() throws Exception {} + + @Test + public void testCalcHash() throws IOException, OpenCLException { + CLUniformHashedGrid clUniformHashedGrid = new CLUniformHashedGrid(1024, new VRectangle(0, 0, 10, 10), 1); + ArrayList positions = new ArrayList<>(); + for(int i = 0; i < 1024; i++) { + positions.add(new VPoint(random.nextDouble() * 10,random.nextDouble() * 10)); + } + int[] hasehs = clUniformHashedGrid.calcHashes(positions); + + assertEquals(hasehs.length, positions.size()); + + for(int i = 0; i < hasehs.length; i++) { + int hash = getGridHash(getGridPosition(positions.get(i), clUniformHashedGrid.getCellSize(), clUniformHashedGrid.getWorldOrign()), clUniformHashedGrid.getGridSize()); + assertEquals(hasehs[i], hash); + } + } + + @Test + public void testCalcAndSortHash() throws IOException, OpenCLException { + CLUniformHashedGrid clUniformHashedGrid = new CLUniformHashedGrid(1024, new VRectangle(0, 0, 10, 10), 1); + ArrayList positions = new ArrayList<>(); + for(int i = 0; i < 1024; i++) { + positions.add(new VPoint(random.nextDouble() * 10,random.nextDouble() * 10)); + } + int[] hasehs = clUniformHashedGrid.calcSortedHashes(positions); + + assertEquals(hasehs.length, positions.size()); + + int[] expectedHashes = new int[positions.size()]; + for(int i = 0; i < hasehs.length; i++) { + int hash = getGridHash(getGridPosition(positions.get(i), clUniformHashedGrid.getCellSize(), clUniformHashedGrid.getWorldOrign()), clUniformHashedGrid.getGridSize()); + expectedHashes[i] = hash; + } + Arrays.sort(expectedHashes); + + for(int i = 0; i < hasehs.length; i++) { + assertEquals(hasehs[i], expectedHashes[i]); + } + } + + @Test + public void testGridCell() throws IOException, OpenCLException { + CLUniformHashedGrid clUniformHashedGrid = new CLUniformHashedGrid(1024, new VRectangle(0, 0, 10, 10), 1); + ArrayList positions = new ArrayList<>(); + for(int i = 0; i < 1024; i++) { + positions.add(new VPoint(random.nextDouble() * 10,random.nextDouble() * 10)); + } + CLUniformHashedGrid.GridCells gridCells = clUniformHashedGrid.calcPositionsInCell(positions); + + } + + /** + * Helper to compute the hash values, see OpenCL code in Particles.cl + */ + private static int umad(int a, int b, int c) { + return (a * b) + c; + } + + /** + * Computes the grid position of a real world position, see OpenCL code in Particles.cl + */ + private static int[] getGridPosition(final VPoint p, float cellSize, final VPoint worldOrign) { + int[] gridPos = new int[2]; + gridPos[0] = (int)Math.floor( (p.getX()-worldOrign.getX()) / cellSize ); + gridPos[1] = (int)Math.floor( (p.getY()-worldOrign.getY()) / cellSize ); + return gridPos; + } + + /** + * Computes the hash value of a grid position, see OpenCL code in Particles.cl + */ + private static int getGridHash(final int[] gridPos, int[] gridSize) { + gridPos[0] = gridPos[0] & (gridSize[0] - 1); + gridPos[1] = gridPos[1] & (gridSize[1] - 1); + return umad(umad(1, gridSize[1], gridPos[1]), gridSize[0], gridPos[0]); + } +} -- GitLab