11.3.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit a58b9135 authored by Benedikt Zoennchen's avatar Benedikt Zoennchen

implement the linked cell on the GPU.

parent 9cad2a18
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
////////////////////////////////////////////////////////////////////////////////
// Common definitions
////////////////////////////////////////////////////////////////////////////////
#define UMAD(a, b, c) ( (a) * (b) + (c) )
typedef struct{
float x;
float y;
float z;
} Float3;
typedef struct{
uint x;
uint y;
uint z;
}Uint3;
typedef struct{
int x;
int y;
int z;
}Int3;
typedef struct{
Float3 colliderPos;
float colliderRadius;
Float3 gravity;
float globalDamping;
float particleRadius;
Uint3 gridSize;
uint numCells;
Float3 worldOrigin;
Float3 cellSize;
uint numBodies;
uint maxParticlesPerCell;
float spring;
float damping;
float shear;
float attraction;
float boundaryDamping;
} simParams_t;
typedef struct {
float2 position;
float stepLength;
} pedestrian;
#define LOCAL_SIZE_LIMIT 16U
inline void ComparatorPrivate(
uint *keyA,
uint *valA,
uint *keyB,
uint *valB,
uint dir
){
if( (*keyA > *keyB) == dir ){
uint t;
t = *keyA; *keyA = *keyB; *keyB = t;
t = *valA; *valA = *valB; *valB = t;
}
}
inline void ComparatorLocal(
__local uint *keyA,
__local uint *valA,
__local uint *keyB,
__local uint *valB,
uint dir
){
if( (*keyA > *keyB) == dir ){
uint t;
t = *keyA; *keyA = *keyB; *keyB = t;
t = *valA; *valA = *valB; *valB = t;
}
}
////////////////////////////////////////////////////////////////////////////////
// Save particle grid cell hashes and indices
////////////////////////////////////////////////////////////////////////////////
uint2 getGridPos(float2 p, __constant float* cellSize, __constant float2* worldOrigin){
uint2 gridPos;
float2 wordOr = (*worldOrigin);
gridPos.x = (int)floor((p.x - wordOr.x) / (*cellSize));
gridPos.y = (int)floor((p.y - wordOr.y) / (*cellSize));
return gridPos;
}
//Calculate address in grid from position (clamping to edges)
uint getGridHash(uint2 gridPos, __constant uint2* gridSize){
//Wrap addressing, assume power-of-two grid dimensions
gridPos.x = gridPos.x & ((*gridSize).x - 1);
gridPos.y = gridPos.y & ((*gridSize).y - 1);
return UMAD( UMAD(1.0, (*gridSize).y, gridPos.y), (*gridSize).x, gridPos.x );
}
//Calculate grid hash value for each particle
__kernel void calcHash(
__global uint *d_Hash, //output
__global uint *d_Index, //output
__global const float2 *d_Pos, //input: positions
__constant float* cellSize,
__constant float2* worldOrigin,
__constant uint2* gridSize,
uint numParticles
){
const uint index = get_global_id(0);
if(index >= numParticles)
return;
float2 p = d_Pos[index];
//Get address in grid
uint2 gridPos = getGridPos(p, cellSize, worldOrigin);
uint gridHash = getGridHash(gridPos, gridSize);
//Store grid hash and particle index
d_Hash[index] = gridHash;
d_Index[index] = index;
}
////////////////////////////////////////////////////////////////////////////////
// Find cell bounds and reorder positions+velocities by sorted indices
////////////////////////////////////////////////////////////////////////////////
__kernel void Memset(
__global uint *d_Data,
uint val,
uint N
){
if(get_global_id(0) < N)
d_Data[get_global_id(0)] = val;
}
__kernel void findCellBoundsAndReorder(
__global uint *d_CellStart, //output: cell start index
__global uint *d_CellEnd, //output: cell end index
__global float2 *d_ReorderedPos, //output: reordered by cell hash positions
__global float2 *d_ReorderedVel, //output: reordered by cell hash velocities
__global const uint *d_Hash, //input: sorted grid hashes
__global const uint *d_Index, //input: particle indices sorted by hash
__global const float2 *d_Pos, //input: positions array sorted by hash
__local uint *localHash, //get_group_size(0) + 1 elements
uint numParticles
){
uint hash;
const uint index = get_global_id(0);
//Handle case when no. of particles not multiple of block size
if(index < numParticles){
hash = d_Hash[index];
//Load hash data into local memory so that we can look
//at neighboring particle's hash value without loading
//two hash values per thread
localHash[get_local_id(0) + 1] = hash;
//First thread in block must load neighbor particle hash
if(index > 0 && get_local_id(0) == 0)
localHash[0] = d_Hash[index - 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(index < numParticles){
//Border case
if(index == 0)
d_CellStart[hash] = 0;
//Main case
else{
if(hash != localHash[get_local_id(0)])
d_CellEnd[localHash[get_local_id(0)]] = d_CellStart[hash] = index;
};
//Another border case
if(index == numParticles - 1)
d_CellEnd[hash] = numParticles;
//Now use the sorted index to reorder the pos and vel arrays
uint sortedIndex = d_Index[index];
float2 pos = d_Pos[sortedIndex];
d_ReorderedPos[index] = pos;
}
}
////////////////////////////////////////////////////////////////////////////////
// Process collisions (calculate accelerations)
////////////////////////////////////////////////////////////////////////////////
float4 collideSpheres(
float4 posA,
float4 posB,
float4 velA,
float4 velB,
float radiusA,
float radiusB,
float spring,
float damping,
float shear,
float attraction
){
//Calculate relative position
float4 relPos = (float4)(posB.x - posA.x, posB.y - posA.y, posB.z - posA.z, 0);
float dist = sqrt(relPos.x * relPos.x + relPos.y * relPos.y + relPos.z * relPos.z);
float collideDist = radiusA + radiusB;
float4 force = (float4)(0, 0, 0, 0);
if(dist < collideDist){
float4 norm = (float4)(relPos.x / dist, relPos.y / dist, relPos.z / dist, 0);
//Relative velocity
float4 relVel = (float4)(velB.x - velA.x, velB.y - velA.y, velB.z - velA.z, 0);
//Relative tangential velocity
float relVelDotNorm = relVel.x * norm.x + relVel.y * norm.y + relVel.z * norm.z;
float4 tanVel = (float4)(relVel.x - relVelDotNorm * norm.x, relVel.y - relVelDotNorm * norm.y, relVel.z - relVelDotNorm * norm.z, 0);
//Spring force (potential)
float springFactor = -spring * (collideDist - dist);
force = (float4)(
springFactor * norm.x + damping * relVel.x + shear * tanVel.x + attraction * relPos.x,
springFactor * norm.y + damping * relVel.y + shear * tanVel.y + attraction * relPos.y,
springFactor * norm.z + damping * relVel.z + shear * tanVel.z + attraction * relPos.z,
0
);
}
return force;
}
__kernel void collide(
__global float2 *d_Vel, //output: new velocity
__global const float2 *d_ReorderedPos, //input: reordered positions
__global const float2 *d_ReorderedVel, //input: reordered velocities
__global const uint *d_Index, //input: reordered particle indices
__global const uint *d_CellStart, //input: cell boundaries
__global const uint *d_CellEnd,
__constant float* cellSize,
__constant float2* worldOrigin,
__constant uint2* gridSize,
uint numParticles
){
uint index = get_global_id(0);
if(index >= numParticles)
return;
float2 pos = d_ReorderedPos[index];
float2 vel = d_ReorderedVel[index];
float2 force = (float2)(0, 0);
//Get address in grid
uint2 gridPos = getGridPos(pos, cellSize, worldOrigin);
//Accumulate surrounding cells
for(int z = -1; z <= 1; z++)
for(int y = -1; y <= 1; y++)
for(int x = -1; x <= 1; x++){
//Get start particle index for this cell
uint hash = getGridHash(gridPos + (uint2)(x, y), gridSize);
uint startI = d_CellStart[hash];
//Skip empty cell
if(startI == 0xFFFFFFFFU)
continue;
//Iterate over particles in this cell
uint endI = d_CellEnd[hash];
for(uint j = startI; j < endI; j++){
if(j == index)
continue;
float2 pos2 = d_ReorderedPos[j];
float2 vel2 = d_ReorderedVel[j];
//Collide two spheres
/*force += collideSpheres(
pos, pos2,
vel, vel2,
params->particleRadius, params->particleRadius,
params->spring, params->damping, params->shear, params->attraction
);*/
}
}
//Collide with cursor sphere
/*force += collideSpheres(
pos, (float4)(params->colliderPos.x, params->colliderPos.y, params->colliderPos.z, 0),
vel, (float4)(0, 0, 0, 0),
params->particleRadius, params->colliderRadius,
params->spring, params->damping, params->shear, params->attraction
);*/
//Write new velocity back to original unsorted location
d_Vel[d_Index[index]] = vel + force;
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void bitonicSortLocal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint dir
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
//Offset to the beginning of subbatch and load data
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
for(uint size = 2; size < arrayLength; size <<= 1){
//Bitonic merge
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
dir
);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void bitonicSortLocal1(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
//Offset to the beginning of subarray and load data
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
//Bitonic merge
uint ddd = (comparatorI & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
//Odd / even arrays of LOCAL_SIZE_LIMIT elements
//sorted in opposite directions
{
uint ddd = (get_group_id(0) & 1);
for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void bitonicMergeGlobal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint size,
uint stride,
uint dir
){
uint global_comparatorI = get_global_id(0);
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
uint keyA = d_SrcKey[pos + 0];
uint valA = d_SrcVal[pos + 0];
uint keyB = d_SrcKey[pos + stride];
uint valB = d_SrcVal[pos + stride];
ComparatorPrivate(
&keyA, &valA,
&keyB, &valB,
ddd
);
d_DstKey[pos + 0] = keyA;
d_DstVal[pos + 0] = valA;
d_DstKey[pos + stride] = keyB;
d_DstVal[pos + stride] = valB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void bitonicMergeLocal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint stride,
uint size,
uint dir
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
\ No newline at end of file
...@@ -23,8 +23,6 @@ public class TestBitonicSort { ...@@ -23,8 +23,6 @@ public class TestBitonicSort {
private static Logger logger = LogManager.getLogger(TestConvolution.class); private static Logger logger = LogManager.getLogger(TestConvolution.class);
private static Random random = new Random();
@Before @Before
public void setUp() throws Exception {} public void setUp() throws Exception {}
......
package org.vadere.util.math;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.junit.Before;
import org.junit.Test;
import org.vadere.util.geometry.shapes.VPoint;
import org.vadere.util.geometry.shapes.VRectangle;
import org.vadere.util.opencl.CLUniformHashedGrid;
import org.vadere.util.opencl.OpenCLException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
import static org.junit.Assert.assertEquals;
/**
* @author Benedikt Zoennchen
*/
public class TestCellGridSort {
private static Logger logger = LogManager.getLogger(TestConvolution.class);
private static Random random = new Random();
@Before
public void setUp() throws Exception {}
@Test
public void testCalcHash() throws IOException, OpenCLException {
CLUniformHashedGrid clUniformHashedGrid = new CLUniformHashedGrid(1024, new VRectangle(0, 0, 10, 10), 1);
ArrayList<VPoint> positions = new ArrayList<>();
for(int i = 0; i < 1024; i++) {
positions.add(new VPoint(random.nextDouble() * 10,random.nextDouble() * 10));
}
int[] hasehs = clUniformHashedGrid.calcHashes(positions);
assertEquals(hasehs.length, positions.size());
for(int i = 0; i < hasehs.length; i++) {
int hash = getGridHash(getGridPosition(positions.get(i), clUniformHashedGrid.getCellSize(), clUniformHashedGrid.getWorldOrign()), clUniformHashedGrid.getGridSize());
assertEquals(hasehs[i], hash);
}
}
@Test
public void testCalcAndSortHash() throws IOException, OpenCLException {
CLUniformHashedGrid clUniformHashedGrid = new CLUniformHashedGrid(1024, new VRectangle(0, 0, 10, 10), 1);
ArrayList<VPoint> positions = new ArrayList<>();
for(int i = 0; i < 1024; i++) {
positions.add(new VPoint(random.nextDouble() * 10,random.nextDouble() * 10));
}
int[] hasehs = clUniformHashedGrid.calcSortedHashes(positions);
assertEquals(hasehs.length, positions.size());
int[] expectedHashes = new int[positions.size()];
for(int i = 0; i < hasehs.length; i++) {
int hash = getGridHash(getGridPosition(positions.get(i), clUniformHashedGrid.getCellSize(), clUniformHashedGrid.getWorldOrign()), clUniformHashedGrid.getGridSize());
expectedHashes[i] = hash;