System updates are being installed. You can continue using GitLab, but it may be unavailable for some minutes during the updates.

Commit a58b9135 authored by Benedikt Zoennchen's avatar Benedikt Zoennchen

implement the linked cell on the GPU.

parent 9cad2a18
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
////////////////////////////////////////////////////////////////////////////////
// Common definitions
////////////////////////////////////////////////////////////////////////////////
#define UMAD(a, b, c) ( (a) * (b) + (c) )
typedef struct{
float x;
float y;
float z;
} Float3;
typedef struct{
uint x;
uint y;
uint z;
}Uint3;
typedef struct{
int x;
int y;
int z;
}Int3;
typedef struct{
Float3 colliderPos;
float colliderRadius;
Float3 gravity;
float globalDamping;
float particleRadius;
Uint3 gridSize;
uint numCells;
Float3 worldOrigin;
Float3 cellSize;
uint numBodies;
uint maxParticlesPerCell;
float spring;
float damping;
float shear;
float attraction;
float boundaryDamping;
} simParams_t;
typedef struct {
float2 position;
float stepLength;
} pedestrian;
#define LOCAL_SIZE_LIMIT 16U
inline void ComparatorPrivate(
uint *keyA,
uint *valA,
uint *keyB,
uint *valB,
uint dir
){
if( (*keyA > *keyB) == dir ){
uint t;
t = *keyA; *keyA = *keyB; *keyB = t;
t = *valA; *valA = *valB; *valB = t;
}
}
inline void ComparatorLocal(
__local uint *keyA,
__local uint *valA,
__local uint *keyB,
__local uint *valB,
uint dir
){
if( (*keyA > *keyB) == dir ){
uint t;
t = *keyA; *keyA = *keyB; *keyB = t;
t = *valA; *valA = *valB; *valB = t;
}
}
////////////////////////////////////////////////////////////////////////////////
// Save particle grid cell hashes and indices
////////////////////////////////////////////////////////////////////////////////
uint2 getGridPos(float2 p, __constant float* cellSize, __constant float2* worldOrigin){
uint2 gridPos;
float2 wordOr = (*worldOrigin);
gridPos.x = (int)floor((p.x - wordOr.x) / (*cellSize));
gridPos.y = (int)floor((p.y - wordOr.y) / (*cellSize));
return gridPos;
}
//Calculate address in grid from position (clamping to edges)
uint getGridHash(uint2 gridPos, __constant uint2* gridSize){
//Wrap addressing, assume power-of-two grid dimensions
gridPos.x = gridPos.x & ((*gridSize).x - 1);
gridPos.y = gridPos.y & ((*gridSize).y - 1);
return UMAD( UMAD(1.0, (*gridSize).y, gridPos.y), (*gridSize).x, gridPos.x );
}
//Calculate grid hash value for each particle
__kernel void calcHash(
__global uint *d_Hash, //output
__global uint *d_Index, //output
__global const float2 *d_Pos, //input: positions
__constant float* cellSize,
__constant float2* worldOrigin,
__constant uint2* gridSize,
uint numParticles
){
const uint index = get_global_id(0);
if(index >= numParticles)
return;
float2 p = d_Pos[index];
//Get address in grid
uint2 gridPos = getGridPos(p, cellSize, worldOrigin);
uint gridHash = getGridHash(gridPos, gridSize);
//Store grid hash and particle index
d_Hash[index] = gridHash;
d_Index[index] = index;
}
////////////////////////////////////////////////////////////////////////////////
// Find cell bounds and reorder positions+velocities by sorted indices
////////////////////////////////////////////////////////////////////////////////
__kernel void Memset(
__global uint *d_Data,
uint val,
uint N
){
if(get_global_id(0) < N)
d_Data[get_global_id(0)] = val;
}
__kernel void findCellBoundsAndReorder(
__global uint *d_CellStart, //output: cell start index
__global uint *d_CellEnd, //output: cell end index
__global float2 *d_ReorderedPos, //output: reordered by cell hash positions
__global float2 *d_ReorderedVel, //output: reordered by cell hash velocities
__global const uint *d_Hash, //input: sorted grid hashes
__global const uint *d_Index, //input: particle indices sorted by hash
__global const float2 *d_Pos, //input: positions array sorted by hash
__local uint *localHash, //get_group_size(0) + 1 elements
uint numParticles
){
uint hash;
const uint index = get_global_id(0);
//Handle case when no. of particles not multiple of block size
if(index < numParticles){
hash = d_Hash[index];
//Load hash data into local memory so that we can look
//at neighboring particle's hash value without loading
//two hash values per thread
localHash[get_local_id(0) + 1] = hash;
//First thread in block must load neighbor particle hash
if(index > 0 && get_local_id(0) == 0)
localHash[0] = d_Hash[index - 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(index < numParticles){
//Border case
if(index == 0)
d_CellStart[hash] = 0;
//Main case
else{
if(hash != localHash[get_local_id(0)])
d_CellEnd[localHash[get_local_id(0)]] = d_CellStart[hash] = index;
};
//Another border case
if(index == numParticles - 1)
d_CellEnd[hash] = numParticles;
//Now use the sorted index to reorder the pos and vel arrays
uint sortedIndex = d_Index[index];
float2 pos = d_Pos[sortedIndex];
d_ReorderedPos[index] = pos;
}
}
////////////////////////////////////////////////////////////////////////////////
// Process collisions (calculate accelerations)
////////////////////////////////////////////////////////////////////////////////
float4 collideSpheres(
float4 posA,
float4 posB,
float4 velA,
float4 velB,
float radiusA,
float radiusB,
float spring,
float damping,
float shear,
float attraction
){
//Calculate relative position
float4 relPos = (float4)(posB.x - posA.x, posB.y - posA.y, posB.z - posA.z, 0);
float dist = sqrt(relPos.x * relPos.x + relPos.y * relPos.y + relPos.z * relPos.z);
float collideDist = radiusA + radiusB;
float4 force = (float4)(0, 0, 0, 0);
if(dist < collideDist){
float4 norm = (float4)(relPos.x / dist, relPos.y / dist, relPos.z / dist, 0);
//Relative velocity
float4 relVel = (float4)(velB.x - velA.x, velB.y - velA.y, velB.z - velA.z, 0);
//Relative tangential velocity
float relVelDotNorm = relVel.x * norm.x + relVel.y * norm.y + relVel.z * norm.z;
float4 tanVel = (float4)(relVel.x - relVelDotNorm * norm.x, relVel.y - relVelDotNorm * norm.y, relVel.z - relVelDotNorm * norm.z, 0);
//Spring force (potential)
float springFactor = -spring * (collideDist - dist);
force = (float4)(
springFactor * norm.x + damping * relVel.x + shear * tanVel.x + attraction * relPos.x,
springFactor * norm.y + damping * relVel.y + shear * tanVel.y + attraction * relPos.y,
springFactor * norm.z + damping * relVel.z + shear * tanVel.z + attraction * relPos.z,
0
);
}
return force;
}
__kernel void collide(
__global float2 *d_Vel, //output: new velocity
__global const float2 *d_ReorderedPos, //input: reordered positions
__global const float2 *d_ReorderedVel, //input: reordered velocities
__global const uint *d_Index, //input: reordered particle indices
__global const uint *d_CellStart, //input: cell boundaries
__global const uint *d_CellEnd,
__constant float* cellSize,
__constant float2* worldOrigin,
__constant uint2* gridSize,
uint numParticles
){
uint index = get_global_id(0);
if(index >= numParticles)
return;
float2 pos = d_ReorderedPos[index];
float2 vel = d_ReorderedVel[index];
float2 force = (float2)(0, 0);
//Get address in grid
uint2 gridPos = getGridPos(pos, cellSize, worldOrigin);
//Accumulate surrounding cells
for(int z = -1; z <= 1; z++)
for(int y = -1; y <= 1; y++)
for(int x = -1; x <= 1; x++){
//Get start particle index for this cell
uint hash = getGridHash(gridPos + (uint2)(x, y), gridSize);
uint startI = d_CellStart[hash];
//Skip empty cell
if(startI == 0xFFFFFFFFU)
continue;
//Iterate over particles in this cell
uint endI = d_CellEnd[hash];
for(uint j = startI; j < endI; j++){
if(j == index)
continue;
float2 pos2 = d_ReorderedPos[j];
float2 vel2 = d_ReorderedVel[j];
//Collide two spheres
/*force += collideSpheres(
pos, pos2,
vel, vel2,
params->particleRadius, params->particleRadius,
params->spring, params->damping, params->shear, params->attraction
);*/
}
}
//Collide with cursor sphere
/*force += collideSpheres(
pos, (float4)(params->colliderPos.x, params->colliderPos.y, params->colliderPos.z, 0),
vel, (float4)(0, 0, 0, 0),
params->particleRadius, params->colliderRadius,
params->spring, params->damping, params->shear, params->attraction
);*/
//Write new velocity back to original unsorted location
d_Vel[d_Index[index]] = vel + force;
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void bitonicSortLocal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint dir
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
//Offset to the beginning of subbatch and load data
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
for(uint size = 2; size < arrayLength; size <<= 1){
//Bitonic merge
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
dir
);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void bitonicSortLocal1(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
//Offset to the beginning of subarray and load data
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
//Bitonic merge
uint ddd = (comparatorI & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
//Odd / even arrays of LOCAL_SIZE_LIMIT elements
//sorted in opposite directions
{
uint ddd = (get_group_id(0) & 1);
for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void bitonicMergeGlobal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint size,
uint stride,
uint dir
){
uint global_comparatorI = get_global_id(0);
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
uint keyA = d_SrcKey[pos + 0];
uint valA = d_SrcVal[pos + 0];
uint keyB = d_SrcKey[pos + stride];
uint valB = d_SrcVal[pos + stride];
ComparatorPrivate(
&keyA, &valA,
&keyB, &valB,
ddd
);
d_DstKey[pos + 0] = keyA;
d_DstVal[pos + 0] = valA;
d_DstKey[pos + stride] = keyB;
d_DstVal[pos + stride] = valB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void bitonicMergeLocal(
__global uint *d_DstKey,
__global uint *d_DstVal,
__global uint *d_SrcKey,
__global uint *d_SrcVal,
uint arrayLength,
uint stride,
uint size,
uint dir
){
__local uint l_key[LOCAL_SIZE_LIMIT];
__local uint l_val[LOCAL_SIZE_LIMIT];
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(; stride > 0; stride >>= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
&l_key[pos + 0], &l_val[pos + 0],
&l_key[pos + stride], &l_val[pos + stride],
ddd
);
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
\ No newline at end of file
......@@ -23,8 +23,6 @@ public class TestBitonicSort {
private static Logger logger = LogManager.getLogger(TestConvolution.class);
private static Random random = new Random();
@Before
public void setUp() throws Exception {}
......
package org.vadere.util.math;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.junit.Before;
import org.junit.Test;
import org.vadere.util.geometry.shapes.VPoint;
import org.vadere.util.geometry.shapes.VRectangle;
import org.vadere.util.opencl.CLUniformHashedGrid;
import org.vadere.util.opencl.OpenCLException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
import static org.junit.Assert.assertEquals;
/**
* @author Benedikt Zoennchen
*/
public class TestCellGridSort {
private static Logger logger = LogManager.getLogger(TestConvolution.class);
private static Random random = new Random();
@Before
public void setUp() throws Exception {}
@Test
public void testCalcHash() throws IOException, OpenCLException {
CLUniformHashedGrid clUniformHashedGrid = new CLUniformHashedGrid(1024, new VRectangle(0, 0, 10, 10), 1);
ArrayList<VPoint> positions = new ArrayList<>();
for(int i = 0; i < 1024; i++) {
positions.add(new VPoint(random.nextDouble() * 10,random.nextDouble() * 10));
}
int[] hasehs = clUniformHashedGrid.calcHashes(positions);
assertEquals(hasehs.length, positions.size());
for(int i = 0; i < hasehs.length; i++) {
int hash = getGridHash(getGridPosition(positions.get(i), clUniformHashedGrid.getCellSize(), clUniformHashedGrid.getWorldOrign()), clUniformHashedGrid.getGridSize());
assertEquals(hasehs[i], hash);
}
}
@Test
public void testCalcAndSortHash() throws IOException, OpenCLException {
CLUniformHashedGrid clUniformHashedGrid = new CLUniformHashedGrid(1024, new VRectangle(0, 0, 10, 10), 1);
ArrayList<VPoint> positions = new ArrayList<>();
for(int i = 0; i < 1024; i++) {
positions.add(new VPoint(random.nextDouble() * 10,random.nextDouble() * 10));
}
int[] hasehs = clUniformHashedGrid.calcSortedHashes(positions);
assertEquals(hasehs.length, positions.size());
int[] expectedHashes = new int[positions.size()];
for(int i = 0; i < hasehs.length; i++) {
int hash = getGridHash(getGridPosition(positions.get(i), clUniformHashedGrid.getCellSize(), clUniformHashedGrid.getWorldOrign()), clUniformHashedGrid.getGridSize());
expectedHashes[i] = hash;
}
Arrays.sort(expectedHashes);
for(int i = 0; i < hasehs.length; i++) {
assertEquals(hasehs[i], expectedHashes[i]);
}
}
@Test
public void testGridCell() throws IOException, OpenCLException {
CLUniformHashedGrid clUniformHashedGrid = new CLUniformHashedGrid(1024, new VRectangle(0, 0, 10, 10), 1);
ArrayList<VPoint> positions = new ArrayList<>();
for(int i = 0; i < 1024; i++) {
positions.add(new VPoint(random.nextDouble() * 10,random.nextDouble() * 10));
}
CLUniformHashedGrid.GridCells gridCells = clUniformHashedGrid.calcPositionsInCell(positions);
}