Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
vadere
vadere
Commits
36865d52
Commit
36865d52
authored
Jul 12, 2018
by
Benedikt Zoennchen
Browse files
documentation (adding comments), the hardware parameters are no longer hardcoded for the sorting.
parent
fdc22cfc
Changes
6
Show whitespace changes
Inline
Side-by-side
VadereUtils/resources/BitonicSort.cl
View file @
36865d52
...
...
@@ -8,9 +8,6 @@
*
is
strictly
prohibited.
*
*/
#
define
LOCAL_SIZE_LIMIT
16U
inline
void
ComparatorPrivate
(
uint
*keyA,
uint
*valA,
...
...
@@ -48,10 +45,11 @@ __kernel void bitonicSortLocal(
__global
uint
*d_SrcKey,
__global
uint
*d_SrcVal,
uint
arrayLength,
uint
dir
uint
dir,
__local
uint
*l_key,
__local
uint
*l_val
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
uint
LOCAL_SIZE_LIMIT
=
get_local_size
(
0
)
*
2
;
//Offset
to
the
beginning
of
subbatch
and
load
data
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
...
...
@@ -108,11 +106,11 @@ __kernel void bitonicSortLocal1(
__global
uint
*d_DstKey,
__global
uint
*d_DstVal,
__global
uint
*d_SrcKey,
__global
uint
*d_SrcVal
__global
uint
*d_SrcVal,
__local
uint
*l_key,
__local
uint
*l_val
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
uint
LOCAL_SIZE_LIMIT
=
get_local_size
(
0
)
*
2
;
//Offset
to
the
beginning
of
subarray
and
load
data
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_SrcVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
...
...
@@ -206,11 +204,11 @@ __kernel void bitonicMergeLocal(
uint
arrayLength,
uint
stride,
uint
size,
uint
dir
uint
dir,
__local
uint
*l_key,
__local
uint
*l_val
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
uint
LOCAL_SIZE_LIMIT
=
get_local_size
(
0
)
*
2
;
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_SrcVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_DstKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
...
...
VadereUtils/resources/Particles.cl
View file @
36865d52
...
...
@@ -63,8 +63,6 @@ typedef struct {
float
stepLength
;
}
pedestrian
;
#
define
LOCAL_SIZE_LIMIT
16U
inline
void
ComparatorPrivate
(
uint
*keyA,
uint
*valA,
...
...
@@ -329,10 +327,11 @@ __kernel void bitonicSortLocal(
__global
uint
*d_SrcKey,
__global
uint
*d_SrcVal,
uint
arrayLength,
uint
dir
uint
dir,
__local
uint
*l_key,
__local
uint
*l_val
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
uint
LOCAL_SIZE_LIMIT
=
get_local_size
(
0
)
*
2
;
//Offset
to
the
beginning
of
subbatch
and
load
data
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
...
...
@@ -389,11 +388,11 @@ __kernel void bitonicSortLocal1(
__global
uint
*d_DstKey,
__global
uint
*d_DstVal,
__global
uint
*d_SrcKey,
__global
uint
*d_SrcVal
__global
uint
*d_SrcVal,
__local
uint
*l_key,
__local
uint
*l_val
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
uint
LOCAL_SIZE_LIMIT
=
get_local_size
(
0
)
*
2
;
//Offset
to
the
beginning
of
subarray
and
load
data
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_SrcVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
...
...
@@ -487,11 +486,11 @@ __kernel void bitonicMergeLocal(
uint
arrayLength,
uint
stride,
uint
size,
uint
dir
uint
dir,
__local
uint
*l_key,
__local
uint
*l_val
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
uint
LOCAL_SIZE_LIMIT
=
get_local_size
(
0
)
*
2
;
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_SrcVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_DstKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
...
...
VadereUtils/src/org/vadere/util/opencl/CLBitonicSort.java
View file @
36865d52
...
...
@@ -12,17 +12,15 @@ import org.lwjgl.system.MemoryUtil;
import
java.io.IOException
;
import
java.nio.ByteBuffer
;
import
java.nio.FloatBuffer
;
import
java.nio.IntBuffer
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_CONTEXT_PLATFORM
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_MAX_WORK_GROUP_SIZE
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_NAME
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_TYPE_GPU
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_ALLOC_HOST_PTR
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_COPY_HOST_PTR
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_READ_ONLY
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_READ_WRITE
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_WRITE_ONLY
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_PROGRAM_BUILD_STATUS
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_SUCCESS
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clBuildProgram
;
...
...
@@ -33,15 +31,16 @@ import static org.lwjgl.opencl.CL10.clCreateKernel;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clCreateProgramWithSource
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clEnqueueNDRangeKernel
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clEnqueueReadBuffer
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clEnqueueWriteBuffer
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clFinish
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clGetDeviceIDs
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clGetDeviceInfo
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clGetPlatformIDs
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseCommandQueue
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseContext
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseKernel
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseMemObject
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseProgram
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clSetKernelArg
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clSetKernelArg1i
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clSetKernelArg1p
;
import
static
org
.
lwjgl
.
system
.
MemoryStack
.
stackPush
;
...
...
@@ -50,6 +49,8 @@ import static org.lwjgl.system.MemoryUtil.memUTF8;
/**
* @author Benedikt Zoennchen
*
* This class implements the bitonic sort using the GPU via OpenCL.
*/
public
class
CLBitonicSort
{
private
static
Logger
log
=
LogManager
.
getLogger
(
CLBitonicSort
.
class
);
...
...
@@ -93,7 +94,7 @@ public class CLBitonicSort {
private
int
[]
resultKeys
;
//Note: logically shared with BitonicSort.cl!
private
static
final
int
LOCAL_SIZE_LIMIT
=
16
;
private
int
max_work_group_size
=
16
;
private
boolean
debug
=
false
;
...
...
@@ -154,7 +155,7 @@ public class CLBitonicSort {
// small sorts
if
(
keys
.
length
<=
LOCAL_SIZE_LIMIT
)
if
(
keys
.
length
<=
max_work_group_size
)
{
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
0
,
clOutKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
1
,
clOutValues
));
...
...
@@ -162,6 +163,8 @@ public class CLBitonicSort {
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
3
,
clInValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicSortLocal
,
4
,
keys
.
length
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicSortLocal
,
5
,
1
));
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicSortLocal
,
6
,
keys
.
length
*
4
));
// local memory
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicSortLocal
,
7
,
keys
.
length
*
4
));
// local memory
clGlobalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
clLocalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
...
...
@@ -175,20 +178,22 @@ public class CLBitonicSort {
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
1
,
clOutValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
2
,
clInKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
3
,
clInValues
));
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicSortLocal1
,
4
,
max_work_group_size
*
4
));
// local memory
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicSortLocal1
,
5
,
max_work_group_size
*
4
));
// local memory
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
2
);
clLocalWorkSize
.
put
(
0
,
max_work_group_size
/
2
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicSortLocal1
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
for
(
int
size
=
2
*
LOCAL_SIZE_LIMIT
;
size
<=
keys
.
length
;
size
<<=
1
)
for
(
int
size
=
2
*
max_work_group_size
;
size
<=
keys
.
length
;
size
<<=
1
)
{
for
(
int
stride
=
size
/
2
;
stride
>
0
;
stride
>>=
1
)
{
if
(
stride
>=
LOCAL_SIZE_LIMIT
)
if
(
stride
>=
max_work_group_size
)
{
//Launch bitonicMergeGlobal
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeGlobal
,
0
,
clOutKeys
));
...
...
@@ -204,7 +209,7 @@ public class CLBitonicSort {
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
4
);
clLocalWorkSize
.
put
(
0
,
max_work_group_size
/
4
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicMergeGlobal
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
...
...
@@ -221,11 +226,13 @@ public class CLBitonicSort {
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
5
,
stride
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
6
,
size
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
7
,
dir
));
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicMergeLocal
,
8
,
max_work_group_size
*
4
));
// local memory
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicMergeLocal
,
9
,
max_work_group_size
*
4
));
// local memory
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
2
);
clLocalWorkSize
.
put
(
0
,
max_work_group_size
/
2
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicMergeLocal
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
...
...
@@ -368,6 +375,9 @@ public class CLBitonicSort {
clBitonicMergeLocal
=
clCreateKernel
(
clProgram
,
"bitonicMergeLocal"
,
errcode_ret
);
CLInfo
.
checkCLError
(
errcode_ret
);
PointerBuffer
pp
=
stack
.
mallocPointer
(
1
);
clGetDeviceInfo
(
clDevice
,
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
pp
,
null
);
max_work_group_size
=
(
int
)
pp
.
get
(
0
);
}
}
...
...
VadereUtils/src/org/vadere/util/opencl/CL
UniformHashedGrid
.java
→
VadereUtils/src/org/vadere/util/opencl/CL
LinkedCell
.java
View file @
36865d52
...
...
@@ -19,7 +19,6 @@ import java.nio.IntBuffer;
import
java.util.List
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_CONTEXT_PLATFORM
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_ADDRESS_BITS
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_MAX_WORK_GROUP_SIZE
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_NAME
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_TYPE_GPU
;
...
...
@@ -57,9 +56,12 @@ import static org.lwjgl.system.MemoryUtil.memUTF8;
/**
* @author Benedikt Zoennchen
*
* This class offers the methods to compute an array based linked-cell which contains 2D-coordinates i.e. {@link VPoint}
* using the GPU (see. green-2007 Building the Grid using Sorting).
*/
public
class
CL
UniformHashedGrid
{
private
static
Logger
log
=
LogManager
.
getLogger
(
CL
UniformHashedGrid
.
class
);
public
class
CL
LinkedCell
{
private
static
Logger
log
=
LogManager
.
getLogger
(
CL
LinkedCell
.
class
);
// CL ids
private
long
clPlatform
;
...
...
@@ -122,12 +124,9 @@ public class CLUniformHashedGrid {
private
int
[]
resultValues
;
private
int
[]
resultKeys
;
//Note: logically shared with BitonicSort.cl!
private
static
final
int
LOCAL_SIZE_LIMIT
=
16
;
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
CLLinkedCell
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
CLUniformHashedGrid
.
class
);
private
long
max_work_group_size
;
private
int
max_work_group_size
;
private
boolean
debug
=
false
;
...
...
@@ -138,7 +137,16 @@ public class CLUniformHashedGrid {
NonSeparate
}
public
CLUniformHashedGrid
(
final
int
numberOfElements
,
final
VRectangle
bound
,
final
double
cellSize
)
throws
OpenCLException
{
/**
* Default constructor.
*
* @param numberOfElements the number of positions contained in the linked cell.
* @param bound the spatial bound of the linked cell.
* @param cellSize the cellSize (in x and y direction) of the linked cell.
*
* @throws OpenCLException
*/
public
CLLinkedCell
(
final
int
numberOfElements
,
final
VRectangle
bound
,
final
double
cellSize
)
throws
OpenCLException
{
this
.
numberOfElements
=
numberOfElements
;
this
.
iGridSize
=
new
int
[]{
(
int
)
Math
.
ceil
(
bound
.
getWidth
()
/
cellSize
),
(
int
)
Math
.
ceil
(
bound
.
getHeight
()
/
cellSize
)};
this
.
numberOfGridCells
=
this
.
iGridSize
[
0
]
*
this
.
iGridSize
[
1
];
...
...
@@ -153,16 +161,53 @@ public class CLUniformHashedGrid {
init
();
}
public
class
GridCells
{
/**
* The data structure representing the linked cell. The elements of cell i
* between (reorderedPositions[cellStart[i]*2], reorderedPositions[cellStart[i]*2+1])
* and (reorderedPositions[(cellEnds[i]-1)*2], reorderedPositions[(cellEnds[i]-1)*2+1]).
*/
public
class
LinkedCell
{
/**
* the starting index at which the cell starts, i.e. cell i starts at cellStart[i].
*/
public
int
[]
cellStarts
;
/**
* the ending index at which the cell starts, i.e. cell i ends at cellStart[i].
*/
public
int
[]
cellEnds
;
/**
* the ordered 2D-coordinates.
*/
public
float
[]
reorderedPositions
;
/**
* the mapping between the unordered (original) positions and the reorderedPositions,
* i.e. reorderedPositions[i] == positions[indices[i]]
*/
public
int
[]
indices
;
/**
* the hashes i.e. the cell of the positions, i.e. hashes[i] is the cell of positions[i].
*/
public
int
[]
hashes
;
/**
* the original positions in original order.
*/
public
float
[]
positions
;
}
public
GridCells
calcPositionsInCell
(
@NotNull
final
List
<
VPoint
>
positions
)
throws
OpenCLException
{
/**
* Computes the {@link LinkedCell} of the list of positions.
*
* @param positions a list of position contained in {@link CLLinkedCell#bound}.
* @return {@link LinkedCell} which is the linked list in an array based structure.
*
* @throws OpenCLException
*/
public
LinkedCell
calcLinkedCell
(
@NotNull
final
List
<
VPoint
>
positions
)
throws
OpenCLException
{
assert
positions
.
size
()
==
numberOfElements
;
this
.
positionList
=
positions
;
allocHostMemory
();
...
...
@@ -186,7 +231,7 @@ public class CLUniformHashedGrid {
int
[]
aHashes
=
CLUtils
.
toIntArray
(
hashes
,
numberOfElements
);
float
[]
aPositions
=
CLUtils
.
toFloatArray
(
this
.
positions
,
numberOfElements
*
2
);
Gri
dCell
s
gridCells
=
new
Gri
dCell
s
();
Linke
dCell
gridCells
=
new
Linke
dCell
();
gridCells
.
cellEnds
=
aCellEnds
;
gridCells
.
cellStarts
=
aCellStarts
;
gridCells
.
reorderedPositions
=
aReorderedPositions
;
...
...
@@ -202,6 +247,14 @@ public class CLUniformHashedGrid {
//clFindCellBoundsAndReorder(clCellStarts, clCellEnds, clReorderedPositions, clHashes, clIndices, clPositions, numberOfElements, numberOfGridCells);
}
/**
* Computes all the hash values, i.e. cells of each position and sort these hashes and construct a mapping
* of the rearrangement. This method exists to test the bitonic sort algorithm on the GPU.
*
* @param positions the positions which will be hashed.
* @return the sorted hashes.
* @throws OpenCLException
*/
public
int
[]
calcSortedHashes
(
@NotNull
final
List
<
VPoint
>
positions
)
throws
OpenCLException
{
assert
positions
.
size
()
==
numberOfElements
;
this
.
positionList
=
positions
;
...
...
@@ -221,6 +274,14 @@ public class CLUniformHashedGrid {
//clFindCellBoundsAndReorder(clCellStarts, clCellEnds, clReorderedPositions, clHashes, clIndices, clPositions, numberOfElements, numberOfGridCells);
}
/**
* Computes all the hash values, i.e. cells of each position.
* This method exists to test the hash computation on the GPU.
*
* @param positions the positions which will be hashed.
* @return the (unsorted) hashes.
* @throws OpenCLException
*/
public
int
[]
calcHashes
(
@NotNull
final
List
<
VPoint
>
positions
)
throws
OpenCLException
{
assert
positions
.
size
()
==
numberOfElements
;
this
.
positionList
=
positions
;
...
...
@@ -239,10 +300,21 @@ public class CLUniformHashedGrid {
//clFindCellBoundsAndReorder(clCellStarts, clCellEnds, clReorderedPositions, clHashes, clIndices, clPositions, numberOfElements, numberOfGridCells);
}
/**
* Returns the gridSizes of the linked cell, i.e. result[0] is the x and
* result[1] the y direction.
*
* @return the gridSizes (2D) stored in an array.
*/
public
int
[]
getGridSize
()
{
return
new
int
[]{
iGridSize
[
0
],
iGridSize
[
1
]};
}
/**
* Returns the gridSize which is equal in x and y direction.
*
* @return the gridSize
*/
public
float
getCellSize
()
{
return
iCellSize
;
}
...
...
@@ -299,7 +371,7 @@ public class CLUniformHashedGrid {
return
resultValues
;
}
p
ublic
void
init
()
throws
OpenCLException
{
p
rivate
void
init
()
throws
OpenCLException
{
initCallbacks
();
initCL
();
buildProgram
();
...
...
@@ -373,7 +445,7 @@ public class CLUniformHashedGrid {
IntBuffer
errcode_ret
=
stack
.
callocInt
(
1
);
// small sorts
if
(
numberOfElements
<=
LOCAL_SIZE_LIMIT
)
{
if
(
numberOfElements
<=
max_work_group_size
)
{
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
0
,
clKeysOut
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
1
,
clValuesOut
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
2
,
clKeysIn
));
...
...
@@ -381,6 +453,8 @@ public class CLUniformHashedGrid {
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicSortLocal
,
4
,
numberOfElements
));
//TODO: check the hard coded 1, and the waiting of the queue
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicSortLocal
,
5
,
1
));
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicSortLocal
,
6
,
keys
.
length
*
4
));
// local memory
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicSortLocal
,
7
,
keys
.
length
*
4
));
// local memory
clGlobalWorkSize
.
put
(
0
,
numberOfElements
/
2
);
clLocalWorkSize
.
put
(
0
,
numberOfElements
/
2
);
...
...
@@ -393,18 +467,20 @@ public class CLUniformHashedGrid {
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
1
,
clValuesOut
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
2
,
clKeysIn
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
3
,
clValuesIn
));
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicSortLocal1
,
4
,
max_work_group_size
*
4
));
// local memory
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicSortLocal1
,
5
,
max_work_group_size
*
4
));
// local memory
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
numberOfElements
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
2
);
clLocalWorkSize
.
put
(
0
,
max_work_group_size
/
2
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicSortLocal1
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
for
(
int
size
=
2
*
LOCAL_SIZE_LIMIT
;
size
<=
numberOfElements
;
size
<<=
1
)
{
for
(
int
size
=
2
*
max_work_group_size
;
size
<=
numberOfElements
;
size
<<=
1
)
{
for
(
int
stride
=
size
/
2
;
stride
>
0
;
stride
>>=
1
)
{
if
(
stride
>=
LOCAL_SIZE_LIMIT
)
{
if
(
stride
>=
max_work_group_size
)
{
//Launch bitonicMergeGlobal
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeGlobal
,
0
,
clKeysOut
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeGlobal
,
1
,
clValuesOut
));
...
...
@@ -419,7 +495,7 @@ public class CLUniformHashedGrid {
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
numberOfElements
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
4
);
clLocalWorkSize
.
put
(
0
,
max_work_group_size
/
4
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicMergeGlobal
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
...
...
@@ -434,11 +510,13 @@ public class CLUniformHashedGrid {
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
5
,
stride
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
6
,
size
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
7
,
dir
));
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicMergeLocal
,
8
,
max_work_group_size
*
4
));
// local memory
CLInfo
.
checkCLError
(
clSetKernelArg
(
clBitonicMergeLocal
,
9
,
max_work_group_size
*
4
));
// local memory
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
numberOfElements
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
2
);
clLocalWorkSize
.
put
(
0
,
max_work_group_size
/
2
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicMergeLocal
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
...
...
@@ -594,7 +672,7 @@ public class CLUniformHashedGrid {
PointerBuffer
pp
=
stack
.
mallocPointer
(
1
);
clGetDeviceInfo
(
clDevice
,
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
pp
,
null
);
max_work_group_size
=
pp
.
get
(
0
);
max_work_group_size
=
(
int
)
pp
.
get
(
0
);
logger
.
info
(
"CL_DEVICE_MAX_WORK_GROUP_SIZE = "
+
max_work_group_size
);
}
...
...
VadereUtils/tests/org/vadere/util/math/TestBitonicSort.java
View file @
36865d52
...
...
@@ -28,8 +28,8 @@ public class TestBitonicSort {
@Test
public
void
testLocalSort
()
throws
IOException
,
OpenCLException
{
int
[]
keys
=
randomArray
(
2
56
);
int
[]
values
=
randomArray
(
2
56
);
int
[]
keys
=
randomArray
(
3
2
);
int
[]
values
=
randomArray
(
3
2
);
CLBitonicSort
clBitonicSort
=
new
CLBitonicSort
();
clBitonicSort
.
sort
(
keys
,
values
);
...
...
VadereUtils/tests/org/vadere/util/math/TestC
ellGridSor
t.java
→
VadereUtils/tests/org/vadere/util/math/TestC
LLinkedLis
t.java
View file @
36865d52
...
...
@@ -6,7 +6,7 @@ import org.junit.Before;
import
org.junit.Test
;
import
org.vadere.util.geometry.shapes.VPoint
;
import
org.vadere.util.geometry.shapes.VRectangle
;
import
org.vadere.util.opencl.CL
UniformHashedGrid
;
import
org.vadere.util.opencl.CL
LinkedCell
;
import
org.vadere.util.opencl.OpenCLException
;
import
java.io.IOException
;
...
...
@@ -19,7 +19,7 @@ import static org.junit.Assert.assertEquals;
/**
* @author Benedikt Zoennchen
*/
public
class
TestC
ellGridSor
t
{
public
class
TestC
LLinkedLis
t
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
TestConvolution
.
class
);
...
...
@@ -30,7 +30,7 @@ public class TestCellGridSort {
@Test
public
void
testCalcHash
()
throws
IOException
,
OpenCLException
{
CL
UniformHashedGrid
clUniformHashedGrid
=
new
CL
UniformHashedGrid
(
1024
,
new
VRectangle
(
0
,
0
,
10
,
10
),
1
);
CL
LinkedCell
clUniformHashedGrid
=
new
CL
LinkedCell
(
1024
,
new
VRectangle
(
0
,
0
,
10
,
10
),
1
);
ArrayList
<
VPoint
>
positions
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
1024
;
i
++)
{
positions
.
add
(
new
VPoint
(
random
.
nextDouble
()
*
10
,
random
.
nextDouble
()
*
10
));
...
...
@@ -49,7 +49,7 @@ public class TestCellGridSort {
@Test
public
void
testCalcAndSortHash
()
throws
IOException
,
OpenCLException
{
CL
UniformHashedGrid
clUniformHashedGrid
=
new
CL
UniformHashedGrid
(
1024
,
new
VRectangle
(
0
,
0
,
10
,
10
),
1
);
CL
LinkedCell
clUniformHashedGrid
=
new
CL
LinkedCell
(
1024
,
new
VRectangle
(
0
,
0
,
10
,
10
),
1
);
ArrayList
<
VPoint
>
positions
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
1024
;
i
++)
{
positions
.
add
(
new
VPoint
(
random
.
nextDouble
()
*
10
,
random
.
nextDouble
()
*
10
));
...
...
@@ -72,12 +72,12 @@ public class TestCellGridSort {
@Test
public
void
testGridCell
()
throws
IOException
,
OpenCLException
{
CL
UniformHashedGrid
clUniformHashedGrid
=
new
CL
UniformHashedGrid
(
1024
,
new
VRectangle
(
0
,
0
,
10
,
10
),
1
);
CL
LinkedCell
clUniformHashedGrid
=
new
CL
LinkedCell
(
1024
,
new
VRectangle
(
0
,
0
,
10
,
10
),
1
);
ArrayList
<
VPoint
>
positions
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
1024
;
i
++)
{
positions
.
add
(
new
VPoint
(
random
.
nextDouble
()
*
10
,
random
.
nextDouble
()
*
10
));
}
CL
UniformHashedGrid
.
Gri
dCell
s
gridCells
=
clUniformHashedGrid
.
calc
PositionsIn
Cell
(
positions
);
CL
LinkedCell
.
Linke
dCell
gridCells
=
clUniformHashedGrid
.
calc
Linked
Cell
(
positions
);
int
numberOfCells
=
clUniformHashedGrid
.
getGridSize
()[
0
]
*
clUniformHashedGrid
.
getGridSize
()[
1
];
for
(
int
cell
=
0
;
cell
<
numberOfCells
;
cell
++)
{
int
cellStart
=
gridCells
.
cellStarts
[
cell
];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment