Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
vadere
vadere
Commits
adc98fe6
Commit
adc98fe6
authored
Jul 09, 2018
by
Benedikt Zoennchen
Browse files
implementation of Bitonic sort on the GPU.
parent
caff4ed7
Pipeline
#61189
failed with stage
in 21 seconds
Changes
4
Pipelines
1
Show whitespace changes
Inline
Side-by-side
VadereUtils/resources/BitonicSort.cl
0 → 100644
View file @
adc98fe6
/*
*
Copyright
1993-2010
NVIDIA
Corporation.
All
rights
reserved.
*
*
Please
refer
to
the
NVIDIA
end
user
license
agreement
(
EULA
)
associated
*
with
this
source
code
for
terms
and
conditions
that
govern
your
use
of
*
this
software.
Any
use,
reproduction,
disclosure,
or
distribution
of
*
this
software
and
related
documentation
outside
the
terms
of
the
EULA
*
is
strictly
prohibited.
*
*/
#
define
LOCAL_SIZE_LIMIT
256U
inline
void
ComparatorPrivate
(
uint
*keyA,
uint
*valA,
uint
*keyB,
uint
*valB,
uint
dir
)
{
if
(
(
*keyA
>
*keyB
)
==
dir
)
{
uint
t
;
t
=
*keyA
; *keyA = *keyB; *keyB = t;
t
=
*valA
; *valA = *valB; *valB = t;
}
}
inline
void
ComparatorLocal
(
__local
uint
*keyA,
__local
uint
*valA,
__local
uint
*keyB,
__local
uint
*valB,
uint
dir
)
{
if
(
(
*keyA
>
*keyB
)
==
dir
)
{
uint
t
;
t
=
*keyA
; *keyA = *keyB; *keyB = t;
t
=
*valA
; *valA = *valB; *valB = t;
}
}
////////////////////////////////////////////////////////////////////////////////
//
Monolithic
bitonic
sort
kernel
for
short
arrays
fitting
into
local
memory
////////////////////////////////////////////////////////////////////////////////
__kernel
void
bitonicSortLocal
(
__global
uint
*d_DstKey,
__global
uint
*d_DstVal,
__global
uint
*d_SrcKey,
__global
uint
*d_SrcVal,
uint
arrayLength,
uint
dir
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
//Offset
to
the
beginning
of
subbatch
and
load
data
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_SrcVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_DstKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_DstVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
l_key[get_local_id
(
0
)
+
0]
=
d_SrcKey[
0]
;
l_val[get_local_id
(
0
)
+
0]
=
d_SrcVal[
0]
;
l_key[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
d_SrcKey[
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
l_val[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
d_SrcVal[
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
for
(
uint
size
=
2
; size < arrayLength; size <<= 1){
//Bitonic
merge
uint
ddd
=
dir
^
(
(
get_local_id
(
0
)
&
(
size
/
2
))
!=
0
)
;
for
(
uint
stride
=
size
/
2
; stride > 0; stride >>= 1){
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
uint
pos
=
2
*
get_local_id
(
0
)
-
(
get_local_id
(
0
)
&
(
stride
-
1
))
;
ComparatorLocal
(
&l_key[pos
+
0],
&l_val[pos
+
0],
&l_key[pos
+
stride],
&l_val[pos
+
stride],
ddd
)
;
}
}
//ddd
==
dir
for
the
last
bitonic
merge
step
{
for
(
uint
stride
=
arrayLength
/
2
; stride > 0; stride >>= 1){
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
uint
pos
=
2
*
get_local_id
(
0
)
-
(
get_local_id
(
0
)
&
(
stride
-
1
))
;
ComparatorLocal
(
&l_key[pos
+
0],
&l_val[pos
+
0],
&l_key[pos
+
stride],
&l_val[pos
+
stride],
dir
)
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
d_DstKey[
0]
=
l_key[get_local_id
(
0
)
+
0]
;
d_DstVal[
0]
=
l_val[get_local_id
(
0
)
+
0]
;
d_DstKey[
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
l_key[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
d_DstVal[
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
l_val[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
}
////////////////////////////////////////////////////////////////////////////////
//
Bitonic
sort
kernel
for
large
arrays
(
not
fitting
into
local
memory
)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level
bitonic
sort
//Almost
the
same
as
bitonicSortLocal
with
the
only
exception
//of
even
/
odd
subarrays
(
of
LOCAL_SIZE_LIMIT
points
)
being
//sorted
in
opposite
directions
__kernel
void
bitonicSortLocal1
(
__global
uint
*d_DstKey,
__global
uint
*d_DstVal,
__global
uint
*d_SrcKey,
__global
uint
*d_SrcVal
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
//Offset
to
the
beginning
of
subarray
and
load
data
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_SrcVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_DstKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_DstVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
l_key[get_local_id
(
0
)
+
0]
=
d_SrcKey[
0]
;
l_val[get_local_id
(
0
)
+
0]
=
d_SrcVal[
0]
;
l_key[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
d_SrcKey[
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
l_val[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
d_SrcVal[
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
uint
comparatorI
=
get_global_id
(
0
)
&
((
LOCAL_SIZE_LIMIT
/
2
)
-
1
)
;
for
(
uint
size
=
2
; size < LOCAL_SIZE_LIMIT; size <<= 1){
//Bitonic
merge
uint
ddd
=
(
comparatorI
&
(
size
/
2
))
!=
0
;
for
(
uint
stride
=
size
/
2
; stride > 0; stride >>= 1){
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
uint
pos
=
2
*
get_local_id
(
0
)
-
(
get_local_id
(
0
)
&
(
stride
-
1
))
;
ComparatorLocal
(
&l_key[pos
+
0],
&l_val[pos
+
0],
&l_key[pos
+
stride],
&l_val[pos
+
stride],
ddd
)
;
}
}
//Odd
/
even
arrays
of
LOCAL_SIZE_LIMIT
elements
//sorted
in
opposite
directions
{
uint
ddd
=
(
get_group_id
(
0
)
&
1
)
;
for
(
uint
stride
=
LOCAL_SIZE_LIMIT
/
2
; stride > 0; stride >>= 1){
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
uint
pos
=
2
*
get_local_id
(
0
)
-
(
get_local_id
(
0
)
&
(
stride
-
1
))
;
ComparatorLocal
(
&l_key[pos
+
0],
&l_val[pos
+
0],
&l_key[pos
+
stride],
&l_val[pos
+
stride],
ddd
)
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
d_DstKey[
0]
=
l_key[get_local_id
(
0
)
+
0]
;
d_DstVal[
0]
=
l_val[get_local_id
(
0
)
+
0]
;
d_DstKey[
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
l_key[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
d_DstVal[
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
l_val[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
}
//Bitonic
merge
iteration
for
'stride
'
>=
LOCAL_SIZE_LIMIT
__kernel
void
bitonicMergeGlobal
(
__global
uint
*d_DstKey,
__global
uint
*d_DstVal,
__global
uint
*d_SrcKey,
__global
uint
*d_SrcVal,
uint
arrayLength,
uint
size,
uint
stride,
uint
dir
)
{
uint
global_comparatorI
=
get_global_id
(
0
)
;
uint
comparatorI
=
global_comparatorI
&
(
arrayLength
/
2
-
1
)
;
//Bitonic
merge
uint
ddd
=
dir
^
(
(
comparatorI
&
(
size
/
2
))
!=
0
)
;
uint
pos
=
2
*
global_comparatorI
-
(
global_comparatorI
&
(
stride
-
1
))
;
uint
keyA
=
d_SrcKey[pos
+
0]
;
uint
valA
=
d_SrcVal[pos
+
0]
;
uint
keyB
=
d_SrcKey[pos
+
stride]
;
uint
valB
=
d_SrcVal[pos
+
stride]
;
ComparatorPrivate
(
&keyA,
&valA,
&keyB,
&valB,
ddd
)
;
d_DstKey[pos
+
0]
=
keyA
;
d_DstVal[pos
+
0]
=
valA
;
d_DstKey[pos
+
stride]
=
keyB
;
d_DstVal[pos
+
stride]
=
valB
;
}
//Combined
bitonic
merge
steps
for
//
'size
'
>
LOCAL_SIZE_LIMIT
and
'stride
'
=
[1
..
LOCAL_SIZE_LIMIT
/
2]
__kernel
void
bitonicMergeLocal
(
__global
uint
*d_DstKey,
__global
uint
*d_DstVal,
__global
uint
*d_SrcKey,
__global
uint
*d_SrcVal,
uint
arrayLength,
uint
stride,
uint
size,
uint
dir
)
{
__local
uint
l_key[LOCAL_SIZE_LIMIT]
;
__local
uint
l_val[LOCAL_SIZE_LIMIT]
;
d_SrcKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_SrcVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_DstKey
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
d_DstVal
+=
get_group_id
(
0
)
*
LOCAL_SIZE_LIMIT
+
get_local_id
(
0
)
;
l_key[get_local_id
(
0
)
+
0]
=
d_SrcKey[
0]
;
l_val[get_local_id
(
0
)
+
0]
=
d_SrcVal[
0]
;
l_key[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
d_SrcKey[
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
l_val[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
d_SrcVal[
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
//Bitonic
merge
uint
comparatorI
=
get_global_id
(
0
)
&
((
arrayLength
/
2
)
-
1
)
;
uint
ddd
=
dir
^
(
(
comparatorI
&
(
size
/
2
))
!=
0
)
;
for
(
; stride > 0; stride >>= 1){
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
uint
pos
=
2
*
get_local_id
(
0
)
-
(
get_local_id
(
0
)
&
(
stride
-
1
))
;
ComparatorLocal
(
&l_key[pos
+
0],
&l_val[pos
+
0],
&l_key[pos
+
stride],
&l_val[pos
+
stride],
ddd
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
d_DstKey[
0]
=
l_key[get_local_id
(
0
)
+
0]
;
d_DstVal[
0]
=
l_val[get_local_id
(
0
)
+
0]
;
d_DstKey[
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
l_key[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
d_DstVal[
(
LOCAL_SIZE_LIMIT
/
2
)
]
=
l_val[get_local_id
(
0
)
+
(
LOCAL_SIZE_LIMIT
/
2
)
]
;
}
\ No newline at end of file
VadereUtils/src/org/vadere/util/opencl/CLBitonicSort.java
0 → 100644
View file @
adc98fe6
package
org.vadere.util.opencl
;
import
org.apache.log4j.LogManager
;
import
org.apache.log4j.Logger
;
import
org.jetbrains.annotations.NotNull
;
import
org.lwjgl.PointerBuffer
;
import
org.lwjgl.opencl.CLContextCallback
;
import
org.lwjgl.opencl.CLProgramCallback
;
import
org.lwjgl.system.Configuration
;
import
org.lwjgl.system.MemoryStack
;
import
org.lwjgl.system.MemoryUtil
;
import
java.io.IOException
;
import
java.nio.ByteBuffer
;
import
java.nio.FloatBuffer
;
import
java.nio.IntBuffer
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_CONTEXT_PLATFORM
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_NAME
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_DEVICE_TYPE_GPU
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_ALLOC_HOST_PTR
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_COPY_HOST_PTR
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_READ_ONLY
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_READ_WRITE
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_MEM_WRITE_ONLY
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_PROGRAM_BUILD_STATUS
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
CL_SUCCESS
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clBuildProgram
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clCreateBuffer
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clCreateCommandQueue
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clCreateContext
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clCreateKernel
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clCreateProgramWithSource
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clEnqueueNDRangeKernel
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clEnqueueReadBuffer
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clEnqueueWriteBuffer
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clFinish
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clGetDeviceIDs
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clGetPlatformIDs
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseCommandQueue
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseContext
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseKernel
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseMemObject
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clReleaseProgram
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clSetKernelArg1i
;
import
static
org
.
lwjgl
.
opencl
.
CL10
.
clSetKernelArg1p
;
import
static
org
.
lwjgl
.
system
.
MemoryStack
.
stackPush
;
import
static
org
.
lwjgl
.
system
.
MemoryUtil
.
NULL
;
import
static
org
.
lwjgl
.
system
.
MemoryUtil
.
memUTF8
;
/**
* @author Benedikt Zoennchen
*/
public
class
CLBitonicSort
{
private
static
Logger
log
=
LogManager
.
getLogger
(
CLBitonicSort
.
class
);
// CL ids
private
long
clPlatform
;
private
long
clDevice
;
private
long
clContext
;
private
long
clQueue
;
private
long
clProgram
;
// CL Memory
private
long
clInKeys
;
private
long
clOutKeys
;
private
long
clInValues
;
private
long
clOutValues
;
// Host Memory
private
IntBuffer
inKeys
;
private
IntBuffer
outKeys
;
private
IntBuffer
inValues
;
private
IntBuffer
outValues
;
private
ByteBuffer
source
;
// CL callbacks
private
CLContextCallback
contextCB
;
private
CLProgramCallback
programCB
;
// CL kernel
private
long
clBitonicSortLocal
;
private
long
clBitonicSortLocal1
;
private
long
clBitonicMergeGlobal
;
private
long
clBitonicMergeLocal
;
private
long
clKernel
;
private
int
[]
keys
;
private
int
[]
values
;
private
int
[]
resultValues
;
private
int
[]
resultKeys
;
//Note: logically shared with BitonicSort.cl!
private
static
final
int
LOCAL_SIZE_LIMIT
=
256
;
private
boolean
debug
=
false
;
public
enum
KernelType
{
Separate
,
Col
,
Row
,
NonSeparate
}
public
CLBitonicSort
()
throws
OpenCLException
{
if
(
debug
)
{
Configuration
.
DEBUG
.
set
(
true
);
Configuration
.
DEBUG_MEMORY_ALLOCATOR
.
set
(
true
);
Configuration
.
DEBUG_STACK
.
set
(
true
);
}
init
();
}
public
int
[]
getResultKeys
()
{
return
resultKeys
;
}
public
int
[]
getResultValues
()
{
return
resultValues
;
}
public
void
init
()
throws
OpenCLException
{
initCallbacks
();
initCL
();
buildProgram
();
}
public
void
sort
(
@NotNull
final
int
[]
keys
,
@NotNull
final
int
[]
values
)
throws
OpenCLException
{
assert
factorRadix2
(
keys
.
length
)
==
1
&&
keys
.
length
==
values
.
length
;
inKeys
=
CLUtils
.
toIntBuffer
(
keys
,
CLUtils
.
toIntBuffer
(
keys
));
outKeys
=
CLUtils
.
toIntBuffer
(
keys
);
inValues
=
CLUtils
.
toIntBuffer
(
values
,
CLUtils
.
toIntBuffer
(
values
));
outValues
=
CLUtils
.
toIntBuffer
(
values
);
try
(
MemoryStack
stack
=
stackPush
())
{
int
dir
=
1
;
PointerBuffer
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
PointerBuffer
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
IntBuffer
errcode_ret
=
stack
.
callocInt
(
1
);
// host memory to gpu memory
clInKeys
=
clCreateBuffer
(
clContext
,
CL_MEM_READ_WRITE
|
CL_MEM_ALLOC_HOST_PTR
|
CL_MEM_COPY_HOST_PTR
,
inKeys
,
errcode_ret
);
CLInfo
.
checkCLError
(
errcode_ret
);
clOutKeys
=
clCreateBuffer
(
clContext
,
CL_MEM_READ_WRITE
,
4
*
keys
.
length
,
errcode_ret
);
CLInfo
.
checkCLError
(
errcode_ret
);
clInValues
=
clCreateBuffer
(
clContext
,
CL_MEM_READ_WRITE
|
CL_MEM_ALLOC_HOST_PTR
|
CL_MEM_COPY_HOST_PTR
,
inValues
,
errcode_ret
);
CLInfo
.
checkCLError
(
errcode_ret
);
clOutValues
=
clCreateBuffer
(
clContext
,
CL_MEM_READ_WRITE
,
4
*
keys
.
length
,
errcode_ret
);
CLInfo
.
checkCLError
(
errcode_ret
);
// small sorts
if
(
keys
.
length
<=
LOCAL_SIZE_LIMIT
)
{
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
0
,
clOutKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
1
,
clOutValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
2
,
clInKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal
,
3
,
clInValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicSortLocal
,
4
,
keys
.
length
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicSortLocal
,
5
,
1
));
clGlobalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
clLocalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
// run the kernel and read the result
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicSortLocal
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
}
else
{
//Launch bitonicSortLocal1
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
0
,
clOutKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
1
,
clOutValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
2
,
clInKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicSortLocal1
,
3
,
clInValues
));
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
2
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicSortLocal1
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
for
(
int
size
=
2
*
LOCAL_SIZE_LIMIT
;
size
<=
keys
.
length
;
size
<<=
1
)
{
for
(
int
stride
=
size
/
2
;
stride
>
0
;
stride
>>=
1
)
{
if
(
stride
>=
LOCAL_SIZE_LIMIT
)
{
//Launch bitonicMergeGlobal
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeGlobal
,
0
,
clOutKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeGlobal
,
1
,
clOutValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeGlobal
,
2
,
clOutKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeGlobal
,
3
,
clOutValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeGlobal
,
4
,
keys
.
length
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeGlobal
,
5
,
size
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeGlobal
,
6
,
stride
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeGlobal
,
7
,
dir
));
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
4
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicMergeGlobal
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
}
else
{
//Launch bitonicMergeLocal
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeLocal
,
0
,
clOutKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeLocal
,
1
,
clOutValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeLocal
,
2
,
clOutKeys
));
CLInfo
.
checkCLError
(
clSetKernelArg1p
(
clBitonicMergeLocal
,
3
,
clOutValues
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
4
,
keys
.
length
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
5
,
stride
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
6
,
size
));
CLInfo
.
checkCLError
(
clSetKernelArg1i
(
clBitonicMergeLocal
,
7
,
dir
));
clGlobalWorkSize
=
stack
.
callocPointer
(
1
);
clLocalWorkSize
=
stack
.
callocPointer
(
1
);
clGlobalWorkSize
.
put
(
0
,
keys
.
length
/
2
);
clLocalWorkSize
.
put
(
0
,
LOCAL_SIZE_LIMIT
/
2
);
CLInfo
.
checkCLError
(
clEnqueueNDRangeKernel
(
clQueue
,
clBitonicMergeLocal
,
1
,
null
,
clGlobalWorkSize
,
clLocalWorkSize
,
null
,
null
));
CLInfo
.
checkCLError
(
clFinish
(
clQueue
));
break
;
}
}
}
}
clEnqueueReadBuffer
(
clQueue
,
clOutKeys
,
true
,
0
,
outKeys
,
null
,
null
);
clEnqueueReadBuffer
(
clQueue
,
clOutValues
,
true
,
0
,
outValues
,
null
,
null
);
resultKeys
=
CLUtils
.
toIntArray
(
outKeys
,
keys
.
length
);
resultValues
=
CLUtils
.
toIntArray
(
outValues
,
values
.
length
);
}
clearCL
();
}
static
long
factorRadix2
(
long
L
){
if
(
L
==
0
){
return
0
;
}
else
{
for
(
int
log2L
=
0
;
(
L
&
1
)
==
0
;
L
>>=
1
,
log2L
++);
return
L
;
}
}
public
void
clear
()
throws
OpenCLException
{
clearMemory
();
}
private
void
clearMemory
()
throws
OpenCLException
{
// release memory and devices
try
{
CLInfo
.
checkCLError
(
clReleaseMemObject
(
clInKeys
));
CLInfo
.
checkCLError
(
clReleaseMemObject
(
clOutKeys
));
CLInfo
.
checkCLError
(
clReleaseMemObject
(
clInValues
));
CLInfo
.
checkCLError
(
clReleaseMemObject
(
clOutValues
));
}
catch
(
OpenCLException
ex
)
{
throw
ex
;
}
finally
{
MemoryUtil
.
memFree
(
inKeys
);
MemoryUtil
.
memFree
(
outKeys
);
MemoryUtil
.
memFree
(
inValues
);
MemoryUtil
.
memFree
(
inKeys
);
MemoryUtil
.
memFree
(
source
);
}
}
private
void
clearCL
()
throws
OpenCLException
{
CLInfo
.
checkCLError
(
clReleaseKernel
(
clBitonicSortLocal
));
CLInfo
.
checkCLError
(
clReleaseKernel
(
clBitonicSortLocal1
));
CLInfo
.
checkCLError
(
clReleaseKernel
(
clBitonicMergeGlobal
));
CLInfo
.
checkCLError
(
clReleaseKernel
(
clBitonicMergeLocal
));
CLInfo
.
checkCLError
(
clReleaseCommandQueue
(
clQueue
));
CLInfo
.
checkCLError
(
clReleaseProgram
(
clProgram
));
CLInfo
.
checkCLError
(
clReleaseContext
(
clContext
));
contextCB
.
free
();
programCB
.
free
();
}
// private helpers
private
void
initCallbacks
()
{
contextCB
=
CLContextCallback
.
create
((
errinfo
,
private_info
,
cb
,
user_data
)
->
{
log
.
debug
(
"[LWJGL] cl_context_callback"
+
"\tInfo: "
+
memUTF8
(
errinfo
));
});
programCB
=
CLProgramCallback
.
create
((
program
,
user_data
)
->
{
try
{
log
.
debug
(
"The cl_program [0x"
+
program
+
"] was built "
+
(
CLInfo
.
getProgramBuildInfoInt
(
program
,
clDevice
,
CL_PROGRAM_BUILD_STATUS
)
==
CL_SUCCESS
?
"successfully"
:
"unsuccessfully"
));
}
catch
(
OpenCLException
e
)
{
e
.
printStackTrace
();
}
});
}
private
void
initCL
()
throws
OpenCLException
{
try
(
MemoryStack
stack
=
stackPush
())
{
IntBuffer
errcode_ret
=
stack
.
callocInt
(
1
);
IntBuffer
numberOfPlatforms
=
stack
.
mallocInt
(
1
);
CLInfo
.
checkCLError
(
clGetPlatformIDs
(
null
,
numberOfPlatforms
));
PointerBuffer
platformIDs
=
stack
.
mallocPointer
(
numberOfPlatforms
.
get
(
0
));
CLInfo
.
checkCLError
(
clGetPlatformIDs
(
platformIDs
,
numberOfPlatforms
));
clPlatform
=
platformIDs
.
get
(
0
);
IntBuffer
numberOfDevices
=
stack
.
mallocInt
(
1
);
CLInfo
.
checkCLError
(
clGetDeviceIDs
(
clPlatform
,
CL_DEVICE_TYPE_GPU
,
null
,
numberOfDevices
));
PointerBuffer
deviceIDs
=
stack
.
mallocPointer
(
numberOfDevices
.
get
(
0
));
CLInfo
.
checkCLError
(
clGetDeviceIDs
(
clPlatform
,
CL_DEVICE_TYPE_GPU
,
deviceIDs
,
numberOfDevices
));
clDevice
=
deviceIDs
.
get
(
0
);
log
.
debug
(
"CL_DEVICE_NAME = "
+
CLInfo
.
getDeviceInfoStringUTF8
(
clDevice
,
CL_DEVICE_NAME
));
PointerBuffer
ctxProps
=
stack
.
mallocPointer
(
3
);
ctxProps
.
put
(
CL_CONTEXT_PLATFORM
)
.
put
(
clPlatform
)
.
put
(
NULL
)
.
flip
();
clContext
=
clCreateContext
(
ctxProps
,
clDevice
,
contextCB
,
NULL
,
errcode_ret
);
CLInfo