Commit c9d74f71 authored by Benedikt Zoennchen's avatar Benedikt Zoennchen

update GPU triangulation algo.

parent 8b3b2836
......@@ -2,6 +2,10 @@
//IDistanceFunction distanceFunc = p -> Math.abs(6 - Math.sqrt(p.getX() * p.getX() + p.getY() * p.getY())) - 4;
// abs(6 - length(v))-4
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define LOCK(a) atomic_cmpxchg(a, 0, 1)
#define UNLOCK(a) atomic_xchg(a, 0)
inline void atomicAdd_g_f(volatile __global float2 *addr, float2 val) {
union{
unsigned int u32;
......@@ -20,7 +24,8 @@ kernel void computeForces(
__global int4* edges,
__global float2* lengths,
__global float* scalingFactor,
__global float2* forces)
__global float2* forces,
__global int* mutexes)
{
int i = get_global_id(0);
int p1Index = edges[i].s0;
......@@ -38,12 +43,15 @@ kernel void computeForces(
float2 partialForce = v * lenDiff; // TODO;
atomicAdd_g_f(&forces[p1Index], partialForce.s0);
//atomicAdd_g_f(&forces[p1Index].s1, partialForce.s1);
volatile __global int* addr = &mutexes[p1Index];
//forces[edges[i].s0] = forces[edges[i].s0] + partialForce; // TODO sync
//forces[edges[i].s0] = (float2) (1.0f, 1.0f);
//forces[edges[i].s1] = forces[edges[i].s1] - partialForce; // TODO sync
//int waiting = 1;
//while (waiting) {
// while (LOCK(addr)) {}
forces[p1Index] = forces[p1Index] + partialForce;
// UNLOCK(addr);
// waiting = 0;
//}
}
kernel void moveVertices(__global float2* vertices, __global float2* forces, const float delta) {
......@@ -93,7 +101,6 @@ kernel void computeLengths(
qLengths[i] = (float2) (length(v)*length(v), desiredLen*desiredLen);
}
// kernel for multiple work-groups
kernel void computePartialSF(__const int size, __global float2* qlengths, __local float2* partialSums, __global float2* output) {
int gid = get_global_id(0);
......@@ -112,7 +119,7 @@ kernel void computePartialSF(__const int size, __global float2* qlengths, __loca
int group_size = get_local_size(0);
float2 len = accumulator;
//float2 len = (float2)(1.f, 1.0f);
//float2 len = (float2)(1.0f, 1.0f);
partialSums[lid] = len;
barrier(CLK_LOCAL_MEM_FENCE);
......
......@@ -44,13 +44,14 @@ kernel void computeForces(
volatile __global int* addr = &mutexes[p1Index];
// TODO does this sync work properly? This syncs too much?
int waiting = 1;
while (waiting) {
while (LOCK(addr)) {}
//int waiting = 1;
//while (waiting) {
// while (LOCK(addr)) {}
forces[p1Index] = forces[p1Index] + partialForce;
UNLOCK(addr);
waiting = 0;
}
// UNLOCK(addr);
// waiting = 0;
//}
}
inline double dabs(double d) {return d < 0 ? -d : d;}
......
......@@ -112,7 +112,7 @@ public class CLDistMesh {
private IntBuffer mutexes;
private AMesh<? extends MPoint> mesh;
private boolean doublePrecision = true;
private boolean doublePrecision = false;
public CLDistMesh(@NotNull AMesh<? extends MPoint> mesh) {
this.mesh = mesh;
......@@ -191,8 +191,10 @@ public class CLDistMesh {
PointerBuffer pp = stack.mallocPointer(1);
clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, pp, null);
maxGroupSize = pp.get(0);
log.info("CL_DEVICE_MAX_WORK_GROUP_SIZE = " + maxGroupSize);
clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, pp, null);
maxComputeUnits = pp.get(0);
log.info("CL_DEVICE_MAX_COMPUTE_UNITS = " + maxComputeUnits);
}
private void buildProgram() {
......@@ -255,9 +257,11 @@ public class CLDistMesh {
private void initialKernelArgs() {
int factor = doublePrecision ? 8 : 4;
int sizeSFPartial = numberOfEdges;
IntBuffer intBuffer = stack.callocInt(1);
IntBuffer intBuffer = stack.callocInt(2);
log.info("CL_DEVICE_TYPE = " + CLInfo.getDeviceInfoPointer(clDevice, CL_DEVICE_TYPE));
clGetKernelWorkGroupInfo(clKernelPartialSF, clDevice, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, intBuffer, null);
prefdWorkGroupSizeMultiple = intBuffer.get(0);
log.info("prefWorkGroupSizeMultiple = " + prefdWorkGroupSizeMultiple);
clSetKernelArg1p(clKernelLengths, 0, clVertices);
clSetKernelArg1p(clKernelLengths, 1, clEdges);
......@@ -305,6 +309,7 @@ public class CLDistMesh {
clGloblWorkSizeSFComplete.put(0, ceilPowerOf2(sizeSFComplete));
clLocalWorkSizeSFComplete.put(0, ceilPowerOf2(sizeSFComplete));
clLocalWorkSizeForces.put(0, 1);
clGlobalWorkSizeForces.put(0, numberOfEdges);
clGlobalWorkSizeEdges = BufferUtils.createPointerBuffer(1);
......@@ -333,9 +338,9 @@ public class CLDistMesh {
clFinish(clQueue);
// TODO: remove, its only for testing!
readResult();
//readResult();
//printResult();
updateMesh();
//updateMesh();
}
private void readResult() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment