...
 
Commits (11)
......@@ -39,6 +39,6 @@ __kernel void dft(
out[i] = tot;
} else {
// backward transform (frequential -> space)
out[i] = tot / (double)length;
out[i] = (float2) (tot.x / (double)length, tot.y/(double) length);
}
}
\ No newline at end of file
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void dft1Dim(const __global float2 * input, __global float2 * output,
const int length, const int direction)
{
int N = length; // only halft the length because of float2 vector
int i = get_global_id(0);
if (i >= N)
return;
float2 sum = (float2)(0.0f,0.0f);
float param = (-2*direction * i) * M_PI_F / (float) N;
for (int k = 0; k < N; ++k) {
float2 value = input[k];
float c;
float s = sincos(k* param, &c);
sum += (float2)(value.x * c - value.y*s,value.x * s + value.y*c);
}
if (direction == 1) {
output[i] = sum;
} else {
output[i] = (float2)(sum.x/(float)N,sum.y/(float)N);
}
}
\ No newline at end of file
__kernel void fft(const __global float2* g_data, __local float2* l_data, uint points_per_group, uint size, int dir)
{
__kernel void fft(const __global float2 * g_data, __local float2 * l_data, uint size, int dir) {
int workitems_per_workgroup = get_local_size(0);
int number_of_workgroups = get_num_groups(0);
int points_per_group = size/number_of_workgroups;
int points_per_item = points_per_group / get_local_size(0);
int l_addr = get_local_id(0) * points_per_item;
int g_addr = get_group_id(0) * points_per_group + l_addr;
int num_vectors = N/4;
uint4 index, br;
uint maks_left, mask_right, shift_pos;
uint mask_left, mask_right, shift_pos;
float2 x1, x2, x3, x4;
for(int i = 0; i < points_per_item; i += 4) {
index = (uint4) (g_addr, g_addr + 1, g_addr + 2, g_addr + 3);
mask_left = size / 2;
mask_right = 1;
shift_pos = log2(size) - 1;
shift_pos = (int)log2((float) size) - 1;
br = (index << shift_pos) & mask_left;
br |= (index >> shift_pos) & mask_right;
......@@ -31,10 +33,10 @@ __kernel void fft(const __global float2* g_data, __local float2* l_data, uint p
x3 = g_data[br.s2];
x4 = g_data[br.s3];
sum12 = x1 + x2;
diff12 = x1 - x2;
sum34 = x3 + x4;
diff34 = (float2) (x3.s1 - x4.s1, x4.s0 - x3.s0) * dir;
float2 sum12 = x1 + x2;
float2 diff12 = x1 - x2;
float2 sum34 = x3 + x4;
float2 diff34 = (float2) (x3.s1 - x4.s1, x4.s0 - x3.s0) * dir;
l_data[l_addr] = sum12 + sum34;
l_data[l_addr+1] = diff12 + diff34;
......@@ -55,8 +57,8 @@ __kernel void fft(const __global float2* g_data, __local float2* l_data, uint p
l_data[l_addr + N2] = x1 - l_data[l_addr + N2];
for(int i = 1; i < N2; i++) {
cosine = cos(M_PI_F * i/N2);
sine = dir * sin(M_PI_F * i/N2);
float param = (M_PI_F * i)/N2;
sine = sincos(param,&cosine);
wk = (float2) (
l_data[l_addr + N2 + i].s0 * cosine +
l_data[l_addr + N2 + i].s1 * sine,
......@@ -70,15 +72,18 @@ __kernel void fft(const __global float2* g_data, __local float2* l_data, uint p
}
barrier(CLK_LOCAL_MEM_FENCE);
unit start, angle, stage;
uint start, angle, stage;
stage = 2;
for(int N2 = points_per_item; N2 < points_per_group; N2 <<=1) {
start = (get_local_id(0) + (get_local_id(0)/stage)*stage) * (points_per_item/2);
angle = start % (N2*2);
for(int i = start; i < start + points_per_item / 2; i++) {
cosine = cos(M_PI_F * angle/N2);
sine = dir * sin(M_PI_F * angle/N2);
float param = M_PI_F * angle/N2;
sine = sincos(param,&cosine);
sine = direction*sine;
wk = (float2) (
l_data[N2 + i].s0 * cosine +
l_data[N2 + i].s1 * sine,
......@@ -91,4 +96,23 @@ __kernel void fft(const __global float2* g_data, __local float2* l_data, uint p
stage <<= 1;
barrier(CLK_LOCAL_MEM_FENCE);
}
// write back into input
barrier(CLK_GLOBAL_MEM_FENCE);
l_addr = get_local_id(0)*points_per_item; // address within one workitem
g_addr = get_group_id(0)*points_per_group + l_addr;
float factor = (1/(float)N);
for (int i = 0; i < points_per_item; ++i) {
int index = g_addr+i;
if (direction == 1) {
g_data[index] = l_data[l_addr+i];
} else { // inverse FFT
float2 val = l_data[l_addr+i];
g_data[index] = (float2)(factor*val.x,factor*val.y);
}
}
}
\ No newline at end of file
__kernel void fft1Dim(const __global float2 *input,
__local float2 *local_data,
__global float2 *output,
const uint points_per_group,
const uint N,
const int direction) {
int g_id = get_global_id(0); // Returns the element of the work-item’s global ID for a given dimension
if (g_id >= N)
return;
int local_size = get_local_size(0); // number of work items in workgroup
int points_per_item = points_per_group/local_size; // points in one workitem
int l_addr = get_local_id(0)*points_per_item; // address within one workitem
int global_addr = get_group_id(0)*points_per_group + l_addr; // address within the hole workgroup
uint4 index, br;
uint mask_left, mask_right, shift_pos;
float2 x1, x2, x3, x4;
for(int i = 0; i < points_per_item; i += 4) {
index = (uint4) (global_addr, global_addr + 1, global_addr + 2, global_addr + 3);
mask_left = N / 2;
mask_right = 1;
shift_pos = (int)log2((float)N) - 1;
br = (index << shift_pos) & mask_left;
br |= (index >> shift_pos) & mask_right;
while(shift_pos > 1) {
shift_pos -= 2;
mask_left >>= 1;
mask_right <<= 1;
br |= (index << shift_pos) & mask_left;
br |= (index >> shift_pos) & mask_right;
}
x1 = input[br.s0];
x2 = input[br.s1];
x3 = input[br.s2];
x4 = input[br.s3];
float2 sum12 = x1 + x2;
float2 diff12 = x1 - x2;
float2 sum34 = x3 + x4;
float2 diff34 = (float2) (x3.y - x4.y, x4.x - x3.x)*direction; // turned aroung because of w1 (0,-i)
local_data[l_addr] = sum12 + sum34;
local_data[l_addr+1] = diff12 + diff34;
local_data[l_addr+2] = sum12 + (float2)(-sum34.x,sum34.y)*direction; // multiply by w2
local_data[l_addr+3] = diff12 + diff34;
global_addr += 4;
l_addr += 4;
}
float cosine, sine;
float2 wk;
for(int N2 = 4; N2 < points_per_item; N2 <<=1) {
l_addr = get_local_id(0) * points_per_item;
for(int fft_index = 0; fft_index < points_per_item; fft_index += 2*N2) {
x1 = local_data[l_addr];
local_data[l_addr] += local_data[l_addr + N2];
local_data[l_addr + N2] = x1 - local_data[l_addr + N2];
for(int i = 1; i < N2; i++) {
float param = (M_PI_F * i)/N2;
sine = sincos(param,&cosine);
wk = (float2) (
local_data[l_addr + N2 + i].s0 * cosine +
local_data[l_addr + N2 + i].s1 * sine,
local_data[l_addr + N2 + i].s1 * cosine -
local_data[l_addr + N2 + i].s0 * sine);
local_data[l_addr + N2 + i] = local_data[l_addr + i] - wk;
local_data[l_addr + i] += wk;
}
l_addr += 2*N2;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
uint start, angle, stage;
stage = 2;
for(int N2 = points_per_item; N2 < points_per_group; N2 <<=1) {
start = (get_local_id(0) + (get_local_id(0)/stage)*stage) * (points_per_item/2);
angle = start % (N2*2);
for(int i = start; i < start + points_per_item / 2; i++) {
float param = M_PI_F * angle/N2;
sine = sincos(param,&cosine);
sine = direction*sine;
wk = (float2) (
local_data[N2 + i].s0 * cosine +
local_data[N2 + i].s1 * sine,
local_data[N2 + i].s1 * cosine -
local_data[N2 + i].s0 * sine);
local_data[N2 + i] = local_data[i] - wk;
local_data[i] += wk;
angle++;
}
stage <<= 1;
barrier(CLK_LOCAL_MEM_FENCE);
}
// data put back into global memory
l_addr = get_local_id(0)*points_per_item; // address within one workitem
global_addr = get_group_id(0)*points_per_group + l_addr; // address within the hole workgroup
for (int i = 0; i < points_per_item; ++i) {
if (direction > 0) {
output[global_addr+i] = local_data[l_addr+i];
} else {
float2 value = local_data[l_addr+i];
double factor = (1/(double)N);
output[global_addr+i] = (float2)(factor*value.x,factor*value.y);
}
}
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
......@@ -80,7 +80,6 @@ public class Convolution {
return floatGaussian2DKernel(size, sigma, defaultNominator);
}
public static float[] generateDoubleDensityGaussianKernel(final int size, final float sigma) {
return floatGaussian2DKernel(size, sigma, densityNominator);
}
......@@ -93,6 +92,16 @@ public class Convolution {
return matrix;
}
public static float[] generdateInputMatrix(final int size, int seed) {
Random random = new Random(seed);
float[] matrix = new float[size];
for (int i = 0; i < size; i++) {
matrix[i] = (float) random.nextDouble();
}
return matrix;
}
public static float[] convolve(final float[] inMatrix,
final float[] kernelMatrix,
final int nWidth,
......
package org.vadere.util.opencl;
import org.apache.log4j.Logger;
import org.lwjgl.PointerBuffer;
import org.lwjgl.opencl.CLContextCallback;
import org.lwjgl.opencl.CLProgramCallback;
import org.lwjgl.system.MemoryStack;
import java.nio.IntBuffer;
import static org.lwjgl.opencl.CL10.*;
import static org.lwjgl.system.MemoryStack.stackPush;
import static org.lwjgl.system.MemoryUtil.NULL;
import static org.lwjgl.system.MemoryUtil.memUTF8;
public interface CLBase {
public void init() throws OpenCLException;
public void initCallbacks();
public abstract void initCL() throws OpenCLException;
public abstract void buildProgram() throws OpenCLException;
public abstract void clearMemory() throws OpenCLException;
public void clearCL() throws OpenCLException;
}
......@@ -14,6 +14,7 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.LongBuffer;
import static org.lwjgl.opencl.CL10.*;
import static org.lwjgl.system.MemoryStack.stackPush;
......@@ -63,6 +64,7 @@ public class CLConvolution {
private float[] kernel;
private KernelType type;
private boolean debug = false;
private boolean profiling = false;
public enum KernelType {
Separate,
......@@ -157,8 +159,12 @@ public class CLConvolution {
PointerBuffer ev = stack.callocPointer(1);
// run the kernel and read the result
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clKernelConvolveCol, 2, null, clGlobalWorkSizeEdges, null, null, null));
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clKernelConvolveRow, 2, null, clGlobalWorkSizeEdges, null, null, null));
//CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clKernelConvolveCol, 2, null, clGlobalWorkSizeEdges, null, null, null));
//CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clKernelConvolveRow, 2, null, clGlobalWorkSizeEdges, null, null, null));
int err_code = (int)enqueueNDRangeKernel("clKernelConvolveCol",clQueue, clKernelConvolveCol, 2, null, clGlobalWorkSizeEdges, null, null, null);
CLInfo.checkCLError(err_code);
err_code = (int) enqueueNDRangeKernel("clKernelConvolveRow",clQueue, clKernelConvolveRow, 2, null, clGlobalWorkSizeEdges, null, null, null);
CLInfo.checkCLError(err_code);
clFinish(clQueue);
}
}
......@@ -170,11 +176,43 @@ public class CLConvolution {
clGlobalWorkSizeEdges.put(1, matrixHeight);
// run the kernel and read the result
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clKernel, 2, null, clGlobalWorkSizeEdges, null, null, null));
//CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clKernel, 2, null, clGlobalWorkSizeEdges, null, null, null));
int err_code = (int) enqueueNDRangeKernel("convolve",clQueue, clKernel, 2, null, clGlobalWorkSizeEdges, null, null, null);
CLInfo.checkCLError(err_code);
CLInfo.checkCLError(clFinish(clQueue));
}
}
private PointerBuffer clEvent = MemoryUtil.memAllocPointer(1);
LongBuffer startTime;
LongBuffer endTime;
PointerBuffer retSize;
private long enqueueNDRangeKernel(final String name, long command_queue, long kernel, int work_dim, PointerBuffer global_work_offset, PointerBuffer global_work_size, PointerBuffer local_work_size, PointerBuffer event_wait_list, PointerBuffer event) {
if(profiling) {
try (MemoryStack stack = MemoryStack.stackPush()){
retSize = stack.mallocPointer(1);
startTime = stack.mallocLong(1);
endTime = stack.mallocLong(1);
long result = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, event_wait_list, clEvent);
clWaitForEvents(clEvent);
long eventAddr = clEvent.get();
clGetEventProfilingInfo(eventAddr, CL_PROFILING_COMMAND_START, startTime, retSize);
clGetEventProfilingInfo(eventAddr, CL_PROFILING_COMMAND_END, endTime, retSize);
clEvent.clear();
// in nanaSec
log.info(name + " " + (endTime.get() - startTime.get()));
endTime.clear();
startTime.clear();
return result;
}
}
else {
return clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, null, null);
}
}
private void setArguments(final long clKernel) throws OpenCLException {
try (MemoryStack stack = stackPush()) {
IntBuffer errcode_ret = stack.callocInt(1);
......@@ -300,7 +338,11 @@ public class CLConvolution {
clContext = clCreateContext(ctxProps, clDevice, contextCB, NULL, errcode_ret);
CLInfo.checkCLError(errcode_ret);
clQueue = clCreateCommandQueue(clContext, clDevice, 0, errcode_ret);
if (profiling) {
clQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, errcode_ret);
} else {
clQueue = clCreateCommandQueue(clContext, clDevice, 0, errcode_ret);
}
CLInfo.checkCLError(errcode_ret);
}
}
......
This diff is collapsed.
......@@ -88,6 +88,7 @@ public final class CLDemo {
//System.out.println("\tCL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = " + InfoUtils.getDeviceInfoPointer(device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE));
System.out.println("\tCL_DEVICE_MAX_WORK_ITEM_DIMENSIONS = " + CLInfo.getDeviceInfoInt(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
System.out.println("\tCL_DEVICE_MAX_WORK_GROUP_SIZE = " + CLInfo.getDeviceInfoPointer(device, CL_DEVICE_MAX_WORK_GROUP_SIZE));
System.out.println("\tCL_DEVICE_LOCAL_MEM_SIZE = " + CLInfo.getDeviceInfoPointer(device,CL_DEVICE_LOCAL_MEM_SIZE));
System.out.println("\tCL_DEVICE_MAX_CLOCK_FREQUENCY = " + CLInfo.getDeviceInfoInt(device, CL_DEVICE_MAX_CLOCK_FREQUENCY));
System.out.println("\tCL_DEVICE_ADDRESS_BITS = " + CLInfo.getDeviceInfoInt(device, CL_DEVICE_ADDRESS_BITS));
System.out.println("\tCL_DEVICE_AVAILABLE = " + (CLInfo.getDeviceInfoInt(device, CL_DEVICE_AVAILABLE) != 0));
......
This diff is collapsed.
......@@ -114,13 +114,26 @@ public final class CLInfo {
checkCLError(clGetProgramBuildInfo(cl_program_id, cl_device_id, param_name, (ByteBuffer)null, pp));
int bytes = (int)pp.get(0);
ByteBuffer buffer = stack.malloc(bytes);
ByteBuffer buffer = stack.malloc(bytes+1000);
checkCLError(clGetProgramBuildInfo(cl_program_id, cl_device_id, param_name, buffer, null));
return memASCII(buffer, bytes - 1);
}
}
public static String getKernelInfoStringASCII(long cl_kernel_id, int param_name) throws OpenCLException {
try (MemoryStack stack = stackPush()) {
PointerBuffer pp = stack.mallocPointer(1);
checkCLError(clGetKernelInfo(cl_kernel_id,param_name, (ByteBuffer)null,pp));
int bytes = (int)pp.get(0);
ByteBuffer buffer = stack.malloc(bytes);
checkCLError(clGetKernelInfo(cl_kernel_id,param_name,buffer,null));
return memASCII(buffer,bytes-1);
}
}
public static void checkCLError(IntBuffer errcode) throws OpenCLException {
checkCLError(errcode.get(errcode.position()));
}
......
......@@ -7,6 +7,7 @@ import org.lwjgl.system.MemoryUtil;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.DoubleBuffer;
import java.nio.FloatBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
......@@ -75,6 +76,18 @@ public class CLUtils {
return floatBuffer;
}
public static DoubleBuffer toDoubleBuffer(@NotNull final double[] doubles) {
DoubleBuffer doubleBuffer = MemoryUtil.memAllocDouble(doubles.length);
return toDoubleBuffer(doubles, doubleBuffer);
}
public static DoubleBuffer toDoubleBuffer(@NotNull final double[] doubles, @NotNull final DoubleBuffer doubleBuffer) {
for(int i = 0; i < doubles.length; i++) {
doubleBuffer.put(i, doubles[i]);
}
return doubleBuffer;
}
public static float[] toFloatArray(@NotNull final FloatBuffer floatBuffer, final int size) {
float[] result = new float[size];
for(int i = 0; i < size; i++) {
......@@ -83,6 +96,15 @@ public class CLUtils {
return result;
}
public static double[] toDoubleArray(@NotNull final DoubleBuffer doubleBuffer, final int size) {
double[] result = new double[size];
for(int i = 0; i < size; i++) {
result[i] = doubleBuffer.get(i);
}
return result;
}
private static ByteBuffer resizeBuffer(@NotNull final ByteBuffer buffer, int newCapacity) {
ByteBuffer newBuffer = MemoryUtil.memCalloc(newCapacity);
buffer.flip();
......
package org.vadere.util.math;
import org.junit.Test;
import org.vadere.util.opencl.CLDFT;
import org.vadere.util.opencl.OpenCLException;
import java.util.Arrays;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestCLDFT {
private float[] inMatrix = new float[] {
2, 1, 1, 1,
1, 1, 1, -1,
1, 1, 1, 1,
1, 1, 1, 0
};
private static final double EPS = Math.pow(10.0,-3.0);
@Test
public void testSimpleDFT() throws OpenCLException {
int N = 256;
float[] input = new float[2*N];
for (int i = 0; i < input.length; ++i) {
input[i] = i%2 == 0 ? 1 : 0;
}
System.out.println("Input: "+Arrays.toString(input));
CLDFT clfft = new CLDFT(input.length,N, CLDFT.TransformationType.SPACE2FREQUENCY);
float[] outputF = clfft.dft1Dim(input);
clfft.clearCL();
System.out.println("\n Frequency: " + Arrays.toString(outputF));
assertEquals("The first element should be N",input.length/2,outputF[0],EPS);
for (int i = 1; i < outputF.length; ++i) {
assertEquals("All other elements should be 0",0,outputF[i],EPS);
}
CLDFT clfftBackwards = new CLDFT(input.length,N,CLDFT.TransformationType.FREQUENCY2SPACE);
float[] outputS = clfftBackwards.dft1Dim(outputF);
clfftBackwards.clearCL();
System.out.println("Space: " + Arrays.toString(outputS));
assertArrayEquals("Back transformation must be equal to original input",input,outputS,(float)EPS);
}
}
package org.vadere.util.math;
import org.junit.Test;
import org.vadere.util.opencl.CLFFT;
import org.vadere.util.opencl.OpenCLException;
import java.util.Arrays;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
public class TestCLFFT {
private static final double EPS = Math.pow(10.0,-5.0);
@Test
public void testForwardAndBackwardFFT() throws OpenCLException {
// mock data
int N = 64;
float[] input = new float[2*N];
for (int i = 0; i < input.length; ++i) {
input[i] = i%2 == 0 ? 1 : 0;
}
// forward FFT
CLFFT clfftForward = new CLFFT(input.length,N,CLFFT.TransformationType.SPACE2FREQUENCY);
float[] outputF = clfftForward.fft1Dim(input);
clfftForward.clearCL();
assertEquals("The first element should be N",N,outputF[0],EPS);
for (int i = 1; i < outputF.length; ++i) {
assertEquals("All other elements should be 0",0,outputF[i],EPS);
}
// backward FFT
CLFFT clfftBackward = new CLFFT(input.length,N, CLFFT.TransformationType.FREQUENCY2SPACE);
float[] outputS = clfftBackward.fft1Dim(outputF);
clfftBackward.clearCL();
assertArrayEquals("Back transformation must be equal to original input",input,outputS,(float)EPS);
}
}
......@@ -21,4 +21,22 @@ public class TestDFT {
public void testDFT() throws IOException {
//TODO: reimplement!
}
public void testSimpleDFT() {
float[] kernel = new float[] {
1, 2, 3,
4, 5, 6,
7, 8, 9
};
float[] inMatrix = new float[] {
2, 1, 1, 1,
1, 1, 1, -1,
1, 1, 1, 1,
1, 1, 1, 0
};
}
}