Commit cff50ebe authored by Benedikt Zoennchen's avatar Benedikt Zoennchen
Browse files

+ correct the same memory errors in CLLinkedCell in CLOptimalStepsModel

+ reduce the stroke width of the contour i.e. CONTOUR_THINKNESS
parent 74c360d3
Pipeline #88678 passed with stages
in 101 minutes and 49 seconds
......@@ -26,7 +26,7 @@ public abstract class SimulationRenderer extends DefaultRenderer {
private static double MAX_POTENTIAL = 1000.0;
private static double CONTOUR_STEP = 2.0;
private static double CONTOUR_THINKNESS = 0.2;
private static double CONTOUR_THINKNESS = 0.1;
private SimulationModel model;
private BufferedImage obstacleDensity = null;
......@@ -206,7 +206,8 @@ public abstract class SimulationRenderer extends DefaultRenderer {
if (potential >= MAX_POTENTIAL) {
c = model.config.getObstacleColor();
} else if (potential % CONTOUR_STEP <= CONTOUR_THINKNESS) {
}
else if (potential % CONTOUR_STEP <= CONTOUR_THINKNESS) {
c = Color.BLACK;
} else {
c = colorHelper.numberToColor(potential % 100);
......
......@@ -2,6 +2,7 @@ package org.vadere.simulator.models.osm;
import org.jetbrains.annotations.NotNull;
import org.vadere.annotation.factories.models.ModelClass;
import org.vadere.meshing.mesh.inter.IMesh;
import org.vadere.simulator.control.factory.GroupSourceControllerFactory;
import org.vadere.simulator.control.factory.SingleSourceControllerFactory;
import org.vadere.simulator.control.factory.SourceControllerFactory;
......@@ -28,7 +29,10 @@ import org.vadere.simulator.models.potential.fields.IPotentialFieldTarget;
import org.vadere.simulator.models.potential.fields.IPotentialFieldTargetGrid;
import org.vadere.simulator.models.potential.fields.PotentialFieldAgent;
import org.vadere.simulator.models.potential.fields.PotentialFieldObstacle;
import org.vadere.simulator.models.potential.solver.calculators.EikonalSolver;
import org.vadere.simulator.models.potential.solver.calculators.cartesian.GridEikonalSolver;
import org.vadere.state.attributes.Attributes;
import org.vadere.state.attributes.models.AttributesFloorField;
import org.vadere.state.attributes.models.AttributesOSM;
import org.vadere.state.attributes.scenario.AttributesAgent;
import org.vadere.state.events.types.ElapsedTimeEvent;
......@@ -40,12 +44,18 @@ import org.vadere.state.scenario.Pedestrian;
import org.vadere.state.scenario.Topography;
import org.vadere.state.types.OptimizationType;
import org.vadere.state.types.UpdateType;
import org.vadere.util.data.cellgrid.CellGrid;
import org.vadere.util.data.cellgrid.CellState;
import org.vadere.util.data.cellgrid.IPotentialPoint;
import org.vadere.util.data.cellgrid.PathFindingTag;
import org.vadere.util.geometry.shapes.IPoint;
import org.vadere.util.geometry.shapes.VPoint;
import org.vadere.util.geometry.shapes.VShape;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.function.Function;
import java.util.stream.Collectors;
@ModelClass(isMainModel = true)
......@@ -153,9 +163,17 @@ public class OptimalStepsModel implements MainModel, PotentialFieldModel, Dynami
Model.findAttributes(attributesList, AttributesFloorField.class),
//3.0,
new EikonalSolver() {
new GridEikonalSolver() {
CellGrid cellGrid = null;
@Override
public CellGrid getCellGrid() {
if(cellGrid == null) {
initialize();
}
return cellGrid;
}
@Override
public void initialize() {
potentialFieldTarget.preLoop(0.4);
......@@ -163,24 +181,51 @@ public class OptimalStepsModel implements MainModel, PotentialFieldModel, Dynami
}
@Override
public CellGrid getPotentialField() {
public Function<IPoint, Double> getPotentialField() {
return getCellGrid().getInterpolationFunction();
}
@Override
public double getPotential(double x, double y) {
return getPotential(x, y, 0.1, 1.0);
}
},
new GridEikonalSolver() {
private CellGrid cellGrid = null;
@Override
public CellGrid getCellGrid() {
if(cellGrid == null) {
initialize();
}
return cellGrid;
}
},
new EikonalSolver() {
CellGrid cellGrid = topography.getDistanceFunctionApproximation(
Model.findAttributes(attributesList, AttributesFloorField.class).getPotentialFieldResolution()
);
@Override
public void initialize() {}
public void initialize() {
double resolution = Model.findAttributes(attributesList, AttributesFloorField.class).getPotentialFieldResolution();
cellGrid = new CellGrid(topography.getBounds().getWidth(), topography.getBounds().getHeight(), resolution, new CellState());
cellGrid.pointStream().forEach(p -> {
double distance = topography.distanceToObstacle(cellGrid.pointToCoord(p));
PathFindingTag tag = distance >= 0 ? PathFindingTag.Reached : PathFindingTag.Obstacle;
cellGrid.setValue(p, new CellState(distance, tag));
});
}
@Override
public CellGrid getPotentialField() {
return cellGrid;
public Function<IPoint, Double> getPotentialField() {
if(cellGrid == null) {
initialize();
}
return cellGrid.getInterpolationFunction();
}
@Override
public double getPotential(double x, double y) {
return cellGrid.getInterpolatedValueAt(x, y).getLeft();
}
}
);*/
......
......@@ -18,23 +18,31 @@ import org.vadere.util.opencl.CLInfo;
import org.vadere.util.opencl.CLUtils;
import org.vadere.util.opencl.OpenCLException;
import org.vadere.simulator.models.potential.solver.calculators.EikonalSolver;
import org.vadere.util.opencl.examples.InfoUtils;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.LongBuffer;
import java.util.List;
import java.util.Random;
import static org.lwjgl.opencl.CL10.CL_CONTEXT_PLATFORM;
import static org.lwjgl.opencl.CL10.CL_DEVICE_LOCAL_MEM_SIZE;
import static org.lwjgl.opencl.CL10.CL_DEVICE_MAX_WORK_GROUP_SIZE;
import static org.lwjgl.opencl.CL10.CL_DEVICE_NAME;
import static org.lwjgl.opencl.CL10.CL_DEVICE_TYPE_GPU;
import static org.lwjgl.opencl.CL10.CL_KERNEL_LOCAL_MEM_SIZE;
import static org.lwjgl.opencl.CL10.CL_KERNEL_WORK_GROUP_SIZE;
import static org.lwjgl.opencl.CL10.CL_MEM_ALLOC_HOST_PTR;
import static org.lwjgl.opencl.CL10.CL_MEM_COPY_HOST_PTR;
import static org.lwjgl.opencl.CL10.CL_MEM_READ_ONLY;
import static org.lwjgl.opencl.CL10.CL_MEM_READ_WRITE;
import static org.lwjgl.opencl.CL10.CL_PROFILING_COMMAND_END;
import static org.lwjgl.opencl.CL10.CL_PROFILING_COMMAND_START;
import static org.lwjgl.opencl.CL10.CL_PROGRAM_BUILD_STATUS;
import static org.lwjgl.opencl.CL10.CL_QUEUE_PROFILING_ENABLE;
import static org.lwjgl.opencl.CL10.CL_SUCCESS;
import static org.lwjgl.opencl.CL10.clBuildProgram;
import static org.lwjgl.opencl.CL10.clCreateBuffer;
......@@ -47,7 +55,8 @@ import static org.lwjgl.opencl.CL10.clEnqueueReadBuffer;
import static org.lwjgl.opencl.CL10.clEnqueueWriteBuffer;
import static org.lwjgl.opencl.CL10.clFinish;
import static org.lwjgl.opencl.CL10.clGetDeviceIDs;
import static org.lwjgl.opencl.CL10.clGetDeviceInfo;
import static org.lwjgl.opencl.CL10.clGetEventProfilingInfo;
import static org.lwjgl.opencl.CL10.clGetKernelWorkGroupInfo;
import static org.lwjgl.opencl.CL10.clGetPlatformIDs;
import static org.lwjgl.opencl.CL10.clReleaseCommandQueue;
import static org.lwjgl.opencl.CL10.clReleaseContext;
......@@ -58,6 +67,7 @@ import static org.lwjgl.opencl.CL10.clSetKernelArg;
import static org.lwjgl.opencl.CL10.clSetKernelArg1f;
import static org.lwjgl.opencl.CL10.clSetKernelArg1i;
import static org.lwjgl.opencl.CL10.clSetKernelArg1p;
import static org.lwjgl.opencl.CL10.clWaitForEvents;
import static org.lwjgl.system.MemoryStack.stackPush;
import static org.lwjgl.system.MemoryUtil.NULL;
import static org.lwjgl.system.MemoryUtil.memUTF8;
......@@ -137,6 +147,7 @@ public class CLOptimalStepsModel {
private int[] iGridSize;
private List<PedestrianOpenCL> pedestrianList;
private List<VPoint> circlePositionList;
private final int deviceType;
private final AttributesFloorField attributesFloorField;
private final AttributesOSM attributesOSM;
......@@ -149,9 +160,12 @@ public class CLOptimalStepsModel {
private static final Logger logger = Logger.getLogger(CLOptimalStepsModel.class);
private int max_work_group_size;
private long max_work_group_size;
private long max_local_memory_size;
private boolean debug = true;
// time measurement
private boolean debug = false;
private boolean profiling = false;
private int numberOfSortElements;
......@@ -164,6 +178,15 @@ public class CLOptimalStepsModel {
private int counter = 0;
public CLOptimalStepsModel(
@NotNull final AttributesOSM attributesOSM,
@NotNull final AttributesFloorField attributesFloorField,
@NotNull final VRectangle bound,
@NotNull final EikonalSolver targetPotential,
@NotNull final EikonalSolver obstaclePotential) throws OpenCLException {
this(attributesOSM, attributesFloorField, bound, targetPotential, obstaclePotential, CL_DEVICE_TYPE_GPU);
}
/**
* Default constructor.
*
......@@ -176,10 +199,12 @@ public class CLOptimalStepsModel {
@NotNull final AttributesFloorField attributesFloorField,
@NotNull final VRectangle bound,
@NotNull final EikonalSolver targetPotential,
@NotNull final EikonalSolver obstaclePotential) throws OpenCLException {
@NotNull final EikonalSolver obstaclePotential,
final int device) throws OpenCLException {
this.attributesOSM = attributesOSM;
this.attributesFloorField = attributesFloorField;
this.bound = bound;
this.deviceType = device;
//TODO: this should be done in mallocHostMemory().
this.targetPotentialField = generatePotentialFieldApproximation(targetPotential);
......@@ -364,7 +389,7 @@ public class CLOptimalStepsModel {
/*clearMemory();
clearCL();*/
counter++;
clearIterationMemory();
//clearIterationMemory();
while (pedestrians.size() > originalSize) {
pedestrians.remove(pedestrians.size()-1);
}
......@@ -373,6 +398,8 @@ public class CLOptimalStepsModel {
//clBitonicSort(clHashes, clIndices, clHashes, clIndices, numberOfElements, 1);
//clFindCellBoundsAndReorder(clCellStarts, clCellEnds, clReorderedPedestrians, clHashes, clIndices, clPedestrians, numberOfElements, numberOfGridCells);
} finally {
clearIterationMemory();
}
}
......@@ -566,7 +593,7 @@ public class CLOptimalStepsModel {
CLInfo.checkCLError(clSetKernelArg1i(clCalcHash, 6, numberOfElements));
clGlobalWorkSize.put(0, numberOfElements);
//TODO: local work size?
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clCalcHash, 1, null, clGlobalWorkSize, null, null, null));
CLInfo.checkCLError((int)enqueueNDRangeKernel("clCalcHash", clQueue, clCalcHash, 1, null, clGlobalWorkSize, null, null, null));
}
}
......@@ -601,6 +628,7 @@ public class CLOptimalStepsModel {
PointerBuffer clGlobalWorkSize = stack.callocPointer(1);
PointerBuffer clLocalWorkSize = stack.callocPointer(1);
IntBuffer errcode_ret = stack.callocInt(1);
long maxWorkGroupSize = getMaxWorkGroupSizeForKernel(clDevice, clNextPositions, 0); // local 4 byte (integer)
CLInfo.checkCLError(clSetKernelArg1p(clNextPositions, 0, clPedestrianNextPositions));
CLInfo.checkCLError(clSetKernelArg1p(clNextPositions, 1, clReorderedPedestrians));
......@@ -617,21 +645,21 @@ public class CLOptimalStepsModel {
CLInfo.checkCLError(clSetKernelArg1f(clNextPositions, 12, (float)attributesFloorField.getPotentialFieldResolution()));
CLInfo.checkCLError(clSetKernelArg1i(clNextPositions, 13, circlePositionList.size()));
int globalWorkSize;
int localWorkSize;
if(numberOfElements <= max_work_group_size){
long globalWorkSize;
long localWorkSize;
if(numberOfElements <= maxWorkGroupSize){
localWorkSize = numberOfElements;
globalWorkSize = numberOfElements;
}
else {
localWorkSize = max_work_group_size;
localWorkSize = maxWorkGroupSize;
globalWorkSize = multipleOf(numberOfElements, localWorkSize);
}
clGlobalWorkSize.put(0, globalWorkSize);
clLocalWorkSize.put(0, localWorkSize);
//TODO: local work size? + check 2^n constrain!
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clNextPositions, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError((int)enqueueNDRangeKernel("clNextPositions", clQueue, clNextPositions, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
}
}
......@@ -649,6 +677,7 @@ public class CLOptimalStepsModel {
PointerBuffer clGlobalWorkSize = stack.callocPointer(1);
PointerBuffer clLocalWorkSize = stack.callocPointer(1);
IntBuffer errcode_ret = stack.callocInt(1);
long maxWorkGroupSize = getMaxWorkGroupSizeForKernel(clDevice, clNextPositions, 0); // local 4 byte (integer)
CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 0, clCellStarts));
CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 1, clCellEnds));
......@@ -656,27 +685,72 @@ public class CLOptimalStepsModel {
CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 3, clHashes));
CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 4, clIndices));
CLInfo.checkCLError(clSetKernelArg1p(clFindCellBoundsAndReorder, 5, clPositions));
CLInfo.checkCLError(clSetKernelArg(clFindCellBoundsAndReorder, 6, (max_work_group_size+1) * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clFindCellBoundsAndReorder, 6, (Math.min(numberOfElements+1, maxWorkGroupSize)) * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg1i(clFindCellBoundsAndReorder, 7, numberOfElements));
int globalWorkSize;
int localWorkSize;
if(numberOfElements <= max_work_group_size){
localWorkSize = numberOfElements;
globalWorkSize = numberOfElements;
}
else {
localWorkSize = max_work_group_size;
globalWorkSize = multipleOf(numberOfElements, localWorkSize);
}
long globalWorkSize;
long localWorkSize;
if(numberOfElements+1 < maxWorkGroupSize){
localWorkSize = numberOfElements;
globalWorkSize = numberOfElements;
}
else {
localWorkSize = maxWorkGroupSize;
globalWorkSize = multipleOf(numberOfElements, localWorkSize);
}
clGlobalWorkSize.put(0, globalWorkSize);
clLocalWorkSize.put(0, localWorkSize);
//TODO: local work size? + check 2^n constrain!
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clFindCellBoundsAndReorder, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError((int)enqueueNDRangeKernel("clFindCellBoundsAndReorder", clQueue, clFindCellBoundsAndReorder, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
}
}
private long enqueueNDRangeKernel(final String name, long command_queue, long kernel, int work_dim, PointerBuffer global_work_offset, PointerBuffer global_work_size, PointerBuffer local_work_size, PointerBuffer event_wait_list, PointerBuffer event) throws OpenCLException {
if(profiling) {
try (MemoryStack stack = stackPush()) {
PointerBuffer clEvent = stack.mallocPointer(1);
LongBuffer startTime = stack.mallocLong(1);
LongBuffer endTime = stack.mallocLong(1);
long result = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, event_wait_list, clEvent);
clWaitForEvents(clEvent);
long eventAddr = clEvent.get();
CLInfo.checkCLError(clGetEventProfilingInfo(eventAddr, CL_PROFILING_COMMAND_START, startTime, null));
CLInfo.checkCLError(clGetEventProfilingInfo(eventAddr, CL_PROFILING_COMMAND_END, endTime, null));
clEvent.clear();
// in nanaSec
log.info(name + " event time " + "0x"+eventAddr + ": " + ((double)endTime.get() - startTime.get()) / 1_000_000.0 + " [ms]");
endTime.clear();
startTime.clear();
return result;
}
}
else {
return clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, event_wait_list, event);
}
}
private long getMaxWorkGroupSizeForKernel(long clDevice, long clKernel, long workItemMem) throws OpenCLException {
try (MemoryStack stack = stackPush()) {
LongBuffer pp = stack.mallocLong(1);
CLInfo.checkCLError(clGetKernelWorkGroupInfo(clKernel, clDevice, CL_KERNEL_LOCAL_MEM_SIZE , pp, null));
/*long kernelLocalMemory = pp.get(0);
logger.debug("CL_KERNEL_LOCAL_MEM_SIZE = (" + clKernel + ") = " + kernelLocalMemory);
logger.debug("memory for each = " + (workItemMem + kernelLocalMemory));
long maxWorkGroupSizeForLocalMemory = (workItemMem + kernelLocalMemory) == 0 ? 0 : (max_local_memory_size / (workItemMem + kernelLocalMemory));*/
long maxWorkGroupSizeForLocalMemory = workItemMem == 0 ? max_work_group_size : (max_local_memory_size / (workItemMem));
PointerBuffer ppp = stack.mallocPointer(1);
CLInfo.checkCLError(clGetKernelWorkGroupInfo(clKernel, clDevice, CL_KERNEL_WORK_GROUP_SIZE , ppp, null));
long maxWorkGroupSizeForPrivateMemory = ppp.get(0);
logger.debug("CL_KERNEL_WORK_GROUP_SIZE (" + clKernel + ") = " + maxWorkGroupSizeForPrivateMemory);
//return Math.min(max_work_group_size, Math.min(maxWorkGroupSizeForLocalMemory, maxWorkGroupSizeForPrivateMemory));
return Math.min(max_work_group_size, Math.min(maxWorkGroupSizeForLocalMemory, maxWorkGroupSizeForPrivateMemory));
}
}
private int expOf(int value, int multiple) {
int result = 2;
while (result < value) {
......@@ -685,8 +759,8 @@ public class CLOptimalStepsModel {
return result;
}
private int multipleOf(int value, int multiple) {
int result = multiple;
private long multipleOf(long value, long multiple) {
long result = multiple;
while (result < value) {
result += multiple;
}
......@@ -706,9 +780,10 @@ public class CLOptimalStepsModel {
PointerBuffer clGlobalWorkSize = stack.callocPointer(1);
PointerBuffer clLocalWorkSize = stack.callocPointer(1);
IntBuffer errcode_ret = stack.callocInt(1);
long maxWorkGroupSize = getMaxWorkGroupSizeForKernel(clDevice, clBitonicMergeLocal, 8); // local memory for key and values (integer)
// small sorts
if (numberOfElements <= max_work_group_size) {
if (numberOfElements <= maxWorkGroupSize) {
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 0, clKeysOut));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 1, clValuesOut));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal, 2, clKeysIn));
......@@ -722,7 +797,7 @@ public class CLOptimalStepsModel {
clLocalWorkSize.put(0, numberOfElements / 2);
// run the kernel and read the result
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError((int)enqueueNDRangeKernel("clBitonicSortLocal", clQueue, clBitonicSortLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
} else {
//Launch bitonicSortLocal1
......@@ -730,20 +805,20 @@ public class CLOptimalStepsModel {
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 1, clValuesOut));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 2, clKeysIn));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicSortLocal1, 3, clValuesIn));
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal1, 4, max_work_group_size * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal1, 5, max_work_group_size * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal1, 4, maxWorkGroupSize * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicSortLocal1, 5, maxWorkGroupSize * 4)); // local memory
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, numberOfElements / 2);
clLocalWorkSize.put(0, max_work_group_size / 2);
clLocalWorkSize.put(0, maxWorkGroupSize / 2);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicSortLocal1, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError((int)enqueueNDRangeKernel("clBitonicSortLocal", clQueue, clBitonicSortLocal1, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
for (int size = 2 * max_work_group_size; size <= numberOfElements; size <<= 1) {
for (int size = (int)(2 * maxWorkGroupSize); size <= numberOfElements; size <<= 1) {
for (int stride = size / 2; stride > 0; stride >>= 1) {
if (stride >= max_work_group_size) {
if (stride >= maxWorkGroupSize) {
//Launch bitonicMergeGlobal
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 0, clKeysOut));
CLInfo.checkCLError(clSetKernelArg1p(clBitonicMergeGlobal, 1, clValuesOut));
......@@ -758,9 +833,9 @@ public class CLOptimalStepsModel {
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, numberOfElements / 2);
clLocalWorkSize.put(0, max_work_group_size / 4);
clLocalWorkSize.put(0, maxWorkGroupSize / 4);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeGlobal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError((int)enqueueNDRangeKernel("clBitonicMergeGlobal", clQueue, clBitonicMergeGlobal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
} else {
//Launch bitonicMergeLocal
......@@ -773,15 +848,15 @@ public class CLOptimalStepsModel {
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 5, stride));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 6, size));
CLInfo.checkCLError(clSetKernelArg1i(clBitonicMergeLocal, 7, dir));
CLInfo.checkCLError(clSetKernelArg(clBitonicMergeLocal, 8, max_work_group_size * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicMergeLocal, 9, max_work_group_size * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicMergeLocal, 8, maxWorkGroupSize * 4)); // local memory
CLInfo.checkCLError(clSetKernelArg(clBitonicMergeLocal, 9, maxWorkGroupSize * 4)); // local memory
clGlobalWorkSize = stack.callocPointer(1);
clLocalWorkSize = stack.callocPointer(1);
clGlobalWorkSize.put(0, numberOfElements / 2);
clLocalWorkSize.put(0, max_work_group_size / 2);
clLocalWorkSize.put(0, maxWorkGroupSize / 2);
CLInfo.checkCLError(clEnqueueNDRangeKernel(clQueue, clBitonicMergeLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError((int)enqueueNDRangeKernel("clBitonicMergeLocal", clQueue, clBitonicMergeLocal, 1, null, clGlobalWorkSize, clLocalWorkSize, null, null));
CLInfo.checkCLError(clFinish(clQueue));
break;
}
......@@ -867,6 +942,7 @@ public class CLOptimalStepsModel {
MemoryUtil.memFree(gridSize);
MemoryUtil.memFree(circlePositions);
MemoryUtil.memFree(potentialFieldGridSize);
MemoryUtil.memFree(source);
}
}
......@@ -931,7 +1007,13 @@ public class CLOptimalStepsModel {
clContext = clCreateContext(ctxProps, clDevice, contextCB, NULL, errcode_ret);
CLInfo.checkCLError(errcode_ret);
clQueue = clCreateCommandQueue(clContext, clDevice, 0, errcode_ret);
if(profiling) {
clQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, errcode_ret);
}
else {
clQueue = clCreateCommandQueue(clContext, clDevice, 0, errcode_ret);
}
CLInfo.checkCLError(errcode_ret);
}
}
......@@ -972,11 +1054,11 @@ public class CLOptimalStepsModel {
clFindCellBoundsAndReorder = clCreateKernel(clProgram, "findCellBoundsAndReorder", errcode_ret);
CLInfo.checkCLError(errcode_ret);
PointerBuffer pp = stack.mallocPointer(1);
clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, pp, null);
max_work_group_size = (int)pp.get(0);
max_work_group_size = InfoUtils.getDeviceInfoPointer(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE);
logger.debug("CL_DEVICE_MAX_WORK_GROUP_SIZE = " + max_work_group_size);
logger.info("CL_DEVICE_MAX_WORK_GROUP_SIZE = " + max_work_group_size);
max_local_memory_size = InfoUtils.getDeviceInfoLong(clDevice, CL_DEVICE_LOCAL_MEM_SIZE);
logger.debug("CL_DEVICE_LOCAL_MEM_SIZE = " + max_local_memory_size);
}
}
......
......@@ -44,6 +44,7 @@ public class UpdateSchemeCLParallel extends UpdateSchemeParallel {
@Override
public void update(double timeStepInSec, double currentTimeInSec) {
try {
clearStrides(topography);
movedPedestrians.clear();
List<PedestrianOSM> pedestrianOSMList = CollectionUtils.select(topography.getElements(Pedestrian.class), PedestrianOSM.class);
// CallMethod.SEEK runs on the GPU
......@@ -61,7 +62,10 @@ public class UpdateSchemeCLParallel extends UpdateSchemeParallel {
}
double cellSize = new AttributesPotentialCompact().getPedPotentialWidth() + maxStepSize;
long ms = System.currentTimeMillis();
List<CLOptimalStepsModel.PedestrianOpenCL> result = clOptimalStepsModel.getNextSteps(pedestrians, cellSize);
ms = System.currentTimeMillis() - ms;
logger.debug("runtime for next step computation = " + ms + " [ms]");
for(int i = 0; i < pedestrians.size(); i++) {
//logger.info("not equals for index = " + i + ": " + result.get(i).position + " -> " + result.get(i).newPosition);
......
......@@ -17,9 +17,11 @@ import org.vadere.state.scenario.Pedestrian;
import org.vadere.state.scenario.Topography;
import org.vadere.util.geometry.shapes.Vector2D;
import org.vadere.util.io.CollectionUtils;
import org.vadere.util.logging.Logger;
public class UpdateSchemeParallel implements UpdateSchemeOSM {
private static Logger logger = Logger.getLogger(UpdateSchemeParallel.class);
protected final ExecutorService executorService;
protected final Topography topography;
protected final Set<Pedestrian> movedPedestrians;
......@@ -39,12 +41,24 @@ public class UpdateSchemeParallel implements UpdateSchemeOSM {
List<Future<?>> futures;
for (CallMethod callMethod : callMethods) {
long ms = 0;
if(callMethod == CallMethod.SEEK) {
ms = System.currentTimeMillis();
}
futures = new LinkedList<>();
for (final PedestrianOSM pedestrian : CollectionUtils.select(topography.getElements(Pedestrian.class), PedestrianOSM.class)) {
Runnable worker = () -> update(pedestrian, timeStepInSec, currentTimeInSec, callMethod);
futures.add(executorService.submit(worker));
}
collectFutures(futures);
if(callMethod == CallMethod.SEEK) {