Commit 68e70553 authored by Nikola Dinev's avatar Nikola Dinev
Browse files

Minor improvements to GPU projectors

parent 4a22dac6
......@@ -113,9 +113,7 @@ namespace elsa
Timer<> timeGuard("JosephsMethodCUDA", "apply");
//transfer volume as texture
auto volumePair = copyTextureToGPU(x);
auto volumeTex = volumePair.first;
auto volume = volumePair.second;
auto [volumeTex, volume] = copyTextureToGPU(x);
// allocate memory for projections
cudaPitchedPtr dsinoPtr;
......@@ -235,9 +233,7 @@ namespace elsa
//transfer projections
if(_fast) {
auto sinoPair = copyTextureToGPU<cudaArrayLayered>(y);
auto sinoTex = sinoPair.first;
auto sino = sinoPair.second;
auto [sinoTex,sino] = copyTextureToGPU<cudaArrayLayered>(y);
cudaDeviceSynchronize();
if (numImgBlocks>0) {
......@@ -334,9 +330,7 @@ namespace elsa
if(_fast) {
//transfer projections
auto sinoPair = copyTextureToGPU<cudaArrayLayered>(y);
auto sinoTex = sinoPair.first;
auto sino = sinoPair.second;
auto [sinoTex,sino] = copyTextureToGPU<cudaArrayLayered>(y);
cudaDeviceSynchronize();
if (numImgBlocks>0) {
......
......@@ -21,12 +21,6 @@ namespace elsa
if (geometryList.empty()) {
throw std::logic_error("SiddonsMethodCUDA: geometry list was empty");
}
//transfer boundingBox
if(cudaMalloc(&_boxMax,dim*sizeof(uint32_t)) != cudaSuccess)
throw std::bad_alloc();
Eigen::Matrix<uint32_t,-1,1> boxMax = _boundingBox._max.template cast<uint32_t>();
if(cudaMemcpyAsync(_boxMax,boxMax.data(),dim*sizeof(uint32_t),cudaMemcpyHostToDevice)!=cudaSuccess)
throw std::logic_error("SiddonsMethodCUDA: Could not transfer bounding box to GPU.");
//allocate device memory and copy ray origins and the inverse of the significant part of projection matrices to device
cudaExtent extent = make_cudaExtent(dim*sizeof(real_t), dim, geometryList.size());
......@@ -66,9 +60,7 @@ namespace elsa
SiddonsMethodCUDA<data_t>::~SiddonsMethodCUDA()
{
//Free CUDA resources
if (cudaFree(_boxMax)!=cudaSuccess ||
cudaFree(_rayOrigins.ptr)!=cudaSuccess ||
cudaFree(_projInvMatrices.ptr)!=cudaSuccess)
if (cudaFree(_rayOrigins.ptr)!=cudaSuccess || cudaFree(_projInvMatrices.ptr)!=cudaSuccess)
Logger::get("SiddonsMethodCUDA")->error("Couldn't free GPU memory; This may cause problems later.");
}
......@@ -126,6 +118,11 @@ namespace elsa
int numImgBlocks = rangeDims(numDim-1)/_threadsPerBlock;
int remaining = rangeDims(numDim-1)%_threadsPerBlock;
if (numDim==3) {
typename TraverseSiddonsCUDA<data_t,3>::BoundingBox boxMax;
boxMax.max[0] = _boundingBox._max.template cast<uint32_t>()[0];
boxMax.max[1] = _boundingBox._max.template cast<uint32_t>()[1];
boxMax.max[2] = _boundingBox._max.template cast<uint32_t>()[2];
//transfer volume and sinogram
cudaExtent volExt = make_cudaExtent(coeffsPerDim[0]*sizeof(data_t),coeffsPerDim[1],coeffsPerDim[2]);
if(cudaMalloc3D(&dvolumePtr,volExt)!=cudaSuccess)
......@@ -152,9 +149,9 @@ namespace elsa
const dim3 grid(rangeDims(0),rangeDims(1),numImgBlocks);
const int threads = _threadsPerBlock;
if(adjoint)
TraverseSiddonsCUDA<data_t,3>::traverseAdjoint(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,(int8_t*)dsinoPtr.ptr, dsinoPtr.pitch,(int8_t*)_rayOrigins.ptr,_rayOrigins.pitch,(int8_t*)_projInvMatrices.ptr,_projInvMatrices.pitch,_boxMax);
TraverseSiddonsCUDA<data_t,3>::traverseAdjoint(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,(int8_t*)dsinoPtr.ptr, dsinoPtr.pitch,(int8_t*)_rayOrigins.ptr,_rayOrigins.pitch,(int8_t*)_projInvMatrices.ptr,_projInvMatrices.pitch,boxMax);
else
TraverseSiddonsCUDA<data_t,3>::traverseForward(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,(int8_t*)dsinoPtr.ptr, dsinoPtr.pitch,(int8_t*)_rayOrigins.ptr,_rayOrigins.pitch,(int8_t*)_projInvMatrices.ptr,_projInvMatrices.pitch,_boxMax);
TraverseSiddonsCUDA<data_t,3>::traverseForward(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,(int8_t*)dsinoPtr.ptr, dsinoPtr.pitch,(int8_t*)_rayOrigins.ptr,_rayOrigins.pitch,(int8_t*)_projInvMatrices.ptr,_projInvMatrices.pitch,boxMax);
}
if (remaining>0) {
......@@ -168,9 +165,9 @@ namespace elsa
int8_t* matrixPtr = (int8_t*)_projInvMatrices.ptr + numImgBlocks*_threadsPerBlock*_projInvMatrices.pitch*3;
if(adjoint)
TraverseSiddonsCUDA<data_t,3>::traverseAdjoint(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,imgPtr, dsinoPtr.pitch,rayPtr,_rayOrigins.pitch,matrixPtr,_projInvMatrices.pitch,_boxMax,remStream);
TraverseSiddonsCUDA<data_t,3>::traverseAdjoint(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,imgPtr, dsinoPtr.pitch,rayPtr,_rayOrigins.pitch,matrixPtr,_projInvMatrices.pitch,boxMax,remStream);
else
TraverseSiddonsCUDA<data_t,3>::traverseForward(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,imgPtr, dsinoPtr.pitch,rayPtr,_rayOrigins.pitch,matrixPtr,_projInvMatrices.pitch,_boxMax,remStream);
TraverseSiddonsCUDA<data_t,3>::traverseForward(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,imgPtr, dsinoPtr.pitch,rayPtr,_rayOrigins.pitch,matrixPtr,_projInvMatrices.pitch,boxMax,remStream);
if (cudaStreamDestroy(remStream)!=cudaSuccess)
Logger::get("SiddonsMethodCUDA")->error("Couldn't destroy GPU stream; This may cause problems later.");
......@@ -186,6 +183,10 @@ namespace elsa
}
else {
typename TraverseSiddonsCUDA<data_t,2>::BoundingBox boxMax;
boxMax.max[0] = _boundingBox._max.template cast<uint32_t>()[0];
boxMax.max[1] = _boundingBox._max.template cast<uint32_t>()[1];
//transfer volume and sinogram
if(cudaMallocPitch(&dvolumePtr.ptr,&dvolumePtr.pitch,coeffsPerDim[0]*sizeof(data_t),coeffsPerDim[1])!=cudaSuccess)
......@@ -217,9 +218,9 @@ namespace elsa
const dim3 grid(rangeDims(0),numImgBlocks);
const int threads = _threadsPerBlock;
if (adjoint)
TraverseSiddonsCUDA<data_t,2>::traverseAdjoint(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,(int8_t*)dsinoPtr.ptr, dsinoPtr.pitch,(int8_t*)_rayOrigins.ptr,_rayOrigins.pitch,(int8_t*)_projInvMatrices.ptr,_projInvMatrices.pitch,_boxMax);
TraverseSiddonsCUDA<data_t,2>::traverseAdjoint(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,(int8_t*)dsinoPtr.ptr, dsinoPtr.pitch,(int8_t*)_rayOrigins.ptr,_rayOrigins.pitch,(int8_t*)_projInvMatrices.ptr,_projInvMatrices.pitch,boxMax);
else
TraverseSiddonsCUDA<data_t,2>::traverseForward(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,(int8_t*)dsinoPtr.ptr, dsinoPtr.pitch,(int8_t*)_rayOrigins.ptr,_rayOrigins.pitch,(int8_t*)_projInvMatrices.ptr,_projInvMatrices.pitch,_boxMax);
TraverseSiddonsCUDA<data_t,2>::traverseForward(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,(int8_t*)dsinoPtr.ptr, dsinoPtr.pitch,(int8_t*)_rayOrigins.ptr,_rayOrigins.pitch,(int8_t*)_projInvMatrices.ptr,_projInvMatrices.pitch,boxMax);
}
if (remaining>0) {
......@@ -233,9 +234,9 @@ namespace elsa
int8_t* matrixPtr = (int8_t*)_projInvMatrices.ptr + numImgBlocks*_threadsPerBlock*_projInvMatrices.pitch*numDim;
if (adjoint)
TraverseSiddonsCUDA<data_t,2>::traverseAdjoint(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,imgPtr, dsinoPtr.pitch,rayPtr,_rayOrigins.pitch,matrixPtr,_projInvMatrices.pitch,_boxMax,remStream);
TraverseSiddonsCUDA<data_t,2>::traverseAdjoint(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,imgPtr, dsinoPtr.pitch,rayPtr,_rayOrigins.pitch,matrixPtr,_projInvMatrices.pitch,boxMax,remStream);
else
TraverseSiddonsCUDA<data_t,2>::traverseForward(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,imgPtr, dsinoPtr.pitch,rayPtr,_rayOrigins.pitch,matrixPtr,_projInvMatrices.pitch,_boxMax,remStream);
TraverseSiddonsCUDA<data_t,2>::traverseForward(grid, threads,(int8_t*)dvolumePtr.ptr, dvolumePtr.pitch,imgPtr, dsinoPtr.pitch,rayPtr,_rayOrigins.pitch,matrixPtr,_projInvMatrices.pitch,boxMax,remStream);
if (cudaStreamDestroy(remStream)!=cudaSuccess)
Logger::get("SiddonsMethodCUDA")->error("Couldn't destroy GPU stream; This may cause problems later.");
}
......
......@@ -812,7 +812,7 @@ void TraverseJosephsCUDA<data_t,dim>::traverseForward(const dim3 blocks, const i
const uint32_t originPitch,
const int8_t* const __restrict__ projInv,
const uint32_t projPitch,
const BoundingBox boxMax,
const BoundingBox& boxMax,
const cudaStream_t stream) {
traverseForwardKernel<data_t,dim><<<blocks,threads,0,stream>>>(volume,sinogram,sinogramPitch,rayOrigins,originPitch,projInv,projPitch,boxMax);
}
......@@ -827,7 +827,7 @@ void TraverseJosephsCUDA<data_t,dim>::traverseAdjoint(const dim3 blocks, const i
const uint32_t originPitch,
const int8_t* const __restrict__ projInv,
const uint32_t projPitch,
BoundingBox boxMax,
const BoundingBox& boxMax,
const cudaStream_t stream) {
traverseAdjointKernel<data_t,dim><<<blocks,threads,0,stream>>>(volume, volumePitch, sinogram, sinogramPitch, rayOrigins, originPitch, projInv, projPitch, boxMax);
}
......
......@@ -19,7 +19,7 @@ namespace elsa
template <typename data_t = elsa::real_t, uint dim = 3>
struct TraverseJosephsCUDA
{
const static uint32_t MAX_THREADS_PER_BLOCK = 64;
const static uint32_t MAX_THREADS_PER_BLOCK = 32;
/**
* Allows for the bounding box to be passed to the kernel by value.
......@@ -74,7 +74,7 @@ template <typename data_t = elsa::real_t, uint dim = 3>
const uint32_t originPitch,
const int8_t *const __restrict__ projInv,
const uint32_t projPitch,
const BoundingBox boxMax,
const BoundingBox& boxMax,
const cudaStream_t stream = 0);
/**
......@@ -113,7 +113,7 @@ template <typename data_t = elsa::real_t, uint dim = 3>
const uint32_t originPitch,
const int8_t *const __restrict__ projInv,
const uint32_t projPitch,
BoundingBox boxMax,
const BoundingBox& boxMax,
const cudaStream_t stream = 0);
/**
......
......@@ -236,7 +236,7 @@ __device__ __forceinline__ double atomicAdd(double* address, double val)
#endif
template <typename data_t, bool adjoint, uint32_t dim>
__global__ void
__global__ void __launch_bounds__(elsa::TraverseSiddonsCUDA<data_t,dim>::MAX_THREADS_PER_BLOCK)
traverseVolume(int8_t* const __restrict__ volume,
const uint64_t volumePitch,
int8_t* const __restrict__ sinogram,
......@@ -245,7 +245,7 @@ traverseVolume(int8_t* const __restrict__ volume,
const uint32_t originPitch,
const int8_t* const __restrict__ projInv,
const uint32_t projPitch,
const uint32_t* const __restrict__ boxMax)
const typename elsa::TraverseSiddonsCUDA<data_t,dim>::BoundingBox boxMax)
{
using namespace elsa;
const int8_t* const projInvPtr = dim==3 ? projInv + (blockIdx.z*blockDim.x + threadIdx.x)*projPitch*3 :
......@@ -273,18 +273,18 @@ traverseVolume(int8_t* const __restrict__ volume,
//find volume intersections
real_t tmin;
if(!box_intersect<real_t,dim>(rayOrigin,rd,boxMax,tmin))
if(!box_intersect<real_t,dim>(rayOrigin,rd,boxMax.max,tmin))
return;
real_t entryPoint[dim];
pointAt<real_t,dim>(rayOrigin,rd,tmin,entryPoint);
projectOntoBox<real_t,dim>(entryPoint,boxMax);
projectOntoBox<real_t,dim>(entryPoint,boxMax.max);
int stepDir[dim];
initStepDirection<real_t,dim>(rd,stepDir);
uint32_t currentVoxel[dim];
if(!closestVoxel<real_t,dim>(entryPoint, boxMax, currentVoxel, rd, stepDir))
if(!closestVoxel<real_t,dim>(entryPoint, boxMax.max, currentVoxel, rd, stepDir))
return;
real_t tdelta[dim], tmax[dim];
......@@ -306,7 +306,7 @@ traverseVolume(int8_t* const __restrict__ volume,
volumeXPtr = dim==3 ? (data_t*)(volume + (boxMax[1]*currentVoxel[2] + currentVoxel[1])*volumePitch) + currentVoxel[0]
: (data_t*)(volume + currentVoxel[1]*volumePitch) + currentVoxel[0];
} while(isVoxelInVolume<real_t,dim>(currentVoxel,boxMax,index));
} while(isVoxelInVolume<real_t,dim>(currentVoxel,boxMax.max,index));
if (!adjoint) {
......@@ -327,7 +327,7 @@ namespace elsa {
const uint32_t originPitch,
const int8_t* const __restrict__ projInv,
const uint32_t projPitch,
const uint32_t* const __restrict__ boxMax,
const BoundingBox& boxMax,
cudaStream_t stream) {
traverseVolume<data_t, false, dim><<<blocks,threads,0,stream>>>(volume,volumePitch,sinogram,sinogramPitch,rayOrigins,originPitch,projInv,projPitch,boxMax);
}
......@@ -342,7 +342,7 @@ namespace elsa {
const uint32_t originPitch,
const int8_t* const __restrict__ projInv,
const uint32_t projPitch,
const uint32_t* const __restrict__ boxMax,
const BoundingBox& boxMax,
cudaStream_t stream) {
traverseVolume<data_t, true, dim><<<blocks,threads,0,stream>>>(volume,volumePitch,sinogram,sinogramPitch,rayOrigins,originPitch,projInv,projPitch,boxMax);
}
......
......@@ -16,6 +16,27 @@ namespace elsa {
template <typename data_t = real_t, uint32_t dim = 3>
struct TraverseSiddonsCUDA {
const static uint32_t MAX_THREADS_PER_BLOCK = 64;
/**
* Allows for the bounding box to be passed to the kernel by value.
* Kernel arguments are stored in constant memory, and should generally
* provide faster access to the variables than via global memory.
*/
struct BoundingBox
{
//min is always 0
uint32_t max[dim];
__device__ __forceinline__ const uint32_t &operator[](const uint32_t idx) const
{
return max[idx];
}
__device__ __forceinline__ uint32_t &operator[](const uint32_t idx)
{
return max[idx];
}
};
/**
* \brief Forward projection using Siddon's method
*
......@@ -50,7 +71,7 @@ namespace elsa {
const uint32_t originPitch,
const int8_t* const __restrict__ projInv,
const uint32_t projPitch,
const uint32_t* const __restrict__ boxMax,
const BoundingBox& boxMax,
cudaStream_t stream = (cudaStream_t)0);
/**
......@@ -87,7 +108,7 @@ namespace elsa {
const uint32_t originPitch,
const int8_t* const __restrict__ projInv,
const uint32_t projPitch,
const uint32_t* const __restrict__ boxMax,
const BoundingBox& boxMax,
cudaStream_t stream = (cudaStream_t)0);
};
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment