Commit b2bc49ec authored by Nikola Dinev's avatar Nikola Dinev
Browse files

Fix Joseph's padded constrained projection

parent dab0a149
Pipeline #393194 passed with stages
in 17 minutes and 15 seconds
...@@ -181,6 +181,8 @@ namespace elsa ...@@ -181,6 +181,8 @@ namespace elsa
CUDA_HOST operator cudaPitchedPtr&() { return *pitchedPtr; }; CUDA_HOST operator cudaPitchedPtr&() { return *pitchedPtr; };
CUDA_HOST cudaPitchedPtr* operator->() { return pitchedPtr.get(); };
private: private:
cudaPitchedPtr* allocate(IndexVector_t size) cudaPitchedPtr* allocate(IndexVector_t size)
{ {
......
...@@ -542,8 +542,10 @@ namespace elsa ...@@ -542,8 +542,10 @@ namespace elsa
if (aabb._dim == 3) { if (aabb._dim == 3) {
auto dataExt = make_cudaExtent(dataDims[0] * sizeof(data_t), dataDims[1], dataDims[2]); auto dataExt = make_cudaExtent(dataDims[0] * sizeof(data_t), dataDims[1], dataDims[2]);
copy3DDataContainer<cudaMemcpyDeviceToHost, true>((void*) hostData, gpuData, dataExt, auto gpuAdjusted =
stream); make_cudaPitchedPtr(gpuData.ptr, gpuData.pitch, dataExt.width, dataExt.height);
copy3DDataContainer<cudaMemcpyDeviceToHost, true>((void*) hostData, gpuAdjusted,
dataExt, stream);
} else { } else {
std::scoped_lock lock(deviceLock); std::scoped_lock lock(deviceLock);
gpuErrchk(cudaSetDevice(_device)); gpuErrchk(cudaSetDevice(_device));
......
...@@ -358,6 +358,14 @@ __device__ __forceinline__ uint sinogramSizeY() ...@@ -358,6 +358,14 @@ __device__ __forceinline__ uint sinogramSizeY()
return 0; return 0;
} }
template <uint32_t dim>
__device__ __forceinline__ bool inPaddingRegion(const uint3& coord,
const elsa::BoundingBoxCUDA<dim>& box)
{
return coord.x >= box._max[0] - box._min[0] || coord.y >= box._max[1] - box._min[1]
|| (dim == 3 && coord.z >= box._max[2] - box._min[2]) || (dim == 2 && coord.z > 0);
}
template <typename data_t, uint32_t dim, ExecutionConfigurationType config = Z_Y_X> template <typename data_t, uint32_t dim, ExecutionConfigurationType config = Z_Y_X>
__global__ void __launch_bounds__(elsa::TraverseJosephsCUDA<data_t, dim>::MAX_THREADS_PER_BLOCK) __global__ void __launch_bounds__(elsa::TraverseJosephsCUDA<data_t, dim>::MAX_THREADS_PER_BLOCK)
traverseForwardKernel(cudaTextureObject_t volume, int8_t* const __restrict__ sinogram, traverseForwardKernel(cudaTextureObject_t volume, int8_t* const __restrict__ sinogram,
...@@ -373,9 +381,7 @@ __global__ void __launch_bounds__(elsa::TraverseJosephsCUDA<data_t, dim>::MAX_TH ...@@ -373,9 +381,7 @@ __global__ void __launch_bounds__(elsa::TraverseJosephsCUDA<data_t, dim>::MAX_TH
auto coord = getCoordinates<config>(); auto coord = getCoordinates<config>();
// immediately return if in padding region // immediately return if in padding region
if (coord.x >= sinogramBoundingBox._max[0] - sinogramBoundingBox._min[0] if (inPaddingRegion(coord, sinogramBoundingBox))
|| coord.y >= sinogramBoundingBox._max[1] - sinogramBoundingBox._min[1]
|| coord.z >= sinogramBoundingBox._max[2] - sinogramBoundingBox._min[2])
return; return;
// homogenous pixel coordinates // homogenous pixel coordinates
...@@ -385,16 +391,23 @@ __global__ void __launch_bounds__(elsa::TraverseJosephsCUDA<data_t, dim>::MAX_TH ...@@ -385,16 +391,23 @@ __global__ void __launch_bounds__(elsa::TraverseJosephsCUDA<data_t, dim>::MAX_TH
if (dim == 3) if (dim == 3)
pixelCoord[1] = coord.y + sinogramBoundingBox._min[1] + 0.5f; pixelCoord[1] = coord.y + sinogramBoundingBox._min[1] + 0.5f;
const auto& poseIdx = dim == 3 ? coord.z : coord.y;
const int8_t* const projInvPtr = const int8_t* const projInvPtr =
projInv + (coord.z + static_cast<uint32_t>(sinogramBoundingBox._min[2])) * projPitch * dim; projInv
+ (poseIdx + static_cast<uint32_t>(sinogramBoundingBox._min[dim - 1])) * projPitch * dim;
const real_t* const rayOrigin = const real_t* const rayOrigin =
(real_t*) (rayOrigins (real_t*) (rayOrigins
+ (coord.z + static_cast<uint32_t>(sinogramBoundingBox._min[2])) * originPitch); + (poseIdx + static_cast<uint32_t>(sinogramBoundingBox._min[dim - 1]))
* originPitch);
data_t* sinogramPtr = data_t* sinogramPtr = ((data_t*) (sinogram
((data_t*) (sinogram + (coord.z * sinogramSizeY<config>() + coord.y) * sinogramPitch) + (coord.z
+ coord.x); * static_cast<uint32_t>(sinogramBoundingBox._max[1]
- sinogramBoundingBox._min[1])
+ coord.y)
* sinogramPitch)
+ coord.x);
__shared__ real_t currentPositionsShared[dim * MAX_THREADS_PER_BLOCK]; __shared__ real_t currentPositionsShared[dim * MAX_THREADS_PER_BLOCK];
__shared__ real_t rayDirectionsShared[dim * MAX_THREADS_PER_BLOCK]; __shared__ real_t rayDirectionsShared[dim * MAX_THREADS_PER_BLOCK];
...@@ -949,45 +962,42 @@ namespace elsa ...@@ -949,45 +962,42 @@ namespace elsa
template <uint32_t dim, ExecutionConfigurationType config> template <uint32_t dim, ExecutionConfigurationType config>
dim3 getGrid(const BoundingBoxCUDA<dim>& boundingBox, dim3 threadsPerBlock) dim3 getGrid(const BoundingBoxCUDA<dim>& boundingBox, dim3 threadsPerBlock)
{ {
auto numImages = auto sinogramSizeX = static_cast<uint32_t>(boundingBox._max[0] - boundingBox._min[0]);
static_cast<uint32_t>(boundingBox._max[dim - 1] - boundingBox._min[dim - 1]); auto sinogramSizeY = static_cast<uint32_t>(boundingBox._max[1] - boundingBox._min[1]);
auto sinogramSizeZ =
auto detectorSizeX = static_cast<uint32_t>(boundingBox._max[0] - boundingBox._min[0]); dim == 3 ? static_cast<uint32_t>(boundingBox._max[2] - boundingBox._min[2]) : 1;
auto detectorSizeY =
dim == 3 ? static_cast<uint32_t>(boundingBox._max[1] - boundingBox._min[1]) : 1;
dim3 grid; dim3 grid;
switch (config) { switch (config) {
case X_Y_Z: case X_Y_Z:
grid.x = detectorSizeX; grid.x = sinogramSizeX;
grid.y = detectorSizeY; grid.y = sinogramSizeY;
grid.z = numImages; grid.z = sinogramSizeZ;
break; break;
case X_Z_Y: case X_Z_Y:
grid.x = detectorSizeX; grid.x = sinogramSizeX;
grid.z = detectorSizeY; grid.z = sinogramSizeY;
grid.y = numImages; grid.y = sinogramSizeZ;
break; break;
case Y_X_Z: case Y_X_Z:
grid.y = detectorSizeX; grid.y = sinogramSizeX;
grid.x = detectorSizeY; grid.x = sinogramSizeY;
grid.z = numImages; grid.z = sinogramSizeZ;
break; break;
case Y_Z_X: case Y_Z_X:
grid.z = detectorSizeX; grid.z = sinogramSizeX;
grid.x = detectorSizeY; grid.x = sinogramSizeY;
grid.y = numImages; grid.y = sinogramSizeZ;
break; break;
case Z_X_Y: case Z_X_Y:
grid.y = detectorSizeX; grid.y = sinogramSizeX;
grid.z = detectorSizeY; grid.z = sinogramSizeY;
grid.x = numImages; grid.x = sinogramSizeZ;
break; break;
case Z_Y_X: case Z_Y_X:
grid.z = detectorSizeX; grid.z = sinogramSizeX;
grid.y = detectorSizeY; grid.y = sinogramSizeY;
grid.x = numImages; grid.x = sinogramSizeZ;
break; break;
} }
...@@ -1026,13 +1036,15 @@ namespace elsa ...@@ -1026,13 +1036,15 @@ namespace elsa
- _sinogramBoundingBox._min[1]) - _sinogramBoundingBox._min[1])
: 1; : 1;
auto numImageBlocks = detectorSizeX / _threadsPerBlock.z; auto tpb = _threadsPerBlock.x * _threadsPerBlock.y * _threadsPerBlock.z;
auto remaining = detectorSizeX % _threadsPerBlock.z;
auto offset = numImageBlocks * _threadsPerBlock.z; auto numImageBlocks = detectorSizeX / tpb;
auto remaining = detectorSizeX % tpb;
auto offset = numImageBlocks * tpb;
if (numImageBlocks > 0) { if (numImageBlocks > 0) {
dim3 grid(numImages, detectorSizeY, numImageBlocks); dim3 grid(numImages, detectorSizeY, numImageBlocks);
traverseAdjointKernel<data_t, dim><<<grid, _threadsPerBlock>>>( traverseAdjointKernel<data_t, dim><<<grid, tpb>>>(
(int8_t*) volume.ptr, volume.pitch, (const int8_t*) sinogram.ptr, sinogram.pitch, 0, (int8_t*) volume.ptr, volume.pitch, (const int8_t*) sinogram.ptr, sinogram.pitch, 0,
(const int8_t*) _rayOrigins.ptr, _rayOrigins.pitch, (const int8_t*) _rayOrigins.ptr, _rayOrigins.pitch,
(const int8_t*) _projInvMatrices.ptr, _projInvMatrices.pitch, _volumeBoundingBox); (const int8_t*) _projInvMatrices.ptr, _projInvMatrices.pitch, _volumeBoundingBox);
...@@ -1067,9 +1079,11 @@ namespace elsa ...@@ -1067,9 +1079,11 @@ namespace elsa
- _volumeBoundingBox._min[dim - 1]) - _volumeBoundingBox._min[dim - 1])
: 1; : 1;
uint32_t numImageBlocks = volumeSizeX / _threadsPerBlock.z; auto tpb = _threadsPerBlock.x * _threadsPerBlock.y * _threadsPerBlock.z;
uint32_t remaining = volumeSizeX % _threadsPerBlock.z;
uint32_t offset = numImageBlocks * _threadsPerBlock.z; auto numImageBlocks = volumeSizeX / tpb;
auto remaining = volumeSizeX % tpb;
auto offset = numImageBlocks * tpb;
uint32_t numAngles = uint32_t numAngles =
_sinogramBoundingBox._max[dim - 1] - _sinogramBoundingBox._min[dim - 1]; _sinogramBoundingBox._max[dim - 1] - _sinogramBoundingBox._min[dim - 1];
...@@ -1077,8 +1091,7 @@ namespace elsa ...@@ -1077,8 +1091,7 @@ namespace elsa
if (numImageBlocks > 0) { if (numImageBlocks > 0) {
traverseAdjointFastKernel<data_t, dim> traverseAdjointFastKernel<data_t, dim>
<<<grid, _threadsPerBlock, <<<grid, tpb, numImageBlocks * MAX_THREADS_PER_BLOCK * sizeof(data_t)>>>(
numImageBlocks * MAX_THREADS_PER_BLOCK * sizeof(data_t)>>>(
(int8_t*) volume.ptr, volume.pitch, 0, numImageBlocks, sinogram, (int8_t*) volume.ptr, volume.pitch, 0, numImageBlocks, sinogram,
(const int8_t*) _rayOrigins.ptr, _rayOrigins.pitch, (const int8_t*) _rayOrigins.ptr, _rayOrigins.pitch,
(const int8_t*) _projMatrices.ptr, _projMatrices.pitch, numAngles); (const int8_t*) _projMatrices.ptr, _projMatrices.pitch, numAngles);
...@@ -1092,7 +1105,7 @@ namespace elsa ...@@ -1092,7 +1105,7 @@ namespace elsa
"TraverseJosephsCUDA: Couldn't create stream for remaining images"); "TraverseJosephsCUDA: Couldn't create stream for remaining images");
traverseAdjointFastKernel<data_t, dim> traverseAdjointFastKernel<data_t, dim>
<<<grid, _threadsPerBlock, remaining * sizeof(data_t), remStream>>>( <<<grid, tpb, remaining * sizeof(data_t), remStream>>>(
(int8_t*) volume.ptr, volume.pitch, offset, 1, sinogram, (int8_t*) volume.ptr, volume.pitch, offset, 1, sinogram,
(const int8_t*) _rayOrigins.ptr, _rayOrigins.pitch, (const int8_t*) _rayOrigins.ptr, _rayOrigins.pitch,
(const int8_t*) _projMatrices.ptr, _projMatrices.pitch, numAngles); (const int8_t*) _projMatrices.ptr, _projMatrices.pitch, numAngles);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment