Commit 38bbacfd authored by Nikola Dinev's avatar Nikola Dinev
Browse files

Added comments, fixed include directories, removed dead code

parent ac48bd43
Pipeline #151957 passed with stages
in 51 seconds
......@@ -69,7 +69,8 @@ namespace elsa
template <typename data_t>
JosephsMethodCUDA<data_t>::~JosephsMethodCUDA() {
JosephsMethodCUDA<data_t>::~JosephsMethodCUDA()
{
//Free CUDA resources
if (cudaFree(_rayOrigins.ptr)!=cudaSuccess ||
cudaFree(_projInvMatrices.ptr)!=cudaSuccess)
......@@ -88,7 +89,8 @@ namespace elsa
}
template <typename data_t>
bool JosephsMethodCUDA<data_t>::isEqual(const LinearOperator<data_t>& other) const {
bool JosephsMethodCUDA<data_t>::isEqual(const LinearOperator<data_t>& other) const
{
if (!LinearOperator<data_t>::isEqual(other))
return false;
......@@ -161,7 +163,7 @@ namespace elsa
cudaDeviceSynchronize();
//retrieve results from GPU
copy3DDataContainer<cudaMemcpyDeviceToHost,false>((void*)&Ax[0] ,dsinoPtr,sinoExt);
copy3DDataContainer<cudaMemcpyDeviceToHost,false>((void*)&Ax[0],dsinoPtr,sinoExt);
}
else {
if (cudaMallocPitch(&dsinoPtr.ptr,&dsinoPtr.pitch,rangeDims[0]*sizeof(data_t),rangeDims[1])!=cudaSuccess)
......@@ -324,7 +326,7 @@ namespace elsa
}
//retrieve results from GPU
copy3DDataContainer<cudaMemcpyDeviceToHost,false>((void*)&Aty[0], dvolumePtr,volExt);
copy3DDataContainer<cudaMemcpyDeviceToHost,false>((void*)&Aty[0],dvolumePtr,volExt);
}
else {
if(cudaMallocPitch(&dvolumePtr.ptr,&dvolumePtr.pitch,coeffsPerDim[0]*sizeof(data_t),coeffsPerDim[1])!=cudaSuccess)
......@@ -431,7 +433,8 @@ namespace elsa
template <typename data_t>
template <cudaMemcpyKind direction, bool async>
void JosephsMethodCUDA<data_t>::copy3DDataContainer(void* hostData,const cudaPitchedPtr& gpuData, const cudaExtent& extent) const{
void JosephsMethodCUDA<data_t>::copy3DDataContainer(void* hostData,const cudaPitchedPtr& gpuData, const cudaExtent& extent) const
{
cudaMemcpy3DParms cpyParams = {0};
cpyParams.extent = extent;
cpyParams.kind = direction;
......@@ -468,7 +471,8 @@ namespace elsa
template <typename data_t>
template <unsigned int flags>
std::pair<cudaTextureObject_t, cudaArray*> JosephsMethodCUDA<data_t>::copyTextureToGPU(const DataContainer<data_t>& hostData) const{
std::pair<cudaTextureObject_t, cudaArray*> JosephsMethodCUDA<data_t>::copyTextureToGPU(const DataContainer<data_t>& hostData) const
{
//transfer volume as texture
auto coeffsPerDim = hostData.getDataDescriptor().getNumberOfCoefficientsPerDimension();
......
......@@ -11,47 +11,61 @@
#include "LogGuard.h"
#include "Timer.h"
#include "projector_kernels/TraverseJosephsCUDA.cuh"
#include "TraverseJosephsCUDA.cuh"
namespace elsa
{
/**
* \brief Wrapper for the CUDA projector based on Joseph's method
* \brief GPU-operator representing the discretized X-ray transform in 2d/3d using Joseph's method.
*
* \author Nikola Dinev
*
* The projector provides two implementations for the backprojection -
* a precise backprojection, using the exact adjoint of the forward projection operator,
* and a fast backprojection, which makes use of the GPU's hardware interpolation capabilities.
* Currently only utilizes a single GPU.
* Volume and images should both fit in device memory at the same time.
* \tparam data_t data type for the domain and range of the operator, defaulting to real_t
*
* \author Nikola Dinev (nikola.dinev@tum.de)
* The volume is traversed along the rays as specified by the Geometry. For interior voxels
* the sampling point is located in the middle of the two planes orthogonal to the main
* direction of the ray. For boundary voxels the sampling point is located at the center of the
* ray intersection with the voxel.
*
* The geometry is represented as a list of projection matrices (see class Geometry), one for each
* acquisition pose.
*
* Forward projection is accomplished using apply(), backward projection using applyAdjoint().
* The projector provides two implementations for the backward projection. The slow version is matched,
* while the fast one is not.
*
* Currently only utilizes a single GPU. Volume and images should both fit in device memory at the same time.
*
* \warning Hardware interpolation is only supported for JosephsMethodCUDA<float>
* \warning Hardware interpolation is significantly less accurate than the software interpolation
*/
template <typename data_t = real_t>
class JosephsMethodCUDA : public LinearOperator<data_t>
{
public:
JosephsMethodCUDA() = delete;
/**
* \brief Construct a new projector for the angles defined in the geometry vector
* \brief Constructor for Joseph's traversal method.
*
* \param domainDescriptor input descriptor
* \param rangeDescriptor output descriptor
* \param geom describes the projection angles
* \param fast uses fast backprojection if set, otherwise precise
* \param threadsPerBlock number of threads per block used for the kernel configuration
* \param[in] domainDescriptor describing the domain of the operator (the volume)
* \param[in] rangeDescriptor describing the range of the operator (the sinogram)
* \param[in] geometryList vector containing the geometries for the acquisition poses
* \param[in] performs fast backward projection if set, otherwise matched; forward projection is unaffected
*
* The domain is expected to be 2 or 3 dimensional (volSizeX, volSizeY, [volSizeZ]),
* the range is expected to be matching the domain (detSizeX, [detSizeY], acqPoses).
*/
JosephsMethodCUDA(const DataDescriptor &domainDescriptor, const DataDescriptor &rangeDescriptor,
const std::vector<Geometry> &geometryList, bool fast = true);
/// destructor
virtual ~JosephsMethodCUDA();
protected:
/// apply the binary method (i.e. forward projection)
/// apply Joseph's method (i.e. forward projection)
void _apply(const DataContainer<data_t>& x, DataContainer<data_t>& Ax) override;
/// apply the adjoint of the binary method (i.e. backward projection)
/// apply the adjoint of Joseph's method (i.e. backward projection)
void _applyAdjoint(const DataContainer<data_t>& y, DataContainer<data_t>& Aty) override;
/// implement the polymorphic clone operation
......@@ -61,30 +75,58 @@ namespace elsa
bool isEqual(const LinearOperator<data_t>& other) const override;
private:
const BoundingBox _boundingBox;
const std::vector<Geometry> _geometryList;
/// the bounding box of the volume
BoundingBox _boundingBox;
/// the geometry list
std::vector<Geometry> _geometryList;
/// threads per block used in the kernel execution configuration
const int _threadsPerBlock = TraverseJosephsCUDA<data_t>::MAX_THREADS_PER_BLOCK;
/// flag specifying which version of the backward projection should be used
const bool _fast;
// Inverse of of projection matrices; stored column-wise on GPU
/// inverse of of projection matrices; stored column-wise on GPU
cudaPitchedPtr _projInvMatrices;
// Projection matrices; stored column-wise on GPU
/// projection matrices; stored column-wise on GPU
cudaPitchedPtr _projMatrices;
// Ray origin for each angle
/// ray origins for each acquisition angle
cudaPitchedPtr _rayOrigins;
/// convenience typedef for cuda array flags
using cudaArrayFlags = unsigned int;
/**
* \brief Copies contents of a 3D data container to and from the gpu
* \brief Copies contents of a 3D data container between GPU and host memory
*
* \tparam direction specifies the direction of the copy operation
* \tparam async whether the copy should be performed asynchronously wrt. the host
*
* \param hostData pointer to host data
* \param gpuData pointer to gpu data
* \param[in] extent specifies the amount of data to be copied
*
* Note that hostData is expected to be a pointer to a linear memory region with no padding between
* dimensions - e.g. the data in DataContainer is stored as a vector with no extra padding, and the
* pointer to the start of the memory region can be retrieved as follows:
*
* DataContainer x;
* void* hostData = (void*)&x[0];
*/
template <cudaMemcpyKind direction, bool async=true>
void copy3DDataContainer(void* hostData,const cudaPitchedPtr& gpuData, const cudaExtent& extent) const;
/**
* \brief Copies the entire contents of hostData to the GPU, returns the created texture object and its associated cudaArray
* \brief Copies the entire contents of DataContainer to the GPU texture memory
*
* \tparam cudaArrayFlags flags used for the creation of the cudaArray which will contain the data
*
* \param[in] hostData the host data container
*
* \returns a pair of the created texture object and its associated cudaArray
*/
template <cudaArrayFlags flags = 0U>
std::pair<cudaTextureObject_t, cudaArray*> copyTextureToGPU(const DataContainer<data_t>& hostData) const;
......
......@@ -63,7 +63,8 @@ namespace elsa
template <typename data_t>
SiddonsMethodCUDA<data_t>::~SiddonsMethodCUDA() {
SiddonsMethodCUDA<data_t>::~SiddonsMethodCUDA()
{
//Free CUDA resources
if (cudaFree(_boxMax)!=cudaSuccess ||
cudaFree(_rayOrigins.ptr)!=cudaSuccess ||
......@@ -259,7 +260,8 @@ namespace elsa
template <typename data_t>
template <cudaMemcpyKind direction, bool async>
void SiddonsMethodCUDA<data_t>::copy3DDataContainerGPU(void* hostData,const cudaPitchedPtr& gpuData, const cudaExtent& extent) const{
void SiddonsMethodCUDA<data_t>::copy3DDataContainerGPU(void* hostData,const cudaPitchedPtr& gpuData, const cudaExtent& extent) const
{
cudaMemcpy3DParms cpyParams = {0};
cpyParams.extent = extent;
cpyParams.kind = direction;
......
......@@ -9,38 +9,55 @@
#include "LogGuard.h"
#include "Timer.h"
#include "projector_kernels/TraverseSiddonsCUDA.cuh"
#include "TraverseSiddonsCUDA.cuh"
namespace elsa
{
/**
* \brief Wrapper for the CUDA projector based on Siddon's method
* \brief GPU-operator representing the discretized X-ray transform in 2d/3d using Siddon's method.
*
* \author Nikola Dinev
*
* \tparam data_t data type for the domain and range of the operator, defaulting to real_t
*
* The volume is traversed along the rays as specified by the Geometry. Each ray is traversed in a
* continguous fashion (i.e. along long voxel borders, not diagonally) and each traversed voxel is
* counted as a hit with weight according to the length of the path of the ray through the voxel.
*
* The geometry is represented as a list of projection matrices (see class Geometry), one for each
* acquisition pose.
*
* Forward projection is accomplished using apply(), backward projection using applyAdjoint().
* This projector is matched.
*
* Currently only utilizes a single GPU.
* Volume and images should both fit in device memory at the same time.
*
* \author Nikola Dinev (nikola.dinev@tum.de)
* Currently only utilizes a single GPU. Volume and images should both fit in device memory at the same time.
*/
template <typename data_t = real_t>
class SiddonsMethodCUDA : public LinearOperator<data_t>
{
public:
SiddonsMethodCUDA() = delete;
/**
* \brief Construct a new projector for the angles defined in the geometry vector
*/
* \brief Constructor for Siddon's method traversal.
*
* \param[in] domainDescriptor describing the domain of the operator (the volume)
* \param[in] rangeDescriptor describing the range of the operator (the sinogram)
* \param[in] geometryList vector containing the geometries for the acquisition poses
*
* The domain is expected to be 2 or 3 dimensional (volSizeX, volSizeY, [volSizeZ]),
* the range is expected to be matching the domain (detSizeX, [detSizeY], acqPoses).
*/
SiddonsMethodCUDA(const DataDescriptor &domainDescriptor, const DataDescriptor &rangeDescriptor,
const std::vector<Geometry> &geom);
/// destructor
~SiddonsMethodCUDA();
protected:
/// apply the binary method (i.e. forward projection)
/// apply Siddon's method (i.e. forward projection)
void _apply(const DataContainer<data_t>& x, DataContainer<data_t>& Ax) override;
/// apply the adjoint of the binary method (i.e. backward projection)
/// apply the adjoint of Siddon's method (i.e. backward projection)
void _applyAdjoint(const DataContainer<data_t>& y, DataContainer<data_t>& Aty) override;
/// implement the polymorphic clone operation
......@@ -50,25 +67,45 @@ namespace elsa
bool isEqual(const LinearOperator<data_t>& other) const override;
private:
const BoundingBox _boundingBox;
const std::vector<Geometry> _geometryList;
/// the bounding box of the volume
BoundingBox _boundingBox;
/// the geometry list
std::vector<Geometry> _geometryList;
/// threads per block used in the kernel execution configuration
const int _threadsPerBlock = 64;
/**
* Inverse of significant part of projection matrices; stored column-wise
*/
/// inverse of of projection matrices; stored column-wise on GPU
cudaPitchedPtr _projInvMatrices;
/**
* Ray origin for each direction
*/
/// ray origins for each acquisition angle
cudaPitchedPtr _rayOrigins;
/// pointer to device global memory storing _boundingBox.max
uint* _boxMax;
/// sets up and starts the kernel traversal routine (for both apply/applyAdjoint)
template<bool adjoint>
void traverseVolume(void* volumePtr, void* sinoPtr) const;
/**
* \brief Copies contents of a 3D data container between GPU and host memory
*
* \tparam direction specifies the direction of the copy operation
* \tparam async whether the copy should be performed asynchronously wrt. the host
*
* \param hostData pointer to host data
* \param gpuData pointer to gpu data
* \param[in] extent specifies the amount of data to be copied
*
* Note that hostData is expected to be a pointer to a linear memory region with no padding between
* dimensions - e.g. the data in DataContainer is stored as a vector with no extra padding, and the
* pointer to the start of the memory region can be retrieved as follows:
*
* DataContainer x;
* void* hostData = (void*)&x[0];
*/
template <cudaMemcpyKind direction, bool async=true>
void copy3DDataContainerGPU(void* hostData,const cudaPitchedPtr& gpuData, const cudaExtent& extent) const;
......
......@@ -96,12 +96,14 @@ add_library(${ELSA_MODULE_TARGET_NAME} ${MODULE_HEADERS} ${MODULE_SOURCES})
add_library(elsa::${ELSA_MODULE_NAME} ALIAS ${ELSA_MODULE_TARGET_NAME})
#CUDA doesn't support C++17, require C++14
target_compile_features(${ELSA_MODULE_TARGET_NAME} PUBLIC cxx_std_14)
set_target_properties(${ELSA_MODULE_TARGET_NAME} PROPERTIES CUDA_STANDARD 14 POSITION_INDEPENDENT_CODE ON)
#CUDA doesn't support linking against a library built under the C++17 standard,
#we only need the elsa.h header -> manually specify include directories
target_include_directories(${ELSA_MODULE_TARGET_NAME} PUBLIC
$<TARGET_PROPERTY:elsa_core,INTERFACE_INCLUDE_DIRECTORIES>)
$<TARGET_PROPERTY:elsa_core,INTERFACE_INCLUDE_DIRECTORIES>
$<INSTALL_INTERFACE:include/${ELSA_MODULE_NAME}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
# register the module
registerComponent(${ELSA_MODULE_NAME})
......
......@@ -29,6 +29,7 @@ __device__ __forceinline__ void gesqmv(const int8_t* const __restrict__ matrix,
}
}
/// normalizes a vector of length 2 or 3 using device inbuilt functions
template <typename real_t, uint32_t dim>
__device__ __forceinline__ void normalize(real_t* const __restrict__ vector)
{
......@@ -44,6 +45,7 @@ __device__ __forceinline__ void normalize(real_t* const __restrict__ vector)
}
}
/// calculates the point at a distance delta from the ray origin ro in direction rd
template <typename real_t, uint32_t dim>
__device__ __forceinline__ void pointAt(const real_t* const __restrict__ ro,
const real_t* const __restrict__ rd,
......@@ -55,6 +57,7 @@ __device__ __forceinline__ void pointAt(const real_t* const __restrict__ ro,
result[i] = delta*rd[i]+ro[i];
}
/// projects a point onto the bounding box by clipping (points inside the bounding box are unaffected)
template <typename real_t, uint32_t dim>
__device__ __forceinline__ void projectOntoBox(real_t* const __restrict__ point,
const real_t* const __restrict__ boxMax)
......@@ -67,6 +70,7 @@ __device__ __forceinline__ void projectOntoBox(real_t* const __restrict__ point,
}
/// determines the voxel that contains a point, if the point is on a border the voxel in the ray direction is favored
template <typename real_t, uint32_t dim>
__device__ __forceinline__ bool closestVoxel(const real_t* const __restrict__ point,
const real_t* const __restrict__ boxMax,
......@@ -85,6 +89,7 @@ __device__ __forceinline__ bool closestVoxel(const real_t* const __restrict__ po
return true;
}
/// initializes stepDir with the sign of rd
template <typename real_t, uint32_t dim>
__device__ __forceinline__ void initStepDirection(const real_t* const __restrict__ rd,
int* const __restrict__ stepDir)
......@@ -94,6 +99,7 @@ __device__ __forceinline__ void initStepDirection(const real_t* const __restrict
stepDir[i] = ((rd[i]>0.0f) - (rd[i]<0.0f));
}
/// initialize step sizes considering the ray direcion
template <typename real_t, uint32_t dim>
__device__ __forceinline__ void initDelta(const real_t* const __restrict__ rd,
const int* const __restrict__ stepDir,
......@@ -105,20 +111,7 @@ __device__ __forceinline__ void initDelta(const real_t* const __restrict__ rd,
}
}
template <typename real_t, uint32_t dim>
__device__ __forceinline__ void initMax(const real_t* const __restrict__ rd,
const uint32_t* const __restrict__ currentVoxel,
const real_t* const __restrict__ point,
real_t* const __restrict__ tmax)
{
real_t nextBoundary;
#pragma unroll
for (int i=0;i<dim;i++) {
nextBoundary = rd[i]>0.0f ? currentVoxel[i] + 1 : currentVoxel[i];
tmax[i] = rd[i]>=-__FLT_EPSILON__ && rd[i]<=__FLT_EPSILON__ ? __FLT_MAX__ : (nextBoundary - point[i])/rd[i];
}
}
/// find intersection points of ray with AABB
template <typename real_t, uint32_t dim>
__device__ __forceinline__ bool box_intersect(const real_t* const __restrict__ ro,
const real_t* const __restrict__ rd,
......@@ -158,6 +151,7 @@ __device__ __forceinline__ bool box_intersect(const real_t* const __restrict__ r
return false;
}
/// returns the index of the smallest element in an array
template <typename real_t, uint32_t dim>
__device__ __forceinline__ uint32_t minIndex(const real_t* const __restrict__ array) {
uint32_t index = 0;
......@@ -173,6 +167,7 @@ __device__ __forceinline__ uint32_t minIndex(const real_t* const __restrict__ ar
return index;
}
/// return the index of the element with the maximum absolute value in array
template <typename real_t, uint32_t dim>
__device__ __forceinline__ uint32_t maxAbsIndex(const real_t* const __restrict__ array) {
uint32_t index = 0;
......@@ -188,6 +183,7 @@ __device__ __forceinline__ uint32_t maxAbsIndex(const real_t* const __restrict__
return index;
}
/// currentPosition is advanced by dist in direction rd
template <typename real_t, uint32_t dim>
__device__ __forceinline__ void updateTraverse(real_t* const __restrict__ currentPosition,
const real_t* const __restrict__ rd,
......@@ -198,6 +194,7 @@ __device__ __forceinline__ void updateTraverse(real_t* const __restrict__ curren
currentPosition[i]+=rd[i]*dist;
}
/// convenience function for texture fetching
template <typename data_t, uint dim>
__device__ __forceinline__ data_t tex(cudaTextureObject_t texObj, const elsa::real_t* const p) {
if(dim==3)
......@@ -206,11 +203,13 @@ __device__ __forceinline__ data_t tex(cudaTextureObject_t texObj, const elsa::re
return tex2D<data_t>(texObj, p[0],p[1]);
}
/// fetches double at position (x,y) from 2D texture
__device__ __forceinline__ double tex2Dd(cudaTextureObject_t texObj, const float x, const float y) {
uint2 rt = tex2D<uint2>(texObj, x, y);
return __hiloint2double(rt.y, rt.x);
}
/// template specialization for double texture fetches
template<>
__device__ __forceinline__ double tex<double,2>(cudaTextureObject_t texObj, const elsa::real_t* const p) {
elsa::real_t x = p[0] - 0.5f;
......@@ -231,11 +230,13 @@ __device__ __forceinline__ double tex<double,2>(cudaTextureObject_t texObj, cons
return (1-a)*(1-b)*T[0][0] + a*(1-b)*T[1][0] + (1-a)*b*T[0][1] + a*b*T[1][1];
}
/// fetches double at position (x,y,z) from 3D texture
__device__ __forceinline__ double tex3Dd(cudaTextureObject_t texObj, const float x, const float y, const elsa::real_t z) {
uint2 rt = tex3D<uint2>(texObj, x, y, z);
return __hiloint2double(rt.y, rt.x);
}
/// template specialization for double texture fetches
template<>
__device__ __forceinline__ double tex<double,3>(cudaTextureObject_t texObj, const elsa::real_t* const p) {
elsa::real_t x = p[0] - 0.5f;
......@@ -378,11 +379,13 @@ __global__ void __launch_bounds__(elsa::TraverseJosephsCUDA<data_t,dim>::MAX_THR
*sinogramPtr = pixelValue;
}
/// fetches double at position x, layer layer from a 1D layered texture
__device__ __forceinline__ double tex1DLayeredd(cudaTextureObject_t texObj, const float x, const int layer) {
uint2 rt = tex1DLayered<uint2>(texObj, x, layer);
return __hiloint2double(rt.y, rt.x);
}
/// template specialization for layered texture fetches
template <>
double tex1DLayered<double>(cudaTextureObject_t texObj, elsa::real_t x, const int layer) {
x = x - 0.5f;
......@@ -398,11 +401,13 @@ double tex1DLayered<double>(cudaTextureObject_t texObj, elsa::real_t x, const in
return (1-a)*T[0]+a*T[1];
}
/// fetches double at position (x,y), layer layer from a 2D layered texture
__device__ __forceinline__ double tex2DLayeredd(cudaTextureObject_t texObj, const float x, const float y, const int layer) {
uint2 rt = tex2DLayered<uint2>(texObj, x, y, layer);
return __hiloint2double(rt.y, rt.x);
}
/// template specialization for layered texture fetches
template <>
double tex2DLayered<double>(cudaTextureObject_t texObj, elsa::real_t x, elsa::real_t y, const int layer) {
x = x - 0.5f;
......@@ -510,6 +515,7 @@ __global__ void __launch_bounds__(elsa::TraverseJosephsCUDA<data_t,dim>::MAX_TH
}
#endif
/// backprojects the weighted sinogram value to a given pixel
template <typename data_t,uint dim>
__device__ __forceinline__ void backproject2(int8_t* const __restrict__ volume,
const uint64_t* const __restrict__ p,
......@@ -530,6 +536,7 @@ __device__ __forceinline__ void backproject2(int8_t* const __restrict__ volume,
atomicAdd(volumeXPtr, val);
}
/// backprojects the weighted sinogram value to a given voxel
template <typename data_t,uint dim>
__device__ __forceinline__ void backproject4(int8_t* const __restrict__ volume,
const uint64_t* const __restrict__ p,
......@@ -562,6 +569,7 @@ __device__ __forceinline__ void backproject4(int8_t* const __restrict__ volume,
atomicAdd(volumeXPtr, val);
}
/// convenience method for backprojecting to a given pixel/voxel
template <typename data_t, uint dim>
__device__ __forceinline__ void backproject(int8_t* const __restrict__ volume,
const uint64_t* const __restrict__ p,
......@@ -576,6 +584,7 @@ __device__ __forceinline__ void backproject(int8_t* const __restrict__ volume,
backproject2<data_t,dim>(volume,p,voxelCoord,voxelCoordf,boxMax,frac,weightedVal);
}
/// swaps the values of a and b
template <typename T>
__device__ __forceinline__ void swap(T& a, T& b) {
T c = a;
......
......@@ -16,11 +16,16 @@
namespace elsa
{
template <typename data_t = float, uint dim = 3>
template <typename data_t = elsa::real_t, uint dim = 3>
struct TraverseJosephsCUDA
{
const static uint32_t MAX_THREADS_PER_BLOCK = 64;
/**
* Allows for the bounding box to be passed to the kernel by value.
* Kernel arguments are stored in constant memory, and should generally
* provide faster access to the variables than via global memory.
*/
struct BoundingBox
{
//min is always 0
......@@ -36,6 +41,31 @@ template <typename data_t = float, uint dim = 3>
}
};
/**
* \brief Forward projection using Josephs's method
*
* \param[in] blocks specifies the grid used for kernel execution
* \param[in] threads specifies the number of threads for each block
* \param[in] volume handle to the texture object containing the volume data
* \param[out] sinogram pointer to output
* \param[in] sinogramPitch pitch (padded width in bytes) of sinogram
* \param[in] rayOrigins pointer to ray origins
* \param[in] originPitch pitch of ray origins
* \param[in] projInv pointer to inverse of projection matrices
* \param[in] projPitch pitch of inverse of projection matrices
* \param[in] boxMax specifies the size of the volume
* \param[in] stream handle to stream in which the kernel should be placed
*
* The variables blocks and threads should be picked based on the sinogram dimensions. To process all
* rays set blocks to (detectorSizeX, detectorSizeY, numAngles / threads), if numAngles is not a multiple of threads
* a second kernel call must be made to process the remaining rays with blocks = (detectorSizeX, detectorSizeY, 1)
* and threads = numAngles % threadsFirstCall. Sinogram, projection matrix, and ray origin pointers should be
* adjusted accordingly to point to the start of the (numAngles - numAngles % threadsFirstCall)-th element.
*
* threads should ideally be a multiple of the warp size (32 for all current GPUs).
*
* \warning Interpolation mode for texture should be set to cudaFilterModePoint when working with doubles.
*/
static void traverseForward(const dim3 blocks, const int threads,
cudaTextureObject_t volume,
int8_t *const __restrict__ sinogram,
......@@ -48,10 +78,32 @@ template <typename data_t = float, uint dim = 3>
const cudaStream_t stream = 0);
/**
* \brief Acts as the exact adjoint of the forward traversal operator.
*
* Volume has to be zero initialized to guarantee correct output.
*/
* \brief Backward projection using Josephs's method
*