37 #ifndef PCL_DEVICE_UTILS_WARP_HPP_ 38 #define PCL_DEVICE_UTILS_WARP_HPP_ 54 static __device__ __forceinline__
unsigned int laneId()
57 asm(
"mov.u32 %0, %laneid;" :
"=r"(ret) );
63 #if (__CUDA_ARCH__ >= 200) 65 asm(
"mov.u32 %0, %lanemask_le;" :
"=r"(ret) );
68 return 0xFFFFFFFF >> (31 -
laneId());
74 #if (__CUDA_ARCH__ >= 200) 76 asm(
"mov.u32 %0, %lanemask_lt;" :
"=r"(ret) );
79 return 0xFFFFFFFF >> (32 -
laneId());
82 static __device__ __forceinline__
unsigned int id()
84 int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
98 template<
typename It,
typename T>
99 static __device__ __forceinline__
void fill(It beg, It end,
const T& value)
105 template<
typename InIt,
typename OutIt>
106 static __device__ __forceinline__ OutIt
copy(InIt beg, InIt end, OutIt out)
108 unsigned int lane =
laneId();
110 OutIt o = out + lane;
117 template<
typename InIt,
typename OutIt,
class UnOp>
118 static __device__ __forceinline__ OutIt
transform(InIt beg, InIt end, OutIt out, UnOp op)
120 unsigned int lane =
laneId();
122 OutIt o = out + lane;
129 template<
typename InIt1,
typename InIt2,
typename OutIt,
class BinOp>
130 static __device__ __forceinline__ OutIt
transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
132 unsigned int lane =
laneId();
133 InIt1 t1 = beg1 + lane;
134 InIt2 t2 = beg2 + lane;
135 OutIt o = out + lane;
142 template<
typename OutIt,
typename T>
143 static __device__ __forceinline__
void yota(OutIt beg, OutIt end, T value)
145 unsigned int lane =
laneId();
148 for(OutIt t = beg + lane; t < end; t +=
STRIDE, value +=
STRIDE)
152 template<
typename T,
class BinOp>
153 static __device__ __forceinline__
void reduce(
volatile T* buffer, BinOp op)
155 unsigned int lane =
laneId();
156 T val = buffer[lane];
160 buffer[lane] = val = op(val, buffer[lane + 16]);
161 buffer[lane] = val = op(val, buffer[lane + 8]);
162 buffer[lane] = val = op(val, buffer[lane + 4]);
163 buffer[lane] = val = op(val, buffer[lane + 2]);
164 buffer[lane] = val = op(val, buffer[lane + 1]);
168 template<
typename T,
class BinOp>
169 static __device__ __forceinline__ T
reduce(
volatile T* buffer, T init, BinOp op)
171 unsigned int lane =
laneId();
172 T val = buffer[lane] = init;
176 buffer[lane] = val = op(val, buffer[lane + 16]);
177 buffer[lane] = val = op(val, buffer[lane + 8]);
178 buffer[lane] = val = op(val, buffer[lane + 4]);
179 buffer[lane] = val = op(val, buffer[lane + 2]);
180 buffer[lane] = val = op(val, buffer[lane + 1]);
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
static __device__ __forceinline__ int laneMaskLe()
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
static __device__ __forceinline__ unsigned int id()
static __device__ __forceinline__ int laneMaskLt()
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
static __device__ __forceinline__ int binaryInclScan(int ballot_mask)
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)