30 #include <type_traits>
40 #include <cuda_runtime.h>
50 static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
51 static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
54 template <
int64_t block_size,
int64_t thread_size,
typename func_t>
55 __global__
void ElementWiseKernel_(int64_t n, func_t f) {
56 int64_t items_per_block = block_size * thread_size;
57 int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
59 for (int64_t i = 0; i < thread_size; ++i) {
68 template <
typename func_t>
69 void ParallelForCUDA_(
const Device& device, int64_t n,
const func_t& func) {
78 CUDAScopedDevice scoped_device(device);
79 int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
80 int64_t grid_size = (n + items_per_block - 1) / items_per_block;
82 ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
83 <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
91 template <
typename func_t>
101 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
102 for (int64_t i = 0; i < n; ++i) {
121 template <
typename func_t>
124 ParallelForCUDA_(device, n, func);
176 template <
typename vec_func_t,
typename func_t>
180 const vec_func_t& vec_func) {
181 #ifdef BUILD_ISPC_MODULE
184 ParallelForCUDA_(device, n, func);
188 int64_t start = n * i / num_threads;
189 int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
190 vec_func(start, end);
197 ParallelForCUDA_(device, n, func);
205 #ifdef BUILD_ISPC_MODULE
208 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
209 using namespace ispc; \
210 ISPCKernel(start, end, __VA_ARGS__);
215 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
217 "ISPC module disabled. Unable to call vectorized kernel {}", \
218 OPEN3D_STRINGIFY(ISPCKernel));
223 #define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
224 [&](T, int64_t start, int64_t end) { \
225 OPEN3D_CALL_ISPC_KERNEL_( \
226 OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \
239 #define OPEN3D_VECTORIZED(ISPCKernel, ...) \
240 [&](int64_t start, int64_t end) { \
241 OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
257 #define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
258 [&](int64_t start, int64_t end) { \
259 static_assert(std::is_arithmetic<T>::value, \
260 "Data type is not an arithmetic type"); \
262 OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
263 OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \
264 OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \
265 OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \
266 OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \
267 OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \
268 OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \
269 OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \
270 OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \
271 OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \
272 OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \
273 [&](auto&& generic, int64_t start, int64_t end) { \
275 "Unsupported data type {} for calling " \
276 "vectorized kernel {}", \
277 typeid(generic).name(), \
278 OPEN3D_STRINGIFY(ISPCKernel)); \
279 })(T{}, start, end); \
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:67
#define LogError(...)
Definition: Logging.h:67
std::string ToString() const
Definition: Device.h:75
DeviceType GetType() const
Definition: Device.h:91
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:92
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:122
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:50
Definition: PinholeCameraIntrinsic.cpp:35