20 __device__
operator unsigned ()
const {
return index; }
24template <
typename P,
typename I,
typename U>
25void cuda_find_if_loop(P&& p, I input,
unsigned count,
unsigned* idx, U pred) {
34 auto B = (
count + E::nv - 1) / E::nv;
40 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (
auto tid,
auto bid) {
42 __shared__
unsigned shm_id;
50 auto tile = cuda_get_tile(bid, E::nv, count);
52 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
53 input + tile.begin, tid, tile.count()
58 for(
unsigned i=0; i<E::vt; i++) {
59 auto j = E::nt*i + tid;
60 if(j < tile.count() && pred(x[i])) {
80 atomicMin(&shm_id,
id);
85 atomicMin(idx, shm_id);
92template <
typename P,
typename I,
typename O>
93void cuda_min_element_loop(
94 P&& p, I input,
unsigned count,
unsigned* idx, O op,
void* ptr
102 using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
104 cuda_uninitialized_reduce_loop(p,
105 cuda_make_load_iterator<T>([=]__device__(
auto i){
106 return T{*(input+i), i};
110 [=] __device__ (
const auto& a,
const auto& b) {
111 return op(a.key, b.key) ? a : b;
118template <
typename P,
typename I,
typename O>
119void cuda_max_element_loop(
120 P&& p, I input,
unsigned count,
unsigned* idx, O op,
void* ptr
128 using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
130 cuda_uninitialized_reduce_loop(p,
131 cuda_make_load_iterator<T>([=]__device__(
auto i){
132 return T{*(input+i), i};
136 [=] __device__ (
const auto& a,
const auto& b) {
137 return op(a.key, b.key) ? b : a;
180template <
typename P,
typename I,
typename U>
182 P&& p, I first, I last,
unsigned* idx, U op
184 detail::cuda_find_if_loop(p, first,
std::distance(first, last), idx, op);
192template <
typename I,
typename U>
196 cap.
find_if(first, last, idx, op);
201template <
typename I,
typename U>
205 cap.
find_if(first, last, idx, op);
214template <
typename I,
typename U>
216 return on([=](cudaStream_t stream)
mutable {
223template <
typename I,
typename U>
225 cudaTask task, I first, I last,
unsigned* idx, U op
227 on(task, [=](cudaStream_t stream)
mutable {
248template <
typename P,
typename T>
250 return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);
287template <
typename P,
typename I,
typename O>
289 detail::cuda_min_element_loop(
299template <
typename I,
typename O>
304 auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(
308 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
309 (cudaStream_t stream)
mutable {
316template <
typename I,
typename O>
318 cudaTask task, I first, I last,
unsigned* idx, O op
323 auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(
327 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
328 (cudaStream_t stream)
mutable {
339template <
typename I,
typename O>
348template <
typename I,
typename O>
350 cudaTask task, I first, I last,
unsigned* idx, O op
373template <
typename P,
typename T>
375 return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);
412template <
typename P,
typename I,
typename O>
414 detail::cuda_max_element_loop(
424template <
typename I,
typename O>
429 auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(
433 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
434 (cudaStream_t stream)
mutable {
441template <
typename I,
typename O>
443 cudaTask task, I first, I last,
unsigned* idx, O op
448 auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(
452 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
453 (cudaStream_t stream)
mutable {
464template <
typename I,
typename O>
473template <
typename I,
typename O>
475 cudaTask task, I first, I last,
unsigned* idx, O op
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:215
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:300
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:425
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:193
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:340
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:465
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
cuda parallel-iteration algorithms include file
taskflow namespace
Definition small_vector.hpp:27
unsigned cuda_max_element_buffer_size(unsigned count)
queries the buffer size in bytes needed to call tf::cuda_max_element
Definition find.hpp:374
void cuda_single_task(P &&p, C c)
runs a callable asynchronously using one kernel thread
Definition for_each.hpp:69
void cuda_max_element(P &&p, I first, I last, unsigned *idx, O op, void *buf)
finds the index of the maximum element in a range
Definition find.hpp:413
void cuda_min_element(P &&p, I first, I last, unsigned *idx, O op, void *buf)
finds the index of the minimum element in a range
Definition find.hpp:288
void cuda_find_if(P &&p, I first, I last, unsigned *idx, U op)
finds the index of the first element that satisfies the given criteria
Definition find.hpp:181
unsigned cuda_min_element_buffer_size(unsigned count)
queries the buffer size in bytes needed to call tf::cuda_min_element
Definition find.hpp:249
cuda reduce algorithms include file