Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
find.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "for_each.hpp"
4#include "reduce.hpp"
5
11namespace tf::detail {
12
14template <typename T>
15struct cudaFindPair {
16
17 T key;
18 unsigned index;
19
20 __device__ operator unsigned () const { return index; }
21};
22
24template <typename P, typename I, typename U>
25void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) {
26
27 if(count == 0) {
28 cuda_single_task(p, [=] __device__ () { *idx = 0; });
29 return;
30 }
31
32 using E = std::decay_t<P>;
33
34 auto B = (count + E::nv - 1) / E::nv;
35
36 // set the index to the maximum
37 cuda_single_task(p, [=] __device__ () { *idx = count; });
38
39 // launch the kernel to atomic-find the minimum
40 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
41
42 __shared__ unsigned shm_id;
43
44 if(!tid) {
45 shm_id = count;
46 }
47
48 __syncthreads();
49
50 auto tile = cuda_get_tile(bid, E::nv, count);
51
52 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
53 input + tile.begin, tid, tile.count()
54 );
55
56 auto id = count;
57
58 for(unsigned i=0; i<E::vt; i++) {
59 auto j = E::nt*i + tid;
60 if(j < tile.count() && pred(x[i])) {
61 id = j + tile.begin;
62 break;
63 }
64 }
65
66 // Note: the reduce version is not faster though
67 // reduce to a scalar per block.
68 //__shared__ typename cudaBlockReduce<E::nt, unsigned>::Storage shm;
69
70 //id = cudaBlockReduce<E::nt, unsigned>()(
71 // tid,
72 // id,
73 // shm,
74 // (tile.count() < E::nt ? tile.count() : E::nt),
75 // cuda_minimum<unsigned>{},
76 // false
77 //);
78
79 // only need the minimum id
80 atomicMin(&shm_id, id);
81 __syncthreads();
82
83 // reduce all to the global memory
84 if(!tid) {
85 atomicMin(idx, shm_id);
86 //atomicMin(idx, id);
87 }
88 });
89}
90
92template <typename P, typename I, typename O>
93void cuda_min_element_loop(
94 P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
95) {
96
97 if(count == 0) {
98 cuda_single_task(p, [=] __device__ () { *idx = 0; });
99 return;
100 }
101
102 using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
103
104 cuda_uninitialized_reduce_loop(p,
105 cuda_make_load_iterator<T>([=]__device__(auto i){
106 return T{*(input+i), i};
107 }),
108 count,
109 idx,
110 [=] __device__ (const auto& a, const auto& b) {
111 return op(a.key, b.key) ? a : b;
112 },
113 ptr
114 );
115}
116
118template <typename P, typename I, typename O>
119void cuda_max_element_loop(
120 P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
121) {
122
123 if(count == 0) {
124 cuda_single_task(p, [=] __device__ () { *idx = 0; });
125 return;
126 }
127
128 using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
129
130 cuda_uninitialized_reduce_loop(p,
131 cuda_make_load_iterator<T>([=]__device__(auto i){
132 return T{*(input+i), i};
133 }),
134 count,
135 idx,
136 [=] __device__ (const auto& a, const auto& b) {
137 return op(a.key, b.key) ? b : a;
138 },
139 ptr
140 );
141}
142
143} // end of namespace tf::detail ---------------------------------------------
144
145namespace tf {
146
147
148// ----------------------------------------------------------------------------
149// cuda_find_if
150// ----------------------------------------------------------------------------
151
180template <typename P, typename I, typename U>
182 P&& p, I first, I last, unsigned* idx, U op
183) {
184 detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op);
185}
186
187// ----------------------------------------------------------------------------
188// cudaFlow
189// ----------------------------------------------------------------------------
190
191// Function: find_if
192template <typename I, typename U>
193cudaTask cudaFlow::find_if(I first, I last, unsigned* idx, U op) {
194 return capture([=](cudaFlowCapturer& cap){
196 cap.find_if(first, last, idx, op);
197 });
198}
199
200// Function: find_if
201template <typename I, typename U>
202void cudaFlow::find_if(cudaTask task, I first, I last, unsigned* idx, U op) {
203 capture(task, [=](cudaFlowCapturer& cap){
205 cap.find_if(first, last, idx, op);
206 });
207}
208
209// ----------------------------------------------------------------------------
210// cudaFlowCapturer
211// ----------------------------------------------------------------------------
212
213// Function: find_if
214template <typename I, typename U>
215cudaTask cudaFlowCapturer::find_if(I first, I last, unsigned* idx, U op) {
216 return on([=](cudaStream_t stream) mutable {
218 cuda_find_if(p, first, last, idx, op);
219 });
220}
221
222// Function: find_if
223template <typename I, typename U>
225 cudaTask task, I first, I last, unsigned* idx, U op
226) {
227 on(task, [=](cudaStream_t stream) mutable {
229 cuda_find_if(p, first, last, idx, op);
230 });
231}
232
233// ----------------------------------------------------------------------------
234// cuda_min_element
235// ----------------------------------------------------------------------------
236
248template <typename P, typename T>
249unsigned cuda_min_element_buffer_size(unsigned count) {
250 return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);
251}
252
287template <typename P, typename I, typename O>
288void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
289 detail::cuda_min_element_loop(
290 p, first, std::distance(first, last), idx, op, buf
291 );
292}
293
294// ----------------------------------------------------------------------------
295// cudaFlowCapturer::min_element
296// ----------------------------------------------------------------------------
297
298// Function: min_element
299template <typename I, typename O>
300cudaTask cudaFlowCapturer::min_element(I first, I last, unsigned* idx, O op) {
301
302 using T = typename std::iterator_traits<I>::value_type;
303
304 auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(
305 std::distance(first, last)
306 );
307
308 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
309 (cudaStream_t stream) mutable {
311 cuda_min_element(p, first, last, idx, op, buf.get().data());
312 });
313}
314
315// Function: min_element
316template <typename I, typename O>
318 cudaTask task, I first, I last, unsigned* idx, O op
319) {
320
321 using T = typename std::iterator_traits<I>::value_type;
322
323 auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(
324 std::distance(first, last)
325 );
326
327 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
328 (cudaStream_t stream) mutable {
330 cuda_min_element(p, first, last, idx, op, buf.get().data());
331 });
332}
333
334// ----------------------------------------------------------------------------
335// cudaFlow::min_element
336// ----------------------------------------------------------------------------
337
338// Function: min_element
339template <typename I, typename O>
340cudaTask cudaFlow::min_element(I first, I last, unsigned* idx, O op) {
341 return capture([=](cudaFlowCapturer& cap){
343 cap.min_element(first, last, idx, op);
344 });
345}
346
347// Function: min_element
348template <typename I, typename O>
350 cudaTask task, I first, I last, unsigned* idx, O op
351) {
352 capture(task, [=](cudaFlowCapturer& cap){
354 cap.min_element(first, last, idx, op);
355 });
356}
357
358// ----------------------------------------------------------------------------
359// cuda_max_element
360// ----------------------------------------------------------------------------
361
373template <typename P, typename T>
374unsigned cuda_max_element_buffer_size(unsigned count) {
375 return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);
376}
377
412template <typename P, typename I, typename O>
413void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
414 detail::cuda_max_element_loop(
415 p, first, std::distance(first, last), idx, op, buf
416 );
417}
418
419// ----------------------------------------------------------------------------
420// cudaFlowCapturer::max_element
421// ----------------------------------------------------------------------------
422
423// Function: max_element
424template <typename I, typename O>
425cudaTask cudaFlowCapturer::max_element(I first, I last, unsigned* idx, O op) {
426
427 using T = typename std::iterator_traits<I>::value_type;
428
429 auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(
430 std::distance(first, last)
431 );
432
433 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
434 (cudaStream_t stream) mutable {
436 cuda_max_element(p, first, last, idx, op, buf.get().data());
437 });
438}
439
440// Function: max_element
441template <typename I, typename O>
443 cudaTask task, I first, I last, unsigned* idx, O op
444) {
445
446 using T = typename std::iterator_traits<I>::value_type;
447
448 auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(
449 std::distance(first, last)
450 );
451
452 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
453 (cudaStream_t stream) mutable {
455 cuda_max_element(p, first, last, idx, op, buf.get().data());
456 });
457}
458
459// ----------------------------------------------------------------------------
460// cudaFlow::max_element
461// ----------------------------------------------------------------------------
462
463// Function: max_element
464template <typename I, typename O>
465cudaTask cudaFlow::max_element(I first, I last, unsigned* idx, O op) {
466 return capture([=](cudaFlowCapturer& cap){
468 cap.max_element(first, last, idx, op);
469 });
470}
471
472// Function: max_element
473template <typename I, typename O>
475 cudaTask task, I first, I last, unsigned* idx, O op
476) {
477 capture(task, [=](cudaFlowCapturer& cap){
479 cap.max_element(first, last, idx, op);
480 });
481}
482
483} // end of namespace tf -----------------------------------------------------
484
485
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:215
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:300
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:425
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:193
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:340
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:465
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
T count(T... args)
T distance(T... args)
cuda parallel-iteration algorithms include file
T forward(T... args)
taskflow namespace
Definition small_vector.hpp:27
unsigned cuda_max_element_buffer_size(unsigned count)
queries the buffer size in bytes needed to call tf::cuda_max_element
Definition find.hpp:374
void cuda_single_task(P &&p, C c)
runs a callable asynchronously using one kernel thread
Definition for_each.hpp:69
void cuda_max_element(P &&p, I first, I last, unsigned *idx, O op, void *buf)
finds the index of the maximum element in a range
Definition find.hpp:413
void cuda_min_element(P &&p, I first, I last, unsigned *idx, O op, void *buf)
finds the index of the minimum element in a range
Definition find.hpp:288
void cuda_find_if(P &&p, I first, I last, unsigned *idx, U op)
finds the index of the first element that satisfies the given criteria
Definition find.hpp:181
unsigned cuda_min_element_buffer_size(unsigned count)
queries the buffer size in bytes needed to call tf::cuda_min_element
Definition find.hpp:249
cuda reduce algorithms include file