Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
merge.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../cudaflow.hpp"
4
10namespace tf::detail {
11
16enum class cudaMergeBoundType {
17 LOWER,
18 UPPER
19};
20
22template<typename T, unsigned N>
23struct cudaMergePair {
24 cudaArray<T, N> keys;
25 cudaArray<unsigned, N> indices;
26};
27
29struct cudaMergeRange {
30 unsigned a_begin, a_end, b_begin, b_end;
31
32 __device__ unsigned a_count() const { return a_end - a_begin; }
33 __device__ unsigned b_count() const { return b_end - b_begin; }
34 __device__ unsigned total() const { return a_count() + b_count(); }
35
36 __device__ cudaRange a_range() const {
37 return cudaRange { a_begin, a_end };
38 }
39 __device__ cudaRange b_range() const {
40 return cudaRange { b_begin, b_end };
41 }
42
43 __device__ cudaMergeRange to_local() const {
44 return cudaMergeRange { 0, a_count(), a_count(), total() };
45 }
46
47 // Partition from mp to the end.
48 __device__ cudaMergeRange partition(unsigned mp0, unsigned diag) const {
49 return cudaMergeRange { a_begin + mp0, a_end, b_begin + diag - mp0, b_end };
50 }
51
52 // Partition from mp0 to mp1.
53 __device__ cudaMergeRange partition(unsigned mp0, unsigned diag0,
54 unsigned mp1, unsigned diag1) const {
55 return cudaMergeRange {
56 a_begin + mp0,
57 a_begin + mp1,
58 b_begin + diag0 - mp0,
59 b_begin + diag1 - mp1
60 };
61 }
62
63 __device__ bool a_valid() const {
64 return a_begin < a_end;
65 }
66 __device__ bool b_valid() const {
67 return b_begin < b_end;
68 }
69};
70
72template<
73 cudaMergeBoundType bounds = cudaMergeBoundType::LOWER,
74 typename a_keys_it, typename b_keys_it, typename comp_t
75>
76__device__ auto cuda_merge_path(
77 a_keys_it a_keys, unsigned a_count,
78 b_keys_it b_keys, unsigned b_count,
79 unsigned diag, comp_t comp
80) {
81
82 unsigned beg = (diag > b_count) ? diag - b_count : 0;
83 unsigned end = diag < a_count ? diag : a_count;
84
85 while(beg < end) {
86 auto mid = (beg + end) / 2;
87 auto a_key = a_keys[mid];
88 auto b_key = b_keys[diag - 1 - mid];
89 bool pred = (cudaMergeBoundType::UPPER == bounds) ?
90 comp(a_key, b_key) :
91 !comp(b_key, a_key);
92
93 if(pred) beg = mid + 1;
94 else end = mid;
95 }
96 return beg;
97}
98
100template<cudaMergeBoundType bounds, typename keys_it, typename comp_t>
101__device__ auto cuda_merge_path(
102 keys_it keys, cudaMergeRange range, unsigned diag, comp_t comp
103) {
104
105 return cuda_merge_path<bounds>(
106 keys + range.a_begin, range.a_count(),
107 keys + range.b_begin, range.b_count(),
108 diag, comp);
109}
110
112template<cudaMergeBoundType bounds, bool range_check, typename T, typename comp_t>
113__device__ bool cuda_merge_predicate(
114 T a_key, T b_key, cudaMergeRange range, comp_t comp
115) {
116
117 bool p;
118 if(range_check && !range.a_valid()) {
119 p = false;
120 }
121 else if(range_check && !range.b_valid()) {
122 p = true;
123 }
124 else {
125 p = (cudaMergeBoundType::UPPER == bounds) ? comp(a_key, b_key) :
126 !comp(b_key, a_key);
127 }
128 return p;
129}
130
132inline __device__ auto cuda_compute_merge_range(
133 unsigned a_count, unsigned b_count,
134 unsigned partition, unsigned spacing,
135 unsigned mp0, unsigned mp1
136) {
137
138 auto diag0 = spacing * partition;
139 auto diag1 = min(a_count + b_count, diag0 + spacing);
140
141 return cudaMergeRange { mp0, mp1, diag0 - mp0, diag1 - mp1 };
142}
143
151template<unsigned nt, unsigned vt, typename T>
152__device__ auto cuda_load_two_streams_reg(
153 const T* a, unsigned a_count, const T* b, unsigned b_count, unsigned tid
154) {
155
156 b -= a_count;
157 cudaArray<T, vt> x;
158 cuda_strided_iterate<nt, vt>([&](auto i, auto index) {
159 const T* p = (index >= a_count) ? b : a;
160 x[i] = p[index];
161 }, tid, a_count + b_count);
162
163 return x;
164}
165
167template<unsigned nt, unsigned vt, typename T, typename a_it, typename b_it>
168__device__
171 cudaArray<T, vt>
172> load_two_streams_reg(a_it a, unsigned a_count, b_it b, unsigned b_count, unsigned tid) {
173 b -= a_count;
174 cudaArray<T, vt> x;
175 cuda_strided_iterate<nt, vt>([&](auto i, auto index) {
176 x[i] = (index < a_count) ? a[index] : b[index];
177 }, tid, a_count + b_count);
178 return x;
179}
180
182template<unsigned nt, unsigned vt, typename A, typename B, typename T, unsigned S>
183__device__ void cuda_load_two_streams_shared(A a, unsigned a_count,
184 B b, unsigned b_count, unsigned tid, T (&shared)[S], bool sync = true
185) {
186 // Load into register then make an unconditional strided store into memory.
187 auto x = cuda_load_two_streams_reg<nt, vt, T>(a, a_count, b, b_count, tid);
188 cuda_reg_to_shared_strided<nt>(x, tid, shared, sync);
189}
190
192template<unsigned nt, unsigned vt, typename T>
193__device__ auto cuda_gather_two_streams_strided(const T* a,
194 unsigned a_count, const T* b, unsigned b_count, cudaArray<unsigned, vt> indices,
195 unsigned tid) {
196
197 ptrdiff_t b_offset = b - a - a_count;
198 auto count = a_count + b_count;
199
200 cudaArray<T, vt> x;
201 cuda_strided_iterate<nt, vt>([&](auto i, auto j) {
202 ptrdiff_t gather = indices[i];
203 if(gather >= a_count) gather += b_offset;
204 x[i] = a[gather];
205 }, tid, count);
206
207 return x;
208}
209
211template<unsigned nt, unsigned vt, typename T, typename a_it, typename b_it>
212__device__
215 cudaArray<T, vt>
216> cuda_gather_two_streams_strided(a_it a,
217 unsigned a_count, b_it b, unsigned b_count, cudaArray<unsigned, vt> indices, unsigned tid) {
218
219 b -= a_count;
220 cudaArray<T, vt> x;
221 cuda_strided_iterate<nt, vt>([&](auto i, auto j) {
222 x[i] = (indices[i] < a_count) ? a[indices[i]] : b[indices[i]];
223 }, tid, a_count + b_count);
224
225 return x;
226}
227
229template<unsigned nt, unsigned vt, typename a_it, typename b_it, typename c_it>
230__device__ void cuda_transfer_two_streams_strided(
231 a_it a, unsigned a_count, b_it b, unsigned b_count,
232 cudaArray<unsigned, vt> indices, unsigned tid, c_it c
233) {
234
236 auto x = cuda_gather_two_streams_strided<nt, vt, T>(
237 a, a_count, b, b_count, indices, tid
238 );
239
240 cuda_reg_to_mem_strided<nt>(x, tid, a_count + b_count, c);
241}
242
243
251template<cudaMergeBoundType bounds, unsigned vt, typename T, typename comp_t>
252__device__ auto cuda_serial_merge(
253 const T* keys_shared, cudaMergeRange range, comp_t comp, bool sync = true
254) {
255
256 auto a_key = keys_shared[range.a_begin];
257 auto b_key = keys_shared[range.b_begin];
258
259 cudaMergePair<T, vt> merge_pair;
260 cuda_iterate<vt>([&](auto i) {
261 bool p = cuda_merge_predicate<bounds, true>(a_key, b_key, range, comp);
262 auto index = p ? range.a_begin : range.b_begin;
263
264 merge_pair.keys[i] = p ? a_key : b_key;
265 merge_pair.indices[i] = index;
266
267 T c_key = keys_shared[++index];
268 if(p) a_key = c_key, range.a_begin = index;
269 else b_key = c_key, range.b_begin = index;
270 });
271
272 if(sync) __syncthreads();
273 return merge_pair;
274}
275
281template<cudaMergeBoundType bounds,
282 unsigned nt, unsigned vt,
283 typename a_it, typename b_it, typename T, typename comp_t, unsigned S
284>
285__device__ auto block_merge_from_mem(
286 a_it a, b_it b, cudaMergeRange range_mem, unsigned tid, comp_t comp, T (&keys_shared)[S]
287) {
288
289 static_assert(S >= nt * vt + 1,
290 "block_merge_from_mem requires temporary storage of at "
291 "least nt * vt + 1 items");
292
293 // Load the data into shared memory.
294 cuda_load_two_streams_shared<nt, vt>(
295 a + range_mem.a_begin, range_mem.a_count(),
296 b + range_mem.b_begin, range_mem.b_count(),
297 tid, keys_shared, true
298 );
299
300 // Run a merge path to find the start of the serial merge for each thread.
301 auto range_local = range_mem.to_local();
302 auto diag = vt * tid;
303 auto mp = cuda_merge_path<bounds>(keys_shared, range_local, diag, comp);
304
305 // Compute the ranges of the sources in shared memory. The end iterators
306 // of the range are inaccurate, but still facilitate exact merging, because
307 // only vt elements will be merged.
308 auto merged = cuda_serial_merge<bounds, vt>(
309 keys_shared, range_local.partition(mp, diag), comp
310 );
311
312 return merged;
313};
314
316template<cudaMergeBoundType bounds,
317 typename P, typename a_keys_it, typename b_keys_it, typename comp_t
318>
319void cuda_merge_path_partitions(
320 P&& p,
321 a_keys_it a, unsigned a_count,
322 b_keys_it b, unsigned b_count,
323 unsigned spacing,
324 comp_t comp,
325 unsigned* buf
326) {
327
328 //int num_partitions = (int)div_up(a_count + b_count, spacing) + 1;
329
330 unsigned num_partitions = (a_count + b_count + spacing - 1) / spacing + 1;
331
332 const unsigned nt = 128;
333 const unsigned vt = 1;
334 const unsigned nv = nt * vt;
335
336 unsigned B = (num_partitions + nv - 1) / nv; // nt = 128, vt = 1
337
338 cuda_kernel<<<B, nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) {
339 auto range = cuda_get_tile(bid, nt * vt, num_partitions);
340 cuda_strided_iterate<nt, vt>([=](auto, auto j) {
341 auto index = range.begin + j;
342 auto diag = min(spacing * index, a_count + b_count);
343 buf[index] = cuda_merge_path<bounds>(a, a_count, b, b_count, diag, comp);
344 }, tid, range.count());
345 });
346}
347
348//template<typename segments_it>
349//auto load_balance_partitions(int64_t dest_count, segments_it segments,
350// int num_segments, int spacing, context_t& context) ->
351// mem_t<typename std::iterator_traits<segments_it>::value_type> {
352//
353// typedef typename std::iterator_traits<segments_it>::value_type int_t;
354// return merge_path_partitions<bounds_upper>(counting_iterator_t<int_t>(0),
355// dest_count, segments, num_segments, spacing, less_t<int_t>(), context);
356//}
357
358//template<bounds_t bounds, typename keys_it>
359//mem_t<int> binary_search_partitions(keys_it keys, int count, int num_items,
360// int spacing, context_t& context) {
361//
362// int num_partitions = div_up(count, spacing) + 1;
363// mem_t<int> mem(num_partitions, context);
364// int* p = mem.data();
365// transform([=]MGPU_DEVICE(int index) {
366// int key = min(spacing * index, count);
367// p[index] = binary_search<bounds>(keys, num_items, key, less_t<int>());
368// }, num_partitions, context);
369// return mem;
370//}
371
373template<
374 typename P,
375 typename a_keys_it, typename a_vals_it,
376 typename b_keys_it, typename b_vals_it,
377 typename c_keys_it, typename c_vals_it,
378 typename comp_t
379>
380void cuda_merge_loop(
381 P&& p,
382 a_keys_it a_keys, a_vals_it a_vals, unsigned a_count,
383 b_keys_it b_keys, b_vals_it b_vals, unsigned b_count,
384 c_keys_it c_keys, c_vals_it c_vals,
385 comp_t comp,
386 void* ptr
387) {
388
389 using E = std::decay_t<P>;
392
393 auto buf = static_cast<unsigned*>(ptr);
394
395 auto has_values = !std::is_same<V, cudaEmpty>::value;
396
397 cuda_merge_path_partitions<cudaMergeBoundType::LOWER>(
398 p, a_keys, a_count, b_keys, b_count, E::nv, comp, buf
399 );
400
401 unsigned B = (a_count + b_count + E::nv - 1)/ E::nv;
402
403 // we use small kernel
404 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
405
406 __shared__ union {
407 T keys[E::nv + 1];
408 unsigned indices[E::nv];
409 } shared;
410
411 // Load the range for this CTA and merge the values into register.
412 auto mp0 = buf[bid + 0];
413 auto mp1 = buf[bid + 1];
414 auto range = cuda_compute_merge_range(a_count, b_count, bid, E::nv, mp0, mp1);
415
416 auto merge = block_merge_from_mem<cudaMergeBoundType::LOWER, E::nt, E::vt>(
417 a_keys, b_keys, range, tid, comp, shared.keys
418 );
419
420 auto dest_offset = E::nv * bid;
421 cuda_reg_to_mem_thread<E::nt>(
422 merge.keys, tid, range.total(), c_keys + dest_offset, shared.keys
423 );
424
425 if(has_values) {
426 // Transpose the indices from thread order to strided order.
427 auto indices = cuda_reg_thread_to_strided<E::nt>(
428 merge.indices, tid, shared.indices
429 );
430
431 // Gather the input values and merge into the output values.
432 cuda_transfer_two_streams_strided<E::nt>(
433 a_vals + range.a_begin, range.a_count(),
434 b_vals + range.b_begin, range.b_count(), indices, tid,
435 c_vals + dest_offset
436 );
437 }
438 });
439}
440
441} // end of namespace tf::detail ---------------------------------------------
442
443namespace tf {
444
445// ----------------------------------------------------------------------------
446// standalone merge algorithms
447// ----------------------------------------------------------------------------
448
459template <typename P>
460unsigned cuda_merge_buffer_size(unsigned a_count, unsigned b_count) {
461 using E = std::decay_t<P>;
462 unsigned sz = (a_count + b_count + E::nv - 1) / E::nv + 1;
463 return sz*sizeof(unsigned);
464}
465
466// ----------------------------------------------------------------------------
467// key-value merge
468// ----------------------------------------------------------------------------
469
470//template<
471// typename P,
472// typename a_keys_it, typename a_vals_it,
473// typename b_keys_it, typename b_vals_it,
474// typename c_keys_it, typename c_vals_it,
475// typename C
476//>
477//void cuda_merge(
478// P&& p,
479// a_keys_it a_keys_first, a_vals_it a_vals_first, a_keys_it a_keys_last,
480// b_keys_it b_keys_first, b_vals_it b_vals_first, b_keys_it b_keys_last,
481// c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
482//) {
483//
484// unsigned a_count = std::distance(a_keys_first, a_keys_last);
485// unsigned b_count = std::distance(b_keys_first, b_keys_last);
486//
487// if(a_count + b_count == 0) {
488// return;
489// }
490//
491// // allocate temporary buffer
492// cudaDeviceVector<std::byte> temp(cuda_merge_buffer_size<P>(a_count, b_count));
493//
494// detail::cuda_merge_loop(
495// p,
496// a_keys_first, a_vals_first, a_count,
497// b_keys_first, b_vals_first, b_count,
498// c_keys_first, c_vals_first, comp,
499// temp.data()
500// );
501//
502// // synchronize the execution
503// p.synchronize();
504//}
505
553template<
554 typename P,
555 typename a_keys_it, typename a_vals_it,
556 typename b_keys_it, typename b_vals_it,
557 typename c_keys_it, typename c_vals_it,
558 typename C
559>
561 P&& p,
562 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
563 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
564 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp,
565 void* buf
566) {
567
568 unsigned a_count = std::distance(a_keys_first, a_keys_last);
569 unsigned b_count = std::distance(b_keys_first, b_keys_last);
570
571 if(a_count + b_count == 0) {
572 return;
573 }
574
575 detail::cuda_merge_loop(p,
576 a_keys_first, a_vals_first, a_count,
577 b_keys_first, b_vals_first, b_count,
578 c_keys_first, c_vals_first, comp,
579 buf
580 );
581}
582
583// ----------------------------------------------------------------------------
584// key-only merge
585// ----------------------------------------------------------------------------
586
587//template<typename P,
588// typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C
589//>
590//void cuda_merge(
591// P&& p,
592// a_keys_it a_keys_first, a_keys_it a_keys_last,
593// b_keys_it b_keys_first, b_keys_it b_keys_last,
594// c_keys_it c_keys_first,
595// C comp
596//) {
597// cuda_merge(
598// p,
599// a_keys_first, (const cudaEmpty*)nullptr, a_keys_last,
600// b_keys_first, (const cudaEmpty*)nullptr, b_keys_last,
601// c_keys_first, (cudaEmpty*)nullptr, comp
602// );
603//}
604
626template<typename P,
627 typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C
628>
630 P&& p,
631 a_keys_it a_keys_first, a_keys_it a_keys_last,
632 b_keys_it b_keys_first, b_keys_it b_keys_last,
633 c_keys_it c_keys_first,
634 C comp,
635 void* buf
636) {
638 p,
639 a_keys_first, a_keys_last, (const cudaEmpty*)nullptr,
640 b_keys_first, b_keys_last, (const cudaEmpty*)nullptr,
641 c_keys_first, (cudaEmpty*)nullptr, comp,
642 buf
643 );
644}
645
646// ----------------------------------------------------------------------------
647// cudaFlow merge algorithms
648// ----------------------------------------------------------------------------
649
650// Function: merge
651template<typename A, typename B, typename C, typename Comp>
653 A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp
654) {
655 return capture([=](cudaFlowCapturer& cap){
657 cap.merge(a_first, a_last, b_first, b_last, c_first, comp);
658 });
659}
660
661// Function: merge
662template<typename A, typename B, typename C, typename Comp>
664 cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp
665) {
666 capture(task, [=](cudaFlowCapturer& cap){
668 cap.merge(a_first, a_last, b_first, b_last, c_first, comp);
669 });
670}
671
672// Function: merge_by_key
673template<
674 typename a_keys_it, typename a_vals_it,
675 typename b_keys_it, typename b_vals_it,
676 typename c_keys_it, typename c_vals_it,
677 typename C
678>
680 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
681 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
682 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
683) {
684 return capture([=](cudaFlowCapturer& cap){
686 cap.merge_by_key(
687 a_keys_first, a_keys_last, a_vals_first,
688 b_keys_first, b_keys_last, b_vals_first,
689 c_keys_first, c_vals_first,
690 comp
691 );
692 });
693}
694
695// Function: merge_by_key
696template<
697 typename a_keys_it, typename a_vals_it,
698 typename b_keys_it, typename b_vals_it,
699 typename c_keys_it, typename c_vals_it,
700 typename C
701>
703 cudaTask task,
704 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
705 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
706 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
707) {
708 capture(task, [=](cudaFlowCapturer& cap){
710 cap.merge_by_key(
711 a_keys_first, a_keys_last, a_vals_first,
712 b_keys_first, b_keys_last, b_vals_first,
713 c_keys_first, c_vals_first,
714 comp
715 );
716 });
717}
718
719// ----------------------------------------------------------------------------
720// cudaFlowCapturer merge algorithms
721// ----------------------------------------------------------------------------
722
723// Function: merge
724template<typename A, typename B, typename C, typename Comp>
726 A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp
727) {
728 // TODO
729 auto bufsz = cuda_merge_buffer_size<cudaDefaultExecutionPolicy>(
730 std::distance(a_first, a_last), std::distance(b_first, b_last)
731 );
732
733 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
734 (cudaStream_t stream) mutable {
736 a_first, a_last, b_first, b_last, c_first, comp, buf.get().data()
737 );
738 });
739}
740
741// Procedure: merge (update)
742template<typename A, typename B, typename C, typename Comp>
744 cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp
745) {
746 // TODO
747 auto bufsz = cuda_merge_buffer_size<cudaDefaultExecutionPolicy>(
748 std::distance(a_first, a_last), std::distance(b_first, b_last)
749 );
750
751 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
752 (cudaStream_t stream) mutable {
754 a_first, a_last, b_first, b_last, c_first, comp, buf.get().data()
755 );
756 });
757}
758
759// Function: merge_by_key
760template<
761 typename a_keys_it, typename a_vals_it,
762 typename b_keys_it, typename b_vals_it,
763 typename c_keys_it, typename c_vals_it,
764 typename C
765>
767 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
768 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
769 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
770) {
771
772 auto bufsz = cuda_merge_buffer_size<cudaDefaultExecutionPolicy>(
773 std::distance(a_keys_first, a_keys_last),
774 std::distance(b_keys_first, b_keys_last)
775 );
776
777 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
778 (cudaStream_t stream) mutable {
780 a_keys_first, a_keys_last, a_vals_first,
781 b_keys_first, b_keys_last, b_vals_first,
782 c_keys_first, c_vals_first,
783 comp,
784 buf.get().data()
785 );
786 });
787}
788
789// Function: merge_by_key
790template<
791 typename a_keys_it, typename a_vals_it,
792 typename b_keys_it, typename b_vals_it,
793 typename c_keys_it, typename c_vals_it,
794 typename C
795>
797 cudaTask task,
798 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
799 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
800 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
801) {
802
803 auto bufsz = cuda_merge_buffer_size<cudaDefaultExecutionPolicy>(
804 std::distance(a_keys_first, a_keys_last),
805 std::distance(b_keys_first, b_keys_last)
806 );
807
808 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
809 (cudaStream_t stream) mutable {
811 a_keys_first, a_keys_last, a_vals_first,
812 b_keys_first, b_keys_last, b_vals_first,
813 c_keys_first, c_vals_first,
814 comp,
815 buf.get().data()
816 );
817 });
818}
819
820
821
822} // end of namespace tf -----------------------------------------------------
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
captures kernels that perform parallel key-value merge
Definition merge.hpp:766
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
captures kernels that perform parallel merge on two sorted arrays
Definition merge.hpp:725
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
creates a task to perform parallel key-value merge
Definition merge.hpp:679
cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
creates a task to perform parallel merge on two sorted arrays
Definition merge.hpp:652
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
T count(T... args)
T distance(T... args)
T end(T... args)
T forward(T... args)
T merge(T... args)
T min(T... args)
taskflow namespace
Definition small_vector.hpp:27
void cuda_merge(P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void *buf)
performs asynchronous key-only merge over a range of keys
Definition merge.hpp:629
unsigned cuda_merge_buffer_size(unsigned a_count, unsigned b_count)
queries the buffer size in bytes needed to call merge kernels
Definition merge.hpp:460
void cuda_merge_by_key(P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void *buf)
performs asynchronous key-value merge over a range of keys and values
Definition merge.hpp:560
T partition(T... args)