Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
sort.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "merge.hpp"
4
10namespace tf::detail {
11
12// ----------------------------------------------------------------------------
13// odd-even sort in register
14// ----------------------------------------------------------------------------
15
20constexpr int cuda_clz(int x) {
21 for(int i = 31; i >= 0; --i) {
22 if((1<< i) & x) {
23 return 31 - i;
24 }
25 }
26 return 32;
27}
28
33constexpr int cuda_find_log2(int x, bool round_up = false) {
34 int a = 31 - cuda_clz(x);
35 if(round_up) {
36 a += !is_pow2(x);
37 }
38 return a;
39}
40
42template<typename T, unsigned vt, typename C>
43__device__ auto cuda_odd_even_sort(
44 cudaArray<T, vt> x, C comp, int flags = 0
45) {
46 cuda_iterate<vt>([&](auto I) {
47 #pragma unroll
48 for(auto i = 1 & I; i < vt - 1; i += 2) {
49 if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i]))
50 cuda_swap(x[i], x[i + 1]);
51 }
52 });
53 return x;
54}
55
57template<typename K, typename V, unsigned vt, typename C>
58__device__ auto cuda_odd_even_sort(
59 cudaKVArray<K, V, vt> x, C comp, int flags = 0
60) {
61 cuda_iterate<vt>([&](auto I) {
62 #pragma unroll
63 for(auto i = 1 & I; i < vt - 1; i += 2) {
64 if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) {
65 cuda_swap(x.keys[i], x.keys[i + 1]);
66 cuda_swap(x.vals[i], x.vals[i + 1]);
67 }
68 }
69 });
70 return x;
71}
72
73// ----------------------------------------------------------------------------
74// range check
75// ----------------------------------------------------------------------------
76
78__device__ inline int cuda_out_of_range_flags(int first, int vt, int count) {
79 int out_of_range = min(vt, first + vt - count);
80 int head_flags = 0;
81 if(out_of_range > 0) {
82 const int mask = (1<< vt) - 1;
83 head_flags = mask & (~mask>> out_of_range);
84 }
85 return head_flags;
86}
87
89__device__ inline auto cuda_compute_merge_sort_frame(
90 unsigned partition, unsigned coop, unsigned spacing
91) {
92
93 unsigned size = spacing * (coop / 2);
94 unsigned start = ~(coop - 1) & partition;
95 unsigned a_begin = spacing * start;
96 unsigned b_begin = spacing * start + size;
97
98 return cudaMergeRange {
99 a_begin,
100 a_begin + size,
101 b_begin,
102 b_begin + size
103 };
104}
105
107__device__ inline auto cuda_compute_merge_sort_range(
108 unsigned count, unsigned partition, unsigned coop, unsigned spacing
109) {
110
111 auto frame = cuda_compute_merge_sort_frame(partition, coop, spacing);
112
113 return cudaMergeRange {
114 frame.a_begin,
115 min(count, frame.a_end),
116 min(count, frame.b_begin),
117 min(count, frame.b_end)
118 };
119}
120
122__device__ inline auto cuda_compute_merge_sort_range(
123 unsigned count, unsigned partition, unsigned coop, unsigned spacing,
124 unsigned mp0, unsigned mp1
125) {
126
127 auto range = cuda_compute_merge_sort_range(count, partition, coop, spacing);
128
129 // Locate the diagonal from the start of the A sublist.
130 unsigned diag = spacing * partition - range.a_begin;
131
132 // The end partition of the last cta for each merge operation is computed
133 // and stored as the begin partition for the subsequent merge. i.e. it is
134 // the same partition but in the wrong coordinate system, so its 0 when it
135 // should be listSize. Correct that by checking if this is the last cta
136 // in this merge operation.
137 if(coop - 1 != ((coop - 1) & partition)) {
138 range.a_end = range.a_begin + mp1;
139 range.b_end = min(count, range.b_begin + diag + spacing - mp1);
140 }
141
142 range.a_begin = range.a_begin + mp0;
143 range.b_begin = min(count, range.b_begin + diag - mp0);
144
145 return range;
146}
147
149template<unsigned nt, unsigned vt, typename K, typename V>
150struct cudaBlockSort {
151
152 static constexpr bool has_values = !std::is_same<V, cudaEmpty>::value;
153 static constexpr unsigned num_passes = log2(nt);
154
156 union Storage {
157 K keys[nt * vt + 1];
158 V vals[nt * vt];
159 };
160
161 static_assert(is_pow2(nt), "cudaBlockSort requires pow2 number of threads");
162
163 template<typename C>
164 __device__ auto merge_pass(
165 cudaKVArray<K, V, vt> x,
166 unsigned tid, unsigned count, unsigned pass,
167 C comp, Storage& storage
168 ) const {
169
170 // Divide the CTA's keys into lists.
171 unsigned coop = 2 << pass;
172 auto range = cuda_compute_merge_sort_range(count, tid, coop, vt);
173 unsigned diag = vt * tid - range.a_begin;
174
175 // Store the keys into shared memory for searching.
176 cuda_reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys);
177
178 // Search for the merge path for this thread within its list.
179 auto mp = cuda_merge_path<cudaMergeBoundType::LOWER>(
180 storage.keys, range, diag, comp
181 );
182
183 // Run a serial merge and return.
184 auto merge = cuda_serial_merge<cudaMergeBoundType::LOWER, vt>(
185 storage.keys, range.partition(mp, diag), comp
186 );
187 x.keys = merge.keys;
188
189 if(has_values) {
190 // Reorder values through shared memory.
191 cuda_reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals);
192 x.vals = cuda_shared_gather<nt, vt>(storage.vals, merge.indices);
193 }
194
195 return x;
196 }
197
198 template<typename C>
199 __device__ auto block_sort(cudaKVArray<K, V, vt> x,
200 unsigned tid, unsigned count, C comp, Storage& storage
201 ) const {
202
203 // Sort the inputs within each thread. If any threads have fewer than
204 // vt items, use the segmented sort network to prevent out-of-range
205 // elements from contaminating the sort.
206 if(count < nt * vt) {
207 auto head_flags = cuda_out_of_range_flags(vt * tid, vt, count);
208 x = cuda_odd_even_sort(x, comp, head_flags);
209 } else {
210 x = cuda_odd_even_sort(x, comp);
211 }
212
213 // Merge threads starting with a pair until all values are merged.
214 for(unsigned pass = 0; pass < num_passes; ++pass) {
215 x = merge_pass(x, tid, count, pass, comp, storage);
216 }
217
218 return x;
219 }
220};
221
223template<typename P, typename K, typename C>
224void cuda_merge_sort_partitions(
225 P&& p, K keys, unsigned count,
226 unsigned coop, unsigned spacing, C comp, unsigned* buf
227) {
228
229 // bufer size is num_partitions + 1
230 unsigned num_partitions = (count + spacing - 1) / spacing + 1;
231
232 const unsigned nt = 128;
233 const unsigned vt = 1;
234 const unsigned nv = nt * vt;
235
236 unsigned B = (num_partitions + nv - 1) / nv; // nt = 128, vt = 1
237
238 cuda_kernel<<<B, nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
239 auto range = cuda_get_tile(bid, nt * vt, num_partitions);
240 cuda_strided_iterate<nt, vt>([=](auto, auto j) {
241 auto index = j + range.begin;
242 auto range = cuda_compute_merge_sort_range(count, index, coop, spacing);
243 auto diag = min(spacing * index, count) - range.a_begin;
244 buf[index] = cuda_merge_path<cudaMergeBoundType::LOWER>(
245 keys + range.a_begin, range.a_count(),
246 keys + range.b_begin, range.b_count(),
247 diag, comp
248 );
249 }, tid, range.count());
250 });
251}
252
254template<typename P, typename K_it, typename V_it, typename C>
255void merge_sort_loop(
256 P&& p, K_it keys_input, V_it vals_input, unsigned count, C comp, void* buf
257) {
258
261 using E = std::decay_t<P>;
262
263 const bool has_values = !std::is_same<V, cudaEmpty>::value;
264
265 unsigned B = (count + E::nv - 1) / E::nv;
266 unsigned R = cuda_find_log2(B, true);
267
268 K* keys_output {nullptr};
269 V* vals_output {nullptr};
270 unsigned *mp_data {nullptr};
271
272 if(R) {
273 keys_output = (K*)(buf);
274 if(has_values) {
275 vals_output = (V*)(keys_output + count);
276 mp_data = (unsigned*)(vals_output + count);
277 }
278 else {
279 mp_data = (unsigned*)(keys_output + count);
280 }
281 }
282
283 //cudaDeviceVector<K> keys_temp(R ? count : 0);
284 //auto keys_output = keys_temp.data();
286
287 //cudaDeviceVector<V> vals_temp((has_values && R) ? count : 0);
288 //auto vals_output = vals_temp.data();
289 //std::cout << "vals_output = " << vals_temp.size()*sizeof(V) << std::endl;
290
291 auto keys_blocksort = (1 & R) ? keys_output : keys_input;
292 auto vals_blocksort = (1 & R) ? vals_output : vals_input;
293
294 //printf("B=%u, R=%u\n", B, R);
295
296 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
297
298 using sort_t = cudaBlockSort<E::nt, E::vt, K, V>;
299
300 __shared__ union {
301 typename sort_t::Storage sort;
302 K keys[E::nv];
303 V vals[E::nv];
304 } shared;
305
306 auto tile = cuda_get_tile(bid, E::nv, count);
307
308 // Load the keys and values.
309 cudaKVArray<K, V, E::vt> unsorted;
310 unsorted.keys = cuda_mem_to_reg_thread<E::nt, E::vt>(
311 keys_input + tile.begin, tid, tile.count(), shared.keys
312 );
313
314 if(has_values) {
315 unsorted.vals = cuda_mem_to_reg_thread<E::nt, E::vt>(
316 vals_input + tile.begin, tid, tile.count(), shared.vals
317 );
318 }
319
320 // Blocksort.
321 auto sorted = sort_t().block_sort(unsorted, tid, tile.count(), comp, shared.sort);
322
323 // Store the keys and values.
324 cuda_reg_to_mem_thread<E::nt, E::vt>(
325 sorted.keys, tid, tile.count(), keys_blocksort + tile.begin, shared.keys
326 );
327
328 if(has_values) {
329 cuda_reg_to_mem_thread<E::nt, E::vt>(
330 sorted.vals, tid, tile.count(), vals_blocksort + tile.begin, shared.vals
331 );
332 }
333 });
334
335 if(R == 0) {
336 return;
337 }
338
339 // merge passes
340
341 if(1 & R) {
342 std::swap(keys_input, keys_output);
343 std::swap(vals_input, vals_output);
344 }
345
346 // number of partitions
347 //unsigned num_partitions = B + 1;
348 //cudaDeviceVector<unsigned> mem(num_partitions);
349 //auto mp_data = mem.data();
350 //std::cout << "num_partitions = " << (B+1)*sizeof(unsigned) << std::endl;
351
352 for(unsigned pass = 0; pass < R; ++pass) {
353
354 unsigned coop = 2 << pass;
355
356 cuda_merge_sort_partitions(
357 p, keys_input, count, coop, E::nv, comp, mp_data
358 );
359
360 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) {
361
362 __shared__ union {
363 K keys[E::nv + 1];
364 unsigned indices[E::nv];
365 } shared;
366
367 auto tile = cuda_get_tile(bid, E::nv, count);
368
369 // Load the range for this CTA and merge the values into register.
370 auto range = cuda_compute_merge_sort_range(
371 count, bid, coop, E::nv, mp_data[bid + 0], mp_data[bid + 1]
372 );
373
374 auto merge = block_merge_from_mem<cudaMergeBoundType::LOWER, E::nt, E::vt>(
375 keys_input, keys_input, range, tid, comp, shared.keys
376 );
377
378 // Store merged values back out.
379 cuda_reg_to_mem_thread<E::nt>(
380 merge.keys, tid, tile.count(), keys_output + tile.begin, shared.keys
381 );
382
383 if(has_values) {
384 // Transpose the indices from thread order to strided order.
385 auto indices = cuda_reg_thread_to_strided<E::nt>(
386 merge.indices, tid, shared.indices
387 );
388
389 // Gather the input values and merge into the output values.
390 cuda_transfer_two_streams_strided<E::nt>(
391 vals_input + range.a_begin, range.a_count(),
392 vals_input + range.b_begin, range.b_count(),
393 indices, tid, vals_output + tile.begin
394 );
395 }
396 });
397
398 std::swap(keys_input, keys_output);
399 std::swap(vals_input, vals_output);
400 }
401}
402
403} // end of namespace tf::detail ---------------------------------------------
404
405namespace tf {
406
420template <typename P, typename K, typename V = cudaEmpty>
421unsigned cuda_sort_buffer_size(unsigned count) {
422
423 using E = std::decay_t<P>;
424
425 const bool has_values = !std::is_same<V, cudaEmpty>::value;
426
427 unsigned B = (count + E::nv - 1) / E::nv;
428 unsigned R = detail::cuda_find_log2(B, true);
429
430 return R ? (count * sizeof(K) + (has_values ? count*sizeof(V) : 0) +
431 (B+1)*sizeof(unsigned)) : 0;
432}
433
434// ----------------------------------------------------------------------------
435// key-value sort
436// ----------------------------------------------------------------------------
437
470template<typename P, typename K_it, typename V_it, typename C>
472 P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf
473) {
474
475 unsigned N = std::distance(k_first, k_last);
476
477 if(N <= 1) {
478 return;
479 }
480
481 detail::merge_sort_loop(p, k_first, v_first, N, comp, buf);
482}
483
484// ----------------------------------------------------------------------------
485// key sort
486// ----------------------------------------------------------------------------
487
504template<typename P, typename K_it, typename C>
505void cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf) {
506 cuda_sort_by_key(p, k_first, k_last, (cudaEmpty*)nullptr, comp, buf);
507}
508
509// ----------------------------------------------------------------------------
510// cudaFlow
511// ----------------------------------------------------------------------------
512
513// Function: sort
514template <typename I, typename C>
515cudaTask cudaFlow::sort(I first, I last, C comp) {
516 return capture([=](cudaFlowCapturer& cap){
518 cap.sort(first, last, comp);
519 });
520}
521
522// Function: sort
523template <typename I, typename C>
524void cudaFlow::sort(cudaTask task, I first, I last, C comp) {
525 capture(task, [=](cudaFlowCapturer& cap){
527 cap.sort(first, last, comp);
528 });
529}
530
531// Function: sort_by_key
532template <typename K_it, typename V_it, typename C>
533cudaTask cudaFlow::sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp) {
534 return capture([=](cudaFlowCapturer& cap){
536 cap.sort_by_key(k_first, k_last, v_first, comp);
537 });
538}
539
540// Function: sort_by_key
541template <typename K_it, typename V_it, typename C>
543 cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp
544) {
545 capture(task, [=](cudaFlowCapturer& cap){
547 cap.sort_by_key(k_first, k_last, v_first, comp);
548 });
549}
550
551// ----------------------------------------------------------------------------
552// cudaFlowCapturer
553// ----------------------------------------------------------------------------
554
555// Function: sort
556template <typename I, typename C>
557cudaTask cudaFlowCapturer::sort(I first, I last, C comp) {
558
559 using K = typename std::iterator_traits<I>::value_type;
560
561 auto bufsz = cuda_sort_buffer_size<cudaDefaultExecutionPolicy, K>(
562 std::distance(first, last)
563 );
564
565 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
566 (cudaStream_t stream) mutable {
567 cuda_sort(
568 cudaDefaultExecutionPolicy{stream}, first, last, comp, buf.get().data()
569 );
570 });
571}
572
573// Function: sort
574template <typename I, typename C>
575void cudaFlowCapturer::sort(cudaTask task, I first, I last, C comp) {
576
577 using K = typename std::iterator_traits<I>::value_type;
578
579 auto bufsz = cuda_sort_buffer_size<cudaDefaultExecutionPolicy, K>(
580 std::distance(first, last)
581 );
582
583 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
584 (cudaStream_t stream) mutable {
585 cuda_sort(
586 cudaDefaultExecutionPolicy{stream}, first, last, comp, buf.get().data()
587 );
588 });
589}
590
591// Function: sort_by_key
592template <typename K_it, typename V_it, typename C>
594 K_it k_first, K_it k_last, V_it v_first, C comp
595) {
596
599
600 auto bufsz = cuda_sort_buffer_size<cudaDefaultExecutionPolicy, K, V>(
601 std::distance(k_first, k_last)
602 );
603
604 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
605 (cudaStream_t stream) mutable {
607 k_first, k_last, v_first, comp, buf.get().data()
608 );
609 });
610}
611
612// Function: sort_by_key
613template <typename K_it, typename V_it, typename C>
615 cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp
616) {
617
620
621 auto bufsz = cuda_sort_buffer_size<cudaDefaultExecutionPolicy, K, V>(
622 std::distance(k_first, k_last)
623 );
624
625 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
626 (cudaStream_t stream) mutable {
628 k_first, k_last, v_first, comp, buf.get().data()
629 );
630 });
631}
632
633
634} // end of namespace tf -----------------------------------------------------
635
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask sort(I first, I last, C comp)
captures kernels that sort the given array
Definition sort.hpp:557
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
captures kernels that sort the given array
Definition sort.hpp:593
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
creates kernels that sort the given array
Definition sort.hpp:533
cudaTask sort(I first, I last, C comp)
creates a task to perform parallel sort an array
Definition sort.hpp:515
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
T count(T... args)
T distance(T... args)
T forward(T... args)
CUDA merge algorithm include file.
T merge(T... args)
T min(T... args)
taskflow namespace
Definition small_vector.hpp:27
void cuda_sort(P &&p, K_it k_first, K_it k_last, C comp, void *buf)
performs asynchronous key-only sort on a range of items
Definition sort.hpp:505
void cuda_sort_by_key(P &&p, K_it k_first, K_it k_last, V_it v_first, C comp, void *buf)
performs asynchronous key-value sort on a range of items
Definition sort.hpp:471
unsigned cuda_sort_buffer_size(unsigned count)
queries the buffer size in bytes needed to call sort kernels for the given number of elements
Definition sort.hpp:421
T partition(T... args)
T sort(T... args)
T swap(T... args)