20constexpr int cuda_clz(
int x) {
21 for(
int i = 31; i >= 0; --i) {
33constexpr int cuda_find_log2(
int x,
bool round_up =
false) {
34 int a = 31 - cuda_clz(x);
42template<
typename T,
unsigned vt,
typename C>
43__device__
auto cuda_odd_even_sort(
44 cudaArray<T, vt> x, C comp,
int flags = 0
46 cuda_iterate<vt>([&](
auto I) {
48 for(
auto i = 1 & I; i < vt - 1; i += 2) {
49 if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i]))
50 cuda_swap(x[i], x[i + 1]);
57template<
typename K,
typename V,
unsigned vt,
typename C>
58__device__
auto cuda_odd_even_sort(
59 cudaKVArray<K, V, vt> x, C comp,
int flags = 0
61 cuda_iterate<vt>([&](
auto I) {
63 for(
auto i = 1 & I; i < vt - 1; i += 2) {
64 if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) {
65 cuda_swap(x.keys[i], x.keys[i + 1]);
66 cuda_swap(x.vals[i], x.vals[i + 1]);
78__device__
inline int cuda_out_of_range_flags(
int first,
int vt,
int count) {
79 int out_of_range =
min(vt, first + vt - count);
81 if(out_of_range > 0) {
82 const int mask = (1<< vt) - 1;
83 head_flags = mask & (~mask>> out_of_range);
89__device__
inline auto cuda_compute_merge_sort_frame(
90 unsigned partition,
unsigned coop,
unsigned spacing
93 unsigned size = spacing * (coop / 2);
94 unsigned start = ~(coop - 1) & partition;
95 unsigned a_begin = spacing * start;
96 unsigned b_begin = spacing * start + size;
98 return cudaMergeRange {
107__device__
inline auto cuda_compute_merge_sort_range(
108 unsigned count,
unsigned partition,
unsigned coop,
unsigned spacing
111 auto frame = cuda_compute_merge_sort_frame(partition, coop, spacing);
113 return cudaMergeRange {
115 min(count, frame.a_end),
116 min(count, frame.b_begin),
117 min(count, frame.b_end)
122__device__
inline auto cuda_compute_merge_sort_range(
123 unsigned count,
unsigned partition,
unsigned coop,
unsigned spacing,
124 unsigned mp0,
unsigned mp1
127 auto range = cuda_compute_merge_sort_range(count, partition, coop, spacing);
130 unsigned diag = spacing *
partition - range.a_begin;
137 if(coop - 1 != ((coop - 1) & partition)) {
138 range.a_end = range.a_begin + mp1;
139 range.b_end =
min(count, range.b_begin + diag + spacing - mp1);
142 range.a_begin = range.a_begin + mp0;
143 range.b_begin =
min(count, range.b_begin + diag - mp0);
149template<
unsigned nt,
unsigned vt,
typename K,
typename V>
150struct cudaBlockSort {
153 static constexpr unsigned num_passes = log2(nt);
161 static_assert(is_pow2(nt),
"cudaBlockSort requires pow2 number of threads");
164 __device__
auto merge_pass(
165 cudaKVArray<K, V, vt> x,
166 unsigned tid,
unsigned count,
unsigned pass,
167 C comp, Storage& storage
171 unsigned coop = 2 << pass;
172 auto range = cuda_compute_merge_sort_range(count, tid, coop, vt);
173 unsigned diag = vt * tid - range.a_begin;
176 cuda_reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys);
179 auto mp = cuda_merge_path<cudaMergeBoundType::LOWER>(
180 storage.keys, range, diag, comp
184 auto merge = cuda_serial_merge<cudaMergeBoundType::LOWER, vt>(
185 storage.keys, range.partition(mp, diag), comp
191 cuda_reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals);
192 x.vals = cuda_shared_gather<nt, vt>(storage.vals,
merge.indices);
199 __device__
auto block_sort(cudaKVArray<K, V, vt> x,
200 unsigned tid,
unsigned count, C comp, Storage& storage
206 if(count < nt * vt) {
207 auto head_flags = cuda_out_of_range_flags(vt * tid, vt, count);
208 x = cuda_odd_even_sort(x, comp, head_flags);
210 x = cuda_odd_even_sort(x, comp);
214 for(
unsigned pass = 0; pass < num_passes; ++pass) {
215 x = merge_pass(x, tid, count, pass, comp, storage);
223template<
typename P,
typename K,
typename C>
224void cuda_merge_sort_partitions(
225 P&& p, K keys,
unsigned count,
226 unsigned coop,
unsigned spacing, C comp,
unsigned* buf
230 unsigned num_partitions = (
count + spacing - 1) / spacing + 1;
232 const unsigned nt = 128;
233 const unsigned vt = 1;
234 const unsigned nv = nt * vt;
236 unsigned B = (num_partitions + nv - 1) / nv;
238 cuda_kernel<<<B, nt, 0, p.stream()>>>([=] __device__ (
auto tid,
auto bid) {
239 auto range = cuda_get_tile(bid, nt * vt, num_partitions);
240 cuda_strided_iterate<nt, vt>([=](
auto,
auto j) {
241 auto index = j + range.begin;
242 auto range = cuda_compute_merge_sort_range(count, index, coop, spacing);
243 auto diag =
min(spacing * index, count) - range.a_begin;
244 buf[index] = cuda_merge_path<cudaMergeBoundType::LOWER>(
245 keys + range.a_begin, range.a_count(),
246 keys + range.b_begin, range.b_count(),
249 }, tid, range.count());
254template<
typename P,
typename K_it,
typename V_it,
typename C>
256 P&& p, K_it keys_input, V_it vals_input,
unsigned count, C comp,
void* buf
265 unsigned B = (
count + E::nv - 1) / E::nv;
266 unsigned R = cuda_find_log2(B,
true);
268 K* keys_output {
nullptr};
269 V* vals_output {
nullptr};
270 unsigned *mp_data {
nullptr};
273 keys_output = (K*)(buf);
275 vals_output = (V*)(keys_output + count);
276 mp_data = (
unsigned*)(vals_output + count);
279 mp_data = (
unsigned*)(keys_output + count);
291 auto keys_blocksort = (1 & R) ? keys_output : keys_input;
292 auto vals_blocksort = (1 & R) ? vals_output : vals_input;
296 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (
auto tid,
auto bid) {
298 using sort_t = cudaBlockSort<E::nt, E::vt, K, V>;
301 typename sort_t::Storage
sort;
306 auto tile = cuda_get_tile(bid, E::nv, count);
309 cudaKVArray<K, V, E::vt> unsorted;
310 unsorted.keys = cuda_mem_to_reg_thread<E::nt, E::vt>(
311 keys_input + tile.begin, tid, tile.count(), shared.keys
315 unsorted.vals = cuda_mem_to_reg_thread<E::nt, E::vt>(
316 vals_input + tile.begin, tid, tile.count(), shared.vals
321 auto sorted = sort_t().block_sort(unsorted, tid, tile.count(), comp, shared.sort);
324 cuda_reg_to_mem_thread<E::nt, E::vt>(
325 sorted.keys, tid, tile.count(), keys_blocksort + tile.begin, shared.keys
329 cuda_reg_to_mem_thread<E::nt, E::vt>(
330 sorted.vals, tid, tile.count(), vals_blocksort + tile.begin, shared.vals
352 for(
unsigned pass = 0; pass < R; ++pass) {
354 unsigned coop = 2 << pass;
356 cuda_merge_sort_partitions(
357 p, keys_input, count, coop, E::nv, comp, mp_data
360 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(
auto tid,
auto bid) {
364 unsigned indices[E::nv];
367 auto tile = cuda_get_tile(bid, E::nv, count);
370 auto range = cuda_compute_merge_sort_range(
371 count, bid, coop, E::nv, mp_data[bid + 0], mp_data[bid + 1]
374 auto merge = block_merge_from_mem<cudaMergeBoundType::LOWER, E::nt, E::vt>(
375 keys_input, keys_input, range, tid, comp, shared.keys
379 cuda_reg_to_mem_thread<E::nt>(
380 merge.keys, tid, tile.count(), keys_output + tile.begin, shared.keys
385 auto indices = cuda_reg_thread_to_strided<E::nt>(
386 merge.indices, tid, shared.indices
390 cuda_transfer_two_streams_strided<E::nt>(
391 vals_input + range.a_begin, range.a_count(),
392 vals_input + range.b_begin, range.b_count(),
393 indices, tid, vals_output + tile.begin
420template <
typename P,
typename K,
typename V = cudaEmpty>
427 unsigned B = (count + E::nv - 1) / E::nv;
428 unsigned R = detail::cuda_find_log2(B,
true);
430 return R ? (count *
sizeof(K) + (has_values ? count*
sizeof(V) : 0) +
431 (B+1)*
sizeof(unsigned)) : 0;
470template<
typename P,
typename K_it,
typename V_it,
typename C>
472 P&& p, K_it k_first, K_it k_last, V_it v_first, C comp,
void* buf
481 detail::merge_sort_loop(p, k_first, v_first, N, comp, buf);
504template<
typename P,
typename K_it,
typename C>
505void cuda_sort(P&& p, K_it k_first, K_it k_last, C comp,
void* buf) {
514template <
typename I,
typename C>
518 cap.
sort(first, last, comp);
523template <
typename I,
typename C>
527 cap.
sort(first, last, comp);
532template <
typename K_it,
typename V_it,
typename C>
541template <
typename K_it,
typename V_it,
typename C>
543 cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp
556template <
typename I,
typename C>
561 auto bufsz = cuda_sort_buffer_size<cudaDefaultExecutionPolicy, K>(
565 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
566 (cudaStream_t stream)
mutable {
574template <
typename I,
typename C>
579 auto bufsz = cuda_sort_buffer_size<cudaDefaultExecutionPolicy, K>(
583 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
584 (cudaStream_t stream)
mutable {
592template <
typename K_it,
typename V_it,
typename C>
594 K_it k_first, K_it k_last, V_it v_first, C comp
600 auto bufsz = cuda_sort_buffer_size<cudaDefaultExecutionPolicy, K, V>(
604 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
605 (cudaStream_t stream)
mutable {
607 k_first, k_last, v_first, comp, buf.get().data()
613template <
typename K_it,
typename V_it,
typename C>
615 cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp
621 auto bufsz = cuda_sort_buffer_size<cudaDefaultExecutionPolicy, K, V>(
625 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
626 (cudaStream_t stream)
mutable {
628 k_first, k_last, v_first, comp, buf.get().data()
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask sort(I first, I last, C comp)
captures kernels that sort the given array
Definition sort.hpp:557
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
captures kernels that sort the given array
Definition sort.hpp:593
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
creates kernels that sort the given array
Definition sort.hpp:533
cudaTask sort(I first, I last, C comp)
creates a task to perform parallel sort an array
Definition sort.hpp:515
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
CUDA merge algorithm include file.
taskflow namespace
Definition small_vector.hpp:27
void cuda_sort(P &&p, K_it k_first, K_it k_last, C comp, void *buf)
performs asynchronous key-only sort on a range of items
Definition sort.hpp:505
void cuda_sort_by_key(P &&p, K_it k_first, K_it k_last, V_it v_first, C comp, void *buf)
performs asynchronous key-value sort on a range of items
Definition sort.hpp:471
unsigned cuda_sort_buffer_size(unsigned count)
queries the buffer size in bytes needed to call sort kernels for the given number of elements
Definition sort.hpp:421