hello-world/docs/scan_8hpp_source.html

#pragma once


#include "reduce.hpp"


namespace tf::detail {


// ----------------------------------------------------------------------------

// scan

// ----------------------------------------------------------------------------


inline constexpr unsigned cudaScanRecursionThreshold = 8;


enum class cudaScanType : int {

  EXCLUSIVE = 1,

  INCLUSIVE

};


template<typename T, unsigned vt = 0, bool is_array = (vt > 0)>

struct cudaScanResult {

  T scan;

  T reduction;

};


template<typename T, unsigned vt>

struct cudaScanResult<T, vt, true> {

  cudaArray<T, vt> scan;

  T reduction;

};


//-----------------------------------------------------------------------------


template<unsigned nt, typename T>

struct cudaBlockScan {


  const static unsigned num_warps  = nt / CUDA_WARP_SIZE;

  const static unsigned num_passes = log2(nt);

  const static unsigned capacity   = nt + num_warps;


  union storage_t {

    T data[2 * nt];

    struct { T threads[nt], warps[num_warps]; };

  };


  // standard scan

  template<typename op_t>

  __device__ cudaScanResult<T> operator ()(

    unsigned tid,

    T x,

    storage_t& storage,

    unsigned count = nt,

    op_t op = op_t(),

    T init = T(),

    cudaScanType type = cudaScanType::EXCLUSIVE

  ) const;


  // vectorized scan. accepts multiple values per thread and adds in

  // optional global carry-in

  template<unsigned vt, typename op_t>

  __device__ cudaScanResult<T, vt> operator()(

    unsigned tid,

    cudaArray<T, vt> x,

    storage_t& storage,

    T carry_in = T(),

    bool use_carry_in = false,

    unsigned count = nt,

    op_t op = op_t(),

    T init = T(),

    cudaScanType type = cudaScanType::EXCLUSIVE

  ) const;

};


// standard scan

template <unsigned nt, typename T>

template<typename op_t>

__device__ cudaScanResult<T> cudaBlockScan<nt, T>::operator () (

  unsigned tid, T x, storage_t& storage, unsigned count, op_t op,

  T init, cudaScanType type

) const {


  unsigned first = 0;

  storage.data[first + tid] = x;

  __syncthreads();


  cuda_iterate<num_passes>([&](auto pass) {

    if(auto offset = 1<<pass; tid >= offset) {

      x = op(storage.data[first + tid - offset], x);

    }

    first = nt - first;

    storage.data[first + tid] = x;

    __syncthreads();

  });


  cudaScanResult<T> result;

  result.reduction = storage.data[first + count - 1];

  result.scan = (tid < count) ?

    (cudaScanType::INCLUSIVE == type ? x :

      (tid ? storage.data[first + tid - 1] : init)) :

    result.reduction;

  __syncthreads();


  return result;

}


// vectorized scan block

template <unsigned nt, typename T>

template<unsigned vt, typename op_t>

__device__ cudaScanResult<T, vt> cudaBlockScan<nt, T>::operator()(

  unsigned tid,

  cudaArray<T, vt> x,

  storage_t& storage,

  T carry_in,

  bool use_carry_in,

  unsigned count, op_t op,

  T init,

  cudaScanType type

) const {


  // Start with an inclusive scan of the in-range elements.

  if(count >= nt * vt) {

    cuda_iterate<vt>([&](auto i) {

      x[i] = i ? op(x[i], x[i - 1]) : x[i];

    });

  } else {

    cuda_iterate<vt>([&](auto i) {

      auto index = vt * tid + i;

      x[i] = i ?

        ((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) :

        (x[i] = (index < count) ? x[i] : init);

    });

  }


  // Scan the thread-local reductions for a carry-in for each thread.

  auto result = operator()(

    tid, x[vt - 1], storage,

    (count + vt - 1) / vt, op, init, cudaScanType::EXCLUSIVE

  );


  // Perform the scan downsweep and add both the global carry-in and the

  // thread carry-in to the values.

  if(use_carry_in) {

    result.reduction = op(carry_in, result.reduction);

    result.scan = tid ? op(carry_in, result.scan) : carry_in;

  } else {

    use_carry_in = tid > 0;

  }


  cudaArray<T, vt> y;

  cuda_iterate<vt>([&](auto i) {

    if(cudaScanType::EXCLUSIVE == type) {

      y[i] = i ? x[i - 1] : result.scan;

      if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]);

    } else {

      y[i] = use_carry_in ? op(x[i], result.scan) : x[i];

    }

  });


  return cudaScanResult<T, vt> { y, result.reduction };

}


template <typename P, typename I, typename O, typename C>

void cuda_single_pass_scan(

  P&& p,

  cudaScanType scan_type,

  I input,

  unsigned count,

  O output,

  C op

  //reduction_it reduction,

) {


  using T = typename std::iterator_traits<O>::value_type;

  using E = std::decay_t<P>;


  // Small input specialization. This is the non-recursive branch.

  cuda_kernel<<<1, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {


    using scan_t = cudaBlockScan<E::nt, T>;


    __shared__ union {

      typename scan_t::storage_t scan;

      T values[E::nv];

    } shared;


    auto carry_in = T();

    for(unsigned cur = 0; cur < count; cur += E::nv) {

      // Cooperatively load values into register.

      auto count2 = min(count - cur, E::nv);


      auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(input + cur,

        tid, count2, shared.values);


      auto result = scan_t()(tid, x, shared.scan,

        carry_in, cur > 0, count2, op, T(), scan_type);


      // Store the scanned values back to global memory.

      cuda_reg_to_mem_thread<E::nt, E::vt>(result.scan, tid, count2,

        output + cur, shared.values);


      // Roll the reduction into carry_in.

      carry_in = result.reduction;

    }


    // Store the carry-out to the reduction pointer. This may be a

    // discard_iterator_t if no reduction is wanted.

    //if(!tid) *reduction = carry_in;

  });

}


template<typename P, typename I, typename O, typename C>

void cuda_scan_loop(

  P&& p,

  cudaScanType scan_type,

  I input,

  unsigned count,

  O output,

  C op,

  //reduction_it reduction,

  void* ptr

) {


  using E = std::decay_t<P>;

  using T = typename std::iterator_traits<O>::value_type;


  T* buffer = static_cast<T*>(ptr);


  //launch_t::cta_dim(context).B(count);

  unsigned B = (count + E::nv - 1) / E::nv;


  if(B > cudaScanRecursionThreshold) {


    //cudaDeviceVector<T> partials(B);

    //auto buffer = partials.data();


    // upsweep phase

    cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {


      __shared__ typename cudaBlockReduce<E::nt, T>::Storage shm;


      // Load the tile's data into register.

      auto tile = cuda_get_tile(bid, E::nv, count);

      auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(

        input + tile.begin, tid, tile.count()

      );


      // Reduce the thread's values into a scalar.

      T scalar;

      cuda_strided_iterate<E::nt, E::vt>(

        [&] (auto i, auto j) { scalar = i ? op(scalar, x[i]) : x[0]; },

        tid, tile.count()

      );


      // Reduce across all threads.

      auto all_reduce = cudaBlockReduce<E::nt, T>()(

        tid, scalar, shm, tile.count(), op

      );


      // Store the final reduction to the partials.

      if(!tid) {

        buffer[bid] = all_reduce;

      }

    });


    // recursively call scan

    //cuda_scan_loop(p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, S);

    cuda_scan_loop(

      p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, buffer+B

    );


    // downsweep: perform an intra-tile scan and add the scan of the partials

    // as carry-in

    cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {


      using scan_t = cudaBlockScan<E::nt, T>;


      __shared__ union {

        typename scan_t::storage_t scan;

        T values[E::nv];

      } shared;


      // Load a tile to register in thread order.

      auto tile = cuda_get_tile(bid, E::nv, count);

      auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(

        input + tile.begin, tid, tile.count(), shared.values

      );


      // Scan the array with carry-in from the partials.

      auto y = scan_t()(tid, x, shared.scan,

        buffer[bid], bid > 0, tile.count(), op, T(),

        scan_type).scan;


      // Store the scanned values to the output.

      cuda_reg_to_mem_thread<E::nt, E::vt>(

        y, tid, tile.count(), output + tile.begin, shared.values

      );

    });

  }

  // Small input specialization. This is the non-recursive branch.

  else {

    cuda_single_pass_scan(p, scan_type, input, count, output, op);

  }

}


}  // namespace tf::detail ----------------------------------------------------


namespace tf {


template <typename P, typename T>


unsigned cuda_scan_buffer_size(unsigned count) {

  using E = std::decay_t<P>;

  unsigned B = (count + E::nv - 1) / E::nv;

  unsigned n = 0;

  for(auto b=B; b>detail::cudaScanRecursionThreshold; b=(b+E::nv-1)/E::nv) {

    n += b;

  }

  return n*sizeof(T);

}


// ----------------------------------------------------------------------------

// inclusive scan

// ----------------------------------------------------------------------------


//template<typename P, typename I, typename O, typename C>

//void cuda_inclusive_scan(P&& p, I first, I last, O output, C op) {

//

//  unsigned count = std::distance(first, last);

//

//  if(count == 0) {

//    return;

//  }

//

//  using T = typename std::iterator_traits<O>::value_type;

//

//  // allocate temporary buffer

//  cudaDeviceVector<std::byte> temp(cuda_scan_buffer_size<P, T>(count));

//

//  // launch the scan loop

//  detail::cuda_scan_loop(

//    p, detail::cudaScanType::INCLUSIVE, first, count, output, op, temp.data()

//  );

//

//  // synchronize the execution

//  p.synchronize();

//}


template<typename P, typename I, typename O, typename C>


void cuda_inclusive_scan(

  P&& p, I first, I last, O output, C op, void* buf

) {


  unsigned count = std::distance(first, last);


  if(count == 0) {

    return;

  }


  // launch the scan loop

  detail::cuda_scan_loop(

    p, detail::cudaScanType::INCLUSIVE, first, count, output, op, buf

  );

}


// ----------------------------------------------------------------------------

// transform inclusive_scan

// ----------------------------------------------------------------------------


//template<typename P, typename I, typename O, typename C, typename U>

//void cuda_transform_inclusive_scan(

//  P&& p, I first, I last, O output, C bop, U uop

//) {

//

//  unsigned count = std::distance(first, last);

//

//  if(count == 0) {

//    return;

//  }

//

//  using T = typename std::iterator_traits<O>::value_type;

//

//  // allocate temporary buffer

//  cudaDeviceVector<std::byte> temp(cuda_scan_buffer_size<P, T>(count));

//  auto buf = temp.data();

//

//  // launch the scan loop

//  detail::cuda_scan_loop(

//    p, detail::cudaScanType::INCLUSIVE,

//    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),

//    count, output, bop, buf

//  );

//

//  // synchronize the execution

//  p.synchronize();

//}


template<typename P, typename I, typename O, typename C, typename U>


void cuda_transform_inclusive_scan(

  P&& p, I first, I last, O output, C bop, U uop, void* buf

) {


  using T = typename std::iterator_traits<O>::value_type;


  unsigned count = std::distance(first, last);


  if(count == 0) {

    return;

  }


  // launch the scan loop

  detail::cuda_scan_loop(

    p, detail::cudaScanType::INCLUSIVE,

    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),

    count, output, bop, buf

  );

}


// ----------------------------------------------------------------------------

// exclusive scan

// ----------------------------------------------------------------------------


//template<typename P, typename I, typename O, typename C>

//void cuda_exclusive_scan(P&& p, I first, I last, O output, C op) {

//

//  unsigned count = std::distance(first, last);

//

//  if(count == 0) {

//    return;

//  }

//

//  using T = typename std::iterator_traits<O>::value_type;

//

//  // allocate temporary buffer

//  cudaDeviceVector<std::byte> temp(cuda_scan_buffer_size<P, T>(count));

//  auto buf = temp.data();

//

//  // launch the scan loop

//  detail::cuda_scan_loop(

//    p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf

//  );

//

//  // synchronize the execution

//  p.synchronize();

//}


template<typename P, typename I, typename O, typename C>


void cuda_exclusive_scan(

  P&& p, I first, I last, O output, C op, void* buf

) {


  unsigned count = std::distance(first, last);


  if(count == 0) {

    return;

  }


  // launch the scan loop

  detail::cuda_scan_loop(

    p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf

  );

}


// ----------------------------------------------------------------------------

// transform exclusive scan

// ----------------------------------------------------------------------------


//template<typename P, typename I, typename O, typename C, typename U>

//void cuda_transform_exclusive_scan(

//  P&& p, I first, I last, O output, C bop, U uop

//) {

//

//  unsigned count = std::distance(first, last);

//

//  if(count == 0) {

//    return;

//  }

//

//  using T = typename std::iterator_traits<O>::value_type;

//

//  // allocate temporary buffer

//  cudaDeviceVector<std::byte> temp(cuda_scan_buffer_size<P, T>(count));

//  auto buf = temp.data();

//

//  // launch the scan loop

//  detail::cuda_scan_loop(

//    p, detail::cudaScanType::EXCLUSIVE,

//    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),

//    count, output, bop, buf

//  );

//

//  // synchronize the execution

//  p.synchronize();

//}


template<typename P, typename I, typename O, typename C, typename U>


void cuda_transform_exclusive_scan(

  P&& p, I first, I last, O output, C bop, U uop, void* buf

) {


  using T = typename std::iterator_traits<O>::value_type;


  unsigned count = std::distance(first, last);


  if(count == 0) {

    return;

  }


  // launch the scan loop

  detail::cuda_scan_loop(

    p, detail::cudaScanType::EXCLUSIVE,

    cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),

    count, output, bop, buf

  );

}


// ----------------------------------------------------------------------------

// cudaFlow

// ----------------------------------------------------------------------------


// Function: inclusive_scan

template <typename I, typename O, typename C>


cudaTask cudaFlow::inclusive_scan(I first, I last, O output, C op) {

  return capture([=](cudaFlowCapturer& cap) {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.inclusive_scan(first, last, output, op);

  });

}


// Function: inclusive_scan

template <typename I, typename O, typename C>


void cudaFlow::inclusive_scan(cudaTask task, I first, I last, O output, C op) {

  capture(task, [=](cudaFlowCapturer& cap) {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.inclusive_scan(first, last, output, op);

  });

}


// Function: exclusive_scan

template <typename I, typename O, typename C>


cudaTask cudaFlow::exclusive_scan(I first, I last, O output, C op) {

  return capture([=](cudaFlowCapturer& cap) {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.exclusive_scan(first, last, output, op);

  });

}


// Function: exclusive_scan

template <typename I, typename O, typename C>


void cudaFlow::exclusive_scan(cudaTask task, I first, I last, O output, C op) {

  capture(task, [=](cudaFlowCapturer& cap) {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.exclusive_scan(first, last, output, op);

  });

}


// Function: transform_inclusive_scan

template <typename I, typename O, typename B, typename U>


cudaTask cudaFlow::transform_inclusive_scan(

  I first, I last, O output, B bop, U uop

) {

  return capture([=](cudaFlowCapturer& cap) {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.transform_inclusive_scan(first, last, output, bop, uop);

  });

}


// Function: transform_inclusive_scan

template <typename I, typename O, typename B, typename U>


void cudaFlow::transform_inclusive_scan(

  cudaTask task, I first, I last, O output, B bop, U uop

) {

  capture(task, [=](cudaFlowCapturer& cap) {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.transform_inclusive_scan(first, last, output, bop, uop);

  });

}


// Function: transform_exclusive_scan

template <typename I, typename O, typename B, typename U>


cudaTask cudaFlow::transform_exclusive_scan(

  I first, I last, O output, B bop, U uop

) {

  return capture([=](cudaFlowCapturer& cap) {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.transform_exclusive_scan(first, last, output, bop, uop);

  });

}


// Function: transform_exclusive_scan

template <typename I, typename O, typename B, typename U>


void cudaFlow::transform_exclusive_scan(

  cudaTask task, I first, I last, O output, B bop, U uop

) {

  capture(task, [=](cudaFlowCapturer& cap) {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.transform_exclusive_scan(first, last, output, bop, uop);

  });

}


// ----------------------------------------------------------------------------

// cudaFlowCapturer

// ----------------------------------------------------------------------------


// Function: inclusive_scan

template <typename I, typename O, typename C>


cudaTask cudaFlowCapturer::inclusive_scan(I first, I last, O output, C op) {


  using T = typename std::iterator_traits<O>::value_type;


  auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_inclusive_scan(p, first, last, output, op, buf.get().data());

  });

}


// Function: inclusive_scan

template <typename I, typename O, typename C>


void cudaFlowCapturer::inclusive_scan(

  cudaTask task, I first, I last, O output, C op

) {


  using T = typename std::iterator_traits<O>::value_type;


  auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_inclusive_scan(p, first, last, output, op, buf.get().data());

  });

}


// Function: exclusive_scan

template <typename I, typename O, typename C>


cudaTask cudaFlowCapturer::exclusive_scan(I first, I last, O output, C op) {


  using T = typename std::iterator_traits<O>::value_type;


  auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_exclusive_scan(p, first, last, output, op, buf.get().data());

  });

}


// Function: exclusive_scan

template <typename I, typename O, typename C>


void cudaFlowCapturer::exclusive_scan(

  cudaTask task, I first, I last, O output, C op

) {


  using T = typename std::iterator_traits<O>::value_type;


  auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_exclusive_scan(p, first, last, output, op, buf.get().data());

  });

}


// Function: transform_inclusive_scan

template <typename I, typename O, typename B, typename U>


cudaTask cudaFlowCapturer::transform_inclusive_scan(

  I first, I last, O output, B bop, U uop

) {


  using T = typename std::iterator_traits<O>::value_type;


  auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_transform_inclusive_scan(

      p, first, last, output, bop, uop, buf.get().data()

    );

  });

}


// Function: transform_inclusive_scan

template <typename I, typename O, typename B, typename U>


void cudaFlowCapturer::transform_inclusive_scan(

  cudaTask task, I first, I last, O output, B bop, U uop

) {


  using T = typename std::iterator_traits<O>::value_type;


  auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_transform_inclusive_scan(

      p, first, last, output, bop, uop, buf.get().data()

    );

  });

}


// Function: transform_exclusive_scan

template <typename I, typename O, typename B, typename U>


cudaTask cudaFlowCapturer::transform_exclusive_scan(

  I first, I last, O output, B bop, U uop

) {


  using T = typename std::iterator_traits<O>::value_type;


  auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_transform_exclusive_scan(

      p, first, last, output, bop, uop, buf.get().data()

    );

  });

}


// Function: transform_exclusive_scan

template <typename I, typename O, typename B, typename U>


void cudaFlowCapturer::transform_exclusive_scan(

  cudaTask task, I first, I last, O output, B bop, U uop

) {


  using T = typename std::iterator_traits<O>::value_type;


  auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_transform_exclusive_scan(

      p, first, last, output, bop, uop, buf.get().data()

    );

  });

}


}  // end of namespace tf -----------------------------------------------------


tf::cudaExecutionPolicy
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29

tf::cudaFlowCapturer
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57

tf::cudaFlowCapturer::exclusive_scan
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlowCapturer::inclusive_scan but excludes the first value
Definition scan.hpp:739

tf::cudaFlowCapturer::transform_inclusive_scan
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
captures kernels that perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:775

tf::cudaFlowCapturer::inclusive_scan
cudaTask inclusive_scan(I first, I last, O output, C op)
captures kernels that perform parallel inclusive scan over a range of items
Definition scan.hpp:703

tf::cudaFlowCapturer::transform_exclusive_scan
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value
Definition scan.hpp:817

tf::cudaFlowCapturer::make_optimizer
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312

tf::cudaFlowCapturer::on
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105

tf::cudaFlow::inclusive_scan
cudaTask inclusive_scan(I first, I last, O output, C op)
creates a task to perform parallel inclusive scan over a range of items
Definition scan.hpp:619

tf::cudaFlow::transform_inclusive_scan
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
creates a task to perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:655

tf::cudaFlow::capture
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582

tf::cudaFlow::exclusive_scan
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlow::inclusive_scan but excludes the first value
Definition scan.hpp:637

tf::cudaFlow::transform_exclusive_scan
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlow::transform_inclusive_scan but excludes the first value
Definition scan.hpp:677

tf::cudaLinearCapturing
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182

tf::cudaTask
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65

std::count
T count(T... args)

std::distance
T distance(T... args)

std::forward
T forward(T... args)

std::iterator_traits

std::min
T min(T... args)

tf
taskflow namespace
Definition small_vector.hpp:27

tf::cuda_inclusive_scan
void cuda_inclusive_scan(P &&p, I first, I last, O output, C op, void *buf)
performs asynchronous inclusive scan over a range of items
Definition scan.hpp:394

tf::cuda_transform_exclusive_scan
void cuda_transform_exclusive_scan(P &&p, I first, I last, O output, C bop, U uop, void *buf)
performs asynchronous exclusive scan over a range of items
Definition scan.hpp:593

tf::cuda_exclusive_scan
void cuda_exclusive_scan(P &&p, I first, I last, O output, C op, void *buf)
performs asynchronous exclusive scan over a range of items
Definition scan.hpp:526

tf::cuda_transform_inclusive_scan
void cuda_transform_inclusive_scan(P &&p, I first, I last, O output, C bop, U uop, void *buf)
performs asynchronous inclusive scan over a range of transformed items
Definition scan.hpp:461

tf::cuda_scan_buffer_size
unsigned cuda_scan_buffer_size(unsigned count)
queries the buffer size in bytes needed to call scan kernels
Definition scan.hpp:340

reduce.hpp
cuda reduce algorithms include file