hello-world/docs/cudaflow_8hpp_source.html

#pragma once


#include "../taskflow.hpp"

#include "cuda_task.hpp"

#include "cuda_capturer.hpp"


namespace tf {


// ----------------------------------------------------------------------------

// class definition: cudaFlow

// ----------------------------------------------------------------------------


class cudaFlow {


  friend class Executor;


  struct External {

    cudaGraph graph;

  };


  struct Internal {

  };


  using handle_t = std::variant<External, Internal>;


  public:


    cudaFlow();


    ~cudaFlow();


    bool empty() const;


    size_t num_tasks() const;


    void clear();


    void dump(std::ostream& os) const;


    void dump_native_graph(std::ostream& os) const;


    // ------------------------------------------------------------------------

    // Graph building routines

    // ------------------------------------------------------------------------


    cudaTask noop();


    template <typename C>

    cudaTask host(C&& callable);


    template <typename C>

    void host(cudaTask task, C&& callable);


    template <typename F, typename... ArgsT>

    cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT&&... args);


    template <typename F, typename... ArgsT>

    void kernel(

      cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT&&... args

    );


    cudaTask memset(void* dst, int v, size_t count);


    void memset(cudaTask task, void* dst, int ch, size_t count);


    cudaTask memcpy(void* tgt, const void* src, size_t bytes);


    void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes);


    template <typename T, std::enable_if_t<

      is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr

    >

    cudaTask zero(T* dst, size_t count);


    template <typename T, std::enable_if_t<

      is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr

    >

    void zero(cudaTask task, T* dst, size_t count);


    template <typename T, std::enable_if_t<

      is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr

    >

    cudaTask fill(T* dst, T value, size_t count);


    template <typename T, std::enable_if_t<

      is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr

    >

    void fill(cudaTask task, T* dst, T value, size_t count);


    template <typename T,

      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr

    >

    cudaTask copy(T* tgt, const T* src, size_t num);


    template <typename T,

      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr

    >

    void copy(cudaTask task, T* tgt, const T* src, size_t num);


    // ------------------------------------------------------------------------

    // offload methods

    // ------------------------------------------------------------------------


    template <typename P>

    void offload_until(P&& predicate);


    void offload_n(size_t N);


    void offload();


    // ------------------------------------------------------------------------

    // generic algorithms

    // ------------------------------------------------------------------------


    template <typename C>

    cudaTask single_task(C c);


    template <typename C>

    void single_task(cudaTask task, C c);


    template <typename I, typename C>

    cudaTask for_each(I first, I last, C callable);


    template <typename I, typename C>

    void for_each(cudaTask task, I first, I last, C callable);


    template <typename I, typename C>

    cudaTask for_each_index(I first, I last, I step, C callable);


    template <typename I, typename C>

    void for_each_index(

      cudaTask task, I first, I last, I step, C callable

    );


    template <typename I, typename O, typename C>

    cudaTask transform(I first, I last, O output, C op);


    template <typename I, typename O, typename C>

    void transform(cudaTask task, I first, I last, O output, C c);


    template <typename I1, typename I2, typename O, typename C>

    cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);


    template <typename I1, typename I2, typename O, typename C>

    void transform(

      cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c

    );


    template <typename I, typename T, typename B>

    cudaTask reduce(I first, I last, T* result, B bop);


    template <typename I, typename T, typename C>

    void reduce(cudaTask task, I first, I last, T* result, C op);


    template <typename I, typename T, typename B>

    cudaTask uninitialized_reduce(I first, I last, T* result, B bop);


    template <typename I, typename T, typename C>

    void uninitialized_reduce(

      cudaTask task, I first, I last, T* result, C op

    );


    template <typename I, typename T, typename B, typename U>

    cudaTask transform_reduce(I first, I last, T* result, B bop, U uop);


    template <typename I, typename T, typename B, typename U>

    void transform_reduce(cudaTask, I first, I last, T* result, B bop, U uop);


    template <typename I, typename T, typename B, typename U>

    cudaTask transform_uninitialized_reduce(

      I first, I last, T* result, B bop, U uop

    );


    template <typename I, typename T, typename B, typename U>

    void transform_uninitialized_reduce(

      cudaTask task, I first, I last, T* result, B bop, U uop

    );


    template <typename I, typename O, typename C>

    cudaTask inclusive_scan(I first, I last, O output, C op);


    template <typename I, typename O, typename C>

    void inclusive_scan(cudaTask task, I first, I last, O output, C op);


    template <typename I, typename O, typename C>

    cudaTask exclusive_scan(I first, I last, O output, C op);


    template <typename I, typename O, typename C>

    void exclusive_scan(cudaTask task, I first, I last, O output, C op);


    template <typename I, typename O, typename B, typename U>

    cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop);


    template <typename I, typename O, typename B, typename U>

    void transform_inclusive_scan(

      cudaTask task, I first, I last, O output, B bop, U uop

    );


    template <typename I, typename O, typename B, typename U>

    cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop);


    template <typename I, typename O, typename B, typename U>

    void transform_exclusive_scan(

      cudaTask task, I first, I last, O output, B bop, U uop

    );


    template <typename A, typename B, typename C, typename Comp>

    cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp);


    template <typename A, typename B, typename C, typename Comp>

    void merge(

      cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp

    );


    template <typename I, typename C>

    cudaTask sort(I first, I last, C comp);


    template <typename I, typename C>

    void sort(cudaTask task, I first, I last, C comp);


    template <typename K_it, typename V_it, typename C>

    cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp);


    template <typename K_it, typename V_it, typename C>

    void sort_by_key(

      cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp

    );


    template<

      typename a_keys_it, typename a_vals_it,

      typename b_keys_it, typename b_vals_it,

      typename c_keys_it, typename c_vals_it,

      typename C

    >

    cudaTask merge_by_key(

      a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,

      b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,

      c_keys_it c_keys_first, c_vals_it c_vals_first, C comp

    );


    template<

      typename a_keys_it, typename a_vals_it,

      typename b_keys_it, typename b_vals_it,

      typename c_keys_it, typename c_vals_it,

      typename C

    >

    void merge_by_key(

      cudaTask task,

      a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,

      b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,

      c_keys_it c_keys_first, c_vals_it c_vals_first, C comp

    );


    template <typename I, typename U>

    cudaTask find_if(I first, I last, unsigned* idx, U op);


    template <typename I, typename U>

    void find_if(cudaTask task, I first, I last, unsigned* idx, U op);


    template <typename I, typename O>

    cudaTask min_element(I first, I last, unsigned* idx, O op);


    template <typename I, typename O>

    void min_element(cudaTask task, I first, I last, unsigned* idx, O op);


    template <typename I, typename O>

    cudaTask max_element(I first, I last, unsigned* idx, O op);


    template <typename I, typename O>

    void max_element(cudaTask task, I first, I last, unsigned* idx, O op);


    // ------------------------------------------------------------------------

    // subflow

    // ------------------------------------------------------------------------


    template <typename C>

    cudaTask capture(C&& callable);


    template <typename C>

    void capture(cudaTask task, C callable);


  private:


    handle_t _handle;


    cudaGraph& _graph;


    cudaGraphExec_t _executable {nullptr};


    cudaFlow(cudaGraph&);

};


// Construct a standalone cudaFlow


inline cudaFlow::cudaFlow() :

  _handle {std::in_place_type_t<External>{}},

  _graph  {std::get_if<External>(&_handle)->graph} {


  TF_CHECK_CUDA(

    cudaGraphCreate(&_graph._native_handle, 0),

    "cudaFlow failed to create a native graph (external mode)"

  );

}


// Construct the cudaFlow from executor (internal graph)

inline cudaFlow::cudaFlow(cudaGraph& g) :

  _handle {std::in_place_type_t<Internal>{}},

  _graph  {g} {


  assert(_graph._native_handle == nullptr);


  TF_CHECK_CUDA(

    cudaGraphCreate(&_graph._native_handle, 0),

    "failed to create a native graph (internal mode)"

  );

}


// Destructor


inline cudaFlow::~cudaFlow() {

  if(_executable) {

    cudaGraphExecDestroy(_executable);

  }

  cudaGraphDestroy(_graph._native_handle);

  _graph._native_handle = nullptr;

}


// Procedure: clear


inline void cudaFlow::clear() {


  if(_executable) {

    TF_CHECK_CUDA(

      cudaGraphExecDestroy(_executable), "failed to destroy executable graph"

    );

    _executable = nullptr;

  }


  TF_CHECK_CUDA(

    cudaGraphDestroy(_graph._native_handle), "failed to destroy native graph"

  );


  TF_CHECK_CUDA(

    cudaGraphCreate(&_graph._native_handle, 0), "failed to create native graph"

  );


  _graph._nodes.clear();

}


// Function: empty


inline bool cudaFlow::empty() const {

  return _graph._nodes.empty();

}


// Function: num_tasks


inline size_t cudaFlow::num_tasks() const {

  return _graph._nodes.size();

}


// Procedure: dump


inline void cudaFlow::dump(std::ostream& os) const {

  _graph.dump(os, nullptr, "");

}


// Procedure: dump


inline void cudaFlow::dump_native_graph(std::ostream& os) const {

  cuda_dump_graph(os, _graph._native_handle);

}


// ----------------------------------------------------------------------------

// Graph building methods

// ----------------------------------------------------------------------------


// Function: noop


inline cudaTask cudaFlow::noop() {


  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Empty>{}

  );


  TF_CHECK_CUDA(

    cudaGraphAddEmptyNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0

    ),

    "failed to create a no-operation (empty) node"

  );


  return cudaTask(node);

}


// Function: host

template <typename C>


cudaTask cudaFlow::host(C&& c) {


  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Host>{}, std::forward<C>(c)

  );


  auto h = std::get_if<cudaNode::Host>(&node->_handle);


  cudaHostNodeParams p;

  p.fn = cudaNode::Host::callback;

  p.userData = h;


  TF_CHECK_CUDA(

    cudaGraphAddHostNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0, &p

    ),

    "failed to create a host node"

  );


  return cudaTask(node);

}


// Function: kernel

template <typename F, typename... ArgsT>


cudaTask cudaFlow::kernel(

  dim3 g, dim3 b, size_t s, F f, ArgsT&&... args

) {


  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Kernel>{}, (void*)f

  );


  cudaKernelNodeParams p;

  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };

  p.func = (void*)f;

  p.gridDim = g;

  p.blockDim = b;

  p.sharedMemBytes = s;

  p.kernelParams = arguments;

  p.extra = nullptr;


  TF_CHECK_CUDA(

    cudaGraphAddKernelNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0, &p

    ),

    "failed to create a kernel task"

  );


  return cudaTask(node);

}


// Function: zero

template <typename T, std::enable_if_t<

  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*

>


cudaTask cudaFlow::zero(T* dst, size_t count) {


  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Memset>{}

  );


  auto p = cuda_get_zero_parms(dst, count);


  TF_CHECK_CUDA(

    cudaGraphAddMemsetNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0, &p

    ),

    "failed to create a memset (zero) task"

  );


  return cudaTask(node);

}


// Function: fill

template <typename T, std::enable_if_t<

  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*

>


cudaTask cudaFlow::fill(T* dst, T value, size_t count) {


  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Memset>{}

  );


  auto p = cuda_get_fill_parms(dst, value, count);


  TF_CHECK_CUDA(

    cudaGraphAddMemsetNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0, &p

    ),

    "failed to create a memset (fill) task"

  );


  return cudaTask(node);

}


// Function: copy

template <

  typename T,

  std::enable_if_t<!std::is_same_v<T, void>, void>*

>


cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) {


  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Memcpy>{}

  );


  auto p = cuda_get_copy_parms(tgt, src, num);


  TF_CHECK_CUDA(

    cudaGraphAddMemcpyNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0, &p

    ),

    "failed to create a memcpy (copy) task"

  );


  return cudaTask(node);

}


// Function: memset


inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) {


  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Memset>{}

  );


  auto p = cuda_get_memset_parms(dst, ch, count);


  TF_CHECK_CUDA(

    cudaGraphAddMemsetNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0, &p

    ),

    "failed to create a memset task"

  );


  return cudaTask(node);

}


// Function: memcpy


inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) {


  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Memcpy>{}

  );


  auto p = cuda_get_memcpy_parms(tgt, src, bytes);


  TF_CHECK_CUDA(

    cudaGraphAddMemcpyNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0, &p

    ),

    "failed to create a memcpy task"

  );


  return cudaTask(node);

}


// ------------------------------------------------------------------------

// update methods

// ------------------------------------------------------------------------


// Function: host

template <typename C>


void cudaFlow::host(cudaTask task, C&& c) {


  if(task.type() != cudaTaskType::HOST) {

    TF_THROW(task, " is not a host task");

  }


  auto h = std::get_if<cudaNode::Host>(&task._node->_handle);


  h->func = std::forward<C>(c);

}


// Function: update kernel parameters

template <typename F, typename... ArgsT>


void cudaFlow::kernel(

  cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args

) {


  if(task.type() != cudaTaskType::KERNEL) {

    TF_THROW(task, " is not a kernel task");

  }


  cudaKernelNodeParams p;


  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };

  p.func = (void*)f;

  p.gridDim = g;

  p.blockDim = b;

  p.sharedMemBytes = s;

  p.kernelParams = arguments;

  p.extra = nullptr;


  TF_CHECK_CUDA(

    cudaGraphExecKernelNodeSetParams(

      _executable, task._node->_native_handle, &p

    ),

    "failed to update kernel parameters on ", task

  );

}


// Function: update copy parameters

template <

  typename T,

  std::enable_if_t<!std::is_same_v<T, void>, void>*

>


void cudaFlow::copy(cudaTask task, T* tgt, const T* src, size_t num) {


  if(task.type() != cudaTaskType::MEMCPY) {

    TF_THROW(task, " is not a memcpy task");

  }


  auto p = cuda_get_copy_parms(tgt, src, num);


  TF_CHECK_CUDA(

    cudaGraphExecMemcpyNodeSetParams(

      _executable, task._node->_native_handle, &p

    ),

    "failed to update memcpy parameters on ", task

  );

}


// Function: update memcpy parameters


inline void cudaFlow::memcpy(

  cudaTask task, void* tgt, const void* src, size_t bytes

) {


  if(task.type() != cudaTaskType::MEMCPY) {

    TF_THROW(task, " is not a memcpy task");

  }


  auto p = cuda_get_memcpy_parms(tgt, src, bytes);


  TF_CHECK_CUDA(

    cudaGraphExecMemcpyNodeSetParams(_executable, task._node->_native_handle, &p),

    "failed to update memcpy parameters on ", task

  );

}


// Procedure: memset

inline


void cudaFlow::memset(cudaTask task, void* dst, int ch, size_t count) {


  if(task.type() != cudaTaskType::MEMSET) {

    TF_THROW(task, " is not a memset task");

  }


  auto p = cuda_get_memset_parms(dst, ch, count);


  TF_CHECK_CUDA(

    cudaGraphExecMemsetNodeSetParams(

      _executable, task._node->_native_handle, &p

    ),

    "failed to update memset parameters on ", task

  );

}


// Procedure: fill

template <typename T, std::enable_if_t<

  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*

>


void cudaFlow::fill(cudaTask task, T* dst, T value, size_t count) {


  if(task.type() != cudaTaskType::MEMSET) {

    TF_THROW(task, " is not a memset task");

  }


  auto p = cuda_get_fill_parms(dst, value, count);


  TF_CHECK_CUDA(

    cudaGraphExecMemsetNodeSetParams(

      _executable, task._node->_native_handle, &p

    ),

    "failed to update memset parameters on ", task

  );

}


// Procedure: zero

template <typename T, std::enable_if_t<

  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*

>


void cudaFlow::zero(cudaTask task, T* dst, size_t count) {


  if(task.type() != cudaTaskType::MEMSET) {

    TF_THROW(task, " is not a memset task");

  }


  auto p = cuda_get_zero_parms(dst, count);


  TF_CHECK_CUDA(

    cudaGraphExecMemsetNodeSetParams(

      _executable, task._node->_native_handle, &p

    ),

    "failed to update memset parameters on ", task

  );

}


// Function: capture

template <typename C>


void cudaFlow::capture(cudaTask task, C c) {


  if(task.type() != cudaTaskType::SUBFLOW) {

    TF_THROW(task, " is not a subflow task");

  }


  // insert a subflow node

  // construct a captured flow from the callable

  auto node_handle = std::get_if<cudaNode::Subflow>(&task._node->_handle);

  node_handle->graph.clear();


  cudaFlowCapturer capturer(node_handle->graph);


  c(capturer);


  // obtain the optimized captured graph

  auto captured = capturer._capture();

  //cuda_dump_graph(std::cout, captured);


  TF_CHECK_CUDA(

    cudaGraphExecChildGraphNodeSetParams(

      _executable, task._node->_native_handle, captured

    ),

    "failed to update a captured child graph"

  );


  TF_CHECK_CUDA(cudaGraphDestroy(captured), "failed to destroy captured graph");

}


// ----------------------------------------------------------------------------

// captured flow

// ----------------------------------------------------------------------------


// Function: capture

template <typename C>


cudaTask cudaFlow::capture(C&& c) {


  // insert a subflow node

  auto node = _graph.emplace_back(

    _graph, std::in_place_type_t<cudaNode::Subflow>{}

  );


  // construct a captured flow from the callable

  auto node_handle = std::get_if<cudaNode::Subflow>(&node->_handle);

  node_handle->graph.clear();

  cudaFlowCapturer capturer(node_handle->graph);


  c(capturer);


  // obtain the optimized captured graph

  auto captured = capturer._capture();

  //cuda_dump_graph(std::cout, captured);


  TF_CHECK_CUDA(

    cudaGraphAddChildGraphNode(

      &node->_native_handle, _graph._native_handle, nullptr, 0, captured

    ),

    "failed to add a cudaFlow capturer task"

  );


  TF_CHECK_CUDA(cudaGraphDestroy(captured), "failed to destroy captured graph");


  return cudaTask(node);

}


// ----------------------------------------------------------------------------

// Offload methods

// ----------------------------------------------------------------------------


// Procedure: offload_until

template <typename P>


void cudaFlow::offload_until(P&& predicate) {


  // transforms cudaFlow to a native cudaGraph under the specified device

  // and launches the graph through a given or an internal device stream

  if(_executable == nullptr) {

    TF_CHECK_CUDA(

      cudaGraphInstantiate(

        &_executable, _graph._native_handle, nullptr, nullptr, 0

      ),

      "failed to create an executable graph"

    );

    //cuda_dump_graph(std::cout, cf._graph._native_handle);

  }


  //cudaScopedPerThreadStream s;

  cudaStream s;


  while(!predicate()) {

    TF_CHECK_CUDA(

      cudaGraphLaunch(_executable, s), "failed to execute cudaFlow"

    );

    s.synchronize();

    //TF_CHECK_CUDA(

    //  cudaStreamSynchronize(s), "failed to synchronize cudaFlow execution"

    //);

  }


  _graph._state = cudaGraph::OFFLOADED;

}


// Procedure: offload_n


inline void cudaFlow::offload_n(size_t n) {

  offload_until([repeat=n] () mutable { return repeat-- == 0; });

}


// Procedure: offload


inline void cudaFlow::offload() {

  offload_until([repeat=1] () mutable { return repeat-- == 0; });

}


// ############################################################################

// Forward declaration: FlowBuilder

// ############################################################################


// FlowBuilder::emplace_on

template <typename C, typename D,

  std::enable_if_t<is_cudaflow_task_v<C>, void>*

>


Task FlowBuilder::emplace_on(C&& c, D&& d) {

  auto n = _graph._emplace_back(

    std::in_place_type_t<Node::cudaFlow>{},

    [c=std::forward<C>(c), d=std::forward<D>(d)] (Executor& e, Node* p) mutable {

      cudaScopedDevice ctx(d);

      e._invoke_cudaflow_task_entry(p, c);

    },

    std::make_unique<cudaGraph>()

  );

  return Task(n);

}


// FlowBuilder::emplace

template <typename C, std::enable_if_t<is_cudaflow_task_v<C>, void>*>

Task FlowBuilder::emplace(C&& c) {

  return emplace_on(std::forward<C>(c), tf::cuda_get_device());

}


// ############################################################################

// Forward declaration: Executor

// ############################################################################


// Procedure: _invoke_cudaflow_task_entry

template <typename C, std::enable_if_t<is_cudaflow_task_v<C>, void>*>

void Executor::_invoke_cudaflow_task_entry(Node* node, C&& c) {


  using T = std::conditional_t<

    std::is_invocable_r_v<void, C, cudaFlow&>, cudaFlow, cudaFlowCapturer

  >;


  auto h = std::get_if<Node::cudaFlow>(&node->_handle);


  cudaGraph* g = dynamic_cast<cudaGraph*>(h->graph.get());


  g->clear();


  T cf(*g);


  c(cf);


  // TODO: change it to _graph.state

  //if(cf._executable == nullptr) {

  if(!(g->_state & cudaGraph::OFFLOADED)) {

    cf.offload();

  }

}


/*// Procedure: _invoke_cudaflow_task_entry (cudaFlow)

template <typename C,

  std::enable_if_t<std::is_invocable_r_v<void, C, cudaFlow&>, void>*

>

void Executor::_invoke_cudaflow_task_entry(Node* node, C&& c) {


  auto h = std::get_if<Node::cudaFlow>(&node->_handle);


  cudaGraph* g = dynamic_cast<cudaGraph*>(h->graph.get());


  g->clear();


  cudaFlow cf(*g);


  c(cf);


  if(cf._executable == nullptr) {

    cf.offload();

  }

}


// Procedure: _invoke_cudaflow_task_entry (cudaFlowCapturer)

template <typename C,

  std::enable_if_t<std::is_invocable_r_v<void, C, cudaFlowCapturer&>, void>*

>

void Executor::_invoke_cudaflow_task_entry(Node* node, C&& c) {


  auto h = std::get_if<Node::cudaFlow>(&node->_handle);


  cudaGraph* g = dynamic_cast<cudaGraph*>(h->graph.get());


  g->clear();


  cudaFlowCapturer fc(*g);


  c(fc);


  if(fc._executable == nullptr) {

    fc.offload();

  }

}*/


}  // end of namespace tf -----------------------------------------------------


std::ostream

tf::Executor
class to create an executor for running a taskflow graph
Definition executor.hpp:50

tf::FlowBuilder::emplace
Task emplace(C &&callable)
creates a static task
Definition flow_builder.hpp:742

tf::FlowBuilder::_graph
Graph & _graph
associated graph object
Definition flow_builder.hpp:727

tf::FlowBuilder::emplace_on
Task emplace_on(C &&callable, D &&device)
creates a cudaFlow task on the given device
Definition cudaflow.hpp:1666

tf::Task
class to create a task handle over a node in a taskflow graph
Definition task.hpp:187

tf::cudaFlowCapturer
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57

tf::cudaFlow
class to create a cudaFlow task dependency graph
Definition cudaflow.hpp:56

tf::cudaFlow::host
cudaTask host(C &&callable)
creates a host task that runs a callable on the host
Definition cudaflow.hpp:1248

tf::cudaFlow::inclusive_scan
cudaTask inclusive_scan(I first, I last, O output, C op)
creates a task to perform parallel inclusive scan over a range of items
Definition scan.hpp:619

tf::cudaFlow::memset
cudaTask memset(void *dst, int v, size_t count)
creates a memset task that fills untyped data with a byte value
Definition cudaflow.hpp:1367

tf::cudaFlow::empty
bool empty() const
queries the emptiness of the graph
Definition cudaflow.hpp:1206

tf::cudaFlow::~cudaFlow
~cudaFlow()
destroys the cudaFlow and its associated native CUDA graph and executable graph
Definition cudaflow.hpp:1176

tf::cudaFlow::for_each
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:181

tf::cudaFlow::transform_reduce
cudaTask transform_reduce(I first, I last, T *result, B bop, U uop)
performs parallel reduction over a range of transformed items
Definition reduce.hpp:596

tf::cudaFlow::fill
cudaTask fill(T *dst, T value, size_t count)
creates a memset task that fills a typed memory block with a value
Definition cudaflow.hpp:1325

tf::cudaFlow::noop
cudaTask noop()
creates a no-operation task
Definition cudaflow.hpp:1230

tf::cudaFlow::for_each_index
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:190

tf::cudaFlow::uninitialized_reduce
cudaTask uninitialized_reduce(I first, I last, T *result, B bop)
similar to tf::cudaFlow::reduce but does not assume any initial value to reduce
Definition reduce.hpp:587

tf::cudaFlow::zero
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cudaflow.hpp:1303

tf::cudaFlow::dump_native_graph
void dump_native_graph(std::ostream &os) const
dumps the native CUDA graph into a DOT format through an output stream
Definition cudaflow.hpp:1221

tf::cudaFlow::find_if
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:193

tf::cudaFlow::transform_inclusive_scan
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
creates a task to perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:655

tf::cudaFlow::min_element
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:340

tf::cudaFlow::max_element
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:465

tf::cudaFlow::dump
void dump(std::ostream &os) const
dumps the cudaFlow graph into a DOT format through an output stream
Definition cudaflow.hpp:1216

tf::cudaFlow::offload
void offload()
offloads the cudaFlow and executes it once
Definition cudaflow.hpp:1654

tf::cudaFlow::capture
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582

tf::cudaFlow::exclusive_scan
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlow::inclusive_scan but excludes the first value
Definition scan.hpp:637

tf::cudaFlow::reduce
cudaTask reduce(I first, I last, T *result, B bop)
performs parallel reduction over a range of items
Definition reduce.hpp:578

tf::cudaFlow::transform_uninitialized_reduce
cudaTask transform_uninitialized_reduce(I first, I last, T *result, B bop, U uop)
similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:605

tf::cudaFlow::sort_by_key
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
creates kernels that sort the given array
Definition sort.hpp:533

tf::cudaFlow::offload_until
void offload_until(P &&predicate)
offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
Definition cudaflow.hpp:1618

tf::cudaFlow::merge_by_key
cudaTask merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
creates a task to perform parallel key-value merge
Definition merge.hpp:679

tf::cudaFlow::kernel
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
creates a kernel task
Definition cudaflow.hpp:1272

tf::cudaFlow::clear
void clear()
clears the cudaFlow object
Definition cudaflow.hpp:1185

tf::cudaFlow::offload_n
void offload_n(size_t N)
offloads the cudaFlow and executes it by the given times
Definition cudaflow.hpp:1649

tf::cudaFlow::single_task
cudaTask single_task(C c)
runs a callable with only a single kernel thread
Definition for_each.hpp:169

tf::cudaFlow::memcpy
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task that copies untyped data in bytes
Definition cudaflow.hpp:1386

tf::cudaFlow::cudaFlow
cudaFlow()
constructs a standalone cudaFlow
Definition cudaflow.hpp:1152

tf::cudaFlow::sort
cudaTask sort(I first, I last, C comp)
creates a task to perform parallel sort an array
Definition sort.hpp:515

tf::cudaFlow::num_tasks
size_t num_tasks() const
queries the number of tasks
Definition cudaflow.hpp:1211

tf::cudaFlow::transform_exclusive_scan
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlow::transform_inclusive_scan but excludes the first value
Definition scan.hpp:677

tf::cudaFlow::copy
cudaTask copy(T *tgt, const T *src, size_t num)
creates a memcopy task that copies typed data
Definition cudaflow.hpp:1348

tf::cudaFlow::transform
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:139

tf::cudaFlow::merge
cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
creates a task to perform parallel merge on two sorted arrays
Definition merge.hpp:652

tf::cudaScopedDevice
class to create an RAII-styled context switch
Definition cuda_device.hpp:293

tf::cudaStream
‍**
Definition cuda_stream.hpp:174

tf::cudaStream::synchronize
void synchronize() const
synchronizes the associated stream
Definition cuda_stream.hpp:253

tf::cudaTask
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65

tf::cudaTask::type
cudaTaskType type() const
queries the task type
Definition cuda_task.hpp:221

cuda_capturer.hpp
cudaFlow capturer include file

cuda_task.hpp
cudaTask include file

std::forward
T forward(T... args)

std

tf
taskflow namespace
Definition small_vector.hpp:27

tf::cuda_get_device
int cuda_get_device()
gets the current device associated with the caller thread
Definition cuda_device.hpp:24

tf::cudaTaskType::KERNEL
@ KERNEL
memory copy task type

tf::cudaTaskType::MEMSET
@ MEMSET
memory set task type

tf::cudaTaskType::SUBFLOW
@ SUBFLOW
subflow (child graph) task type

tf::cudaTaskType::HOST
@ HOST
host task type

tf::cudaTaskType::MEMCPY
@ MEMCPY
memory copy task type