hello-world/docs/cuda__capturer_8hpp_source.html

#pragma once


#include "cuda_task.hpp"

#include "cuda_optimizer.hpp"


namespace tf {


// ----------------------------------------------------------------------------

// class definition: cudaFlowCapturer

// ----------------------------------------------------------------------------


class cudaFlowCapturer {


  friend class cudaFlow;

  friend class Executor;


  struct External {

    cudaGraph graph;

  };


  struct Internal {

  };


  using handle_t = std::variant<External, Internal>;


  using Optimizer = std::variant<

    cudaRoundRobinCapturing,

    cudaSequentialCapturing,

    cudaLinearCapturing

  >;


  public:


    cudaFlowCapturer();


    virtual ~cudaFlowCapturer();


    bool empty() const;


    size_t num_tasks() const;


    void clear();


    void dump(std::ostream& os) const;


    template <typename OPT, typename... ArgsT>

    OPT& make_optimizer(ArgsT&&... args);


    // ------------------------------------------------------------------------

    // basic methods

    // ------------------------------------------------------------------------


    template <typename C, std::enable_if_t<

      std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr

    >

    cudaTask on(C&& callable);


    template <typename C, std::enable_if_t<

      std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr

    >

    void on(cudaTask task, C&& callable);


    cudaTask noop();


    void noop(cudaTask task);


    cudaTask memcpy(void* dst, const void* src, size_t count);


    void memcpy(cudaTask task, void* dst, const void* src, size_t count);


    template <typename T,

      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr

    >

    cudaTask copy(T* tgt, const T* src, size_t num);


    template <typename T,

      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr

    >

    void copy(cudaTask task, T* tgt, const T* src, size_t num);


    cudaTask memset(void* ptr, int v, size_t n);


    void memset(cudaTask task, void* ptr, int value, size_t n);


    template <typename F, typename... ArgsT>

    cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT&&... args);


    template <typename F, typename... ArgsT>

    void kernel(

      cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args

    );


    // ------------------------------------------------------------------------

    // generic algorithms

    // ------------------------------------------------------------------------


    template <typename C>

    cudaTask single_task(C c);


    template <typename C>

    void single_task(cudaTask task, C c);


    template <typename I, typename C>

    cudaTask for_each(I first, I last, C callable);


    template <typename I, typename C>

    void for_each(cudaTask task, I first, I last, C callable);


    template <typename I, typename C>

    cudaTask for_each_index(I first, I last, I step, C callable);


    template <typename I, typename C>

    void for_each_index(

      cudaTask task, I first, I last, I step, C callable

    );


    template <typename I, typename O, typename C>

    cudaTask transform(I first, I last, O output, C op);


    template <typename I, typename O, typename C>

    void transform(cudaTask task, I first, I last, O output, C op);


    template <typename I1, typename I2, typename O, typename C>

    cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);


    template <typename I1, typename I2, typename O, typename C>

    void transform(

      cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op

    );


    template <typename I, typename T, typename C>

    cudaTask reduce(I first, I last, T* result, C op);


    template <typename I, typename T, typename C>

    void reduce(cudaTask task, I first, I last, T* result, C op);


    template <typename I, typename T, typename C>

    cudaTask uninitialized_reduce(I first, I last, T* result, C op);


    template <typename I, typename T, typename C>

    void uninitialized_reduce(

      cudaTask task, I first, I last, T* result, C op

    );


    template <typename I, typename T, typename C, typename U>

    cudaTask transform_reduce(I first, I last, T* result, C bop, U uop);


    template <typename I, typename T, typename C, typename U>

    void transform_reduce(

      cudaTask task, I first, I last, T* result, C bop, U uop

    );


    template <typename I, typename T, typename C, typename U>

    cudaTask transform_uninitialized_reduce(I first, I last, T* result, C bop, U uop);


    template <typename I, typename T, typename C, typename U>

    void transform_uninitialized_reduce(

      cudaTask task, I first, I last, T* result, C bop, U uop

    );


    template <typename I, typename O, typename C>

    cudaTask inclusive_scan(I first, I last, O output, C op);


    template <typename I, typename O, typename C>

    void inclusive_scan(cudaTask task, I first, I last, O output, C op);


    template <typename I, typename O, typename C>

    cudaTask exclusive_scan(I first, I last, O output, C op);


    template <typename I, typename O, typename C>

    void exclusive_scan(cudaTask task, I first, I last, O output, C op);


    template <typename I, typename O, typename B, typename U>

    cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop);


    template <typename I, typename O, typename B, typename U>

    void transform_inclusive_scan(

      cudaTask task, I first, I last, O output, B bop, U uop

    );


    template <typename I, typename O, typename B, typename U>

    cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop);


    template <typename I, typename O, typename B, typename U>

    void transform_exclusive_scan(

      cudaTask task, I first, I last, O output, B bop, U uop

    );


    template <typename A, typename B, typename C, typename Comp>

    cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp);


    template <typename A, typename B, typename C, typename Comp>

    void merge(

      cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp

    );


    template<

      typename a_keys_it, typename a_vals_it,

      typename b_keys_it, typename b_vals_it,

      typename c_keys_it, typename c_vals_it,

      typename C

    >

    cudaTask merge_by_key(

      a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,

      b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,

      c_keys_it c_keys_first, c_vals_it c_vals_first, C comp

    );


    template<

      typename a_keys_it, typename a_vals_it,

      typename b_keys_it, typename b_vals_it,

      typename c_keys_it, typename c_vals_it,

      typename C

    >

    void merge_by_key(

      cudaTask task,

      a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,

      b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,

      c_keys_it c_keys_first, c_vals_it c_vals_first, C comp

    );


    template <typename I, typename C>

    cudaTask sort(I first, I last, C comp);


    template <typename I, typename C>

    void sort(cudaTask task, I first, I last, C comp);


    template <typename K_it, typename V_it, typename C>

    cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp);


    template <typename K_it, typename V_it, typename C>

    void sort_by_key(

      cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp

    );


    template <typename I, typename U>

    cudaTask find_if(I first, I last, unsigned* idx, U op);


    template <typename I, typename U>

    void find_if(cudaTask task, I first, I last, unsigned* idx, U op);


    template <typename I, typename O>

    cudaTask min_element(I first, I last, unsigned* idx, O op);


    template <typename I, typename O>

    void min_element(cudaTask task, I first, I last, unsigned* idx, O op);


    template <typename I, typename O>

    cudaTask max_element(I first, I last, unsigned* idx, O op);


    template <typename I, typename O>

    void max_element(cudaTask task, I first, I last, unsigned* idx, O op);


    // ------------------------------------------------------------------------

    // offload methods

    // ------------------------------------------------------------------------


    template <typename P>

    void offload_until(P&& predicate);


    void offload_n(size_t n);


    void offload();


  private:


    handle_t _handle;


    cudaGraph& _graph;


    Optimizer _optimizer;


    cudaGraphExec_t _executable {nullptr};


    cudaFlowCapturer(cudaGraph&);


    cudaGraph_t _capture();


    void _destroy_executable();


};


// constructs a cudaFlow capturer from a taskflow

inline cudaFlowCapturer::cudaFlowCapturer(cudaGraph& g) :

  _handle {std::in_place_type_t<Internal>{}},

  _graph  {g} {

}


// constructs a standalone cudaFlow capturer


inline cudaFlowCapturer::cudaFlowCapturer() :

  _handle {std::in_place_type_t<External>{}},

  _graph  {std::get_if<External>(&_handle)->graph} {

}


inline cudaFlowCapturer::~cudaFlowCapturer() {


  if(_executable != nullptr) {

    cudaGraphExecDestroy(_executable);

  }

}


// Function: empty


inline bool cudaFlowCapturer::empty() const {

  return _graph.empty();

}


// Function: num_tasks


inline size_t cudaFlowCapturer::num_tasks() const {

  return _graph._nodes.size();

}


// Procedure: clear


inline void cudaFlowCapturer::clear() {

  _destroy_executable();

  _graph._nodes.clear();

}


// Procedure: dump


inline void cudaFlowCapturer::dump(std::ostream& os) const {

  _graph.dump(os, nullptr, "");

}


// Procedure: _destroy_executable

inline void cudaFlowCapturer::_destroy_executable() {

  if(_executable != nullptr) {

    TF_CHECK_CUDA(

      cudaGraphExecDestroy(_executable), "failed to destroy executable graph"

    );

    _executable = nullptr;

  }

}


// Function: capture

template <typename C, std::enable_if_t<

  std::is_invocable_r_v<void, C, cudaStream_t>, void>*

>


cudaTask cudaFlowCapturer::on(C&& callable) {

  auto node = _graph.emplace_back(_graph,

    std::in_place_type_t<cudaNode::Capture>{}, std::forward<C>(callable)

  );

  return cudaTask(node);

}


// Function: noop


inline cudaTask cudaFlowCapturer::noop() {

  return on([](cudaStream_t){});

}


// Function: noop


inline void cudaFlowCapturer::noop(cudaTask task) {

  on(task, [](cudaStream_t){});

}


// Function: memcpy


inline cudaTask cudaFlowCapturer::memcpy(

  void* dst, const void* src, size_t count

) {

  return on([dst, src, count] (cudaStream_t stream) mutable {

    TF_CHECK_CUDA(

      cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),

      "failed to capture memcpy"

    );

  });

}


// Function: copy

template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*>


cudaTask cudaFlowCapturer::copy(T* tgt, const T* src, size_t num) {

  return on([tgt, src, num] (cudaStream_t stream) mutable {

    TF_CHECK_CUDA(

      cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream),

      "failed to capture copy"

    );

  });

}


// Function: memset


inline cudaTask cudaFlowCapturer::memset(void* ptr, int v, size_t n) {

  return on([ptr, v, n] (cudaStream_t stream) mutable {

    TF_CHECK_CUDA(

      cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset"

    );

  });

}


// Function: kernel

template <typename F, typename... ArgsT>


cudaTask cudaFlowCapturer::kernel(

  dim3 g, dim3 b, size_t s, F f, ArgsT&&... args

) {

  return on([g, b, s, f, args...] (cudaStream_t stream) mutable {

    f<<<g, b, s, stream>>>(args...);

  });

}


// Function: _capture

inline cudaGraph_t cudaFlowCapturer::_capture() {

  return std::visit(

    [this](auto&& opt){ return opt._optimize(_graph); }, _optimizer

  );

}


// Procedure: offload_until

template <typename P>


void cudaFlowCapturer::offload_until(P&& predicate) {


  // If the topology got changed, we need to destroy the executable

  // and create a new one

  if(_graph._state & cudaGraph::CHANGED) {


    _destroy_executable();


    auto g = _capture();

    TF_CHECK_CUDA(

      cudaGraphInstantiate(&_executable, g, nullptr, nullptr, 0),

      "failed to create an executable graph"

    );


    //cuda_dump_graph(std::cout, g);


    // TODO: store the native graph?

    TF_CHECK_CUDA(cudaGraphDestroy(g), "failed to destroy captured graph");

  }

  // if the graph is just updated (i.e., topology does not change),

  // we can skip part of the optimization and just update the executable

  // with the new captured graph

  else if(_graph._state & cudaGraph::UPDATED) {


    // TODO: skip part of the optimization (e.g., levelization)

    auto g = _capture();


    assert(_executable != nullptr);


    cudaGraphNode_t error_node;

    cudaGraphExecUpdateResult error_result;

    cudaGraphExecUpdate(_executable, g, &error_node, &error_result);


    if(error_result != cudaGraphExecUpdateSuccess) {

      _destroy_executable();

      TF_CHECK_CUDA(

        cudaGraphInstantiate(&_executable, g, nullptr, nullptr, 0),

        "failed to re-create an executable graph after updates fail"

      );

    }

    // TODO: store the native graph?

    TF_CHECK_CUDA(cudaGraphDestroy(g), "failed to destroy captured graph");

  }


  // offload the executable

  if(_executable) {

    //cudaScopedPerThreadStream s;

    cudaStream s;


    while(!predicate()) {

      TF_CHECK_CUDA(

        cudaGraphLaunch(_executable, s), "failed to launch the exec graph"

      );


      s.synchronize();


      //TF_CHECK_CUDA(cudaStreamSynchronize(s), "failed to synchronize stream");

    }

  }


  _graph._state = cudaGraph::OFFLOADED;

}


// Procedure: offload_n


inline void cudaFlowCapturer::offload_n(size_t n) {

  offload_until([repeat=n] () mutable { return repeat-- == 0; });

}


// Procedure: offload


inline void cudaFlowCapturer::offload() {

  offload_until([repeat=1] () mutable { return repeat-- == 0; });

}


// Function: on

template <typename C, std::enable_if_t<

  std::is_invocable_r_v<void, C, cudaStream_t>, void>*

>


void cudaFlowCapturer::on(cudaTask task, C&& callable) {


  if(task.type() != cudaTaskType::CAPTURE) {

    TF_THROW("invalid cudaTask type (must be CAPTURE)");

  }


  _graph._state |= cudaGraph::UPDATED;


  std::get_if<cudaNode::Capture>(&task._node->_handle)->work =

    std::forward<C>(callable);

}


// Function: memcpy


inline void cudaFlowCapturer::memcpy(

  cudaTask task, void* dst, const void* src, size_t count

) {

  on(task, [dst, src, count](cudaStream_t stream) mutable {

    TF_CHECK_CUDA(

      cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),

      "failed to capture memcpy"

    );

  });

}


// Function: copy

template <typename T,

  std::enable_if_t<!std::is_same_v<T, void>, void>*

>


void cudaFlowCapturer::copy(

  cudaTask task, T* tgt, const T* src, size_t num

) {

  on(task, [tgt, src, num] (cudaStream_t stream) mutable {

    TF_CHECK_CUDA(

      cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream),

      "failed to capture copy"

    );

  });

}


// Function: memset


inline void cudaFlowCapturer::memset(

  cudaTask task, void* ptr, int v, size_t n

) {

  on(task, [ptr, v, n] (cudaStream_t stream) mutable {

    TF_CHECK_CUDA(

      cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset"

    );

  });

}


// Function: kernel

template <typename F, typename... ArgsT>


void cudaFlowCapturer::kernel(

  cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args

) {

  on(task, [g, b, s, f, args...] (cudaStream_t stream) mutable {

    f<<<g, b, s, stream>>>(args...);

  });

}


// Function: make_optimizer

template <typename OPT, typename ...ArgsT>


OPT& cudaFlowCapturer::make_optimizer(ArgsT&&... args) {

  return _optimizer.emplace<OPT>(std::forward<ArgsT>(args)...);

}


}  // end of namespace tf -----------------------------------------------------


std::ostream

tf::Executor
class to create an executor for running a taskflow graph
Definition executor.hpp:50

tf::cudaFlowCapturer
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57

tf::cudaFlowCapturer::transform_reduce
cudaTask transform_reduce(I first, I last, T *result, C bop, U uop)
captures kernels that perform parallel reduction over a range of transformed items
Definition reduce.hpp:459

tf::cudaFlowCapturer::clear
void clear()
clear this cudaFlow capturer
Definition cuda_capturer.hpp:1081

tf::cudaFlowCapturer::for_each
cudaTask for_each(I first, I last, C callable)
captures a kernel that applies a callable to each dereferenced element of the data array
Definition for_each.hpp:221

tf::cudaFlowCapturer::offload_n
void offload_n(size_t n)
offloads the captured cudaFlow and executes it by the given times
Definition cuda_capturer.hpp:1237

tf::cudaFlowCapturer::memset
cudaTask memset(void *ptr, int v, size_t n)
initializes or sets GPU memory to the given value byte by byte
Definition cuda_capturer.hpp:1146

tf::cudaFlowCapturer::merge_by_key
cudaTask merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
captures kernels that perform parallel key-value merge
Definition merge.hpp:766

tf::cudaFlowCapturer::exclusive_scan
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlowCapturer::inclusive_scan but excludes the first value
Definition scan.hpp:739

tf::cudaFlowCapturer::reduce
cudaTask reduce(I first, I last, T *result, C op)
captures kernels that perform parallel reduction over a range of items
Definition reduce.hpp:427

tf::cudaFlowCapturer::empty
bool empty() const
queries the emptiness of the graph
Definition cuda_capturer.hpp:1071

tf::cudaFlowCapturer::sort
cudaTask sort(I first, I last, C comp)
captures kernels that sort the given array
Definition sort.hpp:557

tf::cudaFlowCapturer::transform_uninitialized_reduce
cudaTask transform_uninitialized_reduce(I first, I last, T *result, C bop, U uop)
similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:479

tf::cudaFlowCapturer::noop
cudaTask noop()
captures a no-operation task
Definition cuda_capturer.hpp:1113

tf::cudaFlowCapturer::offload
void offload()
offloads the captured cudaFlow and executes it once
Definition cuda_capturer.hpp:1242

tf::cudaFlowCapturer::transform_inclusive_scan
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
captures kernels that perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:775

tf::cudaFlowCapturer::inclusive_scan
cudaTask inclusive_scan(I first, I last, O output, C op)
captures kernels that perform parallel inclusive scan over a range of items
Definition scan.hpp:703

tf::cudaFlowCapturer::kernel
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
captures a kernel
Definition cuda_capturer.hpp:1156

tf::cudaFlowCapturer::find_if
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:215

tf::cudaFlowCapturer::offload_until
void offload_until(P &&predicate)
offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
Definition cuda_capturer.hpp:1173

tf::cudaFlowCapturer::min_element
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:300

tf::cudaFlowCapturer::dump
void dump(std::ostream &os) const
dumps the capture graph into a DOT format through an output stream
Definition cuda_capturer.hpp:1087

tf::cudaFlowCapturer::transform_exclusive_scan
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value
Definition scan.hpp:817

tf::cudaFlowCapturer::uninitialized_reduce
cudaTask uninitialized_reduce(I first, I last, T *result, C op)
similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce
Definition reduce.hpp:443

tf::cudaFlowCapturer::transform
cudaTask transform(I first, I last, O output, C op)
captures a kernel that transforms an input range to an output range
Definition transform.hpp:181

tf::cudaFlowCapturer::make_optimizer
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312

tf::cudaFlowCapturer::cudaFlowCapturer
cudaFlowCapturer()
constrcts a standalone cudaFlowCapturer
Definition cuda_capturer.hpp:1058

tf::cudaFlowCapturer::~cudaFlowCapturer
virtual ~cudaFlowCapturer()
destructs the cudaFlowCapturer
Definition cuda_capturer.hpp:1063

tf::cudaFlowCapturer::copy
cudaTask copy(T *tgt, const T *src, size_t num)
captures a copy task of typed data
Definition cuda_capturer.hpp:1136

tf::cudaFlowCapturer::sort_by_key
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
captures kernels that sort the given array
Definition sort.hpp:593

tf::cudaFlowCapturer::merge
cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
captures kernels that perform parallel merge on two sorted arrays
Definition merge.hpp:725

tf::cudaFlowCapturer::single_task
cudaTask single_task(C c)
capturers a kernel to runs the given callable with only one thread
Definition for_each.hpp:259

tf::cudaFlowCapturer::max_element
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:425

tf::cudaFlowCapturer::on
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105

tf::cudaFlowCapturer::memcpy
cudaTask memcpy(void *dst, const void *src, size_t count)
copies data between host and device asynchronously through a stream
Definition cuda_capturer.hpp:1123

tf::cudaFlowCapturer::num_tasks
size_t num_tasks() const
queries the number of tasks
Definition cuda_capturer.hpp:1076

tf::cudaFlowCapturer::for_each_index
cudaTask for_each_index(I first, I last, I step, C callable)
captures a kernel that applies a callable to each index in the range with the step size
Definition for_each.hpp:230

tf::cudaFlow
class to create a cudaFlow task dependency graph
Definition cudaflow.hpp:56

tf::cudaLinearCapturing
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182

tf::cudaRoundRobinCapturing
class to capture a CUDA graph using a round-robin algorithm
Definition cuda_optimizer.hpp:243

tf::cudaSequentialCapturing
class to capture a CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:134

tf::cudaStream
‍**
Definition cuda_stream.hpp:174

tf::cudaStream::synchronize
void synchronize() const
synchronizes the associated stream
Definition cuda_stream.hpp:253

tf::cudaTask
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65

tf::cudaTask::type
cudaTaskType type() const
queries the task type
Definition cuda_task.hpp:221

cuda_optimizer.hpp
cudaFlow capturing algorithms include file

cuda_task.hpp
cudaTask include file

std::forward
T forward(T... args)

std

tf
taskflow namespace
Definition small_vector.hpp:27

tf::cudaTaskType::CAPTURE
@ CAPTURE
capture task type