hello-world/docs/for__each_8hpp_source.html

#pragma once


#include "../cudaflow.hpp"


namespace tf {


namespace detail {


template <typename P, typename I, typename C>

void cuda_for_each_loop(P&& p, I first, unsigned count, C c) {


  using E = std::decay_t<P>;


  unsigned B = (count + E::nv - 1) / E::nv;


  cuda_kernel<<<B, E::nt, 0, p.stream()>>>(

  [=] __device__ (auto tid, auto bid) {

    auto tile = cuda_get_tile(bid, E::nv, count);

    cuda_strided_iterate<E::nt, E::vt>([=](auto, auto j) {

      c(*(first + tile.begin + j));

    }, tid, tile.count());

  });

}


template <typename P, typename I, typename C>

void cuda_for_each_index_loop(

  P&& p, I first, I inc, unsigned count, C c

) {


  using E = std::decay_t<P>;


  unsigned B = (count + E::nv - 1) / E::nv;


  cuda_kernel<<<B, E::nt, 0, p.stream()>>>(

  [=]__device__(auto tid, auto bid) {

    auto tile = cuda_get_tile(bid, E::nv, count);

    cuda_strided_iterate<E::nt, E::vt>([=]__device__(auto, auto j) {

      c(first + inc*(tile.begin+j));

    }, tid, tile.count());

  });

}


}  // end of namespace detail -------------------------------------------------


// ----------------------------------------------------------------------------

// cuda standard algorithms: single_task/for_each/for_each_index

// ----------------------------------------------------------------------------


template <typename P, typename C>


void cuda_single_task(P&& p, C c) {

  cuda_kernel<<<1, 1, 0, p.stream()>>>(

    [=]__device__(auto, auto) mutable { c(); }

  );

}


template <typename P, typename I, typename C>


void cuda_for_each(P&& p, I first, I last, C c) {


  unsigned count = std::distance(first, last);


  if(count == 0) {

    return;

  }


  detail::cuda_for_each_loop(p, first, count, c);

}


template <typename P, typename I, typename C>


void cuda_for_each_index(P&& p, I first, I last, I inc, C c) {


  if(is_range_invalid(first, last, inc)) {

    TF_THROW("invalid range [", first, ", ", last, ") with inc size ", inc);

  }


  unsigned count = distance(first, last, inc);


  if(count == 0) {

    return;

  }


  detail::cuda_for_each_index_loop(p, first, inc, count, c);

}


// ----------------------------------------------------------------------------

// single_task

// ----------------------------------------------------------------------------


template <typename C>

__global__ void cuda_single_task(C callable) {

  callable();

}


// ----------------------------------------------------------------------------

// cudaFlow

// ----------------------------------------------------------------------------


// Function: single_task

template <typename C>


cudaTask cudaFlow::single_task(C c) {

  return kernel(1, 1, 0, cuda_single_task<C>, c);

}


// Function: single_task

template <typename C>


void cudaFlow::single_task(cudaTask task, C c) {

  return kernel(task, 1, 1, 0, cuda_single_task<C>, c);

}


// Function: for_each

template <typename I, typename C>


cudaTask cudaFlow::for_each(I first, I last, C c) {

  return capture([=](cudaFlowCapturer& cap) mutable {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.for_each(first, last, c);

  });

}


// Function: for_each_index

template <typename I, typename C>


cudaTask cudaFlow::for_each_index(I first, I last, I inc, C c) {

  return capture([=](cudaFlowCapturer& cap) mutable {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.for_each_index(first, last, inc, c);

  });

}


// Function: for_each

template <typename I, typename C>


void cudaFlow::for_each(cudaTask task, I first, I last, C c) {

  capture(task, [=](cudaFlowCapturer& cap) mutable {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.for_each(first, last, c);

  });

}


// Function: for_each_index

template <typename I, typename C>


void cudaFlow::for_each_index(cudaTask task, I first, I last, I inc, C c) {

  capture(task, [=](cudaFlowCapturer& cap) mutable {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.for_each_index(first, last, inc, c);

  });

}


// ----------------------------------------------------------------------------

// cudaFlowCapturer

// ----------------------------------------------------------------------------


// Function: for_each

template <typename I, typename C>


cudaTask cudaFlowCapturer::for_each(I first, I last, C c) {

  return on([=](cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_for_each(p, first, last, c);

  });

}


// Function: for_each_index

template <typename I, typename C>


cudaTask cudaFlowCapturer::for_each_index(I beg, I end, I inc, C c) {

  return on([=] (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_for_each_index(p, beg, end, inc, c);

  });

}


// Function: for_each

template <typename I, typename C>


void cudaFlowCapturer::for_each(cudaTask task, I first, I last, C c) {

  on(task, [=](cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_for_each(p, first, last, c);

  });

}


// Function: for_each_index

template <typename I, typename C>


void cudaFlowCapturer::for_each_index(

  cudaTask task, I beg, I end, I inc, C c

) {

  on(task, [=] (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_for_each_index(p, beg, end, inc, c);

  });

}


// Function: single_task

template <typename C>


cudaTask cudaFlowCapturer::single_task(C callable) {

  return on([=] (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_single_task(p, callable);

  });

}


// Function: single_task

template <typename C>


void cudaFlowCapturer::single_task(cudaTask task, C callable) {

  on(task, [=] (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_single_task(p, callable);

  });

}


}  // end of namespace tf -----------------------------------------------------


tf::cudaExecutionPolicy
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29

tf::cudaFlowCapturer
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57

tf::cudaFlowCapturer::for_each
cudaTask for_each(I first, I last, C callable)
captures a kernel that applies a callable to each dereferenced element of the data array
Definition for_each.hpp:221

tf::cudaFlowCapturer::make_optimizer
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312

tf::cudaFlowCapturer::single_task
cudaTask single_task(C c)
capturers a kernel to runs the given callable with only one thread
Definition for_each.hpp:259

tf::cudaFlowCapturer::on
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105

tf::cudaFlowCapturer::for_each_index
cudaTask for_each_index(I first, I last, I step, C callable)
captures a kernel that applies a callable to each index in the range with the step size
Definition for_each.hpp:230

tf::cudaFlow::for_each
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:181

tf::cudaFlow::for_each_index
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:190

tf::cudaFlow::capture
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582

tf::cudaFlow::kernel
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
creates a kernel task
Definition cudaflow.hpp:1272

tf::cudaFlow::single_task
cudaTask single_task(C c)
runs a callable with only a single kernel thread
Definition for_each.hpp:169

tf::cudaLinearCapturing
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182

tf::cudaTask
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65

std::count
T count(T... args)

std::distance
T distance(T... args)

std::forward
T forward(T... args)

tf
taskflow namespace
Definition small_vector.hpp:27

tf::cuda_for_each_index
void cuda_for_each_index(P &&p, I first, I last, I inc, C c)
performs asynchronous parallel iterations over an index-based range of items
Definition for_each.hpp:138

tf::cuda_single_task
void cuda_single_task(P &&p, C c)
runs a callable asynchronously using one kernel thread
Definition for_each.hpp:69

tf::cuda_for_each
void cuda_for_each(P &&p, I first, I last, C c)
performs asynchronous parallel iterations over a range of items
Definition for_each.hpp:97