hello-world/docs/transform_8hpp_source.html

#pragma once


#include "../cudaflow.hpp"


namespace tf {


// ----------------------------------------------------------------------------

// transform

// ----------------------------------------------------------------------------


namespace detail {


template <typename P, typename I, typename O, typename C>

void cuda_transform_loop(P&& p, I first, unsigned count, O output, C op) {


  using E = std::decay_t<P>;


  unsigned B = (count + E::nv - 1) / E::nv;


  cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) {

    auto tile = cuda_get_tile(bid, E::nv, count);

    cuda_strided_iterate<E::nt, E::vt>([=]__device__(auto, auto j) {

      auto offset = j + tile.begin;

      *(output + offset) = op(*(first+offset));

    }, tid, tile.count());

  });

}


template <typename P, typename I1, typename I2, typename O, typename C>

void cuda_transform_loop(

  P&& p, I1 first1, I2 first2, unsigned count, O output, C op

) {


  using E = std::decay_t<P>;


  unsigned B = (count + E::nv - 1) / E::nv;


  cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) {

    auto tile = cuda_get_tile(bid, E::nv, count);

    cuda_strided_iterate<E::nt, E::vt>([=]__device__(auto, auto j) {

      auto offset = j + tile.begin;

      *(output + offset) = op(*(first1+offset), *(first2+offset));

    }, tid, tile.count());

  });

}


}  // end of namespace detail -------------------------------------------------


// ----------------------------------------------------------------------------

// CUDA standard algorithms: transform

// ----------------------------------------------------------------------------


template <typename P, typename I, typename O, typename C>


void cuda_transform(P&& p, I first, I last, O output, C op) {


  unsigned count = std::distance(first, last);


  if(count == 0) {

    return;

  }


  detail::cuda_transform_loop(p, first, count, output, op);

}


template <typename P, typename I1, typename I2, typename O, typename C>


void cuda_transform(

  P&& p, I1 first1, I1 last1, I2 first2, O output, C op

) {


  unsigned count = std::distance(first1, last1);


  if(count == 0) {

    return;

  }


  detail::cuda_transform_loop(p, first1, first2, count, output, op);

}


// ----------------------------------------------------------------------------

// cudaFlow

// ----------------------------------------------------------------------------


// Function: transform

template <typename I, typename O, typename C>


cudaTask cudaFlow::transform(I first, I last, O output, C c) {

  return capture([=](cudaFlowCapturer& cap) mutable {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.transform(first, last, output, c);

  });

}


// Function: transform

template <typename I1, typename I2, typename O, typename C>


cudaTask cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C c) {

  return capture([=](cudaFlowCapturer& cap) mutable {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.transform(first1, last1, first2, output, c);

  });

}


// Function: update transform

template <typename I, typename O, typename C>


void cudaFlow::transform(cudaTask task, I first, I last, O output, C c) {

  capture(task, [=](cudaFlowCapturer& cap) mutable {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.transform(first, last, output, c);

  });

}


// Function: update transform

template <typename I1, typename I2, typename O, typename C>


void cudaFlow::transform(

  cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c

) {

  capture(task, [=](cudaFlowCapturer& cap) mutable {

    cap.make_optimizer<cudaLinearCapturing>();

    cap.transform(first1, last1, first2, output, c);

  });

}


// ----------------------------------------------------------------------------

// cudaFlowCapturer

// ----------------------------------------------------------------------------


// Function: transform

template <typename I, typename O, typename C>


cudaTask cudaFlowCapturer::transform(I first, I last, O output, C op) {

  return on([=](cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_transform(p, first, last, output, op);

  });

}


// Function: transform

template <typename I1, typename I2, typename O, typename C>


cudaTask cudaFlowCapturer::transform(

  I1 first1, I1 last1, I2 first2, O output, C op

) {

  return on([=](cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_transform(p, first1, last1, first2, output, op);

  });

}


// Function: transform

template <typename I, typename O, typename C>


void cudaFlowCapturer::transform(

  cudaTask task, I first, I last, O output, C op

) {

  on(task, [=] (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_transform(p, first, last, output, op);

  });

}


// Function: transform

template <typename I1, typename I2, typename O, typename C>


void cudaFlowCapturer::transform(

  cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op

) {

  on(task, [=] (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_transform(p, first1, last1, first2, output, op);

  });

}


}  // end of namespace tf -----------------------------------------------------


tf::cudaExecutionPolicy
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29

tf::cudaFlowCapturer
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57

tf::cudaFlowCapturer::transform
cudaTask transform(I first, I last, O output, C op)
captures a kernel that transforms an input range to an output range
Definition transform.hpp:181

tf::cudaFlowCapturer::make_optimizer
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312

tf::cudaFlowCapturer::on
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105

tf::cudaFlow::capture
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582

tf::cudaFlow::transform
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:139

tf::cudaLinearCapturing
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182

tf::cudaTask
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65

std::count
T count(T... args)

std::distance
T distance(T... args)

std::forward
T forward(T... args)

tf
taskflow namespace
Definition small_vector.hpp:27

tf::cuda_transform
void cuda_transform(P &&p, I first, I last, O output, C op)
performs asynchronous parallel transforms over a range of items
Definition transform.hpp:84