hello-world/docs/find_8hpp_source.html

#pragma once


#include "for_each.hpp"

#include "reduce.hpp"


namespace tf::detail {


template <typename T>

struct cudaFindPair {


  T key;

  unsigned index;


  __device__ operator unsigned () const { return index; }

};


template <typename P, typename I, typename U>

void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) {


  if(count == 0) {

    cuda_single_task(p, [=] __device__ () { *idx = 0; });

    return;

  }


  using E = std::decay_t<P>;


  auto B = (count + E::nv - 1) / E::nv;


  // set the index to the maximum

  cuda_single_task(p, [=] __device__ () { *idx = count; });


  // launch the kernel to atomic-find the minimum

  cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {


    __shared__ unsigned shm_id;


    if(!tid) {

      shm_id = count;

    }


    __syncthreads();


    auto tile = cuda_get_tile(bid, E::nv, count);


    auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(

      input + tile.begin, tid, tile.count()

    );


    auto id = count;


    for(unsigned i=0; i<E::vt; i++) {

      auto j = E::nt*i + tid;

      if(j < tile.count() && pred(x[i])) {

        id = j + tile.begin;

        break;

      }

    }


    // Note: the reduce version is not faster though

    // reduce to a scalar per block.

    //__shared__ typename cudaBlockReduce<E::nt, unsigned>::Storage shm;


    //id = cudaBlockReduce<E::nt, unsigned>()(

    //  tid,

    //  id,

    //  shm,

    //  (tile.count() < E::nt ? tile.count() : E::nt),

    //  cuda_minimum<unsigned>{},

    //  false

    //);


    // only need the minimum id

    atomicMin(&shm_id, id);

    __syncthreads();


    // reduce all to the global memory

    if(!tid) {

      atomicMin(idx, shm_id);

      //atomicMin(idx, id);

    }

  });

}


template <typename P, typename I, typename O>

void cuda_min_element_loop(

  P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr

) {


  if(count == 0) {

    cuda_single_task(p, [=] __device__ () { *idx = 0; });

    return;

  }


  using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;


  cuda_uninitialized_reduce_loop(p,

    cuda_make_load_iterator<T>([=]__device__(auto i){

      return T{*(input+i), i};

    }),

    count,

    idx,

    [=] __device__ (const auto& a, const auto& b) {

      return op(a.key, b.key) ? a : b;

    },

    ptr

  );

}


template <typename P, typename I, typename O>

void cuda_max_element_loop(

  P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr

) {


  if(count == 0) {

    cuda_single_task(p, [=] __device__ () { *idx = 0; });

    return;

  }


  using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;


  cuda_uninitialized_reduce_loop(p,

    cuda_make_load_iterator<T>([=]__device__(auto i){

      return T{*(input+i), i};

    }),

    count,

    idx,

    [=] __device__ (const auto& a, const auto& b) {

      return op(a.key, b.key) ? b : a;

    },

    ptr

  );

}


}  // end of namespace tf::detail ---------------------------------------------


namespace tf {


// ----------------------------------------------------------------------------

// cuda_find_if

// ----------------------------------------------------------------------------


template <typename P, typename I, typename U>


void cuda_find_if(

  P&& p, I first, I last, unsigned* idx, U op

) {

  detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op);

}


// ----------------------------------------------------------------------------

// cudaFlow

// ----------------------------------------------------------------------------


// Function: find_if

template <typename I, typename U>


cudaTask cudaFlow::find_if(I first, I last, unsigned* idx, U op) {

  return capture([=](cudaFlowCapturer& cap){

    cap.make_optimizer<cudaLinearCapturing>();

    cap.find_if(first, last, idx, op);

  });

}


// Function: find_if

template <typename I, typename U>


void cudaFlow::find_if(cudaTask task, I first, I last, unsigned* idx, U op) {

  capture(task, [=](cudaFlowCapturer& cap){

    cap.make_optimizer<cudaLinearCapturing>();

    cap.find_if(first, last, idx, op);

  });

}


// ----------------------------------------------------------------------------

// cudaFlowCapturer

// ----------------------------------------------------------------------------


// Function: find_if

template <typename I, typename U>


cudaTask cudaFlowCapturer::find_if(I first, I last, unsigned* idx, U op) {

  return on([=](cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_find_if(p, first, last, idx, op);

  });

}


// Function: find_if

template <typename I, typename U>


void cudaFlowCapturer::find_if(

  cudaTask task, I first, I last, unsigned* idx, U op

) {

  on(task, [=](cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_find_if(p, first, last, idx, op);

  });

}


// ----------------------------------------------------------------------------

// cuda_min_element

// ----------------------------------------------------------------------------


template <typename P, typename T>


unsigned cuda_min_element_buffer_size(unsigned count) {

  return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);

}


template <typename P, typename I, typename O>


void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {

  detail::cuda_min_element_loop(

    p, first, std::distance(first, last), idx, op, buf

  );

}


// ----------------------------------------------------------------------------

// cudaFlowCapturer::min_element

// ----------------------------------------------------------------------------


// Function: min_element

template <typename I, typename O>


cudaTask cudaFlowCapturer::min_element(I first, I last, unsigned* idx, O op) {


  using T = typename std::iterator_traits<I>::value_type;


  auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_min_element(p, first, last, idx, op, buf.get().data());

  });

}


// Function: min_element

template <typename I, typename O>


void cudaFlowCapturer::min_element(

  cudaTask task, I first, I last, unsigned* idx, O op

) {


  using T = typename std::iterator_traits<I>::value_type;


  auto bufsz = cuda_min_element_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_min_element(p, first, last, idx, op, buf.get().data());

  });

}


// ----------------------------------------------------------------------------

// cudaFlow::min_element

// ----------------------------------------------------------------------------


// Function: min_element

template <typename I, typename O>


cudaTask cudaFlow::min_element(I first, I last, unsigned* idx, O op) {

  return capture([=](cudaFlowCapturer& cap){

    cap.make_optimizer<cudaLinearCapturing>();

    cap.min_element(first, last, idx, op);

  });

}


// Function: min_element

template <typename I, typename O>


void cudaFlow::min_element(

  cudaTask task, I first, I last, unsigned* idx, O op

) {

  capture(task, [=](cudaFlowCapturer& cap){

    cap.make_optimizer<cudaLinearCapturing>();

    cap.min_element(first, last, idx, op);

  });

}


// ----------------------------------------------------------------------------

// cuda_max_element

// ----------------------------------------------------------------------------


template <typename P, typename T>


unsigned cuda_max_element_buffer_size(unsigned count) {

  return cuda_reduce_buffer_size<P, detail::cudaFindPair<T>>(count);

}


template <typename P, typename I, typename O>


void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {

  detail::cuda_max_element_loop(

    p, first, std::distance(first, last), idx, op, buf

  );

}


// ----------------------------------------------------------------------------

// cudaFlowCapturer::max_element

// ----------------------------------------------------------------------------


// Function: max_element

template <typename I, typename O>


cudaTask cudaFlowCapturer::max_element(I first, I last, unsigned* idx, O op) {


  using T = typename std::iterator_traits<I>::value_type;


  auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_max_element(p, first, last, idx, op, buf.get().data());

  });

}


// Function: max_element

template <typename I, typename O>


void cudaFlowCapturer::max_element(

  cudaTask task, I first, I last, unsigned* idx, O op

) {


  using T = typename std::iterator_traits<I>::value_type;


  auto bufsz = cuda_max_element_buffer_size<cudaDefaultExecutionPolicy, T>(

    std::distance(first, last)

  );


  on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]

  (cudaStream_t stream) mutable {

    cudaDefaultExecutionPolicy p(stream);

    cuda_max_element(p, first, last, idx, op, buf.get().data());

  });

}


// ----------------------------------------------------------------------------

// cudaFlow::max_element

// ----------------------------------------------------------------------------


// Function: max_element

template <typename I, typename O>


cudaTask cudaFlow::max_element(I first, I last, unsigned* idx, O op) {

  return capture([=](cudaFlowCapturer& cap){

    cap.make_optimizer<cudaLinearCapturing>();

    cap.max_element(first, last, idx, op);

  });

}


// Function: max_element

template <typename I, typename O>


void cudaFlow::max_element(

  cudaTask task, I first, I last, unsigned* idx, O op

) {

  capture(task, [=](cudaFlowCapturer& cap){

    cap.make_optimizer<cudaLinearCapturing>();

    cap.max_element(first, last, idx, op);

  });

}


}  // end of namespace tf -----------------------------------------------------


tf::cudaExecutionPolicy
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29

tf::cudaFlowCapturer
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57

tf::cudaFlowCapturer::find_if
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:215

tf::cudaFlowCapturer::min_element
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:300

tf::cudaFlowCapturer::make_optimizer
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312

tf::cudaFlowCapturer::max_element
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:425

tf::cudaFlowCapturer::on
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105

tf::cudaFlow::find_if
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:193

tf::cudaFlow::min_element
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:340

tf::cudaFlow::max_element
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:465

tf::cudaFlow::capture
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582

tf::cudaLinearCapturing
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182

tf::cudaTask
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65

std::count
T count(T... args)

std::distance
T distance(T... args)

for_each.hpp
cuda parallel-iteration algorithms include file

std::forward
T forward(T... args)

std::iterator_traits

tf
taskflow namespace
Definition small_vector.hpp:27

tf::cuda_max_element_buffer_size
unsigned cuda_max_element_buffer_size(unsigned count)
queries the buffer size in bytes needed to call tf::cuda_max_element
Definition find.hpp:374

tf::cuda_single_task
void cuda_single_task(P &&p, C c)
runs a callable asynchronously using one kernel thread
Definition for_each.hpp:69

tf::cuda_max_element
void cuda_max_element(P &&p, I first, I last, unsigned *idx, O op, void *buf)
finds the index of the maximum element in a range
Definition find.hpp:413

tf::cuda_min_element
void cuda_min_element(P &&p, I first, I last, unsigned *idx, O op, void *buf)
finds the index of the minimum element in a range
Definition find.hpp:288

tf::cuda_find_if
void cuda_find_if(P &&p, I first, I last, unsigned *idx, U op)
finds the index of the first element that satisfies the given criteria
Definition find.hpp:181

tf::cuda_min_element_buffer_size
unsigned cuda_min_element_buffer_size(unsigned count)
queries the buffer size in bytes needed to call tf::cuda_min_element
Definition find.hpp:249

reduce.hpp
cuda reduce algorithms include file