Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit e9aa5f0

Browse files
committed
WIP porting AgentLauncher to use cub::detail::ptx_dispatch.
1 parent f65c374 commit e9aa5f0

7 files changed

Lines changed: 883 additions & 1646 deletions

File tree

thrust/system/cuda/detail/adjacent_difference.h

Lines changed: 114 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@
3333
#include <thrust/detail/cstdint.h>
3434
#include <thrust/detail/temporary_array.h>
3535
#include <thrust/system/cuda/detail/util.h>
36-
#include <cub/device/device_select.cuh>
37-
#include <cub/block/block_adjacent_difference.cuh>
3836
#include <thrust/system/cuda/detail/core/agent_launcher.h>
3937
#include <thrust/system/cuda/detail/par_to_seq.h>
4038
#include <thrust/system/cuda/detail/dispatch.h>
@@ -43,6 +41,9 @@
4341
#include <thrust/detail/mpl/math.h>
4442
#include <thrust/detail/minmax.h>
4543

44+
#include <cub/block/block_adjacent_difference.cuh>
45+
#include <cub/detail/ptx_dispatch.cuh>
46+
#include <cub/device/device_select.cuh>
4647
#include <cub/util_math.cuh>
4748

4849
namespace thrust
@@ -70,71 +71,42 @@ namespace __adjacent_difference {
7071
cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
7172
struct PtxPolicy
7273
{
73-
enum
74-
{
75-
BLOCK_THREADS = _BLOCK_THREADS,
76-
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
77-
ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD
78-
};
79-
80-
static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
81-
static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
82-
static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
74+
static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
75+
static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
76+
static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
77+
78+
static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
79+
static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
80+
static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
8381
};
8482

8583
template<int INPUT_SIZE, int NOMINAL_4B_ITEMS_PER_THREAD>
8684
struct items_per_thread
8785
{
88-
enum
89-
{
90-
value = (INPUT_SIZE <= 8)
91-
? NOMINAL_4B_ITEMS_PER_THREAD
92-
: mpl::min<
93-
int,
94-
NOMINAL_4B_ITEMS_PER_THREAD,
95-
mpl::max<int,
96-
1,
97-
((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
98-
INPUT_SIZE - 1) /
99-
INPUT_SIZE>::value>::value
100-
};
86+
static constexpr int value =
87+
(INPUT_SIZE <= 8)
88+
? NOMINAL_4B_ITEMS_PER_THREAD
89+
: mpl::min<int,
90+
NOMINAL_4B_ITEMS_PER_THREAD,
91+
mpl::max<int,
92+
1,
93+
((NOMINAL_4B_ITEMS_PER_THREAD * 8) + INPUT_SIZE - 1) /
94+
INPUT_SIZE>::value>::value;
10195
};
10296

103-
template<class Arch, class T>
104-
struct Tuning;
105-
10697
template <class T>
107-
struct Tuning<sm30, T>
98+
struct Tuning350 : cub::detail::ptx_base<350>
10899
{
109-
enum
110-
{
111-
INPUT_SIZE = sizeof(T),
112-
NOMINAL_4B_ITEMS_PER_THREAD = 7,
113-
ITEMS_PER_THREAD = items_per_thread<INPUT_SIZE,
114-
NOMINAL_4B_ITEMS_PER_THREAD>::value
115-
};
116-
typedef PtxPolicy<128,
117-
ITEMS_PER_THREAD,
118-
cub::BLOCK_LOAD_WARP_TRANSPOSE,
119-
cub::LOAD_DEFAULT,
120-
cub::BLOCK_STORE_WARP_TRANSPOSE>
121-
type;
122-
};
123-
template <class T>
124-
struct Tuning<sm35, T> : Tuning<sm30,T>
125-
{
126-
enum
127-
{
128-
NOMINAL_4B_ITEMS_PER_THREAD = 7,
129-
ITEMS_PER_THREAD = items_per_thread<Tuning::INPUT_SIZE,
130-
NOMINAL_4B_ITEMS_PER_THREAD>::value
131-
};
132-
typedef PtxPolicy<128,
133-
ITEMS_PER_THREAD,
134-
cub::BLOCK_LOAD_WARP_TRANSPOSE,
135-
cub::LOAD_LDG,
136-
cub::BLOCK_STORE_WARP_TRANSPOSE>
137-
type;
100+
static constexpr int INPUT_SIZE = static_cast<int>(sizeof(T));
101+
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 7;
102+
static constexpr int ITEMS_PER_THREAD =
103+
items_per_thread<INPUT_SIZE, NOMINAL_4B_ITEMS_PER_THREAD>::value;
104+
105+
using policy = PtxPolicy<128,
106+
ITEMS_PER_THREAD,
107+
cub::BLOCK_LOAD_WARP_TRANSPOSE,
108+
cub::LOAD_LDG,
109+
cub::BLOCK_STORE_WARP_TRANSPOSE>;
138110
};
139111

140112
template <class InputIt,
@@ -148,11 +120,12 @@ namespace __adjacent_difference {
148120
// XXX output type must be result of BinaryOp(input_type,input_type);
149121
typedef input_type output_type;
150122

151-
template<class Arch>
152-
struct PtxPlan : Tuning<Arch,input_type>::type
153-
{
154-
typedef Tuning<Arch,input_type> tuning;
123+
// List tunings in reverse order:
124+
using Tunings = cub::detail::type_list<Tuning350<input_type>>;
155125

126+
template<class Tuning>
127+
struct PtxPlan : Tuning::policy
128+
{
156129
typedef typename core::LoadIterator<PtxPlan, InputIt>::type LoadIt;
157130
typedef typename core::BlockLoad<PtxPlan, LoadIt>::type BlockLoad;
158131

@@ -163,7 +136,7 @@ namespace __adjacent_difference {
163136
PtxPlan::BLOCK_THREADS,
164137
1,
165138
1,
166-
Arch::ver>
139+
Tuning::ptx_arch>
167140
BlockAdjacentDifference;
168141

169142
union TempStorage
@@ -174,24 +147,18 @@ namespace __adjacent_difference {
174147
}; // union TempStorage
175148
}; // struct PtxPlan
176149

177-
typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
178-
179-
typedef typename ptx_plan::LoadIt LoadIt;
180-
typedef typename ptx_plan::BlockLoad BlockLoad;
181-
typedef typename ptx_plan::BlockStore BlockStore;
182-
typedef typename ptx_plan::BlockAdjacentDifference BlockAdjacentDifference;
183-
typedef typename ptx_plan::TempStorage TempStorage;
184-
185-
186-
enum
187-
{
188-
ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
189-
BLOCK_THREADS = ptx_plan::BLOCK_THREADS,
190-
ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE,
191-
};
192-
150+
template <typename ActivePtxPlan>
193151
struct impl
194152
{
153+
using BlockAdjacentDifference =
154+
typename ActivePtxPlan::BlockAdjacentDifference;
155+
using BlockLoad = typename ActivePtxPlan::BlockLoad;
156+
using BlockStore = typename ActivePtxPlan::BlockStore;
157+
using LoadIt = typename ActivePtxPlan::LoadIt;
158+
using TempStorage = typename ActivePtxPlan::TempStorage;
159+
160+
static constexpr int ITEMS_PER_THREAD = ActivePtxPlan::ITEMS_PER_THREAD;
161+
static constexpr int ITEMS_PER_TILE = ActivePtxPlan::ITEMS_PER_TILE;
195162

196163
//---------------------------------------------------------------------
197164
// Per-thread fields
@@ -308,7 +275,7 @@ namespace __adjacent_difference {
308275
BinaryOp binary_op_,
309276
Size num_items)
310277
: temp_storage(temp_storage_),
311-
load_it(core::make_load_iterator(ptx_plan(), input_it_)),
278+
load_it(core::make_load_iterator(ActivePtxPlan{}, input_it_)),
312279
first_tile_previous(first_tile_previous_),
313280
output_it(result_),
314281
binary_op(binary_op_)
@@ -321,15 +288,19 @@ namespace __adjacent_difference {
321288
// Agent entry point
322289
//---------------------------------------------------------------------
323290

291+
template <typename ActivePtxPlan>
324292
THRUST_AGENT_ENTRY(InputIt first,
325293
input_type *first_element,
326294
OutputIt result,
327295
BinaryOp binary_op,
328296
Size num_items,
329297
char * shmem)
330298
{
331-
TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
332-
impl(storage, first, first_element, result, binary_op, num_items);
299+
using temp_storage_t = typename ActivePtxPlan::TempStorage;
300+
auto &storage = *reinterpret_cast<temp_storage_t *>(shmem);
301+
302+
using impl_t = impl<ActivePtxPlan>;
303+
impl_t{storage, first, first_element, result, binary_op, num_items};
333304
}
334305
}; // struct AdjacentDifferenceAgent
335306

@@ -338,14 +309,13 @@ namespace __adjacent_difference {
338309
class Size>
339310
struct InitAgent
340311
{
341-
template <class Arch>
342312
struct PtxPlan : PtxPolicy<128> {};
343-
typedef core::specialize_plan<PtxPlan> ptx_plan;
344313

345314
//---------------------------------------------------------------------
346315
// Agent entry point
347316
//---------------------------------------------------------------------
348317

318+
template <typename /*ActivePtxPlan*/>
349319
THRUST_AGENT_ENTRY(InputIt first,
350320
OutputIt result,
351321
Size num_tiles,
@@ -373,63 +343,89 @@ namespace __adjacent_difference {
373343
cudaStream_t stream,
374344
bool debug_sync)
375345
{
376-
if (num_items == 0)
377-
return cudaSuccess;
346+
cudaError_t status = cudaSuccess;
378347

379-
using core::AgentPlan;
380-
using core::AgentLauncher;
348+
if (!d_temp_storage)
349+
{ // Initialize this for early return.
350+
temp_storage_bytes = 0;
351+
}
381352

382-
cudaError_t status = cudaSuccess;
353+
if (num_items == 0)
354+
{
355+
return status;
356+
}
383357

384-
typedef AgentLauncher<
385-
AdjacentDifferenceAgent<InputIt,
386-
OutputIt,
387-
Size,
388-
BinaryOp> >
389-
difference_agent;
358+
// Declare type aliases for agents, etc:
359+
using adj_diff_agent_t =
360+
AdjacentDifferenceAgent<InputIt, OutputIt, Size, BinaryOp>;
390361

391-
typedef typename iterator_traits<InputIt>::value_type input_type;
392-
typedef AgentLauncher<InitAgent<InputIt, input_type *, Size> > init_agent;
362+
using input_t = typename iterator_traits<InputIt>::value_type;
363+
using init_agent_t = InitAgent<InputIt, input_t *, Size>;
393364

394-
AgentPlan difference_plan = difference_agent::get_plan(stream);
395-
AgentPlan init_plan = init_agent::get_plan();
365+
// Create PtxPlans and AgentPlans:
366+
const auto init_ptx_plan = typename init_agent_t::PtxPlan{};
367+
const thrust::cuda_cub::core::AgentPlan init_agent_plan{init_ptx_plan};
396368

369+
const auto adj_diff_agent_plan =
370+
core::AgentPlanFromTunings<adj_diff_agent_t>::get();
397371

398-
Size tile_size = difference_plan.items_per_tile;
399-
Size num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
372+
// Work out shmem requirements:
373+
const Size tile_size = adj_diff_agent_plan.items_per_tile;
374+
const Size num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
400375

401-
size_t tmp1 = num_tiles * sizeof(input_type);
402-
size_t vshmem_size = core::vshmem_size(difference_plan.shared_memory_size,
403-
num_tiles);
376+
const std::size_t tmp1 = num_tiles * sizeof(input_t);
377+
const std::size_t vshmem_size =
378+
core::vshmem_size(adj_diff_agent_plan.shared_memory_size, num_tiles);
404379

405-
size_t allocation_sizes[2] = {tmp1, vshmem_size};
406-
void * allocations[2] = {NULL, NULL};
380+
std::size_t allocation_sizes[2] = {tmp1, vshmem_size};
381+
void *allocations[2] = {nullptr, nullptr};
407382

408383
status = core::alias_storage(d_temp_storage,
409384
temp_storage_bytes,
410385
allocations,
411386
allocation_sizes);
412387
CUDA_CUB_RET_IF_FAIL(status);
413388

414-
if (d_temp_storage == NULL)
389+
if (d_temp_storage == nullptr)
415390
{
416391
return status;
417392
}
418393

419-
input_type *first_tile_previous = (input_type *)allocations[0];
420-
char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
421-
422-
init_agent ia(init_plan, num_tiles, stream, "adjacent_difference::init_agent", debug_sync);
423-
ia.launch(first, first_tile_previous, num_tiles, tile_size);
394+
input_t *first_tile_previous = reinterpret_cast<input_t *>(allocations[0]);
395+
char *vshmem_ptr = vshmem_size > 0
396+
? reinterpret_cast<char *>(allocations[1])
397+
: nullptr;
398+
399+
// Launch init kernel:
400+
using init_agent_launcher_t = core::AgentLauncher<init_agent_t>;
401+
init_agent_launcher_t ia{init_agent_plan,
402+
num_tiles,
403+
stream,
404+
"adjacent_difference::init_agent",
405+
debug_sync};
406+
ia.launch_ptx_plan(init_ptx_plan,
407+
first,
408+
first_tile_previous,
409+
num_tiles,
410+
tile_size);
424411
CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
425412

426-
difference_agent da(difference_plan, num_items, stream, vshmem_ptr, "adjacent_difference::difference_agent", debug_sync);
427-
da.launch(first,
428-
first_tile_previous,
429-
result,
430-
binary_op,
431-
num_items);
413+
// Launch adjacent difference kernel:
414+
using adj_diff_agent_launcher_t = core::AgentLauncher<adj_diff_agent_t>;
415+
adj_diff_agent_launcher_t da{adj_diff_agent_plan,
416+
num_items,
417+
stream,
418+
vshmem_ptr,
419+
"adjacent_difference::difference_agent",
420+
debug_sync};
421+
da.launch_ptx_dispatch(typename adj_diff_agent_t::Tunings{},
422+
first,
423+
first_tile_previous,
424+
result,
425+
binary_op,
426+
num_items);
432427
CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
428+
433429
return status;
434430
}
435431

0 commit comments

Comments
 (0)