3333#include < thrust/detail/cstdint.h>
3434#include < thrust/detail/temporary_array.h>
3535#include < thrust/system/cuda/detail/util.h>
36- #include < cub/device/device_select.cuh>
37- #include < cub/block/block_adjacent_difference.cuh>
3836#include < thrust/system/cuda/detail/core/agent_launcher.h>
3937#include < thrust/system/cuda/detail/par_to_seq.h>
4038#include < thrust/system/cuda/detail/dispatch.h>
4341#include < thrust/detail/mpl/math.h>
4442#include < thrust/detail/minmax.h>
4543
44+ #include < cub/block/block_adjacent_difference.cuh>
45+ #include < cub/detail/ptx_dispatch.cuh>
46+ #include < cub/device/device_select.cuh>
4647#include < cub/util_math.cuh>
4748
4849namespace thrust
@@ -70,71 +71,42 @@ namespace __adjacent_difference {
7071 cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
7172 struct PtxPolicy
7273 {
73- enum
74- {
75- BLOCK_THREADS = _BLOCK_THREADS,
76- ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
77- ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD
78- };
79-
80- static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
81- static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
82- static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
74+ static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
75+ static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
76+ static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
77+
78+ static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
79+ static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
80+ static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
8381 };
8482
8583 template <int INPUT_SIZE, int NOMINAL_4B_ITEMS_PER_THREAD>
8684 struct items_per_thread
8785 {
88- enum
89- {
90- value = (INPUT_SIZE <= 8 )
91- ? NOMINAL_4B_ITEMS_PER_THREAD
92- : mpl::min<
93- int ,
94- NOMINAL_4B_ITEMS_PER_THREAD,
95- mpl::max<int ,
96- 1 ,
97- ((NOMINAL_4B_ITEMS_PER_THREAD * 8 ) +
98- INPUT_SIZE - 1 ) /
99- INPUT_SIZE>::value>::value
100- };
86+ static constexpr int value =
87+ (INPUT_SIZE <= 8 )
88+ ? NOMINAL_4B_ITEMS_PER_THREAD
89+ : mpl::min<int ,
90+ NOMINAL_4B_ITEMS_PER_THREAD,
91+ mpl::max<int ,
92+ 1 ,
93+ ((NOMINAL_4B_ITEMS_PER_THREAD * 8 ) + INPUT_SIZE - 1 ) /
94+ INPUT_SIZE>::value>::value;
10195 };
10296
103- template <class Arch , class T >
104- struct Tuning ;
105-
10697 template <class T >
107- struct Tuning <sm30, T >
98+ struct Tuning350 : cub::detail::ptx_base< 350 >
10899 {
109- enum
110- {
111- INPUT_SIZE = sizeof (T),
112- NOMINAL_4B_ITEMS_PER_THREAD = 7 ,
113- ITEMS_PER_THREAD = items_per_thread<INPUT_SIZE,
114- NOMINAL_4B_ITEMS_PER_THREAD>::value
115- };
116- typedef PtxPolicy<128 ,
117- ITEMS_PER_THREAD,
118- cub::BLOCK_LOAD_WARP_TRANSPOSE,
119- cub::LOAD_DEFAULT,
120- cub::BLOCK_STORE_WARP_TRANSPOSE>
121- type;
122- };
123- template <class T >
124- struct Tuning <sm35, T> : Tuning<sm30,T>
125- {
126- enum
127- {
128- NOMINAL_4B_ITEMS_PER_THREAD = 7 ,
129- ITEMS_PER_THREAD = items_per_thread<Tuning::INPUT_SIZE,
130- NOMINAL_4B_ITEMS_PER_THREAD>::value
131- };
132- typedef PtxPolicy<128 ,
133- ITEMS_PER_THREAD,
134- cub::BLOCK_LOAD_WARP_TRANSPOSE,
135- cub::LOAD_LDG,
136- cub::BLOCK_STORE_WARP_TRANSPOSE>
137- type;
100+ static constexpr int INPUT_SIZE = static_cast <int >(sizeof (T));
101+ static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 7 ;
102+ static constexpr int ITEMS_PER_THREAD =
103+ items_per_thread<INPUT_SIZE, NOMINAL_4B_ITEMS_PER_THREAD>::value;
104+
105+ using policy = PtxPolicy<128 ,
106+ ITEMS_PER_THREAD,
107+ cub::BLOCK_LOAD_WARP_TRANSPOSE,
108+ cub::LOAD_LDG,
109+ cub::BLOCK_STORE_WARP_TRANSPOSE>;
138110 };
139111
140112 template <class InputIt ,
@@ -148,11 +120,12 @@ namespace __adjacent_difference {
148120 // XXX output type must be result of BinaryOp(input_type,input_type);
149121 typedef input_type output_type;
150122
151- template <class Arch >
152- struct PtxPlan : Tuning<Arch,input_type>::type
153- {
154- typedef Tuning<Arch,input_type> tuning;
123+ // List tunings in reverse order:
124+ using Tunings = cub::detail::type_list<Tuning350<input_type>>;
155125
126+ template <class Tuning >
127+ struct PtxPlan : Tuning::policy
128+ {
156129 typedef typename core::LoadIterator<PtxPlan, InputIt>::type LoadIt;
157130 typedef typename core::BlockLoad<PtxPlan, LoadIt>::type BlockLoad;
158131
@@ -163,7 +136,7 @@ namespace __adjacent_difference {
163136 PtxPlan::BLOCK_THREADS,
164137 1 ,
165138 1 ,
166- Arch::ver >
139+ Tuning::ptx_arch >
167140 BlockAdjacentDifference;
168141
169142 union TempStorage
@@ -174,24 +147,18 @@ namespace __adjacent_difference {
174147 }; // union TempStorage
175148 }; // struct PtxPlan
176149
177- typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
178-
179- typedef typename ptx_plan::LoadIt LoadIt;
180- typedef typename ptx_plan::BlockLoad BlockLoad;
181- typedef typename ptx_plan::BlockStore BlockStore;
182- typedef typename ptx_plan::BlockAdjacentDifference BlockAdjacentDifference;
183- typedef typename ptx_plan::TempStorage TempStorage;
184-
185-
186- enum
187- {
188- ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
189- BLOCK_THREADS = ptx_plan::BLOCK_THREADS,
190- ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE,
191- };
192-
150+ template <typename ActivePtxPlan>
193151 struct impl
194152 {
153+ using BlockAdjacentDifference =
154+ typename ActivePtxPlan::BlockAdjacentDifference;
155+ using BlockLoad = typename ActivePtxPlan::BlockLoad;
156+ using BlockStore = typename ActivePtxPlan::BlockStore;
157+ using LoadIt = typename ActivePtxPlan::LoadIt;
158+ using TempStorage = typename ActivePtxPlan::TempStorage;
159+
160+ static constexpr int ITEMS_PER_THREAD = ActivePtxPlan::ITEMS_PER_THREAD;
161+ static constexpr int ITEMS_PER_TILE = ActivePtxPlan::ITEMS_PER_TILE;
195162
196163 // ---------------------------------------------------------------------
197164 // Per-thread fields
@@ -308,7 +275,7 @@ namespace __adjacent_difference {
308275 BinaryOp binary_op_,
309276 Size num_items)
310277 : temp_storage(temp_storage_),
311- load_it (core::make_load_iterator(ptx_plan() , input_it_)),
278+ load_it (core::make_load_iterator(ActivePtxPlan{} , input_it_)),
312279 first_tile_previous(first_tile_previous_),
313280 output_it(result_),
314281 binary_op(binary_op_)
@@ -321,15 +288,19 @@ namespace __adjacent_difference {
321288 // Agent entry point
322289 // ---------------------------------------------------------------------
323290
291+ template <typename ActivePtxPlan>
324292 THRUST_AGENT_ENTRY (InputIt first,
325293 input_type *first_element,
326294 OutputIt result,
327295 BinaryOp binary_op,
328296 Size num_items,
329297 char * shmem)
330298 {
331- TempStorage &storage = *reinterpret_cast <TempStorage *>(shmem);
332- impl (storage, first, first_element, result, binary_op, num_items);
299+ using temp_storage_t = typename ActivePtxPlan::TempStorage;
300+ auto &storage = *reinterpret_cast <temp_storage_t *>(shmem);
301+
302+ using impl_t = impl<ActivePtxPlan>;
303+ impl_t {storage, first, first_element, result, binary_op, num_items};
333304 }
334305 }; // struct AdjacentDifferenceAgent
335306
@@ -338,14 +309,13 @@ namespace __adjacent_difference {
338309 class Size >
339310 struct InitAgent
340311 {
341- template <class Arch >
342312 struct PtxPlan : PtxPolicy<128 > {};
343- typedef core::specialize_plan<PtxPlan> ptx_plan;
344313
345314 // ---------------------------------------------------------------------
346315 // Agent entry point
347316 // ---------------------------------------------------------------------
348317
318+ template <typename /* ActivePtxPlan*/ >
349319 THRUST_AGENT_ENTRY (InputIt first,
350320 OutputIt result,
351321 Size num_tiles,
@@ -373,63 +343,89 @@ namespace __adjacent_difference {
373343 cudaStream_t stream,
374344 bool debug_sync)
375345 {
376- if (num_items == 0 )
377- return cudaSuccess;
346+ cudaError_t status = cudaSuccess;
378347
379- using core::AgentPlan;
380- using core::AgentLauncher;
348+ if (!d_temp_storage)
349+ { // Initialize this for early return.
350+ temp_storage_bytes = 0 ;
351+ }
381352
382- cudaError_t status = cudaSuccess;
353+ if (num_items == 0 )
354+ {
355+ return status;
356+ }
383357
384- typedef AgentLauncher<
385- AdjacentDifferenceAgent<InputIt,
386- OutputIt,
387- Size,
388- BinaryOp> >
389- difference_agent;
358+ // Declare type aliases for agents, etc:
359+ using adj_diff_agent_t =
360+ AdjacentDifferenceAgent<InputIt, OutputIt, Size, BinaryOp>;
390361
391- typedef typename iterator_traits<InputIt>::value_type input_type ;
392- typedef AgentLauncher< InitAgent<InputIt, input_type *, Size> > init_agent ;
362+ using input_t = typename iterator_traits<InputIt>::value_type;
363+ using init_agent_t = InitAgent<InputIt, input_t *, Size>;
393364
394- AgentPlan difference_plan = difference_agent::get_plan (stream);
395- AgentPlan init_plan = init_agent::get_plan ();
365+ // Create PtxPlans and AgentPlans:
366+ const auto init_ptx_plan = typename init_agent_t ::PtxPlan{};
367+ const thrust::cuda_cub::core::AgentPlan init_agent_plan{init_ptx_plan};
396368
369+ const auto adj_diff_agent_plan =
370+ core::AgentPlanFromTunings<adj_diff_agent_t >::get ();
397371
398- Size tile_size = difference_plan.items_per_tile ;
399- Size num_tiles = cub::DivideAndRoundUp (num_items, tile_size);
372+ // Work out shmem requirements:
373+ const Size tile_size = adj_diff_agent_plan.items_per_tile ;
374+ const Size num_tiles = cub::DivideAndRoundUp (num_items, tile_size);
400375
401- size_t tmp1 = num_tiles * sizeof (input_type );
402- size_t vshmem_size = core::vshmem_size (difference_plan. shared_memory_size ,
403- num_tiles);
376+ const std:: size_t tmp1 = num_tiles * sizeof (input_t );
377+ const std:: size_t vshmem_size =
378+ core::vshmem_size (adj_diff_agent_plan. shared_memory_size , num_tiles);
404379
405- size_t allocation_sizes[2 ] = {tmp1, vshmem_size};
406- void * allocations[2 ] = {NULL , NULL };
380+ std:: size_t allocation_sizes[2 ] = {tmp1, vshmem_size};
381+ void *allocations[2 ] = {nullptr , nullptr };
407382
408383 status = core::alias_storage (d_temp_storage,
409384 temp_storage_bytes,
410385 allocations,
411386 allocation_sizes);
412387 CUDA_CUB_RET_IF_FAIL (status);
413388
414- if (d_temp_storage == NULL )
389+ if (d_temp_storage == nullptr )
415390 {
416391 return status;
417392 }
418393
419- input_type *first_tile_previous = (input_type *)allocations[0 ];
420- char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1 ] : NULL ;
421-
422- init_agent ia (init_plan, num_tiles, stream, " adjacent_difference::init_agent" , debug_sync);
423- ia.launch (first, first_tile_previous, num_tiles, tile_size);
394+ input_t *first_tile_previous = reinterpret_cast <input_t *>(allocations[0 ]);
395+ char *vshmem_ptr = vshmem_size > 0
396+ ? reinterpret_cast <char *>(allocations[1 ])
397+ : nullptr ;
398+
399+ // Launch init kernel:
400+ using init_agent_launcher_t = core::AgentLauncher<init_agent_t >;
401+ init_agent_launcher_t ia{init_agent_plan,
402+ num_tiles,
403+ stream,
404+ " adjacent_difference::init_agent" ,
405+ debug_sync};
406+ ia.launch_ptx_plan (init_ptx_plan,
407+ first,
408+ first_tile_previous,
409+ num_tiles,
410+ tile_size);
424411 CUDA_CUB_RET_IF_FAIL (cudaPeekAtLastError ());
425412
426- difference_agent da (difference_plan, num_items, stream, vshmem_ptr, " adjacent_difference::difference_agent" , debug_sync);
427- da.launch (first,
428- first_tile_previous,
429- result,
430- binary_op,
431- num_items);
413+ // Launch adjacent difference kernel:
414+ using adj_diff_agent_launcher_t = core::AgentLauncher<adj_diff_agent_t >;
415+ adj_diff_agent_launcher_t da{adj_diff_agent_plan,
416+ num_items,
417+ stream,
418+ vshmem_ptr,
419+ " adjacent_difference::difference_agent" ,
420+ debug_sync};
421+ da.launch_ptx_dispatch (typename adj_diff_agent_t ::Tunings{},
422+ first,
423+ first_tile_previous,
424+ result,
425+ binary_op,
426+ num_items);
432427 CUDA_CUB_RET_IF_FAIL (cudaPeekAtLastError ());
428+
433429 return status;
434430 }
435431
0 commit comments