Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dependencies/cub
Submodule cub updated 83 files
+14 −1 cmake/CubCudaConfig.cmake
+1 −6 cub/agent/agent_histogram.cuh
+1 −1 cub/agent/agent_rle.cuh
+1 −2 cub/agent/agent_segment_fixup.cuh
+3 −2 cub/agent/agent_spmv_orig.cuh
+24 −9 cub/agent/agent_sub_warp_merge_sort.cuh
+2 −2 cub/agent/single_pass_scan_operators.cuh
+3 −4 cub/block/block_adjacent_difference.cuh
+2 −2 cub/block/block_discontinuity.cuh
+4 −5 cub/block/block_exchange.cuh
+4 −16 cub/block/block_histogram.cuh
+7 −7 cub/block/block_load.cuh
+8 −10 cub/block/block_radix_rank.cuh
+7 −9 cub/block/block_radix_sort.cuh
+5 −5 cub/block/block_raking_layout.cuh
+5 −5 cub/block/block_reduce.cuh
+5 −5 cub/block/block_scan.cuh
+3 −3 cub/block/block_shuffle.cuh
+7 −7 cub/block/block_store.cuh
+4 −6 cub/block/specializations/block_histogram_sort.cuh
+3 −3 cub/block/specializations/block_reduce_raking.cuh
+5 −5 cub/block/specializations/block_reduce_raking_commutative_only.cuh
+3 −3 cub/block/specializations/block_reduce_warp_reductions.cuh
+3 −3 cub/block/specializations/block_scan_raking.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans2.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans3.cuh
+6 −0 cub/cmake/cub-config.cmake
+83 −0 cub/detail/cdp_dispatch.cuh
+74 −0 cub/detail/detect_cuda_runtime.cuh
+92 −0 cub/detail/device_algorithm_dispatch_invoker.cuh
+16 −16 cub/detail/device_synchronize.cuh
+69 −0 cub/detail/kernel_macros.cuh
+94 −0 cub/detail/nv/detail/__preprocessor
+388 −0 cub/detail/nv/detail/__target_macros
+158 −0 cub/detail/nv/target
+174 −0 cub/detail/ptx_dispatch.cuh
+520 −0 cub/detail/ptx_dispatch_impl.cuh
+75 −0 cub/detail/ptx_targets.cuh
+30 −0 cub/detail/target.cuh
+69 −0 cub/detail/type_list.cuh
+79 −0 cub/detail/type_list_impl.cuh
+4 −5 cub/detail/type_traits.cuh
+43 −0 cub/detail/type_wrapper.cuh
+279 −232 cub/device/dispatch/dispatch_histogram.cuh
+53 −67 cub/device/dispatch/dispatch_merge_sort.cuh
+134 −160 cub/device/dispatch/dispatch_radix_sort.cuh
+100 −162 cub/device/dispatch/dispatch_reduce.cuh
+108 −139 cub/device/dispatch/dispatch_reduce_by_key.cuh
+91 −114 cub/device/dispatch/dispatch_rle.cuh
+70 −92 cub/device/dispatch/dispatch_scan.cuh
+62 −87 cub/device/dispatch/dispatch_scan_by_key.cuh
+77 −92 cub/device/dispatch/dispatch_segmented_sort.cuh
+91 −123 cub/device/dispatch/dispatch_select_if.cuh
+162 −221 cub/device/dispatch/dispatch_spmv_orig.cuh
+99 −113 cub/device/dispatch/dispatch_three_way_partition.cuh
+45 −48 cub/grid/grid_queue.cuh
+24 −31 cub/iterator/tex_obj_input_iterator.cuh
+6 −11 cub/iterator/tex_ref_input_iterator.cuh
+5 −18 cub/thread/thread_load.cuh
+4 −12 cub/thread/thread_store.cuh
+18 −76 cub/util_arch.cuh
+92 −66 cub/util_debug.cuh
+104 −217 cub/util_device.cuh
+4 −6 cub/util_ptx.cuh
+0 −15 cub/util_type.cuh
+5 −5 cub/warp/specializations/warp_reduce_shfl.cuh
+4 −5 cub/warp/specializations/warp_reduce_smem.cuh
+4 −4 cub/warp/specializations/warp_scan_shfl.cuh
+3 −4 cub/warp/specializations/warp_scan_smem.cuh
+5 −6 cub/warp/warp_exchange.cuh
+5 −6 cub/warp/warp_load.cuh
+7 −7 cub/warp/warp_merge_sort.cuh
+6 −6 cub/warp/warp_reduce.cuh
+5 −5 cub/warp/warp_scan.cuh
+5 −6 cub/warp/warp_store.cuh
+0 −12 experimental/defunct/example_coo_spmv.cu
+31 −83 experimental/defunct/test_device_seg_reduce.cu
+7 −20 test/test_device_reduce.cu
+10 −8 test/test_device_segmented_sort.cu
+127 −145 test/test_util.h
+1 −2 test/test_warp_mask.cu
+5 −5 test/test_warp_reduce.cu
12 changes: 8 additions & 4 deletions testing/allocator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
#include <thrust/detail/config.h>
#include <thrust/device_malloc_allocator.h>
#include <thrust/system/cpp/vector.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#include <cub/detail/target.cuh>
#endif

#include <memory>

template <typename T>
Expand Down Expand Up @@ -80,7 +85,9 @@ struct my_allocator_with_custom_destroy
__host__ __device__
void destroy(T *)
{
#if !__CUDA_ARCH__
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
NV_IF_TARGET(NV_IS_HOST, (g_state = true;), ());
#else
g_state = true;
#endif
}
Expand Down Expand Up @@ -203,7 +210,6 @@ void TestAllocatorTraitsRebind()
}
DECLARE_UNITTEST(TestAllocatorTraitsRebind);

#if THRUST_CPP_DIALECT >= 2011
void TestAllocatorTraitsRebindCpp11()
{
ASSERT_EQUAL(
Expand Down Expand Up @@ -251,5 +257,3 @@ void TestAllocatorTraitsRebindCpp11()
);
}
DECLARE_UNITTEST(TestAllocatorTraitsRebindCpp11);
#endif // C++11

4 changes: 0 additions & 4 deletions testing/cuda/pair_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,8 @@ template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
__global__
void stable_sort_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 is_supported)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::stable_sort(exec, first, last);
#else
*is_supported = false;
#endif
}


Expand Down
4 changes: 0 additions & 4 deletions testing/cuda/pair_sort_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,8 @@ template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typen
__global__
void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 is_supported)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first);
#else
*is_supported = false;
#endif
}


Expand Down
8 changes: 0 additions & 8 deletions testing/cuda/partition.cu
Original file line number Diff line number Diff line change
Expand Up @@ -290,12 +290,8 @@ template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typen
__global__
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
*result = thrust::stable_partition(exec, first, last, pred);
#else
*is_supported = false;
#endif
}


Expand Down Expand Up @@ -359,12 +355,8 @@ template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typen
__global__
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
*result = thrust::stable_partition(exec, first, last, stencil_first, pred);
#else
*is_supported = false;
#endif
}


Expand Down
4 changes: 0 additions & 4 deletions testing/cuda/sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,8 @@ template<typename ExecutionPolicy, typename Iterator, typename Compare, typename
__global__
void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp, Iterator2 is_supported)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::sort(exec, first, last, comp);
#else
*is_supported = false;
#endif
}


Expand Down
4 changes: 0 additions & 4 deletions testing/cuda/sort_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,8 @@ template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typen
__global__
void sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Compare comp, Iterator3 is_supported)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::sort_by_key(exec, keys_first, keys_last, values_first, comp);
#else
*is_supported = false;
#endif
}


Expand Down
14 changes: 10 additions & 4 deletions testing/device_delete.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
#include <thrust/device_new.h>
#include <thrust/device_delete.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#include <cub/detail/target.cuh>
#endif

struct Foo
{
__host__ __device__
Expand All @@ -14,10 +18,12 @@ struct Foo
__host__ __device__
~Foo(void)
{
#ifdef __CUDA_ARCH__
// __device__ overload
if(set_me_upon_destruction != 0)
*set_me_upon_destruction = true;
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
NV_IF_TARGET(NV_IS_DEVICE, (
if (set_me_upon_destruction != 0)
{
*set_me_upon_destruction = true;
}), ());
#endif
}

Expand Down
16 changes: 12 additions & 4 deletions testing/uninitialized_copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
#include <thrust/device_malloc_allocator.h>
#include <thrust/iterator/retag.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#include <cub/detail/target.cuh>
#endif

template<typename InputIterator, typename ForwardIterator>
ForwardIterator uninitialized_copy(my_system &system,
Expand Down Expand Up @@ -147,12 +150,17 @@ struct CopyConstructTest
__host__ __device__
CopyConstructTest(const CopyConstructTest &)
{
#if __CUDA_ARCH__
copy_constructed_on_device = true;
copy_constructed_on_host = false;
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
NV_IF_TARGET(NV_IS_DEVICE, (
copy_constructed_on_device = true;
copy_constructed_on_host = false;
), (
copy_constructed_on_device = false;
copy_constructed_on_host = true;
));
#else
copy_constructed_on_device = false;
copy_constructed_on_device = true;
copy_constructed_on_host = true;
#endif
}

Expand Down
14 changes: 11 additions & 3 deletions testing/uninitialized_fill.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
#include <thrust/device_malloc_allocator.h>
#include <thrust/iterator/retag.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#include <cub/detail/target.cuh>
#endif

template<typename ForwardIterator, typename T>
void uninitialized_fill(my_system &system,
Expand Down Expand Up @@ -156,9 +159,14 @@ struct CopyConstructTest
__host__ __device__
CopyConstructTest(const CopyConstructTest &)
{
#if __CUDA_ARCH__
copy_constructed_on_device = true;
copy_constructed_on_host = false;
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
NV_IF_TARGET(NV_IS_DEVICE, (
copy_constructed_on_device = true;
copy_constructed_on_host = false;
), (
copy_constructed_on_device = false;
copy_constructed_on_host = true;
));
#else
copy_constructed_on_device = false;
copy_constructed_on_host = true;
Expand Down
1 change: 0 additions & 1 deletion testing/unittest/cuda/testframework.cu
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg
{
std::cout << "--verbose and --concise cannot be used together" << std::endl;
exit(EXIT_FAILURE);
return false;
}

// check error status before doing anything
Expand Down
8 changes: 6 additions & 2 deletions testing/unittest/runtime_static_assert.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ namespace unittest

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA

#include <cub/detail/target.cuh>

#define ASSERT_STATIC_ASSERT(X) \
{ \
bool triggered = false; \
Expand Down Expand Up @@ -86,8 +88,10 @@ namespace unittest
{
static_assert_exception ex(filename, lineno);

#ifdef __CUDA_ARCH__
*detail::device_exception = ex;
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
NV_IF_TARGET(NV_IS_DEVICE,
(*detail::device_exception = ex;),
(throw ex;));
#else
throw ex;
#endif
Expand Down
21 changes: 18 additions & 3 deletions thrust/detail/allocator/no_throw_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@

#include <thrust/detail/config.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#include <cub/detail/target.cuh>
#endif

THRUST_NAMESPACE_BEGIN
namespace detail
{
Expand All @@ -43,7 +47,20 @@ template<typename BaseAllocator>
__host__ __device__
void deallocate(typename super_t::pointer p, typename super_t::size_type n)
{
#ifndef __CUDA_ARCH__
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
NV_IF_TARGET(NV_IS_HOST, (
try
{
super_t::deallocate(p, n);
} // end try
catch(...)
{
// catch anything
} // end catch
), (
super_t::deallocate(p, n);
));
#else
try
{
super_t::deallocate(p, n);
Expand All @@ -52,8 +69,6 @@ template<typename BaseAllocator>
{
// catch anything
} // end catch
#else
super_t::deallocate(p, n);
#endif
} // end deallocate()

Expand Down
26 changes: 14 additions & 12 deletions thrust/detail/allocator/temporary_allocator.inl
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
#include <thrust/system/detail/bad_alloc.h>
#include <cassert>

#if (defined(_NVHPC_CUDA) || defined(__CUDA_ARCH__)) && \
THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#include <cub/detail/target.cuh>
#if (defined(_NVHPC_CUDA) || defined(__CUDA_ARCH__))
#include <thrust/system/cuda/detail/terminate.h>
#endif
#endif // NVCC device pass or NVC++
#endif // CUDA

THRUST_NAMESPACE_BEGIN
namespace detail
Expand All @@ -45,15 +47,15 @@ __host__ __device__
// note that we pass cnt to deallocate, not a value derived from result.second
deallocate(result.first, cnt);

if (THRUST_IS_HOST_CODE) {
#if THRUST_INCLUDE_HOST_CODE
throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
#endif
} else {
#if THRUST_INCLUDE_DEVICE_CODE && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
#endif
}
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
NV_IF_TARGET(NV_IS_HOST, (
throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
), ( // NV_IS_DEVICE
thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
));
#else
throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
#endif
} // end if

return result.first;
Expand Down
18 changes: 0 additions & 18 deletions thrust/detail/config/cpp_compatibility.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,3 @@
# define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static const
# endif
#endif

#if defined(_NVHPC_CUDA)
# define THRUST_IS_DEVICE_CODE __builtin_is_device_code()
# define THRUST_IS_HOST_CODE (!__builtin_is_device_code())
# define THRUST_INCLUDE_DEVICE_CODE 1
# define THRUST_INCLUDE_HOST_CODE 1
#elif defined(__CUDA_ARCH__)
# define THRUST_IS_DEVICE_CODE 1
# define THRUST_IS_HOST_CODE 0
# define THRUST_INCLUDE_DEVICE_CODE 1
# define THRUST_INCLUDE_HOST_CODE 0
#else
# define THRUST_IS_DEVICE_CODE 0
# define THRUST_IS_HOST_CODE 1
# define THRUST_INCLUDE_DEVICE_CODE 0
# define THRUST_INCLUDE_HOST_CODE 1
#endif

32 changes: 20 additions & 12 deletions thrust/detail/contiguous_storage.inl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
#include <thrust/detail/allocator/destroy_range.h>
#include <thrust/detail/allocator/fill_construct_range.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
#include <cub/detail/target.cuh>
#endif

#include <stdexcept> // for std::runtime_error
#include <utility> // for use of std::swap in the WAR below

Expand Down Expand Up @@ -432,19 +436,23 @@ __host__ __device__
void contiguous_storage<T,Alloc>
::swap_allocators(false_type, Alloc &other)
{
if (THRUST_IS_DEVICE_CODE) {
#if THRUST_INCLUDE_DEVICE_CODE
// allocators must be equal when swapping containers with allocators that propagate on swap
assert(!is_allocator_not_equal(other));
#endif
} else {
#if THRUST_INCLUDE_HOST_CODE
if (is_allocator_not_equal(other))
{
throw allocator_mismatch_on_swap();
}
#endif
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
NV_IF_TARGET(NV_IS_DEVICE, (
// allocators must be equal when swapping containers with allocators that propagate on swap
assert(!is_allocator_not_equal(other));
), (
if (is_allocator_not_equal(other))
{
throw allocator_mismatch_on_swap();
}
));
#else
if (is_allocator_not_equal(other))
{
throw allocator_mismatch_on_swap();
}
#endif

thrust::swap(m_allocator, other);
} // end contiguous_storage::swap_allocators()

Expand Down
Loading