Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "cub"]
path = dependencies/cub
url = ../cub.git
[submodule "libcudacxx"]
path = dependencies/libcudacxx
url = ../libcudacxx.git
50 changes: 29 additions & 21 deletions cmake/ThrustInstallRules.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,35 @@ configure_file("${Thrust_SOURCE_DIR}/thrust/cmake/thrust-header-search.cmake.in"
install(FILES "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake"
DESTINATION "${install_location}")

# Depending on how Thrust is configured, CUB's CMake scripts may or may not be
# included, so maintain a set of CUB install rules in both projects. By default
# CUB headers are installed alongside Thrust -- this may be disabled by turning
# off THRUST_INSTALL_CUB_HEADERS.
option(THRUST_INSTALL_CUB_HEADERS "Include cub headers when installing." ON)
# Depending on how Thrust is configured, libcudacxx and CUB's CMake scripts may
# or may not be include()'d, so force include their install rules when requested.
# By default, these projects are installed alongside Thrust. This is controlled by
# THRUST_INSTALL_CUB_HEADERS and THRUST_INSTALL_LIBCUDACXX_HEADERS.
option(THRUST_INSTALL_CUB_HEADERS "Include CUB headers when installing." ON)
if (THRUST_INSTALL_CUB_HEADERS)
install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
FILES_MATCHING
PATTERN "*.cuh"
)
# Use a function to limit scope of the CUB_*_DIR vars:
function(_thrust_install_cub_headers)
# Fake these for the logic in CUBInstallRules.cmake:
set(CUB_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/cub/")
set(CUB_BINARY_DIR "${Thrust_BINARY_DIR}/cub-config/")
set(CUB_ENABLE_INSTALL_RULES ON)
set(CUB_IN_THRUST OFF)
include("${Thrust_SOURCE_DIR}/dependencies/cub/cmake/CubInstallRules.cmake")
endfunction()

# Need to configure a file to store THRUST_INSTALL_HEADER_INFIX
install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake/"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub"
PATTERN cub-header-search EXCLUDE
)
set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/cub")
configure_file("${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake.in"
"${Thrust_BINARY_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake"
@ONLY)
install(FILES "${Thrust_BINARY_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake"
DESTINATION "${install_location}")
_thrust_install_cub_headers()
endif()

option(THRUST_INSTALL_LIBCUDACXX_HEADERS "Include libcudacxx headers when installing." ON)
if (THRUST_INSTALL_LIBCUDACXX_HEADERS)
# Use a function to limit scope of the libcudacxx_*_DIR vars:
function(_thrust_install_libcudacxx_headers)
# Fake these for the logic in libcudacxxInstallRules.cmake:
set(libcudacxx_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/libcudacxx/")
set(libcudacxx_BINARY_DIR "${Thrust_BINARY_DIR}/libcudacxx-config/")
set(libcudacxx_ENABLE_INSTALL_RULES ON)
include("${Thrust_SOURCE_DIR}/dependencies/libcudacxx/cmake/libcudacxxInstallRules.cmake")
endfunction()

_thrust_install_libcudacxx_headers()
endif()
2 changes: 1 addition & 1 deletion dependencies/cub
Submodule cub updated 63 files
+1 −6 cub/agent/agent_histogram.cuh
+1 −1 cub/agent/agent_rle.cuh
+1 −2 cub/agent/agent_segment_fixup.cuh
+1 −1 cub/agent/agent_spmv_orig.cuh
+24 −9 cub/agent/agent_sub_warp_merge_sort.cuh
+2 −2 cub/agent/single_pass_scan_operators.cuh
+3 −4 cub/block/block_adjacent_difference.cuh
+2 −2 cub/block/block_discontinuity.cuh
+4 −5 cub/block/block_exchange.cuh
+4 −16 cub/block/block_histogram.cuh
+7 −7 cub/block/block_load.cuh
+8 −10 cub/block/block_radix_rank.cuh
+7 −9 cub/block/block_radix_sort.cuh
+5 −5 cub/block/block_raking_layout.cuh
+5 −5 cub/block/block_reduce.cuh
+5 −5 cub/block/block_scan.cuh
+3 −3 cub/block/block_shuffle.cuh
+7 −7 cub/block/block_store.cuh
+4 −6 cub/block/specializations/block_histogram_sort.cuh
+3 −3 cub/block/specializations/block_reduce_raking.cuh
+5 −5 cub/block/specializations/block_reduce_raking_commutative_only.cuh
+3 −3 cub/block/specializations/block_reduce_warp_reductions.cuh
+3 −3 cub/block/specializations/block_scan_raking.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans2.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans3.cuh
+59 −8 cub/cmake/cub-config.cmake
+15 −16 cub/detail/device_synchronize.cuh
+36 −35 cub/device/dispatch/dispatch_histogram.cuh
+35 −18 cub/device/dispatch/dispatch_radix_sort.cuh
+1 −1 cub/device/dispatch/dispatch_reduce.cuh
+22 −29 cub/device/dispatch/dispatch_reduce_by_key.cuh
+23 −28 cub/device/dispatch/dispatch_rle.cuh
+88 −84 cub/device/dispatch/dispatch_segmented_sort.cuh
+24 −28 cub/device/dispatch/dispatch_select_if.cuh
+42 −47 cub/device/dispatch/dispatch_spmv_orig.cuh
+14 −19 cub/device/dispatch/dispatch_three_way_partition.cuh
+48 −50 cub/grid/grid_queue.cuh
+31 −37 cub/iterator/tex_obj_input_iterator.cuh
+5 −18 cub/thread/thread_load.cuh
+4 −12 cub/thread/thread_store.cuh
+41 −49 cub/util_arch.cuh
+96 −67 cub/util_debug.cuh
+109 −138 cub/util_device.cuh
+4 −6 cub/util_ptx.cuh
+0 −15 cub/util_type.cuh
+5 −5 cub/warp/specializations/warp_reduce_shfl.cuh
+4 −5 cub/warp/specializations/warp_reduce_smem.cuh
+4 −4 cub/warp/specializations/warp_scan_shfl.cuh
+3 −4 cub/warp/specializations/warp_scan_smem.cuh
+5 −6 cub/warp/warp_exchange.cuh
+5 −6 cub/warp/warp_load.cuh
+7 −7 cub/warp/warp_merge_sort.cuh
+6 −6 cub/warp/warp_reduce.cuh
+5 −5 cub/warp/warp_scan.cuh
+5 −6 cub/warp/warp_store.cuh
+3 −23 examples/block/example_block_reduce_dyn_smem.cu
+24 −6 test/test_device_merge_sort.cu
+53 −55 test/test_device_segmented_sort.cu
+4 −8 test/test_iterator.cu
+126 −140 test/test_util.h
+1 −2 test/test_warp_mask.cu
+6 −6 test/test_warp_reduce.cu
1 change: 1 addition & 0 deletions dependencies/libcudacxx
Submodule libcudacxx added at 05d48a
25 changes: 14 additions & 11 deletions testing/allocator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
#include <thrust/detail/config.h>
#include <thrust/device_malloc_allocator.h>
#include <thrust/system/cpp/vector.h>

#include <nv/target>

#include <memory>

template <typename T>
Expand Down Expand Up @@ -60,9 +63,12 @@ DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomCopyConstruct);
template <typename T>
struct my_allocator_with_custom_destroy
{
typedef T value_type;
typedef T & reference;
typedef const T & const_reference;
// This is only used with thrust::cpp::vector:
using system_type = thrust::cpp::tag;

using value_type = T;
using reference = T &;
using const_reference = const T &;

static bool g_state;

Expand All @@ -80,9 +86,7 @@ struct my_allocator_with_custom_destroy
__host__ __device__
void destroy(T *)
{
#if !__CUDA_ARCH__
g_state = true;
#endif
NV_IF_TARGET(NV_IS_HOST, (g_state = true;));
}

value_type *allocate(std::ptrdiff_t n)
Expand Down Expand Up @@ -119,12 +123,14 @@ bool my_allocator_with_custom_destroy<T>::g_state = false;
template <typename T>
void TestAllocatorCustomDestroy(size_t n)
{
my_allocator_with_custom_destroy<T>::g_state = false;

{
thrust::cpp::vector<T, my_allocator_with_custom_destroy<T> > vec(n);
} // destroy everything

if (0 < n)
ASSERT_EQUAL(true, my_allocator_with_custom_destroy<T>::g_state);
// state should only be true when there are values to destroy:
ASSERT_EQUAL(n > 0, my_allocator_with_custom_destroy<T>::g_state);
}
DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomDestroy);

Expand Down Expand Up @@ -203,7 +209,6 @@ void TestAllocatorTraitsRebind()
}
DECLARE_UNITTEST(TestAllocatorTraitsRebind);

#if THRUST_CPP_DIALECT >= 2011
void TestAllocatorTraitsRebindCpp11()
{
ASSERT_EQUAL(
Expand Down Expand Up @@ -251,5 +256,3 @@ void TestAllocatorTraitsRebindCpp11()
);
}
DECLARE_UNITTEST(TestAllocatorTraitsRebindCpp11);
#endif // C++11

22 changes: 6 additions & 16 deletions testing/cuda/pair_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,11 @@
#include <thrust/execution_policy.h>


template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
template<typename ExecutionPolicy, typename Iterator>
__global__
void stable_sort_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 is_supported)
void stable_sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::stable_sort(exec, first, last);
#else
*is_supported = false;
#endif
}


Expand Down Expand Up @@ -43,19 +38,14 @@ void TestPairStableSortDevice(ExecutionPolicy exec)

thrust::device_vector<P> d_pairs = h_pairs;

thrust::device_vector<bool> is_supported(1);

stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), is_supported.begin());
stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end());
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);

if(is_supported[0])
{
// sort on the host
thrust::stable_sort(h_pairs.begin(), h_pairs.end());
// sort on the host
thrust::stable_sort(h_pairs.begin(), h_pairs.end());

ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
}
ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
};


Expand Down
24 changes: 7 additions & 17 deletions testing/cuda/pair_sort_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,11 @@
#include <thrust/execution_policy.h>


template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
__global__
void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 is_supported)
void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first);
#else
*is_supported = false;
#endif
}


Expand Down Expand Up @@ -51,21 +46,16 @@ void TestPairStableSortByKeyDevice(ExecutionPolicy exec)
thrust::device_vector<P> d_pairs = h_pairs;
thrust::device_vector<int> d_values = h_values;

thrust::device_vector<bool> is_supported(1);

// sort on the device
stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin(), is_supported.begin());
stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin());
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);

if(is_supported[0])
{
// sort on the host
thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());
// sort on the host
thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());

ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
ASSERT_EQUAL(h_values, d_values);
}
ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
ASSERT_EQUAL(h_values, d_values);
};


Expand Down
66 changes: 24 additions & 42 deletions testing/cuda/partition.cu
Original file line number Diff line number Diff line change
Expand Up @@ -286,16 +286,11 @@ void TestPartitionCopyStencilDeviceNoSync()
DECLARE_UNITTEST(TestPartitionCopyStencilDeviceNoSync);


template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2, typename Iterator3>
template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
__global__
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported)
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
*result = thrust::stable_partition(exec, first, last, pred);
#else
*is_supported = false;
#endif
}


Expand All @@ -313,24 +308,20 @@ void TestStablePartitionDevice(ExecutionPolicy exec)
data[4] = 2;

thrust::device_vector<iterator> result(1);
thrust::device_vector<bool> is_supported(1);

stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin(), is_supported.begin());

stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin());
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);

if(is_supported[0])
{
thrust::device_vector<T> ref(5);
ref[0] = 2;
ref[1] = 2;
ref[2] = 1;
ref[3] = 1;
ref[4] = 1;
thrust::device_vector<T> ref(5);
ref[0] = 2;
ref[1] = 2;
ref[2] = 1;
ref[3] = 1;
ref[4] = 1;

ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
ASSERT_EQUAL(ref, data);
}
ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
ASSERT_EQUAL(ref, data);
}


Expand All @@ -355,16 +346,11 @@ void TestStablePartitionDeviceNoSync()
DECLARE_UNITTEST(TestStablePartitionDeviceNoSync);


template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3, typename Iterator4>
template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
__global__
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported)
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
*result = thrust::stable_partition(exec, first, last, stencil_first, pred);
#else
*is_supported = false;
#endif
}


Expand All @@ -389,24 +375,20 @@ void TestStablePartitionStencilDevice(ExecutionPolicy exec)
stencil[4] = 2;

thrust::device_vector<iterator> result(1);
thrust::device_vector<bool> is_supported(1);

stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin(), is_supported.begin());

stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin());
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);

if(is_supported[0])
{
thrust::device_vector<T> ref(5);
ref[0] = 1;
ref[1] = 1;
ref[2] = 0;
ref[3] = 0;
ref[4] = 0;
thrust::device_vector<T> ref(5);
ref[0] = 1;
ref[1] = 1;
ref[2] = 0;
ref[3] = 0;
ref[4] = 0;

ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
ASSERT_EQUAL(ref, data);
}
ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
ASSERT_EQUAL(ref, data);
}


Expand Down
24 changes: 7 additions & 17 deletions testing/cuda/sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,11 @@
#include <thrust/execution_policy.h>


template<typename ExecutionPolicy, typename Iterator, typename Compare, typename Iterator2>
template<typename ExecutionPolicy, typename Iterator, typename Compare>
__global__
void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp, Iterator2 is_supported)
void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::sort(exec, first, last, comp);
#else
*is_supported = false;
#endif
}


Expand All @@ -34,19 +29,14 @@ void TestComparisonSortDevice(ExecutionPolicy exec, const size_t n, Compare comp
thrust::host_vector<T> h_data = unittest::random_integers<T>(n);
thrust::device_vector<T> d_data = h_data;

thrust::device_vector<bool> is_supported(1);

sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp, is_supported.begin());
sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp);
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);


if(is_supported[0])
{
thrust::sort(h_data.begin(), h_data.end(), comp);

ASSERT_EQUAL(h_data, d_data);
}
thrust::sort(h_data.begin(), h_data.end(), comp);

ASSERT_EQUAL(h_data, d_data);
};


Expand Down Expand Up @@ -163,7 +153,7 @@ void TestComparisonSortCudaStreams()
cudaStreamSynchronize(s);

ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end(), my_less<int>()));

cudaStreamDestroy(s);
}
DECLARE_UNITTEST(TestComparisonSortCudaStreams);
Expand Down
Loading