From 95ab4c234fcf039868ad14d5f639e66c58e97fe0 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Wed, 29 Jun 2022 14:30:13 -0400 Subject: [PATCH 1/6] Skip device synchronization on unsupported arches. --- cub/detail/detect_cuda_runtime.cuh | 9 ++++++ cub/detail/device_synchronize.cuh | 18 ++++++++---- cub/util_device.cuh | 47 +++++++++++++++++++++--------- 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/cub/detail/detect_cuda_runtime.cuh b/cub/detail/detect_cuda_runtime.cuh index 7ac947277d..e8c16c658f 100644 --- a/cub/detail/detect_cuda_runtime.cuh +++ b/cub/detail/detect_cuda_runtime.cuh @@ -94,6 +94,15 @@ namespace detail #endif // CUB_RUNTIME_FUNCTION predefined +#ifdef CUB_RDC_ENABLED +// Detect available version of CDP: +#if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED) +#define CUB_DETAIL_CDPv1 +#else +#define CUB_DETAIL_CDPv2 +#endif +#endif + #endif // Do not document } // namespace detail diff --git a/cub/detail/device_synchronize.cuh b/cub/detail/device_synchronize.cuh index 1a868ff637..9da0a361af 100644 --- a/cub/detail/device_synchronize.cuh +++ b/cub/detail/device_synchronize.cuh @@ -37,18 +37,26 @@ namespace detail CUB_EXEC_CHECK_DISABLE CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() { - cudaError_t result = cudaErrorUnknown; + cudaError_t result = cudaErrorNotSupported; -#if defined(__CUDACC__) && \ - ((__CUDACC_VER_MAJOR__ > 11) || \ - ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6))) + // Device-side sync is only available under CDPv1: +#if defined(CUB_DETAIL_CDPv1) + +#if ((__CUDACC_VER_MAJOR__ > 11) || \ + ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6))) // CUDA >= 11.6 #define CUB_TMP_DEVICE_SYNC_IMPL \ result = __cudaDeviceSynchronizeDeprecationAvoidance(); -#else // CUDA < 11.6 +#else // CUDA < 11.6: #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize(); #endif +#else // CDPv2 or no CDP: + +#define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */ + +#endif // CDP version + NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), (CUB_TMP_DEVICE_SYNC_IMPL)); diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 0965f3d654..909c2bc5da 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -502,7 +502,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int &sm_version, */ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) { - cudaError_t result = cudaErrorUnknown; + cudaError_t result = cudaErrorNotSupported; NV_IF_TARGET(NV_IS_HOST, (result = CubDebug(cudaStreamSynchronize(stream));), @@ -532,21 +532,42 @@ namespace detail CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) { #ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC - (void)stream; - return cudaSuccess; -#else -#if 1 // All valid targets currently support device-side synchronization - _CubLog("%s\n", "Synchronizing..."); - return SyncStream(stream); -#else (void)stream; - _CubLog("%s\n", - "WARNING: Skipping CUB `debug_synchronous` synchronization " - "(unsupported target)."); return cudaSuccess; -#endif -#endif + +#else // CUB_DETAIL_DEBUG_ENABLE_SYNC: + +#define CUB_TMP_SYNC_AVAILABLE \ + _CubLog("%s\n", "Synchronizing..."); \ + return SyncStream(stream) + +#define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \ + (void)stream; \ + _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \ + "device-side sync requires Date: Mon, 15 Aug 2022 12:08:06 -0400 Subject: [PATCH 2/6] Add release notes for 1.17.1. --- CHANGELOG.md | 12 ++++++++++++ README.md | 1 + 2 files changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e860fff341..357748a85d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +# CUB 1.17.1 + +## Summary + +CUB 1.17.1 is a minor bugfix release. + +- NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in + the `AdjacentDifferenceCopy` device algorithms. +- NVIDIA/cub#508: Remove excessive calls to the binary operator given to + the `AdjacentDifferenceCopy` device algorithms. +- Fix device-side debug synchronous behavior in `DeviceSegmentedSort`. + # CUB 1.17.0 ## Summary diff --git a/README.md b/README.md index cc6c7e207d..14aa849f18 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ See the [changelog](CHANGELOG.md) for details about specific releases. | CUB Release | Included In | | ------------------------- | --------------------------------------- | +| 1.17.1 | TBD | | 1.17.0 | TBD | | 1.16.0 | TBD | | 1.15.0 | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 | From e6dee61b80729dde18d55b326e9f156f84b45e7b Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Mon, 15 Aug 2022 12:42:54 -0400 Subject: [PATCH 3/6] Add CUB 2.0.0 changelog. --- CHANGELOG.md | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 1 + 2 files changed, 155 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 357748a85d..d44c052ef0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,157 @@ +# CUB 2.0.0 + +## Summary + +The CUB 2.0.0 major release adds a dependency on libcu++ and contains several +breaking changes. These include new diagnostics when inspecting device-only +lambdas from the host, an updated method of determining accumulator types for +algorithms like Reduce and Scan, and a compile-time replacement for the +runtime `debug_synchronous` debugging flags. + +This release also includes several new features. `DeviceHistogram` now +supports `__half` and better handles various edge cases. `WarpReduce` now +performs correctly when restricted to a single-thread “warp”, and will use +the `__reduce_add_sync` accelerated intrinsic (introduced with Ampere) when +appropriate. `DeviceRadixSort` learned to handle the case +where `begin_bit == end_bit`. + +Several algorithms also have updated documentation, with a particular focus on +clarifying which operations can and cannot be performed in-place. + +## Breaking Changes + +- NVIDIA/cub#448 Add libcu++ dependency (v1.8.0+). +- NVIDIA/cub#448: The following macros are no longer defined by default. They + can be re-enabled by defining `CUB_PROVIDE_LEGACY_ARCH_MACROS`. These will be + completely removed in a future release. + - `CUB_IS_HOST_CODE`: Replace with `NV_IF_TARGET`. + - `CUB_IS_DEVICE_CODE`: Replace with `NV_IF_TARGET`. + - `CUB_INCLUDE_HOST_CODE`: Replace with `NV_IF_TARGET`. + - `CUB_INCLUDE_DEVICE_CODE`: Replace with `NV_IF_TARGET`. +- NVIDIA/cub#486: CUB’s CUDA Runtime support macros have been updated to + support `NV_IF_TARGET`. They are now defined consistently across all + host/device compilation passes. This should not affect most usages of these + macros, but may require changes for some edge cases. + - `CUB_RUNTIME_FUNCTION`: Execution space annotations for functions that + invoke CUDA Runtime APIs. + - Old behavior: + - RDC enabled: Defined to `__host__ __device__` + - RDC not enabled: + - NVCC host pass: Defined to `__host__ __device__` + - NVCC device pass: Defined to `__host__` + - New behavior: + - RDC enabled: Defined to `__host__ __device__` + - RDC not enabled: Defined to `__host__` + - `CUB_RUNTIME_ENABLED`: No change in behavior, but no longer used in CUB. + Provided for legacy support only. Legacy behavior: + - RDC enabled: Macro is defined. + - RDC not enabled: + - NVCC host pass: Macro is defined. + - NVCC device pass: Macro is not defined. + - `CUB_RDC_ENABLED`: New macro, may be combined with `NV_IF_TARGET` to replace + most usages of `CUB_RUNTIME_ENABLED`. Behavior: + - RDC enabled: Macro is defined. + - RDC not enabled: Macro is not defined. +- NVIDIA/cub#509: A compile-time error is now emitted when a `__device__`-only + lambda’s return type is queried from host code (requires libcu++ ≥ 1.9.0). + - Due to limitations in the CUDA programming model, the result of this query + is unreliable, and will silently return an incorrect result. This leads to + difficult to debug errors. + - When using libcu++ 1.9.0, an error will be emitted with information about + work-arounds: + - Use a named function object with a `__device__`-only implementation + of `operator()`. + - Use a `__host__ __device__` lambda. + - Use `cuda::proclaim_return_type` (Added in libcu++ 1.9.0) +- NVIDIA/cub#509: Use the result type of the binary reduction operator for + accumulating intermediate results in the `DeviceReduce` algorithm, following + guidance from http://wg21.link/P2322R6. + - This change requires host-side introspection of the binary operator’s + signature, and device-only extended lambda functions can no longer be used. + - In addition to the behavioral changes, the interfaces for + the `Dispatch*Reduce` layer have changed: + - `DispatchReduce`: + - Now accepts accumulator type as last parameter. + - Now accepts initializer type instead of output iterator value type. + - Constructor now accepts `init` as initial type instead of output + iterator value type. + - `DispatchSegmentedReduce`: + - Accepts accumulator type as last parameter. + - Accepts initializer type instead of output iterator value type. + - Thread operators now accept parameters using different types: `Equality` + , `Inequality`, `InequalityWrapper`, `Sum`, `Difference`, `Division`, `Max` + , `ArgMax`, `Min`, `ArgMin`. + - `ThreadReduce` now accepts accumulator type and uses a different type + for `prefix`. +- NVIDIA/cub#511: Use the result type of the binary operator for accumulating + intermediate results in the `DeviceScan`, `DeviceScanByKey`, + and `DeviceReduceByKey` algorithms, following guidance + from http://wg21.link/P2322R6. + - This change requires host-side introspection of the binary operator’s + signature, and device-only extended lambda functions can no longer be used. + - In addition to the behavioral changes, the interfaces for the `Dispatch` + layer have changed: + - `DispatchScan`now accepts accumulator type as a template parameter. + - `DispatchScanByKey`now accepts accumulator type as a template parameter. + - `DispatchReduceByKey`now accepts accumulator type as the last template + parameter. +- NVIDIA/cub#527: Deprecate the `debug_synchronous` flags on device algorithms. + - This flag no longer has any effect. Define `CUB_DEBUG_SYNC` during + compilation to enable these checks. + - Moving this option from run-time to compile-time avoids the compilation + overhead of unused debugging paths in production code. + +## New Features + +- NVIDIA/cub#514: Support `__half` in `DeviceHistogram`. +- NVIDIA/cub#516: Add support for single-threaded invocations of `WarpReduce`. +- NVIDIA/cub#516: Use `__reduce_add_sync` hardware acceleration for `WarpReduce` + on supported architectures. + +## Bug Fixes + +- NVIDIA/cub#481: Fix the device-wide radix sort implementations to simply copy + the input to the output when `begin_bit == end_bit`. +- NVIDIA/cub#487: Fix `DeviceHistogram::Even` for a variety of edge cases: + - Bin ids are now correctly computed when mixing different types for `SampleT` + and `LevelT`. + - Bin ids are now correctly computed when `LevelT` is an integral type and the + number of levels does not evenly divide the level range. +- NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in + the `AdjacentDifferenceCopy` device algorithms. +- NVIDIA/cub#508: Remove excessive calls to the binary operator given to + the `AdjacentDifferenceCopy` device algorithms. +- NVIDIA/cub#533: Fix debugging utilities when RDC is disabled. + +## Other Enhancements + +- NVIDIA/cub#448: Removed special case code for unsupported CUDA architectures. +- NVIDIA/cub#448: Replace several usages of `__CUDA_ARCH__` with `` + to handle host/device code divergence. +- NVIDIA/cub#448: Mark unused PTX arch parameters as legacy. +- NVIDIA/cub#476: Enabled additional debug logging for the onesweep radix sort + implementation. Thanks to @canonizer for this contribution. +- NVIDIA/cub#480: Add `CUB_DISABLE_BF16_SUPPORT` to avoid including + the `cuda_bf16.h` header or using the `__nv_bfloat16` type. +- NVIDIA/cub#486: Add debug log messages for post-kernel debug synchronizations. +- NVIDIA/cub#490: Clarify documentation for in-place usage of `DeviceScan` + algorithms. +- NVIDIA/cub#494: Clarify documentation for in-place usage of `DeviceHistogram` + algorithms. +- NVIDIA/cub#495: Clarify documentation for in-place usage of `DevicePartition` + algorithms. +- NVIDIA/cub#499: Clarify documentation for in-place usage of `Device*Sort` + algorithms. +- NVIDIA/cub#500: Clarify documentation for in-place usage of `DeviceReduce` + algorithms. +- NVIDIA/cub#501: Clarify documentation for in-place usage + of `DeviceRunLengthEncode` algorithms. +- NVIDIA/cub#503: Clarify documentation for in-place usage of `DeviceSelect` + algorithms. +- NVIDIA/cub#518: Fix typo in `WarpMergeSort` documentation. +- NVIDIA/cub#519: Clarify segmented sort documentation regarding the handling of + elements that are not included in any segment. + # CUB 1.17.1 ## Summary diff --git a/README.md b/README.md index 14aa849f18..2ce2fc7b0a 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ See the [changelog](CHANGELOG.md) for details about specific releases. | CUB Release | Included In | | ------------------------- | --------------------------------------- | +| 2.0.0 | TBD | | 1.17.1 | TBD | | 1.17.0 | TBD | | 1.16.0 | TBD | From dfb3a472394d76618a523def293645c9a1a33588 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 16 Aug 2022 12:02:27 -0400 Subject: [PATCH 4/6] Bump version to 2.0.1 for CTK12 --- cub/version.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/version.cuh b/cub/version.cuh index ee0f42b68e..eeb92bdba0 100644 --- a/cub/version.cuh +++ b/cub/version.cuh @@ -43,7 +43,7 @@ * CUB_VERSION / 100 % 1000 is the minor version. * CUB_VERSION / 100000 is the major version. */ -#define CUB_VERSION 200000 +#define CUB_VERSION 200001 /*! \def CUB_MAJOR_VERSION * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the From ae6fc1d42f0cadabae96983e792001a7228662db Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 16 Aug 2022 16:00:43 -0400 Subject: [PATCH 5/6] Add sm90 option to CMake builds. --- cmake/CubCudaConfig.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/CubCudaConfig.cmake b/cmake/CubCudaConfig.cmake index 8c96b083ca..3cbe0771ca 100644 --- a/cmake/CubCudaConfig.cmake +++ b/cmake/CubCudaConfig.cmake @@ -4,7 +4,7 @@ enable_language(CUDA) # Architecture options: # -set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86) +set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86 90) set(arch_message "CUB: Explicitly enabled compute architectures:") # Thrust sets up the architecture flags in CMAKE_CUDA_FLAGS already. Just From f31d7123b4039c418269c2df207f23abf919b20b Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Thu, 8 Sep 2022 20:23:21 +0400 Subject: [PATCH 6/6] Merge pull request #571 from senior-zero/main-fix/github/char_tests Don't use char in tests (cherry picked from commit f403031cb0d22742bbe7e95fac22af93b6660637) --- test/test_block_load_store.cu | 2 +- test/test_block_radix_sort.cu | 4 ++-- test/test_block_reduce.cu | 2 +- test/test_device_radix_sort.cu | 2 +- test/test_device_reduce.cu | 4 ++-- test/test_device_reduce_by_key.cu | 4 ++-- test/test_device_run_length_encode.cu | 2 +- test/test_device_scan.cu | 2 +- test/test_device_scan_by_key.cu | 2 +- test/test_device_spmv.cu | 2 +- test/test_iterator.cu | 2 +- test/test_iterator_deprecated.cu | 2 +- test/test_util.h | 2 +- test/test_warp_reduce.cu | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/test/test_block_load_store.cu b/test/test_block_load_store.cu index 49bb5ec0f0..b262b25122 100644 --- a/test/test_block_load_store.cu +++ b/test/test_block_load_store.cu @@ -517,7 +517,7 @@ int main(int argc, char** argv) // Compile/run thorough tests #if TEST_VALUE_TYPES == 0 - TestThreads(2, 0.8f); + TestThreads(2, 0.8f); TestThreads(2, 0.8f); TestThreads(2, 0.8f); #elif TEST_VALUE_TYPES == 1 diff --git a/test/test_block_radix_sort.cu b/test/test_block_radix_sort.cu index 440d1419d4..923087ccb5 100644 --- a/test/test_block_radix_sort.cu +++ b/test/test_block_radix_sort.cu @@ -531,7 +531,7 @@ template < void TestKeysAndPairs() { // Test pairs sorting with only 4-byte configs - Test(); // With small-values + Test(); // With small-values Test(); // With same-values Test(); // With large values } @@ -560,7 +560,7 @@ void Test() #elif TEST_VALUE_TYPES == 2 // Test signed and fp types with paired values - TestKeysAndPairs(); + TestKeysAndPairs(); TestKeysAndPairs(); TestKeysAndPairs(); #elif TEST_VALUE_TYPES == 3 diff --git a/test/test_block_reduce.cu b/test/test_block_reduce.cu index 2850ce56f2..423b8593d8 100644 --- a/test/test_block_reduce.cu +++ b/test/test_block_reduce.cu @@ -734,7 +734,7 @@ int main(int argc, char** argv) // primitives #if TEST_VALUE_TYPES == 0 - Test(); + Test(); Test(); Test(); Test(); diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu index 0c40fceeae..a6993fccb6 100644 --- a/test/test_device_radix_sort.cu +++ b/test/test_device_radix_sort.cu @@ -1946,7 +1946,7 @@ int main(int argc, char** argv) // Compile/run thorough tests #if TEST_KEY_BYTES == 1 - TestGen (num_items, num_segments); + TestGen (num_items, num_segments); #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index 73ff4f26f5..1042cf2a0c 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -1473,9 +1473,9 @@ int main(int argc, char** argv) // %PARAM% TEST_TYPES types 0:1:2:3 #if TEST_TYPES == 0 - TestType(max_items, max_segments); + TestType(max_items, max_segments); TestType(max_items, max_segments); - TestType(max_items, max_segments); + TestType(max_items, max_segments); #elif TEST_TYPES == 1 TestType(max_items, max_segments); TestType(max_items, max_segments); diff --git a/test/test_device_reduce_by_key.cu b/test/test_device_reduce_by_key.cu index 98a460d669..797b038231 100644 --- a/test/test_device_reduce_by_key.cu +++ b/test/test_device_reduce_by_key.cu @@ -719,7 +719,7 @@ int main(int argc, char** argv) // %PARAM% TEST_CDP cdp 0:1 // Test different input types - TestOp(num_items); + TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); @@ -735,7 +735,7 @@ int main(int argc, char** argv) TestOp(num_items); TestOp(num_items); - TestOp(num_items); + TestOp(num_items); TestOp(num_items); TestOp(num_items); TestOp(num_items); diff --git a/test/test_device_run_length_encode.cu b/test/test_device_run_length_encode.cu index 2e8d350822..c16d559389 100644 --- a/test/test_device_run_length_encode.cu +++ b/test/test_device_run_length_encode.cu @@ -749,7 +749,7 @@ int main(int argc, char** argv) // %PARAM% TEST_CDP cdp 0:1 // Test different input types - TestSize(num_items); + TestSize(num_items); TestSize(num_items); TestSize(num_items); TestSize(num_items); diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu index 6197bda2db..6f876f3999 100644 --- a/test/test_device_scan.cu +++ b/test/test_device_scan.cu @@ -1221,7 +1221,7 @@ int main(int argc, char** argv) // Test same input+output data types TestSize(num_items, (unsigned char)0, (unsigned char)99); - TestSize(num_items, (char)0, (char)99); + TestSize(num_items, (char)0, (char)99); TestSize(num_items, (unsigned short)0, (unsigned short)99); TestSize(num_items, (unsigned int)0, (unsigned int)99); TestSize(num_items, diff --git a/test/test_device_scan_by_key.cu b/test/test_device_scan_by_key.cu index f3bf841763..b56975f11c 100644 --- a/test/test_device_scan_by_key.cu +++ b/test/test_device_scan_by_key.cu @@ -1036,7 +1036,7 @@ int main(int argc, char** argv) // Test same input+output data types TestSize(num_items, (unsigned char)0, (unsigned char)99); - TestSize(num_items, (char)0, (char)99); + TestSize(num_items, (char)0, (char)99); #elif TEST_VALUE_TYPES == 1 diff --git a/test/test_device_spmv.cu b/test/test_device_spmv.cu index 94b9fbfac6..2e8187417d 100644 --- a/test/test_device_spmv.cu +++ b/test/test_device_spmv.cu @@ -566,7 +566,7 @@ void test_types() { test_type(); test_type(); - test_type(); + test_type(); test_type(); test_type(); } diff --git a/test/test_iterator.cu b/test/test_iterator.cu index f042816a37..32e5b66973 100644 --- a/test/test_iterator.cu +++ b/test/test_iterator.cu @@ -500,7 +500,7 @@ int main(int argc, char** argv) CubDebugExit(PtxVersion(ptx_version)); // Evaluate different data types - Test(); + Test(); Test(); Test(); Test(); diff --git a/test/test_iterator_deprecated.cu b/test/test_iterator_deprecated.cu index b42febe51b..862af1d2c5 100644 --- a/test/test_iterator_deprecated.cu +++ b/test/test_iterator_deprecated.cu @@ -264,7 +264,7 @@ int main(int argc, char** argv) CubDebugExit(args.DeviceInit()); // Evaluate different data types - Test(); + Test(); Test(); Test(); Test(); diff --git a/test/test_util.h b/test/test_util.h index 8d00e435a2..db3ce8806e 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -1134,7 +1134,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< /** * Define for types */ -CUB_VEC_OVERLOAD(char, char) +CUB_VEC_OVERLOAD(char, signed char) CUB_VEC_OVERLOAD(short, short) CUB_VEC_OVERLOAD(int, int) CUB_VEC_OVERLOAD(long, long) diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu index ea2e665183..4b422a08fe 100644 --- a/test/test_warp_reduce.cu +++ b/test/test_warp_reduce.cu @@ -721,7 +721,7 @@ template < void Test(GenMode gen_mode) { // primitive - Test( gen_mode, Sum()); + Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum()); Test( gen_mode, Sum());