From 95ab4c234fcf039868ad14d5f639e66c58e97fe0 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Wed, 29 Jun 2022 14:30:13 -0400
Subject: [PATCH 1/6] Skip device synchronization on unsupported arches.

---
 cub/detail/detect_cuda_runtime.cuh |  9 ++++++
 cub/detail/device_synchronize.cuh  | 18 ++++++++----
 cub/util_device.cuh                | 47 +++++++++++++++++++++---------
 3 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/cub/detail/detect_cuda_runtime.cuh b/cub/detail/detect_cuda_runtime.cuh
index 7ac947277d..e8c16c658f 100644
--- a/cub/detail/detect_cuda_runtime.cuh
+++ b/cub/detail/detect_cuda_runtime.cuh
@@ -94,6 +94,15 @@ namespace detail
 
 #endif // CUB_RUNTIME_FUNCTION predefined
 
+#ifdef CUB_RDC_ENABLED
+// Detect available version of CDP:
+#if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED)
+#define CUB_DETAIL_CDPv1
+#else
+#define CUB_DETAIL_CDPv2
+#endif
+#endif
+
 #endif // Do not document
 
 } // namespace detail
diff --git a/cub/detail/device_synchronize.cuh b/cub/detail/device_synchronize.cuh
index 1a868ff637..9da0a361af 100644
--- a/cub/detail/device_synchronize.cuh
+++ b/cub/detail/device_synchronize.cuh
@@ -37,18 +37,26 @@ namespace detail
 CUB_EXEC_CHECK_DISABLE
 CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
 {
-  cudaError_t result = cudaErrorUnknown;
+  cudaError_t result = cudaErrorNotSupported;
 
-#if defined(__CUDACC__) &&                                                     \
-  ((__CUDACC_VER_MAJOR__ > 11) ||                                              \
-   ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
+  // Device-side sync is only available under CDPv1:
+#if defined(CUB_DETAIL_CDPv1)
+
+#if ((__CUDACC_VER_MAJOR__ > 11) ||                                            \
+     ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
   // CUDA >= 11.6
 #define CUB_TMP_DEVICE_SYNC_IMPL                                               \
   result = __cudaDeviceSynchronizeDeprecationAvoidance();
-#else // CUDA < 11.6
+#else // CUDA < 11.6:
 #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
 #endif
 
+#else // CDPv2 or no CDP:
+
+#define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */
+
+#endif // CDP version
+
   NV_IF_TARGET(NV_IS_HOST,
                (result = cudaDeviceSynchronize();),
                (CUB_TMP_DEVICE_SYNC_IMPL));
diff --git a/cub/util_device.cuh b/cub/util_device.cuh
index 0965f3d654..909c2bc5da 100644
--- a/cub/util_device.cuh
+++ b/cub/util_device.cuh
@@ -502,7 +502,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int &sm_version,
  */
 CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
 {
-  cudaError_t result = cudaErrorUnknown;
+  cudaError_t result = cudaErrorNotSupported;
 
   NV_IF_TARGET(NV_IS_HOST,
                (result = CubDebug(cudaStreamSynchronize(stream));),
@@ -532,21 +532,42 @@ namespace detail
 CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
 {
 #ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC
-  (void)stream;
 
-  return cudaSuccess;
-#else
-#if 1 // All valid targets currently support device-side synchronization
-  _CubLog("%s\n", "Synchronizing...");
-  return SyncStream(stream);
-#else
   (void)stream;
-  _CubLog("%s\n",
-          "WARNING: Skipping CUB `debug_synchronous` synchronization "
-          "(unsupported target).");
   return cudaSuccess;
-#endif
-#endif
+
+#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:
+
+#define CUB_TMP_SYNC_AVAILABLE                                                 \
+  _CubLog("%s\n", "Synchronizing...");                                         \
+  return SyncStream(stream)
+
+#define CUB_TMP_DEVICE_SYNC_UNAVAILABLE                                        \
+  (void)stream;                                                                \
+  _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
+          "device-side sync requires <sm_90, RDC, and CDPv1");                 \
+  return cudaSuccess
+
+#ifdef CUB_DETAIL_CDPv1
+
+  // Can sync everywhere but SM_90+
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;),
+               (CUB_TMP_SYNC_AVAILABLE;));
+
+#else // CDPv2 or no CDP:
+
+  // Can only sync on host
+  NV_IF_TARGET(NV_IS_HOST,
+               (CUB_TMP_SYNC_AVAILABLE;),
+               (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));
+
+#endif // CDP version
+
+#undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
+#undef CUB_TMP_SYNC_AVAILABLE
+
+#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
 }
 
 } // namespace detail

From 41710fe383483a8ce8cb38c244a47a34e9a94cd9 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Mon, 15 Aug 2022 12:08:06 -0400
Subject: [PATCH 2/6] Add release notes for 1.17.1.

---
 CHANGELOG.md | 12 ++++++++++++
 README.md    |  1 +
 2 files changed, 13 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e860fff341..357748a85d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,15 @@
+# CUB 1.17.1
+
+## Summary
+
+CUB 1.17.1 is a minor bugfix release.
+
+- NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in
+  the `AdjacentDifferenceCopy` device algorithms.
+- NVIDIA/cub#508: Remove excessive calls to the binary operator given to
+  the `AdjacentDifferenceCopy` device algorithms.
+- Fix device-side debug synchronous behavior in `DeviceSegmentedSort`.
+
 # CUB 1.17.0
 
 ## Summary
diff --git a/README.md b/README.md
index cc6c7e207d..14aa849f18 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,7 @@ See the [changelog](CHANGELOG.md) for details about specific releases.
 
 | CUB Release               | Included In                             |
 | ------------------------- | --------------------------------------- |
+| 1.17.1                    | TBD                                     |
 | 1.17.0                    | TBD                                     |
 | 1.16.0                    | TBD                                     |
 | 1.15.0                    | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 |

From e6dee61b80729dde18d55b326e9f156f84b45e7b Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Mon, 15 Aug 2022 12:42:54 -0400
Subject: [PATCH 3/6] Add CUB 2.0.0 changelog.

---
 CHANGELOG.md | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md    |   1 +
 2 files changed, 155 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 357748a85d..d44c052ef0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,157 @@
+# CUB 2.0.0
+
+## Summary
+
+The CUB 2.0.0 major release adds a dependency on libcu++ and contains several
+breaking changes. These include new diagnostics when inspecting device-only
+lambdas from the host, an updated method of determining accumulator types for
+algorithms like Reduce and Scan, and a compile-time replacement for the
+runtime `debug_synchronous` debugging flags.
+
+This release also includes several new features. `DeviceHistogram` now
+supports `__half` and better handles various edge cases. `WarpReduce` now
+performs correctly when restricted to a single-thread “warp”, and will use
+the `__reduce_add_sync` accelerated intrinsic (introduced with Ampere) when
+appropriate. `DeviceRadixSort` learned to handle the case
+where `begin_bit == end_bit`.
+
+Several algorithms also have updated documentation, with a particular focus on
+clarifying which operations can and cannot be performed in-place.
+
+## Breaking Changes
+
+- NVIDIA/cub#448 Add libcu++ dependency (v1.8.0+).
+- NVIDIA/cub#448: The following macros are no longer defined by default. They
+  can be re-enabled by defining `CUB_PROVIDE_LEGACY_ARCH_MACROS`. These will be
+  completely removed in a future release.
+  - `CUB_IS_HOST_CODE`: Replace with `NV_IF_TARGET`.
+  - `CUB_IS_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+  - `CUB_INCLUDE_HOST_CODE`: Replace with `NV_IF_TARGET`.
+  - `CUB_INCLUDE_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+- NVIDIA/cub#486: CUB’s CUDA Runtime support macros have been updated to
+  support `NV_IF_TARGET`. They are now defined consistently across all
+  host/device compilation passes. This should not affect most usages of these
+  macros, but may require changes for some edge cases.
+  - `CUB_RUNTIME_FUNCTION`: Execution space annotations for functions that
+    invoke CUDA Runtime APIs.
+    - Old behavior:
+      - RDC enabled: Defined to `__host__ __device__`
+      - RDC not enabled:
+        - NVCC host pass: Defined to `__host__ __device__`
+        - NVCC device pass: Defined to `__host__`
+    - New behavior:
+      - RDC enabled: Defined to `__host__ __device__`
+      - RDC not enabled: Defined to `__host__`
+  - `CUB_RUNTIME_ENABLED`: No change in behavior, but no longer used in CUB.
+    Provided for legacy support only. Legacy behavior:
+    - RDC enabled: Macro is defined.
+    - RDC not enabled:
+      - NVCC host pass: Macro is defined.
+      - NVCC device pass: Macro is not defined.
+  - `CUB_RDC_ENABLED`: New macro, may be combined with `NV_IF_TARGET` to replace
+    most usages of `CUB_RUNTIME_ENABLED`. Behavior:
+    - RDC enabled: Macro is defined.
+    - RDC not enabled: Macro is not defined.
+- NVIDIA/cub#509: A compile-time error is now emitted when a `__device__`-only
+  lambda’s return type is queried from host code (requires libcu++ ≥ 1.9.0).
+  - Due to limitations in the CUDA programming model, the result of this query
+    is unreliable, and will silently return an incorrect result. This leads to
+    difficult to debug errors.
+  - When using libcu++ 1.9.0, an error will be emitted with information about
+    work-arounds:
+    - Use a named function object with a `__device__`-only implementation
+      of `operator()`.
+    - Use a `__host__ __device__` lambda.
+    - Use `cuda::proclaim_return_type` (Added in libcu++ 1.9.0)
+- NVIDIA/cub#509: Use the result type of the binary reduction operator for
+  accumulating intermediate results in the `DeviceReduce` algorithm, following
+  guidance from http://wg21.link/P2322R6.
+  - This change requires host-side introspection of the binary operator’s
+    signature, and device-only extended lambda functions can no longer be used.
+  - In addition to the behavioral changes, the interfaces for
+    the `Dispatch*Reduce` layer have changed:
+    - `DispatchReduce`:
+      - Now accepts accumulator type as last parameter.
+      - Now accepts initializer type instead of output iterator value type.
+      - Constructor now accepts `init` as initial type instead of output
+        iterator value type.
+    - `DispatchSegmentedReduce`:
+      - Accepts accumulator type as last parameter.
+      - Accepts initializer type instead of output iterator value type.
+  - Thread operators now accept parameters using different types: `Equality`
+    , `Inequality`, `InequalityWrapper`, `Sum`, `Difference`, `Division`, `Max`
+    , `ArgMax`, `Min`, `ArgMin`.
+  - `ThreadReduce` now accepts accumulator type and uses a different type
+    for `prefix`.
+- NVIDIA/cub#511: Use the result type of the binary operator for accumulating
+  intermediate results in the `DeviceScan`, `DeviceScanByKey`,
+  and `DeviceReduceByKey` algorithms, following guidance
+  from http://wg21.link/P2322R6.
+  - This change requires host-side introspection of the binary operator’s
+    signature, and device-only extended lambda functions can no longer be used.
+  - In addition to the behavioral changes, the interfaces for the `Dispatch`
+    layer have changed:
+    - `DispatchScan`now accepts accumulator type as a template parameter.
+    - `DispatchScanByKey`now accepts accumulator type as a template parameter.
+    - `DispatchReduceByKey`now accepts accumulator type as the last template
+      parameter.
+- NVIDIA/cub#527: Deprecate the `debug_synchronous` flags on device algorithms.
+  - This flag no longer has any effect. Define `CUB_DEBUG_SYNC` during
+    compilation to enable these checks.
+  - Moving this option from run-time to compile-time avoids the compilation
+    overhead of unused debugging paths in production code.
+
+## New Features
+
+- NVIDIA/cub#514: Support `__half` in `DeviceHistogram`.
+- NVIDIA/cub#516: Add support for single-threaded invocations of `WarpReduce`.
+- NVIDIA/cub#516: Use `__reduce_add_sync` hardware acceleration for `WarpReduce`
+  on supported architectures.
+
+## Bug Fixes
+
+- NVIDIA/cub#481: Fix the device-wide radix sort implementations to simply copy
+  the input to the output when `begin_bit == end_bit`.
+- NVIDIA/cub#487: Fix `DeviceHistogram::Even` for a variety of edge cases:
+  - Bin ids are now correctly computed when mixing different types for `SampleT`
+    and `LevelT`.
+  - Bin ids are now correctly computed when `LevelT` is an integral type and the
+    number of levels does not evenly divide the level range.
+- NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in
+  the `AdjacentDifferenceCopy` device algorithms.
+- NVIDIA/cub#508: Remove excessive calls to the binary operator given to
+  the `AdjacentDifferenceCopy` device algorithms.
+- NVIDIA/cub#533: Fix debugging utilities when RDC is disabled.
+
+## Other Enhancements
+
+- NVIDIA/cub#448: Removed special case code for unsupported CUDA architectures.
+- NVIDIA/cub#448: Replace several usages of `__CUDA_ARCH__` with `<nv/target>`
+  to handle host/device code divergence.
+- NVIDIA/cub#448: Mark unused PTX arch parameters as legacy.
+- NVIDIA/cub#476: Enabled additional debug logging for the onesweep radix sort
+  implementation. Thanks to @canonizer for this contribution.
+- NVIDIA/cub#480: Add `CUB_DISABLE_BF16_SUPPORT` to avoid including
+  the `cuda_bf16.h` header or using the `__nv_bfloat16` type.
+- NVIDIA/cub#486: Add debug log messages for post-kernel debug synchronizations.
+- NVIDIA/cub#490: Clarify documentation for in-place usage of `DeviceScan`
+  algorithms.
+- NVIDIA/cub#494: Clarify documentation for in-place usage of `DeviceHistogram`
+  algorithms.
+- NVIDIA/cub#495: Clarify documentation for in-place usage of `DevicePartition`
+  algorithms.
+- NVIDIA/cub#499: Clarify documentation for in-place usage of `Device*Sort`
+  algorithms.
+- NVIDIA/cub#500: Clarify documentation for in-place usage of `DeviceReduce`
+  algorithms.
+- NVIDIA/cub#501: Clarify documentation for in-place usage
+  of `DeviceRunLengthEncode` algorithms.
+- NVIDIA/cub#503: Clarify documentation for in-place usage of `DeviceSelect`
+  algorithms.
+- NVIDIA/cub#518: Fix typo in `WarpMergeSort` documentation.
+- NVIDIA/cub#519: Clarify segmented sort documentation regarding the handling of
+  elements that are not included in any segment.
+
 # CUB 1.17.1
 
 ## Summary
diff --git a/README.md b/README.md
index 14aa849f18..2ce2fc7b0a 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,7 @@ See the [changelog](CHANGELOG.md) for details about specific releases.
 
 | CUB Release               | Included In                             |
 | ------------------------- | --------------------------------------- |
+| 2.0.0                     | TBD                                     |
 | 1.17.1                    | TBD                                     |
 | 1.17.0                    | TBD                                     |
 | 1.16.0                    | TBD                                     |

From dfb3a472394d76618a523def293645c9a1a33588 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Tue, 16 Aug 2022 12:02:27 -0400
Subject: [PATCH 4/6] Bump version to 2.0.1 for CTK12

---
 cub/version.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/version.cuh b/cub/version.cuh
index ee0f42b68e..eeb92bdba0 100644
--- a/cub/version.cuh
+++ b/cub/version.cuh
@@ -43,7 +43,7 @@
  *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>CUB_VERSION / 100000</tt> is the major version.
  */
-#define CUB_VERSION 200000
+#define CUB_VERSION 200001
 
 /*! \def CUB_MAJOR_VERSION
  *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the

From ae6fc1d42f0cadabae96983e792001a7228662db Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Tue, 16 Aug 2022 16:00:43 -0400
Subject: [PATCH 5/6] Add sm90 option to CMake builds.

---
 cmake/CubCudaConfig.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/CubCudaConfig.cmake b/cmake/CubCudaConfig.cmake
index 8c96b083ca..3cbe0771ca 100644
--- a/cmake/CubCudaConfig.cmake
+++ b/cmake/CubCudaConfig.cmake
@@ -4,7 +4,7 @@ enable_language(CUDA)
 # Architecture options:
 #
 
-set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86)
+set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86 90)
 set(arch_message "CUB: Explicitly enabled compute architectures:")
 
 # Thrust sets up the architecture flags in CMAKE_CUDA_FLAGS already. Just

From f31d7123b4039c418269c2df207f23abf919b20b Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 8 Sep 2022 20:23:21 +0400
Subject: [PATCH 6/6] Merge pull request #571 from
 senior-zero/main-fix/github/char_tests

Don't use char in tests

(cherry picked from commit f403031cb0d22742bbe7e95fac22af93b6660637)
---
 test/test_block_load_store.cu         | 2 +-
 test/test_block_radix_sort.cu         | 4 ++--
 test/test_block_reduce.cu             | 2 +-
 test/test_device_radix_sort.cu        | 2 +-
 test/test_device_reduce.cu            | 4 ++--
 test/test_device_reduce_by_key.cu     | 4 ++--
 test/test_device_run_length_encode.cu | 2 +-
 test/test_device_scan.cu              | 2 +-
 test/test_device_scan_by_key.cu       | 2 +-
 test/test_device_spmv.cu              | 2 +-
 test/test_iterator.cu                 | 2 +-
 test/test_iterator_deprecated.cu      | 2 +-
 test/test_util.h                      | 2 +-
 test/test_warp_reduce.cu              | 2 +-
 14 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/test/test_block_load_store.cu b/test/test_block_load_store.cu
index 49bb5ec0f0..b262b25122 100644
--- a/test/test_block_load_store.cu
+++ b/test/test_block_load_store.cu
@@ -517,7 +517,7 @@ int main(int argc, char** argv)
 
     // Compile/run thorough tests
 #if TEST_VALUE_TYPES == 0
-    TestThreads<char>(2, 0.8f);
+    TestThreads<signed char>(2, 0.8f);
     TestThreads<int>(2, 0.8f);
     TestThreads<long>(2, 0.8f);
 #elif TEST_VALUE_TYPES == 1
diff --git a/test/test_block_radix_sort.cu b/test/test_block_radix_sort.cu
index 440d1419d4..923087ccb5 100644
--- a/test/test_block_radix_sort.cu
+++ b/test/test_block_radix_sort.cu
@@ -531,7 +531,7 @@ template <
 void TestKeysAndPairs()
 {
     // Test pairs sorting with only 4-byte configs
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>();        // With small-values
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, signed char>(); // With small-values
     Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>();         // With same-values
     Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>();     // With large values
 }
@@ -560,7 +560,7 @@ void Test()
 
 #elif TEST_VALUE_TYPES == 2
     // Test signed and fp types with paired values
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, char>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, signed char>();
     TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, short>();
     TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, int>();
 #elif TEST_VALUE_TYPES == 3
diff --git a/test/test_block_reduce.cu b/test/test_block_reduce.cu
index 2850ce56f2..423b8593d8 100644
--- a/test/test_block_reduce.cu
+++ b/test/test_block_reduce.cu
@@ -734,7 +734,7 @@ int main(int argc, char** argv)
 
     // primitives
 #if TEST_VALUE_TYPES == 0
-    Test<char>();
+    Test<signed char>();
     Test<short>();
     Test<int>();
     Test<long long>();
diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu
index 0c40fceeae..a6993fccb6 100644
--- a/test/test_device_radix_sort.cu
+++ b/test/test_device_radix_sort.cu
@@ -1946,7 +1946,7 @@ int main(int argc, char** argv)
     // Compile/run thorough tests
 #if TEST_KEY_BYTES == 1
 
-    TestGen<char, true>               (num_items, num_segments);
+    TestGen<signed char, true>        (num_items, num_segments);
 
 #ifdef TEST_EXTENDED_KEY_TYPES
     TestGen<bool, false>              (num_items, num_segments);
diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
index 73ff4f26f5..1042cf2a0c 100644
--- a/test/test_device_reduce.cu
+++ b/test/test_device_reduce.cu
@@ -1473,9 +1473,9 @@ int main(int argc, char** argv)
     // %PARAM% TEST_TYPES types 0:1:2:3
 
 #if TEST_TYPES == 0
-    TestType<char, char>(max_items, max_segments);
+    TestType<signed char, signed char>(max_items, max_segments);
     TestType<unsigned char, unsigned char>(max_items, max_segments);
-    TestType<char, int>(max_items, max_segments);
+    TestType<signed char, int>(max_items, max_segments);
 #elif TEST_TYPES == 1
     TestType<short, short>(max_items, max_segments);
     TestType<int, int>(max_items, max_segments);
diff --git a/test/test_device_reduce_by_key.cu b/test/test_device_reduce_by_key.cu
index 98a460d669..797b038231 100644
--- a/test/test_device_reduce_by_key.cu
+++ b/test/test_device_reduce_by_key.cu
@@ -719,7 +719,7 @@ int main(int argc, char** argv)
     // %PARAM% TEST_CDP cdp 0:1
 
     // Test different input types
-    TestOp<int, char>(num_items);
+    TestOp<int, signed char>(num_items);
     TestOp<int, short>(num_items);
     TestOp<int, int>(num_items);
     TestOp<int, long>(num_items);
@@ -735,7 +735,7 @@ int main(int argc, char** argv)
     TestOp<int, TestFoo>(num_items);
     TestOp<int, TestBar>(num_items);
 
-    TestOp<char, int>(num_items);
+    TestOp<signed char, int>(num_items);
     TestOp<long long, int>(num_items);
     TestOp<TestFoo, int>(num_items);
     TestOp<TestBar, int>(num_items);
diff --git a/test/test_device_run_length_encode.cu b/test/test_device_run_length_encode.cu
index 2e8d350822..c16d559389 100644
--- a/test/test_device_run_length_encode.cu
+++ b/test/test_device_run_length_encode.cu
@@ -749,7 +749,7 @@ int main(int argc, char** argv)
     // %PARAM% TEST_CDP cdp 0:1
 
     // Test different input types
-    TestSize<char, int, int>(num_items);
+    TestSize<signed char, int, int>(num_items);
     TestSize<short, int, int>(num_items);
     TestSize<int, int, int>(num_items);
     TestSize<long, int, int>(num_items);
diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu
index 6197bda2db..6f876f3999 100644
--- a/test/test_device_scan.cu
+++ b/test/test_device_scan.cu
@@ -1221,7 +1221,7 @@ int main(int argc, char** argv)
 
     // Test same input+output data types
     TestSize<unsigned char>(num_items, (unsigned char)0, (unsigned char)99);
-    TestSize<char>(num_items, (char)0, (char)99);
+    TestSize<signed char>(num_items, (char)0, (char)99);
     TestSize<unsigned short>(num_items, (unsigned short)0, (unsigned short)99);
     TestSize<unsigned int>(num_items, (unsigned int)0, (unsigned int)99);
     TestSize<unsigned long long>(num_items,
diff --git a/test/test_device_scan_by_key.cu b/test/test_device_scan_by_key.cu
index f3bf841763..b56975f11c 100644
--- a/test/test_device_scan_by_key.cu
+++ b/test/test_device_scan_by_key.cu
@@ -1036,7 +1036,7 @@ int main(int argc, char** argv)
 
     // Test same input+output data types
     TestSize<unsigned char>(num_items, (unsigned char)0, (unsigned char)99);
-    TestSize<char>(num_items, (char)0, (char)99);
+    TestSize<signed char>(num_items, (char)0, (char)99);
 
 #elif TEST_VALUE_TYPES == 1
 
diff --git a/test/test_device_spmv.cu b/test/test_device_spmv.cu
index 94b9fbfac6..2e8187417d 100644
--- a/test/test_device_spmv.cu
+++ b/test/test_device_spmv.cu
@@ -566,7 +566,7 @@ void test_types()
 {
   test_type<float>();
   test_type<double>();
-  test_type<char>();
+  test_type<signed char>();
   test_type<int>();
   test_type<unsigned long long>();
 }
diff --git a/test/test_iterator.cu b/test/test_iterator.cu
index f042816a37..32e5b66973 100644
--- a/test/test_iterator.cu
+++ b/test/test_iterator.cu
@@ -500,7 +500,7 @@ int main(int argc, char** argv)
     CubDebugExit(PtxVersion(ptx_version));
 
     // Evaluate different data types
-    Test<char>();
+    Test<signed char>();
     Test<short>();
     Test<int>();
     Test<long>();
diff --git a/test/test_iterator_deprecated.cu b/test/test_iterator_deprecated.cu
index b42febe51b..862af1d2c5 100644
--- a/test/test_iterator_deprecated.cu
+++ b/test/test_iterator_deprecated.cu
@@ -264,7 +264,7 @@ int main(int argc, char** argv)
     CubDebugExit(args.DeviceInit());
 
     // Evaluate different data types
-    Test<char>();
+    Test<signed char>();
     Test<short>();
     Test<int>();
     Test<long>();
diff --git a/test/test_util.h b/test/test_util.h
index 8d00e435a2..db3ce8806e 100644
--- a/test/test_util.h
+++ b/test/test_util.h
@@ -1134,7 +1134,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
 /**
  * Define for types
  */
-CUB_VEC_OVERLOAD(char, char)
+CUB_VEC_OVERLOAD(char, signed char)
 CUB_VEC_OVERLOAD(short, short)
 CUB_VEC_OVERLOAD(int, int)
 CUB_VEC_OVERLOAD(long, long)
diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu
index ea2e665183..4b422a08fe 100644
--- a/test/test_warp_reduce.cu
+++ b/test/test_warp_reduce.cu
@@ -721,7 +721,7 @@ template <
 void Test(GenMode gen_mode)
 {
     // primitive
-    Test<WARPS, LOGICAL_WARP_THREADS, char>(                gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, signed char>(         gen_mode, Sum());
     Test<WARPS, LOGICAL_WARP_THREADS, short>(               gen_mode, Sum());
     Test<WARPS, LOGICAL_WARP_THREADS, int>(                 gen_mode, Sum());
     Test<WARPS, LOGICAL_WARP_THREADS, long long>(           gen_mode, Sum());