NVIDIA · brycelelbach · Oct 29, 2020 · Sep 30, 2020 · Sep 30, 2020 · Sep 30, 2020
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(libcudacxx-benchmarks LANGUAGES CXX CUDA)
+
+set(CMAKE_BUILD_TYPE "RelWithDebInfo")
+
+set(GPU_ARCHS "70;72;75;80" CACHE STRING "List of GPU architectures to compile for.")
+
+message(STATUS "Enabled CUDA architectures:${GPU_ARCHS}")
+
+find_package(Threads REQUIRED)
+find_package(OpenMP)
+
+function(ConfigureHostBench BENCH_NAME BENCH_SRC)
+   add_executable("${BENCH_NAME}" "${BENCH_SRC}")
+   target_link_libraries("${BENCH_NAME}" PRIVATE Threads::Threads)
+
+   # TODO: Link against libcudaxx interface target instead
+   target_include_directories("${BENCH_NAME}" PRIVATE
+                                              "${CMAKE_CURRENT_SOURCE_DIR}/../include")
+   if(OpenMP_CXX_FOUND)
+      target_link_libraries("${BENCH_NAME}" PRIVATE OpenMP::OpenMP_CXX)
+   endif()
+endfunction(ConfigureHostBench)
+
+function(ConfigureDeviceBench BENCH_NAME BENCH_SRC)
+   add_executable("${BENCH_NAME}" "${BENCH_SRC}")
+   set_property(TARGET "${BENCH_NAME}" PROPERTY CUDA_ARCHITECTURES "${GPU_ARCHS}")
+   # TODO: Link against libcudaxx interface target instead
+   target_include_directories("${BENCH_NAME}" PRIVATE
+                                              "${CMAKE_CURRENT_SOURCE_DIR}/../include")
+   target_compile_options("${BENCH_NAME}" PRIVATE --expt-extended-lambda --expt-relaxed-constexpr)
+endfunction(ConfigureDeviceBench)
+
+ConfigureHostBench(concurrency_host concurrency.cpp)                                        
+
+ConfigureDeviceBench(concurrency_device concurrency.cu)
+
diff --git a/samples/benchmark.cpp → benchmarks/concurrency.cpp b/samples/benchmark.cpp → benchmarks/concurrency.cpp
diff --git a/samples/benchmark.cu → benchmarks/concurrency.cu b/samples/benchmark.cu → benchmarks/concurrency.cu
diff --git a/docs/readme.md b/docs/readme.md
@@ -1,7 +1,7 @@
 # libcu++: The C++ Standard Library for Your Entire System
 
 <table><tr>
-<th><b><a href="https://github.com/nvidia/libcudacxx/tree/main/samples">Examples</a></b></th>
+<th><b><a href="https://github.com/nvidia/libcudacxx/tree/main/examples">Examples</a></b></th>
 <th><b><a href="https://godbolt.org/z/M11W7j">Godbolt</a></b></th>
 <th><b><a href="https://nvidia.github.io/libcudacxx">Documentation</a></b></th>
 </tr></table>

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -0,0 +1,53 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(libcudacxx-examples LANGUAGES CXX CUDA)
+
+set(CMAKE_BUILD_TYPE "RelWithDebInfo")
+
+find_package(CUDAToolkit REQUIRED)
+find_package(Threads REQUIRED)
+find_package(OpenMP)
+
+# Download input files for the trie examples.
+if(NOT (EXISTS books))
+    execute_process(COMMAND mkdir books)
+    file(DOWNLOAD https://www.gutenberg.org/files/2600/2600-0.txt books/2600-0.txt SHOW_PROGRESS)
+    file(DOWNLOAD http://www.gutenberg.org/cache/epub/996/pg996.txt books/pg996.txt SHOW_PROGRESS)
+    file(DOWNLOAD http://www.gutenberg.org/cache/epub/55/pg55.txt books/pg55.txt SHOW_PROGRESS)
+    file(DOWNLOAD https://www.gutenberg.org/files/8800/8800.txt books/8800.txt SHOW_PROGRESS)
+    file(DOWNLOAD https://www.gutenberg.org/files/84/84-0.txt books/84-0.txt SHOW_PROGRESS)
+    file(DOWNLOAD http://www.gutenberg.org/cache/epub/6130/pg6130.txt books/pg6130.txt SHOW_PROGRESS)
+    file(DOWNLOAD http://www.gutenberg.org/cache/epub/1727/pg1727.txt books/pg1727.txt SHOW_PROGRESS)
+    file(DOWNLOAD https://www.gutenberg.org/files/2701/2701-0.txt books/2701-0.txt SHOW_PROGRESS)
+    file(DOWNLOAD https://www.gutenberg.org/files/35/35-0.txt books/35-0.txt SHOW_PROGRESS)
+    file(DOWNLOAD https://www.gutenberg.org/files/1342/1342-0.txt books/1342-0.txt SHOW_PROGRESS)
+endif()
+
+add_executable(trie_st trie_st.cpp)
+target_compile_features(trie_st PRIVATE cxx_std_11)
+
+add_executable(trie_mt trie_mt.cpp)
+target_compile_features(trie_mt PRIVATE cxx_std_11)
+target_link_libraries(trie_mt Threads::Threads)
+
+if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1)
+    add_executable(trie_cuda trie.cu)
+    target_compile_features(trie_cuda PRIVATE cxx_std_11 cuda_std_11)
+    target_compile_options(trie_cuda PRIVATE --expt-relaxed-constexpr)
+    set_property(TARGET trie_cuda PROPERTY CUDA_ARCHITECTURES 70)
+else()
+    message(STATUS "Insufficient CUDA version. Skipping trie.cu example.")
+endif()    
+
+if(CUDAToolkit_VERSION VERSION_GREATER 10.2)
+    add_executable(rtc rtc_example.cpp)
+    target_link_libraries(rtc CUDA::nvrtc)
+    target_compile_features(rtc PRIVATE cxx_std_11)
+else()
+    message(STATUS "Insufficient CUDA version. Skipping rtc_example.cpp example.")
+endif()    
+
+add_executable(hash_map concurrent_hash_table.cu)
+target_compile_features(hash_map PRIVATE cxx_std_14 cuda_std_14)
+set_property(TARGET hash_map PROPERTY CUDA_ARCHITECTURES 70)
+target_compile_options(hash_map PRIVATE --expt-extended-lambda)
diff --git a/examples/concurrent_hash_table.cu b/examples/concurrent_hash_table.cu
@@ -0,0 +1,260 @@
+// Copyright (c) 2018-2020 NVIDIA Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Released under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+
+#include <cuda/std/cstdint>
+#include <cuda/std/atomic>
+
+// TODO: It would be great if this example could NOT depend on Thrust.
+#include <thrust/pair.h>
+#include <thrust/functional.h>
+#include <thrust/allocate_unique.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cassert>
+#include <random>
+
+#include <iostream>
+#include <cstdio>
+#include <cassert>
+
+// TODO: This should be upstreamed and then removed.
+namespace thrust {
+
+using universal_raw_memory_resource =
+  thrust::system::cuda::detail::cuda_memory_resource<
+    thrust::system::cuda::detail::cudaMallocManaged, cudaFree, void*
+  >;
+
+template <typename T>
+using universal_allocator =
+  thrust::mr::stateless_resource_allocator<T, universal_raw_memory_resource>;
+
+template <typename T>
+using universal_vector = thrust::device_vector<T, universal_allocator<T>>;
+
+} // thrust
+
+template <
+  typename Key, typename Value,
+  typename Hash     = thrust::identity<Key>,
+  typename KeyEqual = thrust::equal_to<Key>,
+  typename MemoryResource = thrust::universal_raw_memory_resource
+>
+struct concurrent_hash_table {
+  // Elements transition from state_empty -> state_reserved ->
+  // state_filled; no other transitions are allowed.
+  enum state_type {
+    state_empty, state_reserved, state_filled
+  };
+
+  using key_type       = Key;
+  using mapped_type    = Value;
+  using size_type      = cuda::std::uint64_t;
+
+  using key_allocator    = thrust::mr::stateless_resource_allocator<
+    key_type, MemoryResource
+  >;
+  using mapped_allocator = thrust::mr::stateless_resource_allocator<
+    mapped_type, MemoryResource
+  >;
+  using state_allocator  = thrust::mr::stateless_resource_allocator<
+    cuda::std::atomic<state_type>, MemoryResource
+  >;
+
+  using key_iterator   = typename key_allocator::pointer;
+  using value_iterator = typename mapped_allocator::pointer;
+  using state_iterator = typename state_allocator::pointer;
+
+  // This whole thing is silly and should be a lambda, or at least a private
+  // nested class, but alas, NVCC doesn't like that.
+  struct element_destroyer {
+  private:
+    size_type      capacity_;
+    key_iterator   keys_;
+    value_iterator values_;
+    state_iterator states_;
+
+  public:
+    __host__ __device__
+    element_destroyer(size_type capacity,
+                      key_iterator keys,
+                      value_iterator values,
+                      state_iterator states)
+      : capacity_(capacity), keys_(keys), values_(values), states_(states)
+    {}
+
+    element_destroyer(element_destroyer const&) = default;
+
+    __host__ __device__
+    void operator()(size_type i) {
+      if (state_empty != states_[i]) {
+        (keys_ + i)->~key_type();
+        (values_ + i)->~mapped_type();
+      }
+    }
+  };
+
+private:
+  size_type      capacity_;
+  key_iterator   keys_;
+  value_iterator values_;
+  state_iterator states_;
+  Hash           hash_;
+  KeyEqual       key_equal_;
+
+public:
+  __host__
+  concurrent_hash_table(size_type capacity,
+                        Hash hash = Hash(),
+                        KeyEqual key_equal = KeyEqual())
+    : capacity_(capacity)
+    , keys_(key_allocator{}.allocate(capacity_))
+    , values_(mapped_allocator{}.allocate(capacity_))
+    , states_(state_allocator{}.allocate(capacity_))
+    , hash_(std::move(hash))
+    , key_equal_(std::move(key_equal))
+  {
+    thrust::uninitialized_fill(thrust::device,
+                               states_, states_ + capacity_,
+                               state_empty);
+  }
+
+  __host__
+  ~concurrent_hash_table()
+  {
+    thrust::for_each(thrust::device,
+                     thrust::counting_iterator<size_type>(0),
+                     thrust::counting_iterator<size_type>(capacity_),
+                     element_destroyer(capacity_, keys_, values_, states_));
+  }
+
+  // TODO: Change return type to an enum with three possible values, succeeded,
+  // exists, and full.
+  template <typename UKey, typename... Args>
+  __host__ __device__
+  thrust::pair<value_iterator, bool>
+  try_emplace(UKey&& key, Args&&... args) {
+    auto index{hash_(key) % capacity_};
+    // Linearly probe the storage space up to `capacity_` times; if we haven't
+    // succeeded by then, the container is full.
+    for (size_type i = 0; i < capacity_; ++i) {
+      state_type old = states_[index].load(cuda::std::memory_order_acquire);
+      while (old == state_empty) {
+        // As long as the state of this element is empty, attempt to set it to
+        // reserved.
+        if (states_[index].compare_exchange_weak(
+              old, state_reserved, cuda::std::memory_order_acq_rel))
+        {
+          // We succeeded; the element is now "locked" as reserved.
+          new (keys_ + index) key_type(std::forward<UKey>(key));
+          new (values_ + index) mapped_type(std::forward<Args>(args)...);
+          states_[index].store(state_filled, cuda::std::memory_order_release);
+          return thrust::make_pair(values_ + index, true);
+        }
+      }
+      // If we are here, the element we are probing is not empty and we didn't
+      // fill it, so we need to wait for it to be filled.
+      while (state_filled != states_[index].load(cuda::std::memory_order_acquire))
+        ;
+      // Now we know that the element we are probing has been filled by someone
+      // else, so we check if our key is equal to it.
+      if (key_equal_(keys_[index], key))
+        // It is, so the element already exists.
+        return thrust::make_pair(values_ + index, false);
+      // Otherwise, the element isn't a match, so move on to the next element.
+      index = (index + 1) % capacity_;
+    }
+    // If we are here, the container is full.
+    return thrust::make_pair(value_iterator{}, false);
+  }
+
+  __host__ __device__
+  mapped_type& operator[](key_type const& key) {
+    return (*try_emplace(key).first);
+  }
+  __host__ __device__
+  mapped_type& operator[](key_type&& key) {
+    return (*try_emplace(std::move(key)).first);
+  }
+};
+
+template <typename T>
+struct identity_modulo {
+private:
+  T const modulo_;
+
+public:
+  __host__ __device__
+  identity_modulo(T modulo) : modulo_(std::move(modulo)) {}
+
+  identity_modulo(identity_modulo const&) = default;
+
+  __host__ __device__
+  T operator()(T i) { return i % modulo_; }
+};
+
+int main() {
+  {
+    using table = concurrent_hash_table<int, cuda::std::atomic<int>>;
+
+    auto freq = thrust::allocate_unique<table>(thrust::universal_allocator<table>{}, 8);
+
+    thrust::universal_vector<int> input = [] {
+      thrust::universal_vector<int> v(2048);
+      std::mt19937 gen(1337);
+      std::uniform_int_distribution<long> dis(0, 7);
+      thrust::generate(v.begin(), v.end(), [&] { return dis(gen); });
+      return v;
+    }();
+
+    thrust::for_each(thrust::device, input.begin(), input.end(),
+      [freq = freq.get()] __device__ (int i) {
+        (*freq)[i].fetch_add(1, cuda::std::memory_order_relaxed);
+      }
+    );
+
+    thrust::host_vector<int> gold(8);
+    thrust::for_each(input.begin(), input.end(), [&] (int i) { ++gold[i]; });
+
+    for (cuda::std::uint64_t i = 0; i < 8; ++i)
+      std::cout << "i: " << i
+                << " gold: " << gold[i]
+                << " observed: " << (*freq)[i] << "\n";
+
+    assert(cudaSuccess == cudaDeviceSynchronize());
+  }
+  {
+    using table = concurrent_hash_table<int, cuda::std::atomic<int>, identity_modulo<int>>;
+
+    auto freq = thrust::allocate_unique<table>(thrust::universal_allocator<table>{}, 8, identity_modulo<int>(4));
+
+    thrust::universal_vector<int> input = [] {
+      thrust::universal_vector<int> v(2048);
+      std::mt19937 gen(1337);
+      std::uniform_int_distribution<long> dis(0, 7);
+      thrust::generate(v.begin(), v.end(), [&] { return dis(gen); });
+      return v;
+    }();
+
+    thrust::for_each(thrust::device, input.begin(), input.end(),
+      [freq = freq.get()] __device__ (int i) {
+        (*freq)[i].fetch_add(1, cuda::std::memory_order_relaxed);
+      }
+    );
+
+    thrust::host_vector<int> gold(8);
+    thrust::for_each(input.begin(), input.end(), [&] (int i) { ++gold[i]; });
+
+    for (cuda::std::uint64_t i = 0; i < 8; ++i)
+      std::cout << "i: " << i
+                << " gold: " << gold[i]
+                << " observed: " << (*freq)[i] << "\n";
+
+    assert(cudaSuccess == cudaDeviceSynchronize());
+  }
+}
+