This repository was archived by the owner on Mar 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 191
Revamp samples and benchmarks #31
Merged
Merged
Changes from all commits
Commits
Show all changes
29 commits
Select commit
Hold shift + click to select a range
cc3e866
Move and rename benchmark to it separate benchmark directory.
jrhemstad 25d155c
Use CUDA as project language.
jrhemstad e8cc0bb
Revampe benchmark cmake script.
jrhemstad 3c32ceb
typo
jrhemstad 60e7fb3
tense.
jrhemstad 325513e
Changed path to text files.
jrhemstad 50ad605
Add error checking to opening files.
jrhemstad afd58a7
Add initial cmake for building trie_st.
jrhemstad cd8ef16
Update txt file paths and add error checking to trie_mt.
jrhemstad e642ce1
Add cmake config for trie_mt.
jrhemstad e504db4
Update paths and error checking for trie.cu.
jrhemstad acf3db3
Only download books if they don't already exists.
jrhemstad f2e0eaa
Add config for trie.
jrhemstad 9ca2303
Add cmake config for trie.cu.
jrhemstad 32ae0d7
Delete old scripts.
jrhemstad bbde1ee
Update include paths.
jrhemstad 39a0645
Remove hardcoded paths in nvrtc example.
jrhemstad f2ecd55
Add cmake config for nvrtc example.
jrhemstad e6688a4
Remove unused cmake stuff.
jrhemstad 218a033
Doc.
jrhemstad ec2a4b6
Format.
jrhemstad 20a702c
Replace simt namespace with cuda.
jrhemstad dff74e8
Point nvrtc include path to relative location of libcu++.
jrhemstad e291950
Reorganize cmake file.
jrhemstad e53370b
Add concurrent hash table example.
brycelelbach 69e3dc3
Merge remote-tracking branch 'origin/main' into reorganize_samples
jrhemstad ba3538e
rename samples -> examples.
jrhemstad 818d2b1
Add hash map to cmake file.
jrhemstad 1b716a1
Docs: Update link in README.md to point to the new examples directory.
brycelelbach File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| cmake_minimum_required(VERSION 3.18) | ||
|
|
||
| project(libcudacxx-benchmarks LANGUAGES CXX CUDA) | ||
|
|
||
| set(CMAKE_BUILD_TYPE "RelWithDebInfo") | ||
|
|
||
| set(GPU_ARCHS "70;72;75;80" CACHE STRING "List of GPU architectures to compile for.") | ||
|
|
||
| message(STATUS "Enabled CUDA architectures:${GPU_ARCHS}") | ||
|
|
||
| find_package(Threads REQUIRED) | ||
| find_package(OpenMP) | ||
|
|
||
| function(ConfigureHostBench BENCH_NAME BENCH_SRC) | ||
| add_executable("${BENCH_NAME}" "${BENCH_SRC}") | ||
| target_link_libraries("${BENCH_NAME}" PRIVATE Threads::Threads) | ||
|
|
||
| # TODO: Link against libcudaxx interface target instead | ||
| target_include_directories("${BENCH_NAME}" PRIVATE | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/../include") | ||
| if(OpenMP_CXX_FOUND) | ||
| target_link_libraries("${BENCH_NAME}" PRIVATE OpenMP::OpenMP_CXX) | ||
| endif() | ||
| endfunction(ConfigureHostBench) | ||
|
|
||
| function(ConfigureDeviceBench BENCH_NAME BENCH_SRC) | ||
| add_executable("${BENCH_NAME}" "${BENCH_SRC}") | ||
| set_property(TARGET "${BENCH_NAME}" PROPERTY CUDA_ARCHITECTURES "${GPU_ARCHS}") | ||
| # TODO: Link against libcudaxx interface target instead | ||
| target_include_directories("${BENCH_NAME}" PRIVATE | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/../include") | ||
| target_compile_options("${BENCH_NAME}" PRIVATE --expt-extended-lambda --expt-relaxed-constexpr) | ||
| endfunction(ConfigureDeviceBench) | ||
|
|
||
| ConfigureHostBench(concurrency_host concurrency.cpp) | ||
|
|
||
| ConfigureDeviceBench(concurrency_device concurrency.cu) | ||
|
|
||
File renamed without changes.
File renamed without changes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| cmake_minimum_required(VERSION 3.18) | ||
|
|
||
| project(libcudacxx-examples LANGUAGES CXX CUDA) | ||
|
|
||
| set(CMAKE_BUILD_TYPE "RelWithDebInfo") | ||
|
|
||
| find_package(CUDAToolkit REQUIRED) | ||
| find_package(Threads REQUIRED) | ||
| find_package(OpenMP) | ||
|
|
||
| # Download input files for the trie examples. | ||
| if(NOT (EXISTS books)) | ||
| execute_process(COMMAND mkdir books) | ||
| file(DOWNLOAD https://www.gutenberg.org/files/2600/2600-0.txt books/2600-0.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD http://www.gutenberg.org/cache/epub/996/pg996.txt books/pg996.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD http://www.gutenberg.org/cache/epub/55/pg55.txt books/pg55.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD https://www.gutenberg.org/files/8800/8800.txt books/8800.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD https://www.gutenberg.org/files/84/84-0.txt books/84-0.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD http://www.gutenberg.org/cache/epub/6130/pg6130.txt books/pg6130.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD http://www.gutenberg.org/cache/epub/1727/pg1727.txt books/pg1727.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD https://www.gutenberg.org/files/2701/2701-0.txt books/2701-0.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD https://www.gutenberg.org/files/35/35-0.txt books/35-0.txt SHOW_PROGRESS) | ||
| file(DOWNLOAD https://www.gutenberg.org/files/1342/1342-0.txt books/1342-0.txt SHOW_PROGRESS) | ||
| endif() | ||
|
|
||
| add_executable(trie_st trie_st.cpp) | ||
| target_compile_features(trie_st PRIVATE cxx_std_11) | ||
|
|
||
| add_executable(trie_mt trie_mt.cpp) | ||
| target_compile_features(trie_mt PRIVATE cxx_std_11) | ||
| target_link_libraries(trie_mt Threads::Threads) | ||
|
|
||
| if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1) | ||
| add_executable(trie_cuda trie.cu) | ||
| target_compile_features(trie_cuda PRIVATE cxx_std_11 cuda_std_11) | ||
| target_compile_options(trie_cuda PRIVATE --expt-relaxed-constexpr) | ||
| set_property(TARGET trie_cuda PROPERTY CUDA_ARCHITECTURES 70) | ||
| else() | ||
| message(STATUS "Insufficient CUDA version. Skipping trie.cu example.") | ||
| endif() | ||
|
|
||
| if(CUDAToolkit_VERSION VERSION_GREATER 10.2) | ||
| add_executable(rtc rtc_example.cpp) | ||
| target_link_libraries(rtc CUDA::nvrtc) | ||
| target_compile_features(rtc PRIVATE cxx_std_11) | ||
| else() | ||
| message(STATUS "Insufficient CUDA version. Skipping rtc_example.cpp example.") | ||
| endif() | ||
|
|
||
| add_executable(hash_map concurrent_hash_table.cu) | ||
| target_compile_features(hash_map PRIVATE cxx_std_14 cuda_std_14) | ||
| set_property(TARGET hash_map PROPERTY CUDA_ARCHITECTURES 70) | ||
| target_compile_options(hash_map PRIVATE --expt-extended-lambda) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,260 @@ | ||
| // Copyright (c) 2018-2020 NVIDIA Corporation | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // Released under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
|
|
||
| #include <cuda/std/cstdint> | ||
| #include <cuda/std/atomic> | ||
|
|
||
| // TODO: It would be great if this example could NOT depend on Thrust. | ||
| #include <thrust/pair.h> | ||
| #include <thrust/functional.h> | ||
| #include <thrust/allocate_unique.h> | ||
| #include <thrust/host_vector.h> | ||
| #include <thrust/device_vector.h> | ||
| #include <thrust/iterator/counting_iterator.h> | ||
|
|
||
| #include <cassert> | ||
| #include <random> | ||
|
|
||
| #include <iostream> | ||
| #include <cstdio> | ||
| #include <cassert> | ||
|
|
||
| // TODO: This should be upstreamed and then removed. | ||
| namespace thrust { | ||
|
|
||
| using universal_raw_memory_resource = | ||
| thrust::system::cuda::detail::cuda_memory_resource< | ||
| thrust::system::cuda::detail::cudaMallocManaged, cudaFree, void* | ||
| >; | ||
|
|
||
| template <typename T> | ||
| using universal_allocator = | ||
| thrust::mr::stateless_resource_allocator<T, universal_raw_memory_resource>; | ||
|
|
||
| template <typename T> | ||
| using universal_vector = thrust::device_vector<T, universal_allocator<T>>; | ||
|
|
||
| } // thrust | ||
|
|
||
| template < | ||
| typename Key, typename Value, | ||
| typename Hash = thrust::identity<Key>, | ||
| typename KeyEqual = thrust::equal_to<Key>, | ||
| typename MemoryResource = thrust::universal_raw_memory_resource | ||
| > | ||
| struct concurrent_hash_table { | ||
| // Elements transition from state_empty -> state_reserved -> | ||
| // state_filled; no other transitions are allowed. | ||
| enum state_type { | ||
| state_empty, state_reserved, state_filled | ||
| }; | ||
|
|
||
| using key_type = Key; | ||
| using mapped_type = Value; | ||
| using size_type = cuda::std::uint64_t; | ||
|
|
||
| using key_allocator = thrust::mr::stateless_resource_allocator< | ||
| key_type, MemoryResource | ||
| >; | ||
| using mapped_allocator = thrust::mr::stateless_resource_allocator< | ||
| mapped_type, MemoryResource | ||
| >; | ||
| using state_allocator = thrust::mr::stateless_resource_allocator< | ||
| cuda::std::atomic<state_type>, MemoryResource | ||
| >; | ||
|
|
||
| using key_iterator = typename key_allocator::pointer; | ||
| using value_iterator = typename mapped_allocator::pointer; | ||
| using state_iterator = typename state_allocator::pointer; | ||
|
|
||
| // This whole thing is silly and should be a lambda, or at least a private | ||
| // nested class, but alas, NVCC doesn't like that. | ||
| struct element_destroyer { | ||
| private: | ||
| size_type capacity_; | ||
| key_iterator keys_; | ||
| value_iterator values_; | ||
| state_iterator states_; | ||
|
|
||
| public: | ||
| __host__ __device__ | ||
| element_destroyer(size_type capacity, | ||
| key_iterator keys, | ||
| value_iterator values, | ||
| state_iterator states) | ||
| : capacity_(capacity), keys_(keys), values_(values), states_(states) | ||
| {} | ||
|
|
||
| element_destroyer(element_destroyer const&) = default; | ||
|
|
||
| __host__ __device__ | ||
| void operator()(size_type i) { | ||
| if (state_empty != states_[i]) { | ||
| (keys_ + i)->~key_type(); | ||
| (values_ + i)->~mapped_type(); | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| private: | ||
| size_type capacity_; | ||
| key_iterator keys_; | ||
| value_iterator values_; | ||
| state_iterator states_; | ||
| Hash hash_; | ||
| KeyEqual key_equal_; | ||
|
|
||
| public: | ||
| __host__ | ||
| concurrent_hash_table(size_type capacity, | ||
| Hash hash = Hash(), | ||
| KeyEqual key_equal = KeyEqual()) | ||
| : capacity_(capacity) | ||
| , keys_(key_allocator{}.allocate(capacity_)) | ||
| , values_(mapped_allocator{}.allocate(capacity_)) | ||
| , states_(state_allocator{}.allocate(capacity_)) | ||
| , hash_(std::move(hash)) | ||
| , key_equal_(std::move(key_equal)) | ||
| { | ||
| thrust::uninitialized_fill(thrust::device, | ||
| states_, states_ + capacity_, | ||
| state_empty); | ||
| } | ||
|
|
||
| __host__ | ||
| ~concurrent_hash_table() | ||
| { | ||
| thrust::for_each(thrust::device, | ||
| thrust::counting_iterator<size_type>(0), | ||
| thrust::counting_iterator<size_type>(capacity_), | ||
| element_destroyer(capacity_, keys_, values_, states_)); | ||
| } | ||
|
|
||
| // TODO: Change return type to an enum with three possible values, succeeded, | ||
| // exists, and full. | ||
| template <typename UKey, typename... Args> | ||
| __host__ __device__ | ||
| thrust::pair<value_iterator, bool> | ||
| try_emplace(UKey&& key, Args&&... args) { | ||
| auto index{hash_(key) % capacity_}; | ||
| // Linearly probe the storage space up to `capacity_` times; if we haven't | ||
| // succeeded by then, the container is full. | ||
| for (size_type i = 0; i < capacity_; ++i) { | ||
| state_type old = states_[index].load(cuda::std::memory_order_acquire); | ||
| while (old == state_empty) { | ||
| // As long as the state of this element is empty, attempt to set it to | ||
| // reserved. | ||
| if (states_[index].compare_exchange_weak( | ||
| old, state_reserved, cuda::std::memory_order_acq_rel)) | ||
| { | ||
| // We succeeded; the element is now "locked" as reserved. | ||
| new (keys_ + index) key_type(std::forward<UKey>(key)); | ||
| new (values_ + index) mapped_type(std::forward<Args>(args)...); | ||
| states_[index].store(state_filled, cuda::std::memory_order_release); | ||
| return thrust::make_pair(values_ + index, true); | ||
| } | ||
| } | ||
| // If we are here, the element we are probing is not empty and we didn't | ||
| // fill it, so we need to wait for it to be filled. | ||
| while (state_filled != states_[index].load(cuda::std::memory_order_acquire)) | ||
| ; | ||
| // Now we know that the element we are probing has been filled by someone | ||
| // else, so we check if our key is equal to it. | ||
| if (key_equal_(keys_[index], key)) | ||
| // It is, so the element already exists. | ||
| return thrust::make_pair(values_ + index, false); | ||
| // Otherwise, the element isn't a match, so move on to the next element. | ||
| index = (index + 1) % capacity_; | ||
| } | ||
| // If we are here, the container is full. | ||
| return thrust::make_pair(value_iterator{}, false); | ||
| } | ||
|
|
||
| __host__ __device__ | ||
| mapped_type& operator[](key_type const& key) { | ||
| return (*try_emplace(key).first); | ||
| } | ||
| __host__ __device__ | ||
| mapped_type& operator[](key_type&& key) { | ||
| return (*try_emplace(std::move(key)).first); | ||
| } | ||
| }; | ||
|
|
||
| template <typename T> | ||
| struct identity_modulo { | ||
| private: | ||
| T const modulo_; | ||
|
|
||
| public: | ||
| __host__ __device__ | ||
| identity_modulo(T modulo) : modulo_(std::move(modulo)) {} | ||
|
|
||
| identity_modulo(identity_modulo const&) = default; | ||
|
|
||
| __host__ __device__ | ||
| T operator()(T i) { return i % modulo_; } | ||
| }; | ||
|
|
||
| int main() { | ||
| { | ||
| using table = concurrent_hash_table<int, cuda::std::atomic<int>>; | ||
|
|
||
| auto freq = thrust::allocate_unique<table>(thrust::universal_allocator<table>{}, 8); | ||
|
|
||
| thrust::universal_vector<int> input = [] { | ||
| thrust::universal_vector<int> v(2048); | ||
| std::mt19937 gen(1337); | ||
| std::uniform_int_distribution<long> dis(0, 7); | ||
| thrust::generate(v.begin(), v.end(), [&] { return dis(gen); }); | ||
| return v; | ||
| }(); | ||
|
|
||
| thrust::for_each(thrust::device, input.begin(), input.end(), | ||
| [freq = freq.get()] __device__ (int i) { | ||
| (*freq)[i].fetch_add(1, cuda::std::memory_order_relaxed); | ||
| } | ||
| ); | ||
|
|
||
| thrust::host_vector<int> gold(8); | ||
| thrust::for_each(input.begin(), input.end(), [&] (int i) { ++gold[i]; }); | ||
|
|
||
| for (cuda::std::uint64_t i = 0; i < 8; ++i) | ||
| std::cout << "i: " << i | ||
| << " gold: " << gold[i] | ||
| << " observed: " << (*freq)[i] << "\n"; | ||
|
|
||
| assert(cudaSuccess == cudaDeviceSynchronize()); | ||
| } | ||
| { | ||
| using table = concurrent_hash_table<int, cuda::std::atomic<int>, identity_modulo<int>>; | ||
|
|
||
| auto freq = thrust::allocate_unique<table>(thrust::universal_allocator<table>{}, 8, identity_modulo<int>(4)); | ||
|
|
||
| thrust::universal_vector<int> input = [] { | ||
| thrust::universal_vector<int> v(2048); | ||
| std::mt19937 gen(1337); | ||
| std::uniform_int_distribution<long> dis(0, 7); | ||
| thrust::generate(v.begin(), v.end(), [&] { return dis(gen); }); | ||
| return v; | ||
| }(); | ||
|
|
||
| thrust::for_each(thrust::device, input.begin(), input.end(), | ||
| [freq = freq.get()] __device__ (int i) { | ||
| (*freq)[i].fetch_add(1, cuda::std::memory_order_relaxed); | ||
| } | ||
| ); | ||
|
|
||
| thrust::host_vector<int> gold(8); | ||
| thrust::for_each(input.begin(), input.end(), [&] (int i) { ++gold[i]; }); | ||
|
|
||
| for (cuda::std::uint64_t i = 0; i < 8; ++i) | ||
| std::cout << "i: " << i | ||
| << " gold: " << gold[i] | ||
| << " observed: " << (*freq)[i] << "\n"; | ||
|
|
||
| assert(cudaSuccess == cudaDeviceSynchronize()); | ||
| } | ||
| } | ||
|
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.