-
Notifications
You must be signed in to change notification settings - Fork 2k
[https://nvbugs/5498967][fix] Downgrade NCCL #7556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -35,13 +35,22 @@ struct UBBuffer | |||||||||||||
| void* addr; | ||||||||||||||
| int handle; | ||||||||||||||
| size_t size; | ||||||||||||||
| #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0) | ||||||||||||||
| ncclWindow_t window; | ||||||||||||||
| #endif | ||||||||||||||
|
|
||||||||||||||
| UBBuffer(void* a = nullptr, int h = -1, size_t s = 0, ncclWindow_t w = nullptr) | ||||||||||||||
| UBBuffer(void* a = nullptr, int h = -1, size_t s = 0 | ||||||||||||||
| #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0) | ||||||||||||||
| , | ||||||||||||||
| ncclWindow_t w = nullptr | ||||||||||||||
| #endif | ||||||||||||||
| ) | ||||||||||||||
| : addr(a) | ||||||||||||||
| , handle(h) | ||||||||||||||
| , size(s) | ||||||||||||||
| #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0) | ||||||||||||||
| , window(w) | ||||||||||||||
| #endif | ||||||||||||||
| { | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -77,6 +86,8 @@ class UserBufferAllocator | |||||||||||||
| tensorrt_llm::runtime::WorldConfig mWorldConfig; | ||||||||||||||
| }; | ||||||||||||||
|
|
||||||||||||||
| #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0) | ||||||||||||||
|
|
||||||||||||||
| class NCCLHelper | ||||||||||||||
| { | ||||||||||||||
| public: | ||||||||||||||
|
|
@@ -125,6 +136,13 @@ class NCCLUserBufferAllocator : public UserBufferAllocator | |||||||||||||
| std::shared_ptr<ncclComm_t> mComm; | ||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shared_ptr<ncclComm_t> is incorrect and dangerous ncclComm_t is a pointer type; shared_ptr<ncclComm_t> manages a ncclComm_t* (double pointer) and will call delete by default — undefined behavior. Use a raw handle or a smart pointer with a proper deleter. Apply one of: Option A (simplest; recommended unless shared ownership is required): - std::shared_ptr<ncclComm_t> mComm;
+ ncclComm_t mComm{}; // raw NCCL handle; destroy via ncclCommDestroy in .cppOption B (owned handle with RAII deleter): + using NcclCommHandle = std::unique_ptr<std::remove_pointer_t<ncclComm_t>, void(*)(ncclComm_t)>;
- std::shared_ptr<ncclComm_t> mComm;
+ NcclCommHandle mComm{nullptr, nullptr};Note: Option B requires #include <type_traits> and setting the deleter to ncclCommDestroy at initialization. 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||
| static std::unique_ptr<NCCLHelper> mNCCLHelper; | ||||||||||||||
| }; | ||||||||||||||
| #else | ||||||||||||||
| class NCCLUserBufferAllocator : public UserBufferAllocator | ||||||||||||||
| { | ||||||||||||||
| public: | ||||||||||||||
| void initialize(tensorrt_llm::runtime::WorldConfig const& world_config) override; | ||||||||||||||
| }; | ||||||||||||||
| #endif | ||||||||||||||
|
|
||||||||||||||
| #else | ||||||||||||||
| using communicator = void; | ||||||||||||||
|
|
||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -184,6 +184,9 @@ def func(input, residual, norm_weight, eps, enable_fusion): | |||||||||||||||||||||||
| def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy, | ||||||||||||||||||||||||
| fusion): | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if strategy == AllReduceStrategy.NCCL_SYMMETRIC: | ||||||||||||||||||||||||
| pytest.skip("NCCL symmetric is not supported for nccl version < 2.27.") | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
Comment on lines
+187
to
+189
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make the skip conditional on NCCL version instead of unconditional. Right now this always skips NCCL_SYMMETRIC. Gate it on runtime NCCL < 2.27. - if strategy == AllReduceStrategy.NCCL_SYMMETRIC:
- pytest.skip("NCCL symmetric is not supported for nccl version < 2.27.")
+ if strategy == AllReduceStrategy.NCCL_SYMMETRIC:
+ try:
+ import torch.cuda.nccl as nccl
+ nccl_ver = nccl.version() # e.g., 2708 for 2.7.8; returns int like 22700 for 2.27.0 in newer builds
+ except Exception:
+ nccl_ver = 0
+ # Treat values < 22700 as older than 2.27.0
+ if nccl_ver < 22700:
+ pytest.skip("NCCL symmetric is not supported for nccl version < 2.27.")📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||
| torch.manual_seed(42) | ||||||||||||||||||||||||
| tensor_parallel_size = 2 | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Missing platform headers for dynamic loading.
dlopen/dlsym/dlcloseandLoadLibraryA/GetProcAddressrequire including<dlfcn.h>or<windows.h>. Add these near the top of the file.Outside this hunk:
🤖 Prompt for AI Agents