Merge remote-tracking branch 'nccl/master' into develop

Bu işleme şunda yer alıyor:
Marzieh Berenjkoub
2026-01-20 13:01:49 -06:00
işleme 858b4e76eb
240 değiştirilmiş dosya ile 16266 ekleme ve 3578 silme
+27 -17
Dosyayı Görüntüle
@@ -143,7 +143,12 @@ private:
protected:
void SetUp() override {
ncclResult_t result = ncclNetSocket.init(nullptr, nullptr);
void* ctx = nullptr;
uint64_t commId = 0;
ncclNetCommConfig_t config = {};
ncclDebugLogger_t logFunction = nullptr;
ncclProfilerCallback_t profFunction = nullptr;
ncclResult_t result = ncclNetSocket.init(&ctx, commId, &config, logFunction, profFunction);
ASSERT_EQ(result, ncclSuccess) << "Failed to initialize ncclNetSocket. "
<< "Error code: " << result
<< ". Ensure RCCL networking is properly configured.";
@@ -256,8 +261,9 @@ protected:
// Increased attempts and longer total timeout for reliability
for (int attempt = 0; attempt < 100 && !shouldStop.load(); attempt++) {
ncclResult_t connectResult = ncclNetSocket.connect(
0, &config, handle, &tempSendComm, &sendDevComm);
void* ctx = nullptr;
int dev = 0;
ncclResult_t connectResult = ncclNetSocket.connect(ctx, dev, handle, &tempSendComm, &sendDevComm);
if (connectResult == ncclSuccess && tempSendComm != nullptr) {
sendGuard.reset(tempSendComm);
connectCompleted = true;
@@ -702,7 +708,7 @@ protected:
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ncclResult_t result = ncclNetSocket.listen(nullptr, 0, handle, &listenComm);
ASSERT_EQ(result, ncclSuccess) << "Failed to establish listening socket for test execution. "
<< "ncclNetSocket.listen() returned error code: " << result
<< ". Verify network device availability and port accessibility.";
@@ -791,7 +797,9 @@ TEST_F(NetSocketTests, TestConcurrentOperationsTaskCreationDefault) {
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
void* ctx = nullptr;
int dev = 0;
ncclResult_t result = ncclNetSocket.listen(ctx, dev, handle, &listenComm);
ASSERT_EQ(result, ncclSuccess) << "Failed to establish listening socket for test execution. "
<< "ncclNetSocket.listen() returned error code: " << result
<< ". Verify network device availability and port accessibility.";
@@ -895,7 +903,9 @@ TEST_F(NetSocketTests, TestInvalidDeviceIndexListen) {
void *listenComm = nullptr;
// Test with negative device index
ncclResult_t result = ncclNetSocket.listen(-1, handle, &listenComm);
void* ctx = nullptr;
int dev = -1;
ncclResult_t result = ncclNetSocket.listen(ctx, dev, handle, &listenComm);
INFO(NCCL_LOG_INFO, "Listen with dev=-1 returned: %d", result);
EXPECT_EQ(result, ncclInternalError)
<< "Listen should fail with negative device index. "
@@ -904,7 +914,7 @@ TEST_F(NetSocketTests, TestInvalidDeviceIndexListen) {
// Test with device index greater than available devices
int invalidDev = ndev + 10;
result = ncclNetSocket.listen(invalidDev, handle, &listenComm);
result = ncclNetSocket.listen(ctx, invalidDev, handle, &listenComm);
INFO(NCCL_LOG_INFO, "Listen with dev=%d (> ndev=%d) returned: %d", invalidDev,
ndev, result);
EXPECT_EQ(result, ncclInternalError)
@@ -926,8 +936,9 @@ TEST_F(NetSocketTests, TestInvalidDeviceIndexConnect) {
ncclNetDeviceHandle_t *sendDevComm = nullptr;
// Test with negative device index
ncclResult_t result =
ncclNetSocket.connect(-1, &config, handle, &sendComm, &sendDevComm);
void* ctx = nullptr;
int dev = -1;
ncclResult_t result = ncclNetSocket.connect(ctx, dev, handle, &sendComm, &sendDevComm);
INFO(NCCL_LOG_INFO, "Connect with dev=-1 returned: %d", result);
EXPECT_EQ(result, ncclInternalError)
<< "Connect should fail with negative device index. "
@@ -936,8 +947,7 @@ TEST_F(NetSocketTests, TestInvalidDeviceIndexConnect) {
// Test with device index greater than available devices
int invalidDev = ndev + 10;
result = ncclNetSocket.connect(invalidDev, &config, handle, &sendComm,
&sendDevComm);
result = ncclNetSocket.connect(ctx, invalidDev, handle, &sendComm, &sendDevComm);
INFO(NCCL_LOG_INFO, "Connect with dev=%d (> ndev=%d) returned: %d",
invalidDev, ndev, result);
EXPECT_EQ(result, ncclInternalError)
@@ -973,7 +983,7 @@ TEST_F(NetSocketTests, TestInvalidArraySizeIrecv) {
// Setup a dummy communicator first
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ncclResult_t result = ncclNetSocket.listen(nullptr, 0, handle, &listenComm);
if (result == ncclSuccess && listenComm) {
void *sendComm = nullptr;
@@ -1039,7 +1049,7 @@ TEST_F(NetSocketTests, TestNonHostMemoryRegMr) {
// Setup a dummy communicator first
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ncclResult_t result = ncclNetSocket.listen(nullptr, 0, handle, &listenComm);
if (result == ncclSuccess && listenComm) {
void *sendComm = nullptr;
@@ -1189,7 +1199,7 @@ TEST_F(NetSocketTests, TestExcessiveThreadConfig) {
// Initialize to trigger the warning logic
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ncclResult_t result = ncclNetSocket.listen(nullptr, 0, handle, &listenComm);
if (result == ncclSuccess && listenComm) {
// The implementation should have limited the threads to
@@ -1308,7 +1318,7 @@ TEST_F(NetSocketTests, TestExcessiveSocketConfig) {
// Initialize to trigger the warning logic
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ncclResult_t result = ncclNetSocket.listen(nullptr, 0, handle, &listenComm);
if (result == ncclSuccess && listenComm) {
// The implementation should have limited the sockets to
@@ -1337,7 +1347,7 @@ TEST_F(NetSocketTests, TestRequestAllocationFailure) {
// Setup communication
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ncclResult_t result = ncclNetSocket.listen(nullptr, 0, handle, &listenComm);
if (result == ncclSuccess && listenComm) {
void *sendComm = nullptr;
@@ -1441,7 +1451,7 @@ TEST_F(NetSocketTests, TestMessageSizeMismatch) {
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ncclResult_t result = ncclNetSocket.listen(nullptr, 0, handle, &listenComm);
if (result == ncclSuccess && listenComm) {
void *sendComm = nullptr;