From cd256748c01be2c38b8bcb7b4fe2215ca051fe3f Mon Sep 17 00:00:00 2001 From: Pedram Alizadeh Date: Thu, 2 Mar 2023 11:05:25 -0500 Subject: [PATCH 1/4] Adding -pthread flag for linking issues into CMakeLists.txt and src/Makefile (#31) [ROCm/rccl-tests commit: 255750b094265f89256102b3b68c57e72e3a0e45] --- projects/rccl-tests/CMakeLists.txt | 2 ++ projects/rccl-tests/src/Makefile | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/rccl-tests/CMakeLists.txt b/projects/rccl-tests/CMakeLists.txt index 539a1eae2b..d950565e2f 100644 --- a/projects/rccl-tests/CMakeLists.txt +++ b/projects/rccl-tests/CMakeLists.txt @@ -1,6 +1,8 @@ # ######################################################################## # Copyright 2022 Advanced Micro Devices, Inc. # ######################################################################## +#Adding pthread flag for linking +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR) diff --git a/projects/rccl-tests/src/Makefile b/projects/rccl-tests/src/Makefile index 3dbd41ff9a..f01e7b3850 100644 --- a/projects/rccl-tests/src/Makefile +++ b/projects/rccl-tests/src/Makefile @@ -27,7 +27,7 @@ HIPCUFLAGS += -I$(ROCM_PATH)/include HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt +HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread ifeq ($(DEBUG), 0) HIPCUFLAGS += -O3 From 39f83d5bb7a2b9edd253832294ae575ea90ec964 Mon Sep 17 00:00:00 2001 From: Pedram Alizadeh Date: Mon, 3 Apr 2023 11:37:13 -0400 Subject: [PATCH 2/4] fixing the error message for mpirun when number of requested GPUs exceeds the limits (#33) [ROCm/rccl-tests commit: e146460810a88b1a47b22308be63702485994fce] --- projects/rccl-tests/src/common.cu | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 332cc3f272..eb0743a52f 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -1127,11 +1127,13 @@ int main(int argc, char* argv[]) { } HIPCHECK(hipGetDeviceCount(&numDevices)); +#ifndef MPI_SUPPORT if (nGpus > numDevices) { fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices); return testNcclError; } +#endif if (minBytes > maxBytes) { fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", (unsigned long long)minBytes, @@ -1154,7 +1156,14 @@ int main(int argc, char* argv[]) { return -1; } #ifdef MPI_SUPPORT + int nProcs = 1; MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + if (nGpus * nProcs > numDevices) + { + fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus*nProcs, numDevices); + return testNcclError; + } #endif TESTCHECK(run()); return 0; From b7054b500a5ae83e8f8ac9bd1f1bee40e3d03f94 Mon Sep 17 00:00:00 2001 From: Pedram Alizadeh Date: Tue, 25 Apr 2023 13:44:43 -0400 Subject: [PATCH 3/4] Revert "fixing the error message for mpirun when number of requested GPUs exceeds the limits (#33)" (#36) This reverts commit 39f83d5bb7a2b9edd253832294ae575ea90ec964. [ROCm/rccl-tests commit: e856fa720ff45d319225ef1ee4c3f7467e90e823] --- projects/rccl-tests/src/common.cu | 9 --------- 1 file changed, 9 deletions(-) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index eb0743a52f..332cc3f272 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -1127,13 +1127,11 @@ int main(int argc, char* argv[]) { } HIPCHECK(hipGetDeviceCount(&numDevices)); -#ifndef MPI_SUPPORT if (nGpus > numDevices) { fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices); return testNcclError; } -#endif if (minBytes > maxBytes) { fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", (unsigned long long)minBytes, @@ -1156,14 +1154,7 @@ int main(int argc, char* argv[]) { return -1; } #ifdef MPI_SUPPORT - int nProcs = 1; MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &nProcs); - if (nGpus * nProcs > numDevices) - { - fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus*nProcs, numDevices); - return testNcclError; - } #endif TESTCHECK(run()); return 0; From aa5f75aa9c24837b8490774588cc6dcaca2a3f60 Mon Sep 17 00:00:00 2001 From: Pedram Alizadeh Date: Thu, 27 Apr 2023 14:06:17 -0400 Subject: [PATCH 4/4] fixing the error message for mpirun when number of requested GPUs exceeds the limits (#37) [ROCm/rccl-tests commit: d16d1fb16b2abe1c1c88464097e6f1d8070d1116] --- projects/rccl-tests/src/common.cu | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 332cc3f272..7107d8b4e6 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -1127,11 +1127,13 @@ int main(int argc, char* argv[]) { } HIPCHECK(hipGetDeviceCount(&numDevices)); +#ifndef MPI_SUPPORT if (nGpus > numDevices) { fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices); return testNcclError; } +#endif if (minBytes > maxBytes) { fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", (unsigned long long)minBytes, @@ -1163,6 +1165,7 @@ int main(int argc, char* argv[]) { testResult_t run() { int nProcs = 1, proc = 0; int localRank = 0; + int localSize = 0; char hostname[1024]; getHostName(hostname, 1024); @@ -1176,6 +1179,14 @@ testResult_t run() { if (p == proc) break; if (hostHashs[p] == hostHashs[proc]) localRank++; } + for (int p=0; p numDevices) + { + fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d) on node (%s)\n", nGpus*localSize, numDevices, hostname); + return testNcclError; + } #endif is_main_thread = (proc == 0) ? 1 : 0;