Merge pull request #38 from ROCmSoftwarePlatform/develop_merge

Merge master branch into develop

[ROCm/rccl-tests commit: d5201418a9]
This commit is contained in:
Wenkai Du
2023-06-14 08:12:30 -07:00
کامیت شده توسط GitHub
کامیت e07d7ec1b5
@@ -966,11 +966,13 @@ int main(int argc, char* argv[]) {
}
HIPCHECK(hipGetDeviceCount(&numDevices));
#ifndef MPI_SUPPORT
if (nGpus > numDevices)
{
fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices);
return testNcclError;
}
#endif
if (minBytes > maxBytes) {
fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
(unsigned long long)minBytes,
@@ -1002,6 +1004,7 @@ int main(int argc, char* argv[]) {
testResult_t run() {
int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
int localRank = 0;
int localSize = 0;
char hostname[1024];
getHostName(hostname, 1024);
@@ -1023,6 +1026,15 @@ testResult_t run() {
MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
MPI_Comm_size(mpi_comm, &ncclProcs);
MPI_Comm_rank(mpi_comm, &ncclProc);
for (int p=0; p<nProcs; p++) {
if (hostHashs[p] == hostHashs[proc]) localSize++;
}
if (nGpus * localSize > numDevices)
{
fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d) on node (%s)\n", nGpus*localSize, numDevices, hostname);
return testNcclError;
}
#endif
is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;