rccl_prim_test: Generalize ring topology and duplications

Allow user specified ring topology from command line and duplicated
to requested number of workgroups:
./rccl_prim_test -w 12 -p copy -r "0 1 2 3|3 2 1 0|0 2 1 3|3 1 2 0|0 2 3 1|1 3 2 0"
This commit is contained in:
Wenkai Du
2019-11-11 14:08:31 -08:00
förälder 277c72a638
incheckning 07bb6fce8f
+38 -14
Visa fil
@@ -243,11 +243,9 @@ static void setupRings(uint32_t *info, int *ring_0, int *ring_1) {
printf("\n");
}
findConnect(info, ring_0, deviceCnt);
printRing(0, ring_0, deviceCnt);
ring_1[0] =0;
for (int i = 1; i < deviceCnt; i++)
ring_1[i] = ring_0[deviceCnt-i];
printRing(1, ring_1, deviceCnt);
}
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
@@ -270,7 +268,7 @@ static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
int main(int argc,char* argv[])
{
if (cmdOptionExists(argv, argv + argc, "-h")) {
printf("./rccl_prim_test -w num_workgroups -p copy|localcopy|doublecopy|reduce|reducecopy|all -i iterations -n bytes -s 0|1\n");
printf("./rccl_prim_test -w num_workgroups -p copy|localcopy|doublecopy|reduce|reducecopy|all -i iterations -n bytes -s 0|1 -r \"0 1 2 3|3 2 1 0\"\n");
exit(0);
}
@@ -299,6 +297,9 @@ int main(int argc,char* argv[])
sync = atol(s);
if (sync) printf("Sync all GPUs before operation\n");
char *r = getCmdOption(argv, argv + argc, "-r");
if (r) printf("User specified ring topology: %s\n", r);
const char *ops[] = {"copy", "localcopy", "doublecopy", "reduce", "reducecopy", "read", "all"};
char *prim = getCmdOption(argv, argv + argc, "-p");
int op = NUM_OPS, begin_op, end_op;
@@ -320,9 +321,34 @@ int main(int argc,char* argv[])
// Enable peer access
setupPeers(connection_info);
// clockwise and counter clockwise rings
int ring_0[MAX_GPU] = {-1, -1, -1, -1,-1, -1, -1, -1};
int ring_1[MAX_GPU] = {-1, -1, -1, -1,-1, -1, -1, -1};
setupRings(connection_info, ring_0, ring_1);
int ring[MAX_WORKGROUPS][MAX_GPU];
for (int i = 0; i < MAX_WORKGROUPS; i++)
for (int j = 0; j <MAX_GPU; j++)
ring[i][j] = -1;
int num_rings = 0;
if (r) {
int j = 0, n = 0;
do {
if (r[n] == ' ') continue;
if (r[n] == '|') {
num_rings ++;
j = 0;
continue;
}
ring[num_rings][j++] = r[n] - '0';
} while (r[n++] != 0x0);
num_rings ++;
} else {
setupRings(connection_info, ring[0], ring[1]);
num_rings = 2;
}
// duplicate rings
for (int i = num_rings; i < MAX_WORKGROUPS; i++) {
for (int j = 0; j <MAX_GPU; j++)
ring[i][j] = ring[i%num_rings][j];
}
// data buffers
float *buff[MAX_GPU*MAX_WORKGROUPS], *buff_coarse[MAX_GPU*MAX_WORKGROUPS];
@@ -336,6 +362,10 @@ int main(int argc,char* argv[])
HIPCHECK(hipHostMalloc((void**)&remOpCount, sizeof(uint64_t)*MAX_GPU, hipHostMallocMapped));
HIPCHECK(hipHostGetDevicePointer((void**)&d_remOpCount, (void*)remOpCount, 0));
// print rings
for (int i = 0; i < workgroups; i++) {
printRing(i, ring[i], nGpu);
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
@@ -371,10 +401,7 @@ int main(int argc,char* argv[])
for (int i = 0; i < nGpu; i ++) {
for (int j = 0; j < workgroups; j++) {
int next_gpu;
if (j%2)
next_gpu = findNextGpu(ring_1, i, nGpu);
else
next_gpu = findNextGpu(ring_0, i, nGpu);
next_gpu = findNextGpu(ring[j], i, nGpu);
//printf("GPU %d Ring %d -> Next GPU %d\n", i, j, next_gpu);
h_transfer_data[i].dest0[j] = buff[next_gpu*MAX_WORKGROUPS+j] + N;
h_transfer_data[i].dest1[j] = buff_coarse[i*MAX_WORKGROUPS+j] + N;
@@ -467,10 +494,7 @@ int main(int argc,char* argv[])
HIPCHECK(hipGetDeviceProperties(&prop, i));
for (int j = 0; j < workgroups; j++) {
int next_gpu;
if (j%2)
next_gpu = findNextGpu(ring_1, i, nGpu);
else
next_gpu = findNextGpu(ring_0, i, nGpu);
next_gpu = findNextGpu(ring[j], i, nGpu);
uint32_t linktype;
uint32_t hopcount;