Merge pull request #79 from saurabhAMD/rotating_tensor

Rotating tensor -R (default:off)

[ROCm/rccl-tests commit: 073d56f6e2]
This commit is contained in:
saurabhAMD
2024-06-04 17:51:26 -05:00
committed by GitHub
2 changed files with 24 additions and 5 deletions
+23 -5
View File
@@ -25,6 +25,7 @@
int test_ncclVersion = 0; // init'd with ncclGetVersion()
int32_t gpu_block3;
size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
#if NCCL_MAJOR >= 2
ncclDataType_t test_types[ncclNumTypes] = {
@@ -105,6 +106,7 @@ static int numDevices = 1;
static int delay_inout_place = 0;
static int enable_out_of_place = 1;
static int enable_cache_flush = 0;
static int enable_rotating_tensor = 0;
#define NUM_BLOCKS 32
@@ -423,10 +425,16 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
size_t count = args->nbytes / wordSize(type);
// Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
size_t shift = totalnbytes * (iter % steps);
size_t shift = 0;
if(enable_rotating_tensor) {
shift = cache_bytes * (iter % 2);
}
else {
size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
shift = totalnbytes * (iter % steps);
}
if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
for (int i = 0; i < args->nGpus; i++) {
#ifndef NCCL_MAJOR
@@ -852,6 +860,10 @@ testResult_t threadLaunch(struct testThread* thread) {
}
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
if(enable_rotating_tensor) {
recvBytes = recvBytes + cache_bytes;
nbytes = nbytes + cache_bytes;
}
if (memorytype == ncclFine) {
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
@@ -939,6 +951,7 @@ int main(int argc, char* argv[]) {
{"average", required_argument, 0, 'a'},
{"out_of_place", required_argument, 0, 'O'},
{"cache_flush", required_argument, 0, 'F'},
{"rotating_tensor", required_argument, 0, 'R'},
{"help", no_argument, 0, 'h'},
{}
};
@@ -946,7 +959,7 @@ int main(int argc, char* argv[]) {
while(1) {
int c;
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:a:y:s:u:h:q:", longopts, &longindex);
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:R:a:y:s:u:h:q:", longopts, &longindex);
if (c == -1)
break;
@@ -1054,6 +1067,9 @@ int main(int argc, char* argv[]) {
gpu_block3 = deviceProps.multiProcessorCount * 60;
}
break;
case 'R':
enable_rotating_tensor = strtol(optarg, NULL, 0);
break;
case 'a':
average = (int)strtol(optarg, NULL, 0);
break;
@@ -1094,6 +1110,7 @@ int main(int argc, char* argv[]) {
"[-C,--report_cputime <0/1>] \n\t"
"[-O,--out_of_place <0/1>] \n\t"
"[-F,--cache_flush <number of iterations between instruction cache flush>] \n\t"
"[-R,--rotating_tensor <0/1>] \n\t"
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
"[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
"[-h,--help]\n",
@@ -1305,6 +1322,7 @@ testResult_t run() {
threads[t].args.streams=streams+t*nGpus;
threads[t].args.enable_out_of_place=enable_out_of_place;
threads[t].args.enable_cache_flush = enable_cache_flush;
threads[t].args.enable_rotating_tensor = enable_rotating_tensor;
threads[t].args.errors=errors+t;
threads[t].args.bw=bw+t;
threads[t].args.bw_count=bw_count+t;
+1
View File
@@ -128,6 +128,7 @@ struct threadArgs {
int localRank;
int enable_out_of_place;
int enable_cache_flush;
int enable_rotating_tensor;
void** sendbuffs;
size_t sendBytes;
size_t sendInplaceOffset;