rccl-prim-test: minor update (#718)

This commit is contained in:
Wenkai Du
2023-04-03 07:30:04 -07:00
committato da GitHub
parent c8e33b1232
commit addbf4bd90
+11 -34
Vedi File
@@ -44,7 +44,7 @@ THE SOFTWARE.
#define DOUBLECOPYLOCAL_UNROLL 2
#define REDUCECOPY_UNROLL 2
#define PRINT_GPU0_ONLY 1
#define RST "\x1B[0m"
#define KBLU "\x1B[34m"
@@ -52,11 +52,6 @@ THE SOFTWARE.
#define BOLD(x) "\x1B[1m" x RST
#define RTC_CLOCK_FREQ_VEGA20 2.5E7
//Right now kept the Arcturus RTC frequency same as Vega20
//as we are not aware of Arcturus frequency, once we we come to know about it
//we will update it.
#define RTC_CLOCK_FREQ_ARCTURUS 2.5E7
#define RTC_CLOCK_FREQ_DEFAULT 2.7E7
struct transfer_data_t {
float *dest0[MAX_WORKGROUPS]; //remote fine grain
@@ -613,10 +608,9 @@ int main(int argc,char* argv[])
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemsetAsync(d_profiling_data[i], 0, sizeof(struct profiling_data_t), stream[i]));
HIPCHECK(hipStreamSynchronize(stream[i]));
HIPCHECK(hipMemset(d_profiling_data[i], 0, sizeof(struct profiling_data_t)));
}
HIPCHECK(hipStreamSynchronize(NULL));
auto start = std::chrono::high_resolution_clock::now();
for (int j = 0; j < iters; j ++) {
for (int i = 0; i < nGpu; i ++) {
@@ -676,38 +670,21 @@ int main(int argc,char* argv[])
uint32_t hopcount;
HIPCHECK(hipExtGetLinkTypeAndHopCount(i, next_gpu , &linktype, &hopcount));
if(prop.gcnArch == 906) {
write_cycle = write_cycle + profiling_data[i]->write_cycles[j];
bytes_transferred = bytes_transferred + profiling_data[i]->bytes_transferred[j];
double t0 = (double)profiling_data[i]->write_cycles[j]/((double)RTC_CLOCK_FREQ_VEGA20);
fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.4f %-20lu %-.2f\n",
i,i, next_gpu,j,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred[j], (double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9));
} else if (prop.gcnArch == 908) {
write_cycle = write_cycle + profiling_data[i]->write_cycles[j];
bytes_transferred = bytes_transferred + profiling_data[i]->bytes_transferred[j];
double t0 = (double)profiling_data[i]->write_cycles[j]/((double)RTC_CLOCK_FREQ_ARCTURUS);
fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.4f %-20lu %-.2f\n",
i,i, next_gpu,j,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred[j], (double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9));
} else {
write_cycle = write_cycle + profiling_data[i]->write_cycles[j];
bytes_transferred = bytes_transferred + profiling_data[i]->bytes_transferred[j];
double t0 = (double)profiling_data[i]->write_cycles[j]/((double)RTC_CLOCK_FREQ_DEFAULT);
fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.4f %-20lu %-.2f\n",
i,i, next_gpu,j,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred[j], (double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9));
}
write_cycle = write_cycle + profiling_data[i]->write_cycles[j];
bytes_transferred = bytes_transferred + profiling_data[i]->bytes_transferred[j];
double t0 = (double)profiling_data[i]->write_cycles[j]/((double)RTC_CLOCK_FREQ_VEGA20);
fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.4f %-20lu %-.2f\n",
i,i, next_gpu,j,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred[j], (double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9));
}
print_table_summary_line();
double total = 0;
if(prop.gcnArch == 906 ) {
total = (double)write_cycle/((double)RTC_CLOCK_FREQ_VEGA20)/(double)workgroups;
}else if (prop.gcnArch == 908 ){
total = (double)write_cycle/((double)RTC_CLOCK_FREQ_ARCTURUS)/(double)workgroups;
} else {
total = (double)write_cycle/((double)RTC_CLOCK_FREQ_DEFAULT)/(double)workgroups;
}
total = (double)write_cycle/((double)RTC_CLOCK_FREQ_VEGA20)/(double)workgroups;
fprintf(stderr, " %-61s %-13.4f %-20lu %-.2f\n",
"Total" , total, bytes_transferred, (double)bytes_transferred/(total*1.0E9));
print_table_summary_line();
#ifdef PRINT_GPU0_ONLY
break;
#endif
}
std::cout << BOLD(FBLU("[Application Level Transfer Profiling Data]"))<<std::endl;
uint64_t total_bytes_transferred = profiling_data[0]->bytes_transferred[0] * workgroups ;