diff --git a/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp b/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp index d57700474b..2820d64ef4 100644 --- a/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp +++ b/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp @@ -42,6 +42,13 @@ THE SOFTWARE. #define DOUBLECOPY_UNROLL 2 #define REDUCECOPY_UNROLL 2 + + +#define RST "\x1B[0m" +#define KBLU "\x1B[34m" +#define FBLU(x) KBLU x RST +#define BOLD(x) "\x1B[1m" x RST + #define RTC_CLOCK_FREQ_VEGA20 2.7E07 //Right now kept the MI100 RTC frequency same as Vega20 //as we are not aware of MI100 frequency, once we we come to know about it @@ -61,14 +68,24 @@ struct transfer_data_t { }; struct profiling_data_t { - uint64_t write_cycles; - uint64_t bytes_transferred; + uint64_t write_cycles[MAX_WORKGROUPS]; + uint64_t bytes_transferred[MAX_WORKGROUPS]; }; #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) +void print_table_header(void) { + fprintf(stderr, "%120s","=================================================================================================================================\n"); + fprintf(stderr, "%-20s %-13s %-13s %-13s %-13s %-20s %-20s\n","[Originating GPU]", "[Directions]", "[WorkGroup]", "[linktype]", "[time(sec)]" , "[bytes_transferred]", "[kernel throughput(GB/s)]"); + fprintf(stderr, "%120s","=================================================================================================================================\n"); +} + +void print_table_summary_line(void) { + fprintf(stderr, "%120s","---------------------------------------------------------------------------------------------------------------------------------\n"); +} + enum Ops { OP_COPY, OP_LOCALCOPY, @@ -110,8 +127,8 @@ __global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct p __syncthreads(); if (idx == 0) { next_time = clock64(); - __atomic_fetch_add(&(profiling_data->write_cycles), next_time - curr_time, __ATOMIC_SEQ_CST); - __atomic_fetch_add(&(profiling_data->bytes_transferred), n * sizeof(float), __ATOMIC_SEQ_CST); + __atomic_fetch_add(&(profiling_data->write_cycles[bid]), next_time - curr_time, __ATOMIC_SEQ_CST); + __atomic_fetch_add(&(profiling_data->bytes_transferred[bid]), n * sizeof(float), __ATOMIC_SEQ_CST); } } @@ -376,9 +393,9 @@ int main(int argc,char* argv[]) uint64_t opCount = 0; for (int op = begin_op; op < end_op; op ++) { const char *OpsName[] = {"Copy", "Local Copy", "Double Copy", "Reduce", "ReduceCopy"}; - printf("[Testing %s]: \n", OpsName[op]); - // 2 warm up cycles - for (int i = 0; i < 2; i ++) { + printf("\n[Testing %s]: \n", OpsName[op]); + // 4 warm up cycles + for (int i = 0; i < 4; i ++) { for (int i = 0; i < nGpu; i ++) { HIPCHECK(hipSetDevice(i)); //launch the kernel @@ -420,38 +437,71 @@ int main(int argc,char* argv[]) auto delta = std::chrono::high_resolution_clock::now() - start; double deltaSec = std::chrono::duration_cast>(delta).count(); - - std::cout<<"***GPU to GPU Transfer Profiling Data***"<write_cycles[j]; + bytes_transferred = bytes_transferred + profiling_data[i]->bytes_transferred[j]; + double t0 = (double)profiling_data[i]->write_cycles[j]/((double)RTC_CLOCK_FREQ_VEGA20); + fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.4f %-20lu %-.2f\n", + i,i, next_gpu,j,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred[j], (double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9)); + } else if (prop.gcnArch == 908 ){ + write_cycle = write_cycle + profiling_data[i]->write_cycles[j]; + bytes_transferred = bytes_transferred + profiling_data[i]->bytes_transferred[j]; + double t0 = (double)profiling_data[i]->write_cycles[j]/((double)RTC_CLOCK_FREQ_MI100); + fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.4f %-20lu %-.2f\n", + i,i, next_gpu,j,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred[j], (double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9)); + } else { + write_cycle = write_cycle + profiling_data[i]->write_cycles[j]; + bytes_transferred = bytes_transferred + profiling_data[i]->bytes_transferred[j]; + double t0 = (double)profiling_data[i]->write_cycles[j]/((double)RTC_CLOCK_FREQ_DEFAULT); + fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.4f %-20lu %-.2f\n", + i,i, next_gpu,j,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred[j], (double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9)); + } + } + print_table_summary_line(); + double total = 0; if(prop.gcnArch == 906 ) { - double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ_VEGA20)/(double)workgroups; - fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n", - i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9)); - } else if (prop.gcnArch == 908 ){ - double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ_MI100)/(double)workgroups; - fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n", - i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9)); - } else { - double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ_DEFAULT)/(double)workgroups; - fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n", - i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9)); - } + total = (double)write_cycle/((double)RTC_CLOCK_FREQ_VEGA20)/(double)workgroups; + }else if (prop.gcnArch == 908 ){ + total = (double)write_cycle/((double)RTC_CLOCK_FREQ_MI100)/(double)workgroups; + } else { + total = (double)write_cycle/((double)RTC_CLOCK_FREQ_DEFAULT)/(double)workgroups; + } + fprintf(stderr, " %-61s %-13.4f %-20lu %-.2f\n", + "Total" , total, bytes_transferred, (double)bytes_transferred/(total*1.0E9)); + print_table_summary_line(); } - std::cout<<"***Application Level Transfer Profiling Data***"<bytes_transferred) / (deltaSec*1.0E9); - printf("Transfered %lu bytes in %f s. Throughput %f GB/s\n", profiling_data[0]->bytes_transferred, deltaSec, speed); + std::cout << BOLD(FBLU("[Application Level Transfer Profiling Data]"))<bytes_transferred[0] * workgroups ; + print_table_summary_line(); + fprintf(stderr, " %-61s %-13.4f %-20lu %-.2f\n", + "Total" , deltaSec, total_bytes_transferred, (double)total_bytes_transferred/(deltaSec*1.0E9)); + print_table_summary_line(); } for (int i = 0; i < nGpu; i ++) {