diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index a7ebcad5b1..c1d328d9f6 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -268,75 +268,20 @@ class ncclFunction { #endif }; -#ifdef ENABLE_COLLTRACE #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) - #define traceColl(launch_type) { \ - uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ - collTrace->timeStamp = wall_clock64(); \ - collTrace->bid = blockIdx.x; \ - collTrace->funcIndex = ncclShmem.work.header.funcIndex; \ - if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \ - struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \ - collTrace->p2p[0].connIndex = 0; \ - collTrace->p2pOpCount[0] = p2pElems[0].opCount; \ - collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \ - collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \ - collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \ - collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \ - collTrace->p2p[1].connIndex = 0; \ - collTrace->p2pOpCount[1] = p2pElems[1].opCount; \ - collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \ - collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \ - collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \ - collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \ - collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \ - } else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \ - struct ncclWorkElem *elems = ncclShmem.work.elems; \ - collTrace->opCount = elems[0].opCount; \ - collTrace->coll.nWarps = elems[0].nWarps; \ - collTrace->coll.bid = elems[0].bid; \ - collTrace->coll.nChannels = elems[0].nChannels; \ - collTrace->type = (launch_type) | ncclCollTraceCollElemType; \ - } \ - } - - #define traceKernelLaunch(firstLaunch) { \ - traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \ - } - #define traceKernelEnd() { \ - uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ - collTrace->timeStamp = wall_clock64(); \ - collTrace->bid = blockIdx.x; \ - collTrace->type = ncclCollTraceKernelEndType; \ - } - #define traceAbort() { \ - uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ - collTrace->timeStamp = wall_clock64(); \ - collTrace->bid = blockIdx.x; \ - collTrace->type = ncclCollTraceAbortType; \ - } - #define traceData(data2, data4, data8_0, data8_1) { \ - uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ - collTrace->bid = blockIdx.x; \ - collTrace->timeStamp = wall_clock64(); \ - collTrace->funcIndex = data2; \ - collTrace->data_0 = data4; \ - collTrace->opCount = data8_0; \ - collTrace->data_1 = data8_1; \ - collTrace->type = ncclCollTraceDataType; \ - } +#define __trace_hwreg() #else +#define __trace_hwreg() \ + asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); +#endif +#ifdef ENABLE_COLLTRACE #define traceColl(launch_type) { \ uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ - collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ + collTrace->timeStamp = wall_clock64(); \ collTrace->bid = blockIdx.x; \ collTrace->funcIndex = ncclShmem.work.header.funcIndex; \ - asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \ + __trace_hwreg()\ if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \ struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \ collTrace->p2p[0].connIndex = 0; \ @@ -361,20 +306,21 @@ class ncclFunction { collTrace->type = (launch_type) | ncclCollTraceCollElemType; \ } \ } +// #endif #define traceKernelLaunch(firstLaunch) { \ traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \ } #define traceKernelEnd() { \ uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ - collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ + collTrace->timeStamp = wall_clock64(); \ collTrace->bid = blockIdx.x; \ collTrace->type = ncclCollTraceKernelEndType; \ } #define traceAbort() { \ uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ - collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ + collTrace->timeStamp = wall_clock64(); \ collTrace->bid = blockIdx.x; \ collTrace->type = ncclCollTraceAbortType; \ } @@ -382,14 +328,13 @@ class ncclFunction { uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ collTrace->bid = blockIdx.x; \ - collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ + collTrace->timeStamp = wall_clock64(); \ collTrace->funcIndex = data2; \ collTrace->data_0 = data4; \ collTrace->opCount = data8_0; \ collTrace->data_1 = data8_1; \ collTrace->type = ncclCollTraceDataType; \ } -#endif #else #define traceData(data2, data4, data8_0, data8_1) #endif @@ -433,7 +378,7 @@ __device__ inline void* ncclScratchForWarp(int warp) { #define __insert_timestamp(line_num) do { \ if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \ ncclShmem.prof.elem[ncclShmem.prof.count].line = line_num; \ - ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \ + ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = wall_clock64(); \ ncclShmem.prof.count++; \ } \ } while(0); diff --git a/src/include/npkit/npkit.h b/src/include/npkit/npkit.h index 924dc71f85..69304eb553 100644 --- a/src/include/npkit/npkit.h +++ b/src/include/npkit/npkit.h @@ -14,11 +14,7 @@ #include "npkit/npkit_event.h" #include "npkit/npkit_struct.h" -#if defined(__GFX9__) || defined(__GFX10__) -#define NPKIT_GET_GPU_TIMESTAMP __builtin_amdgcn_s_memrealtime -#else -#define NPKIT_GET_GPU_TIMESTAMP clock64 -#endif +#define NPKIT_GET_GPU_TIMESTAMP wall_clock64 class NpKit { public: diff --git a/tools/ib-test/ib_test.cpp b/tools/ib-test/ib_test.cpp index 640b4f4328..b0f94af188 100755 --- a/tools/ib-test/ib_test.cpp +++ b/tools/ib-test/ib_test.cpp @@ -85,7 +85,7 @@ __device__ inline __attribute((always_inline)) long long int __rtc64() { #if __HIP__ - return (long long int) __builtin_amdgcn_s_memrealtime(); + return (long long int) wall_clock64(); #else return (long long int) __clock_u64(); #endif diff --git a/tools/rccl-prim-test/rccl_prim_test.cpp b/tools/rccl-prim-test/rccl_prim_test.cpp index 931f0a3ec1..4bc4ad3a52 100644 --- a/tools/rccl-prim-test/rccl_prim_test.cpp +++ b/tools/rccl-prim-test/rccl_prim_test.cpp @@ -117,7 +117,7 @@ __global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct p __syncthreads(); if (tid == 0) - curr_time = __builtin_amdgcn_s_memrealtime(); + curr_time = wall_clock64(); if (op == OP_COPY) { srcs[0] = transfer_data->src0[bid]; diff --git a/tools/scripts/npkit_trace_generator.py b/tools/scripts/npkit_trace_generator.py index 7cfc021c27..3c4ba90420 100644 --- a/tools/scripts/npkit_trace_generator.py +++ b/tools/scripts/npkit_trace_generator.py @@ -39,11 +39,6 @@ def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path): den = float(f.read()) return den / num / 1e6 -def parse_clock_calibration_info(clock_calibration_file_path): - with open(clock_calibration_file_path, 'r') as f: - num = float(f.read()) - return num - def parse_gpu_event(event_bytes): return { 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), @@ -60,13 +55,13 @@ def parse_cpu_event(event_bytes): 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) } -def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, gpu_time_cpu, gpu_time_gpu, dictionary_of_stats, warmup_runs=5): +def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, dictionary_of_stats, warmup_runs=5): gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx)) stats_key = 'gpu_rank_%d' % (rank) channel_stats = {} raw_event_size = 16 - cpu_base_time = gpu_time_cpu / cpu_clock_scale - gpu_base_time = gpu_time_gpu / gpu_clock_scale + curr_cpu_base_time = None + curr_gpu_base_time = None gpu_events = [] event_type_to_seq = {} unfiltered_events = [] @@ -78,64 +73,77 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo if raw_content_size > 0 and stats_key not in dictionary_of_stats: dictionary_of_stats[stats_key] = {} warmup_raw_content_idx = 0 - parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) - unfiltered_events.append(parsed_gpu_event) - start_event_id = parsed_gpu_event['id'] # start event id while warmup_runs != 0 and warmup_raw_content_idx < raw_content_size: #warmup run cleanup - warmup_raw_content_idx += raw_event_size parsed_gpu_event = parse_gpu_event(raw_content[warmup_raw_content_idx : warmup_raw_content_idx + raw_event_size]) - unfiltered_events.append(parsed_gpu_event) + unfiltered_events.insert(0, parsed_gpu_event) + if start_event_id == 0: + decoded_id = npkit_event_def['id_to_type'][parsed_gpu_event['id']] + if decoded_id == 'NPKIT_EVENT_TIME_SYNC_CPU' or decoded_id == 'NPKIT_EVENT_TIME_SYNC_GPU': + warmup_raw_content_idx += raw_event_size + continue + else: + start_event_id = parsed_gpu_event['id'] + + warmup_raw_content_idx += raw_event_size if parsed_gpu_event['id'] == (start_event_id + 1): warmup_runs -= 1 - warmup_raw_content_idx += raw_event_size raw_content_idx = warmup_raw_content_idx - while raw_content_idx < raw_content_size: parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) - unfiltered_events.append(parsed_gpu_event) - event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']] - phase = 'B' if event_type.endswith('_ENTRY') else 'E' - gpu_events.append({ - 'ph': phase, - 'ts': cpu_base_time + ((parsed_gpu_event['timestamp'] / gpu_clock_scale) - gpu_base_time), - 'pid': rank, - 'tid': buf_idx + 1 - }) - if phase == 'B': - if event_type not in event_type_to_seq: - event_type_to_seq[event_type] = 0 - gpu_events[-1].update({ - 'name': event_type, - 'cat': 'GPU', - 'args': { - 'rank': rank, - 'buf_idx': buf_idx, - 'seq': event_type_to_seq[event_type], - 'rsvd_0': parsed_gpu_event['rsvd'], - 'size_0': parsed_gpu_event['size'] - } - }) - event_type_to_seq[event_type] += 1 + unfiltered_events.insert(0, parsed_gpu_event) + if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU': + curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale + curr_gpu_base_time = None + elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU': + if curr_gpu_base_time is None: + curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale else: - gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']} - current_id = parsed_gpu_event['id'] - gpu_events_reverse = unfiltered_events[::-1] - for i in gpu_events_reverse: - if i['id'] == (current_id-1): - event_start_ts = cpu_base_time + ((i['timestamp'] / gpu_clock_scale) - gpu_base_time) - break - delta_time = max(0.001, gpu_events[-1]['ts'] - event_start_ts) # delta needs to take the last begin - bandwidth = gpu_events[-1]['args']['size'] / delta_time / 1e3 - if (current_id,parsed_gpu_event['size']) in channel_stats: - temp_size = channel_stats[(current_id,parsed_gpu_event['size'])][1]+1 - temp = channel_stats[(current_id,parsed_gpu_event['size'])][0] * (temp_size - 1 )/ (temp_size) - temp_delta = channel_stats[(current_id,parsed_gpu_event['size'])][2] * (temp_size - 1 )/ (temp_size) - channel_stats[(current_id,parsed_gpu_event['size'])][0] = bandwidth / (temp_size) + temp - channel_stats[(current_id,parsed_gpu_event['size'])][1] = temp_size - channel_stats[(current_id,parsed_gpu_event['size'])][2] = delta_time / (temp_size) + temp_delta + if curr_gpu_base_time is None: + curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale + event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']] + phase = 'B' if event_type.endswith('_ENTRY') else 'E' + gpu_events.append({ + 'ph': phase, + 'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time, + 'pid': rank, + 'tid': buf_idx + 1 + + }) + if phase == 'B': + if event_type not in event_type_to_seq: + event_type_to_seq[event_type] = 0 + gpu_events[-1].update({ + 'name': event_type, + 'cat': 'GPU', + 'args': { + 'rank': rank, + 'buf_idx': buf_idx, + 'seq': event_type_to_seq[event_type], + 'rsvd_0': parsed_gpu_event['rsvd'], + 'size_0': parsed_gpu_event['size'] + } + }) + event_type_to_seq[event_type] += 1 else: - channel_stats[(current_id,parsed_gpu_event['size'])] = [bandwidth, 1, delta_time] - gpu_events[-1]['args']['bw (GB/s)'] = bandwidth + gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']} + current_id = parsed_gpu_event['id'] + + for i in unfiltered_events: + if i['id'] == (current_id-1): + event_start_ts = curr_cpu_base_time + i['timestamp'] / gpu_clock_scale - curr_gpu_base_time + break + delta_time = max(0.001, gpu_events[-1]['ts'] - event_start_ts) # delta needs to take the last begin + bandwidth = gpu_events[-1]['args']['size'] / delta_time / 1e3 + if (current_id,parsed_gpu_event['size']) in channel_stats: + temp_size = channel_stats[(current_id,parsed_gpu_event['size'])][1]+1 + temp = channel_stats[(current_id,parsed_gpu_event['size'])][0] * (temp_size - 1 )/ (temp_size) + temp_delta = channel_stats[(current_id,parsed_gpu_event['size'])][2] * (temp_size - 1 )/ (temp_size) + channel_stats[(current_id,parsed_gpu_event['size'])][0] = bandwidth / (temp_size) + temp + channel_stats[(current_id,parsed_gpu_event['size'])][1] = temp_size + channel_stats[(current_id,parsed_gpu_event['size'])][2] = delta_time / (temp_size) + temp_delta + else: + channel_stats[(current_id,parsed_gpu_event['size'])] = [bandwidth, 1, delta_time] + gpu_events[-1]['args']['bw (GB/s)'] = bandwidth raw_content_idx += raw_event_size @@ -150,11 +158,9 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo dictionary_of_stats[stats_key][key][2] = new_avg_time else: dictionary_of_stats[stats_key][key] = channel_stats[key] - - breakpoint() return gpu_events -def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale, cpu_time_global, cpu_time_local): +def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale): cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel)) raw_event_size = 16 cpu_events = [] @@ -179,7 +185,7 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo phase = 'B' if event_type.endswith('_ENTRY') else 'E' cpu_events.append({ 'ph': phase, - 'ts': (cpu_time_global + (parsed_cpu_event['timestamp'] - cpu_time_local)) / cpu_clock_scale, + 'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale, 'pid': rank }) slot = parsed_cpu_event['slot'] @@ -228,6 +234,8 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo raw_content_idx += raw_event_size return cpu_events + + def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def, gpu_statistics): files_in_dump_dir = next(os.walk(npkit_dump_dir))[2] gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')] @@ -247,17 +255,12 @@ def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def, gpu gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank) gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path) - cpu_time_global = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_cpu_global_rank_%d' % rank)) - cpu_time_local = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_cpu_local_rank_%d' % rank)) - gpu_time_cpu = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_gpu_cpu_rank_%d' % rank)) - gpu_time_gpu = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_gpu_gpu_rank_%d' % rank)) - for buf_idx in buf_indices: - gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, gpu_time_cpu, gpu_time_gpu, dictionary_of_stats) + gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, dictionary_of_stats) trace['traceEvents'].extend(gpu_events) for channel in channels: - cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale, cpu_time_global, cpu_time_local) + cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale) trace['traceEvents'].extend(cpu_events) trace['traceEvents'].sort(key=lambda x : x['ts'])