Wall clock update and npkit trace script Update (#771)

* changed builtin clock to wall_clock64
* updated npkit_Trace_generator to the new version of npkit
Этот коммит содержится в:
akolliasAMD
2023-06-07 17:47:10 -06:00
коммит произвёл GitHub
родитель c3f47853bd
Коммит 9cdac774ea
5 изменённых файлов: 86 добавлений и 142 удалений
+12 -67
Просмотреть файл
@@ -268,75 +268,20 @@ class ncclFunction {
#endif
};
#ifdef ENABLE_COLLTRACE
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
#define traceColl(launch_type) { \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x; \
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
collTrace->p2p[0].connIndex = 0; \
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \
collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \
collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \
collTrace->p2p[1].connIndex = 0; \
collTrace->p2pOpCount[1] = p2pElems[1].opCount; \
collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \
collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \
collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
} else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \
struct ncclWorkElem *elems = ncclShmem.work.elems; \
collTrace->opCount = elems[0].opCount; \
collTrace->coll.nWarps = elems[0].nWarps; \
collTrace->coll.bid = elems[0].bid; \
collTrace->coll.nChannels = elems[0].nChannels; \
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
} \
}
#define traceKernelLaunch(firstLaunch) { \
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
}
#define traceKernelEnd() { \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x; \
collTrace->type = ncclCollTraceKernelEndType; \
}
#define traceAbort() { \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x; \
collTrace->type = ncclCollTraceAbortType; \
}
#define traceData(data2, data4, data8_0, data8_1) { \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->bid = blockIdx.x; \
collTrace->timeStamp = wall_clock64(); \
collTrace->funcIndex = data2; \
collTrace->data_0 = data4; \
collTrace->opCount = data8_0; \
collTrace->data_1 = data8_1; \
collTrace->type = ncclCollTraceDataType; \
}
#define __trace_hwreg()
#else
#define __trace_hwreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0));
#endif
#ifdef ENABLE_COLLTRACE
#define traceColl(launch_type) { \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x; \
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
__trace_hwreg()\
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
collTrace->p2p[0].connIndex = 0; \
@@ -361,20 +306,21 @@ class ncclFunction {
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
} \
}
// #endif
#define traceKernelLaunch(firstLaunch) { \
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
}
#define traceKernelEnd() { \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x; \
collTrace->type = ncclCollTraceKernelEndType; \
}
#define traceAbort() { \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x; \
collTrace->type = ncclCollTraceAbortType; \
}
@@ -382,14 +328,13 @@ class ncclFunction {
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->bid = blockIdx.x; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->timeStamp = wall_clock64(); \
collTrace->funcIndex = data2; \
collTrace->data_0 = data4; \
collTrace->opCount = data8_0; \
collTrace->data_1 = data8_1; \
collTrace->type = ncclCollTraceDataType; \
}
#endif
#else
#define traceData(data2, data4, data8_0, data8_1)
#endif
@@ -433,7 +378,7 @@ __device__ inline void* ncclScratchForWarp(int warp) {
#define __insert_timestamp(line_num) do { \
if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \
ncclShmem.prof.elem[ncclShmem.prof.count].line = line_num; \
ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = wall_clock64(); \
ncclShmem.prof.count++; \
} \
} while(0);
+1 -5
Просмотреть файл
@@ -14,11 +14,7 @@
#include "npkit/npkit_event.h"
#include "npkit/npkit_struct.h"
#if defined(__GFX9__) || defined(__GFX10__)
#define NPKIT_GET_GPU_TIMESTAMP __builtin_amdgcn_s_memrealtime
#else
#define NPKIT_GET_GPU_TIMESTAMP clock64
#endif
#define NPKIT_GET_GPU_TIMESTAMP wall_clock64
class NpKit {
public:
+1 -1
Просмотреть файл
@@ -85,7 +85,7 @@ __device__
inline __attribute((always_inline))
long long int __rtc64() {
#if __HIP__
return (long long int) __builtin_amdgcn_s_memrealtime();
return (long long int) wall_clock64();
#else
return (long long int) __clock_u64();
#endif
+1 -1
Просмотреть файл
@@ -117,7 +117,7 @@ __global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct p
__syncthreads();
if (tid == 0)
curr_time = __builtin_amdgcn_s_memrealtime();
curr_time = wall_clock64();
if (op == OP_COPY) {
srcs[0] = transfer_data->src0[bid];
+71 -68
Просмотреть файл
@@ -39,11 +39,6 @@ def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path):
den = float(f.read())
return den / num / 1e6
def parse_clock_calibration_info(clock_calibration_file_path):
with open(clock_calibration_file_path, 'r') as f:
num = float(f.read())
return num
def parse_gpu_event(event_bytes):
return {
'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
@@ -60,13 +55,13 @@ def parse_cpu_event(event_bytes):
'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
}
def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, gpu_time_cpu, gpu_time_gpu, dictionary_of_stats, warmup_runs=5):
def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, dictionary_of_stats, warmup_runs=5):
gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx))
stats_key = 'gpu_rank_%d' % (rank)
channel_stats = {}
raw_event_size = 16
cpu_base_time = gpu_time_cpu / cpu_clock_scale
gpu_base_time = gpu_time_gpu / gpu_clock_scale
curr_cpu_base_time = None
curr_gpu_base_time = None
gpu_events = []
event_type_to_seq = {}
unfiltered_events = []
@@ -78,64 +73,77 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo
if raw_content_size > 0 and stats_key not in dictionary_of_stats:
dictionary_of_stats[stats_key] = {}
warmup_raw_content_idx = 0
parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
unfiltered_events.append(parsed_gpu_event)
start_event_id = parsed_gpu_event['id'] # start event id
while warmup_runs != 0 and warmup_raw_content_idx < raw_content_size: #warmup run cleanup
warmup_raw_content_idx += raw_event_size
parsed_gpu_event = parse_gpu_event(raw_content[warmup_raw_content_idx : warmup_raw_content_idx + raw_event_size])
unfiltered_events.append(parsed_gpu_event)
unfiltered_events.insert(0, parsed_gpu_event)
if start_event_id == 0:
decoded_id = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
if decoded_id == 'NPKIT_EVENT_TIME_SYNC_CPU' or decoded_id == 'NPKIT_EVENT_TIME_SYNC_GPU':
warmup_raw_content_idx += raw_event_size
continue
else:
start_event_id = parsed_gpu_event['id']
warmup_raw_content_idx += raw_event_size
if parsed_gpu_event['id'] == (start_event_id + 1):
warmup_runs -= 1
warmup_raw_content_idx += raw_event_size
raw_content_idx = warmup_raw_content_idx
while raw_content_idx < raw_content_size:
parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
unfiltered_events.append(parsed_gpu_event)
event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
phase = 'B' if event_type.endswith('_ENTRY') else 'E'
gpu_events.append({
'ph': phase,
'ts': cpu_base_time + ((parsed_gpu_event['timestamp'] / gpu_clock_scale) - gpu_base_time),
'pid': rank,
'tid': buf_idx + 1
})
if phase == 'B':
if event_type not in event_type_to_seq:
event_type_to_seq[event_type] = 0
gpu_events[-1].update({
'name': event_type,
'cat': 'GPU',
'args': {
'rank': rank,
'buf_idx': buf_idx,
'seq': event_type_to_seq[event_type],
'rsvd_0': parsed_gpu_event['rsvd'],
'size_0': parsed_gpu_event['size']
}
})
event_type_to_seq[event_type] += 1
unfiltered_events.insert(0, parsed_gpu_event)
if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU':
curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale
curr_gpu_base_time = None
elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU':
if curr_gpu_base_time is None:
curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
else:
gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
current_id = parsed_gpu_event['id']
gpu_events_reverse = unfiltered_events[::-1]
for i in gpu_events_reverse:
if i['id'] == (current_id-1):
event_start_ts = cpu_base_time + ((i['timestamp'] / gpu_clock_scale) - gpu_base_time)
break
delta_time = max(0.001, gpu_events[-1]['ts'] - event_start_ts) # delta needs to take the last begin
bandwidth = gpu_events[-1]['args']['size'] / delta_time / 1e3
if (current_id,parsed_gpu_event['size']) in channel_stats:
temp_size = channel_stats[(current_id,parsed_gpu_event['size'])][1]+1
temp = channel_stats[(current_id,parsed_gpu_event['size'])][0] * (temp_size - 1 )/ (temp_size)
temp_delta = channel_stats[(current_id,parsed_gpu_event['size'])][2] * (temp_size - 1 )/ (temp_size)
channel_stats[(current_id,parsed_gpu_event['size'])][0] = bandwidth / (temp_size) + temp
channel_stats[(current_id,parsed_gpu_event['size'])][1] = temp_size
channel_stats[(current_id,parsed_gpu_event['size'])][2] = delta_time / (temp_size) + temp_delta
if curr_gpu_base_time is None:
curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
phase = 'B' if event_type.endswith('_ENTRY') else 'E'
gpu_events.append({
'ph': phase,
'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time,
'pid': rank,
'tid': buf_idx + 1
})
if phase == 'B':
if event_type not in event_type_to_seq:
event_type_to_seq[event_type] = 0
gpu_events[-1].update({
'name': event_type,
'cat': 'GPU',
'args': {
'rank': rank,
'buf_idx': buf_idx,
'seq': event_type_to_seq[event_type],
'rsvd_0': parsed_gpu_event['rsvd'],
'size_0': parsed_gpu_event['size']
}
})
event_type_to_seq[event_type] += 1
else:
channel_stats[(current_id,parsed_gpu_event['size'])] = [bandwidth, 1, delta_time]
gpu_events[-1]['args']['bw (GB/s)'] = bandwidth
gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
current_id = parsed_gpu_event['id']
for i in unfiltered_events:
if i['id'] == (current_id-1):
event_start_ts = curr_cpu_base_time + i['timestamp'] / gpu_clock_scale - curr_gpu_base_time
break
delta_time = max(0.001, gpu_events[-1]['ts'] - event_start_ts) # delta needs to take the last begin
bandwidth = gpu_events[-1]['args']['size'] / delta_time / 1e3
if (current_id,parsed_gpu_event['size']) in channel_stats:
temp_size = channel_stats[(current_id,parsed_gpu_event['size'])][1]+1
temp = channel_stats[(current_id,parsed_gpu_event['size'])][0] * (temp_size - 1 )/ (temp_size)
temp_delta = channel_stats[(current_id,parsed_gpu_event['size'])][2] * (temp_size - 1 )/ (temp_size)
channel_stats[(current_id,parsed_gpu_event['size'])][0] = bandwidth / (temp_size) + temp
channel_stats[(current_id,parsed_gpu_event['size'])][1] = temp_size
channel_stats[(current_id,parsed_gpu_event['size'])][2] = delta_time / (temp_size) + temp_delta
else:
channel_stats[(current_id,parsed_gpu_event['size'])] = [bandwidth, 1, delta_time]
gpu_events[-1]['args']['bw (GB/s)'] = bandwidth
raw_content_idx += raw_event_size
@@ -150,11 +158,9 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo
dictionary_of_stats[stats_key][key][2] = new_avg_time
else:
dictionary_of_stats[stats_key][key] = channel_stats[key]
breakpoint()
return gpu_events
def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale, cpu_time_global, cpu_time_local):
def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale):
cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel))
raw_event_size = 16
cpu_events = []
@@ -179,7 +185,7 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo
phase = 'B' if event_type.endswith('_ENTRY') else 'E'
cpu_events.append({
'ph': phase,
'ts': (cpu_time_global + (parsed_cpu_event['timestamp'] - cpu_time_local)) / cpu_clock_scale,
'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale,
'pid': rank
})
slot = parsed_cpu_event['slot']
@@ -228,6 +234,8 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo
raw_content_idx += raw_event_size
return cpu_events
def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def, gpu_statistics):
files_in_dump_dir = next(os.walk(npkit_dump_dir))[2]
gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')]
@@ -247,17 +255,12 @@ def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def, gpu
gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank)
gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path)
cpu_time_global = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_cpu_global_rank_%d' % rank))
cpu_time_local = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_cpu_local_rank_%d' % rank))
gpu_time_cpu = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_gpu_cpu_rank_%d' % rank))
gpu_time_gpu = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_gpu_gpu_rank_%d' % rank))
for buf_idx in buf_indices:
gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, gpu_time_cpu, gpu_time_gpu, dictionary_of_stats)
gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, dictionary_of_stats)
trace['traceEvents'].extend(gpu_events)
for channel in channels:
cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale, cpu_time_global, cpu_time_local)
cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale)
trace['traceEvents'].extend(cpu_events)
trace['traceEvents'].sort(key=lambda x : x['ts'])