Wall clock update and npkit trace script Update (#771)
* changed builtin clock to wall_clock64 * updated npkit_Trace_generator to the new version of npkit
Этот коммит содержится в:
@@ -268,75 +268,20 @@ class ncclFunction {
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
|
||||
#define traceColl(launch_type) { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
|
||||
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
|
||||
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
|
||||
collTrace->p2p[0].connIndex = 0; \
|
||||
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
|
||||
collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
|
||||
collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \
|
||||
collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \
|
||||
collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \
|
||||
collTrace->p2p[1].connIndex = 0; \
|
||||
collTrace->p2pOpCount[1] = p2pElems[1].opCount; \
|
||||
collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \
|
||||
collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \
|
||||
collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
|
||||
collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
|
||||
} else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \
|
||||
struct ncclWorkElem *elems = ncclShmem.work.elems; \
|
||||
collTrace->opCount = elems[0].opCount; \
|
||||
collTrace->coll.nWarps = elems[0].nWarps; \
|
||||
collTrace->coll.bid = elems[0].bid; \
|
||||
collTrace->coll.nChannels = elems[0].nChannels; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define traceKernelLaunch(firstLaunch) { \
|
||||
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
|
||||
}
|
||||
#define traceKernelEnd() { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceKernelEndType; \
|
||||
}
|
||||
#define traceAbort() { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceAbortType; \
|
||||
}
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->funcIndex = data2; \
|
||||
collTrace->data_0 = data4; \
|
||||
collTrace->opCount = data8_0; \
|
||||
collTrace->data_1 = data8_1; \
|
||||
collTrace->type = ncclCollTraceDataType; \
|
||||
}
|
||||
#define __trace_hwreg()
|
||||
#else
|
||||
#define __trace_hwreg() \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0));
|
||||
#endif
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#define traceColl(launch_type) { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
|
||||
__trace_hwreg()\
|
||||
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
|
||||
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
|
||||
collTrace->p2p[0].connIndex = 0; \
|
||||
@@ -361,20 +306,21 @@ class ncclFunction {
|
||||
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
|
||||
} \
|
||||
}
|
||||
// #endif
|
||||
#define traceKernelLaunch(firstLaunch) { \
|
||||
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
|
||||
}
|
||||
#define traceKernelEnd() { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceKernelEndType; \
|
||||
}
|
||||
#define traceAbort() { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceAbortType; \
|
||||
}
|
||||
@@ -382,14 +328,13 @@ class ncclFunction {
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->funcIndex = data2; \
|
||||
collTrace->data_0 = data4; \
|
||||
collTrace->opCount = data8_0; \
|
||||
collTrace->data_1 = data8_1; \
|
||||
collTrace->type = ncclCollTraceDataType; \
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#define traceData(data2, data4, data8_0, data8_1)
|
||||
#endif
|
||||
@@ -433,7 +378,7 @@ __device__ inline void* ncclScratchForWarp(int warp) {
|
||||
#define __insert_timestamp(line_num) do { \
|
||||
if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \
|
||||
ncclShmem.prof.elem[ncclShmem.prof.count].line = line_num; \
|
||||
ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = wall_clock64(); \
|
||||
ncclShmem.prof.count++; \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
@@ -14,11 +14,7 @@
|
||||
#include "npkit/npkit_event.h"
|
||||
#include "npkit/npkit_struct.h"
|
||||
|
||||
#if defined(__GFX9__) || defined(__GFX10__)
|
||||
#define NPKIT_GET_GPU_TIMESTAMP __builtin_amdgcn_s_memrealtime
|
||||
#else
|
||||
#define NPKIT_GET_GPU_TIMESTAMP clock64
|
||||
#endif
|
||||
#define NPKIT_GET_GPU_TIMESTAMP wall_clock64
|
||||
|
||||
class NpKit {
|
||||
public:
|
||||
|
||||
@@ -85,7 +85,7 @@ __device__
|
||||
inline __attribute((always_inline))
|
||||
long long int __rtc64() {
|
||||
#if __HIP__
|
||||
return (long long int) __builtin_amdgcn_s_memrealtime();
|
||||
return (long long int) wall_clock64();
|
||||
#else
|
||||
return (long long int) __clock_u64();
|
||||
#endif
|
||||
|
||||
@@ -117,7 +117,7 @@ __global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct p
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0)
|
||||
curr_time = __builtin_amdgcn_s_memrealtime();
|
||||
curr_time = wall_clock64();
|
||||
|
||||
if (op == OP_COPY) {
|
||||
srcs[0] = transfer_data->src0[bid];
|
||||
|
||||
@@ -39,11 +39,6 @@ def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path):
|
||||
den = float(f.read())
|
||||
return den / num / 1e6
|
||||
|
||||
def parse_clock_calibration_info(clock_calibration_file_path):
|
||||
with open(clock_calibration_file_path, 'r') as f:
|
||||
num = float(f.read())
|
||||
return num
|
||||
|
||||
def parse_gpu_event(event_bytes):
|
||||
return {
|
||||
'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
|
||||
@@ -60,13 +55,13 @@ def parse_cpu_event(event_bytes):
|
||||
'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
|
||||
}
|
||||
|
||||
def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, gpu_time_cpu, gpu_time_gpu, dictionary_of_stats, warmup_runs=5):
|
||||
def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, dictionary_of_stats, warmup_runs=5):
|
||||
gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx))
|
||||
stats_key = 'gpu_rank_%d' % (rank)
|
||||
channel_stats = {}
|
||||
raw_event_size = 16
|
||||
cpu_base_time = gpu_time_cpu / cpu_clock_scale
|
||||
gpu_base_time = gpu_time_gpu / gpu_clock_scale
|
||||
curr_cpu_base_time = None
|
||||
curr_gpu_base_time = None
|
||||
gpu_events = []
|
||||
event_type_to_seq = {}
|
||||
unfiltered_events = []
|
||||
@@ -78,64 +73,77 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo
|
||||
if raw_content_size > 0 and stats_key not in dictionary_of_stats:
|
||||
dictionary_of_stats[stats_key] = {}
|
||||
warmup_raw_content_idx = 0
|
||||
parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
|
||||
unfiltered_events.append(parsed_gpu_event)
|
||||
start_event_id = parsed_gpu_event['id'] # start event id
|
||||
while warmup_runs != 0 and warmup_raw_content_idx < raw_content_size: #warmup run cleanup
|
||||
warmup_raw_content_idx += raw_event_size
|
||||
parsed_gpu_event = parse_gpu_event(raw_content[warmup_raw_content_idx : warmup_raw_content_idx + raw_event_size])
|
||||
unfiltered_events.append(parsed_gpu_event)
|
||||
unfiltered_events.insert(0, parsed_gpu_event)
|
||||
if start_event_id == 0:
|
||||
decoded_id = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
|
||||
if decoded_id == 'NPKIT_EVENT_TIME_SYNC_CPU' or decoded_id == 'NPKIT_EVENT_TIME_SYNC_GPU':
|
||||
warmup_raw_content_idx += raw_event_size
|
||||
continue
|
||||
else:
|
||||
start_event_id = parsed_gpu_event['id']
|
||||
|
||||
warmup_raw_content_idx += raw_event_size
|
||||
if parsed_gpu_event['id'] == (start_event_id + 1):
|
||||
warmup_runs -= 1
|
||||
warmup_raw_content_idx += raw_event_size
|
||||
raw_content_idx = warmup_raw_content_idx
|
||||
|
||||
while raw_content_idx < raw_content_size:
|
||||
parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
|
||||
unfiltered_events.append(parsed_gpu_event)
|
||||
event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
|
||||
phase = 'B' if event_type.endswith('_ENTRY') else 'E'
|
||||
gpu_events.append({
|
||||
'ph': phase,
|
||||
'ts': cpu_base_time + ((parsed_gpu_event['timestamp'] / gpu_clock_scale) - gpu_base_time),
|
||||
'pid': rank,
|
||||
'tid': buf_idx + 1
|
||||
})
|
||||
if phase == 'B':
|
||||
if event_type not in event_type_to_seq:
|
||||
event_type_to_seq[event_type] = 0
|
||||
gpu_events[-1].update({
|
||||
'name': event_type,
|
||||
'cat': 'GPU',
|
||||
'args': {
|
||||
'rank': rank,
|
||||
'buf_idx': buf_idx,
|
||||
'seq': event_type_to_seq[event_type],
|
||||
'rsvd_0': parsed_gpu_event['rsvd'],
|
||||
'size_0': parsed_gpu_event['size']
|
||||
}
|
||||
})
|
||||
event_type_to_seq[event_type] += 1
|
||||
unfiltered_events.insert(0, parsed_gpu_event)
|
||||
if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU':
|
||||
curr_cpu_base_time = parsed_gpu_event['timestamp'] / cpu_clock_scale
|
||||
curr_gpu_base_time = None
|
||||
elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU':
|
||||
if curr_gpu_base_time is None:
|
||||
curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
|
||||
else:
|
||||
gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
|
||||
current_id = parsed_gpu_event['id']
|
||||
gpu_events_reverse = unfiltered_events[::-1]
|
||||
for i in gpu_events_reverse:
|
||||
if i['id'] == (current_id-1):
|
||||
event_start_ts = cpu_base_time + ((i['timestamp'] / gpu_clock_scale) - gpu_base_time)
|
||||
break
|
||||
delta_time = max(0.001, gpu_events[-1]['ts'] - event_start_ts) # delta needs to take the last begin
|
||||
bandwidth = gpu_events[-1]['args']['size'] / delta_time / 1e3
|
||||
if (current_id,parsed_gpu_event['size']) in channel_stats:
|
||||
temp_size = channel_stats[(current_id,parsed_gpu_event['size'])][1]+1
|
||||
temp = channel_stats[(current_id,parsed_gpu_event['size'])][0] * (temp_size - 1 )/ (temp_size)
|
||||
temp_delta = channel_stats[(current_id,parsed_gpu_event['size'])][2] * (temp_size - 1 )/ (temp_size)
|
||||
channel_stats[(current_id,parsed_gpu_event['size'])][0] = bandwidth / (temp_size) + temp
|
||||
channel_stats[(current_id,parsed_gpu_event['size'])][1] = temp_size
|
||||
channel_stats[(current_id,parsed_gpu_event['size'])][2] = delta_time / (temp_size) + temp_delta
|
||||
if curr_gpu_base_time is None:
|
||||
curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
|
||||
event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
|
||||
phase = 'B' if event_type.endswith('_ENTRY') else 'E'
|
||||
gpu_events.append({
|
||||
'ph': phase,
|
||||
'ts': curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time,
|
||||
'pid': rank,
|
||||
'tid': buf_idx + 1
|
||||
|
||||
})
|
||||
if phase == 'B':
|
||||
if event_type not in event_type_to_seq:
|
||||
event_type_to_seq[event_type] = 0
|
||||
gpu_events[-1].update({
|
||||
'name': event_type,
|
||||
'cat': 'GPU',
|
||||
'args': {
|
||||
'rank': rank,
|
||||
'buf_idx': buf_idx,
|
||||
'seq': event_type_to_seq[event_type],
|
||||
'rsvd_0': parsed_gpu_event['rsvd'],
|
||||
'size_0': parsed_gpu_event['size']
|
||||
}
|
||||
})
|
||||
event_type_to_seq[event_type] += 1
|
||||
else:
|
||||
channel_stats[(current_id,parsed_gpu_event['size'])] = [bandwidth, 1, delta_time]
|
||||
gpu_events[-1]['args']['bw (GB/s)'] = bandwidth
|
||||
gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}
|
||||
current_id = parsed_gpu_event['id']
|
||||
|
||||
for i in unfiltered_events:
|
||||
if i['id'] == (current_id-1):
|
||||
event_start_ts = curr_cpu_base_time + i['timestamp'] / gpu_clock_scale - curr_gpu_base_time
|
||||
break
|
||||
delta_time = max(0.001, gpu_events[-1]['ts'] - event_start_ts) # delta needs to take the last begin
|
||||
bandwidth = gpu_events[-1]['args']['size'] / delta_time / 1e3
|
||||
if (current_id,parsed_gpu_event['size']) in channel_stats:
|
||||
temp_size = channel_stats[(current_id,parsed_gpu_event['size'])][1]+1
|
||||
temp = channel_stats[(current_id,parsed_gpu_event['size'])][0] * (temp_size - 1 )/ (temp_size)
|
||||
temp_delta = channel_stats[(current_id,parsed_gpu_event['size'])][2] * (temp_size - 1 )/ (temp_size)
|
||||
channel_stats[(current_id,parsed_gpu_event['size'])][0] = bandwidth / (temp_size) + temp
|
||||
channel_stats[(current_id,parsed_gpu_event['size'])][1] = temp_size
|
||||
channel_stats[(current_id,parsed_gpu_event['size'])][2] = delta_time / (temp_size) + temp_delta
|
||||
else:
|
||||
channel_stats[(current_id,parsed_gpu_event['size'])] = [bandwidth, 1, delta_time]
|
||||
gpu_events[-1]['args']['bw (GB/s)'] = bandwidth
|
||||
|
||||
raw_content_idx += raw_event_size
|
||||
|
||||
@@ -150,11 +158,9 @@ def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clo
|
||||
dictionary_of_stats[stats_key][key][2] = new_avg_time
|
||||
else:
|
||||
dictionary_of_stats[stats_key][key] = channel_stats[key]
|
||||
|
||||
breakpoint()
|
||||
return gpu_events
|
||||
|
||||
def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale, cpu_time_global, cpu_time_local):
|
||||
def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale):
|
||||
cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel))
|
||||
raw_event_size = 16
|
||||
cpu_events = []
|
||||
@@ -179,7 +185,7 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo
|
||||
phase = 'B' if event_type.endswith('_ENTRY') else 'E'
|
||||
cpu_events.append({
|
||||
'ph': phase,
|
||||
'ts': (cpu_time_global + (parsed_cpu_event['timestamp'] - cpu_time_local)) / cpu_clock_scale,
|
||||
'ts': parsed_cpu_event['timestamp'] / cpu_clock_scale,
|
||||
'pid': rank
|
||||
})
|
||||
slot = parsed_cpu_event['slot']
|
||||
@@ -228,6 +234,8 @@ def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clo
|
||||
raw_content_idx += raw_event_size
|
||||
return cpu_events
|
||||
|
||||
|
||||
|
||||
def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def, gpu_statistics):
|
||||
files_in_dump_dir = next(os.walk(npkit_dump_dir))[2]
|
||||
gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')]
|
||||
@@ -247,17 +255,12 @@ def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def, gpu
|
||||
gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank)
|
||||
gpu_clock_scale = parse_gpu_clock_scale(gpu_clock_file_path)
|
||||
|
||||
cpu_time_global = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_cpu_global_rank_%d' % rank))
|
||||
cpu_time_local = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_cpu_local_rank_%d' % rank))
|
||||
gpu_time_cpu = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_gpu_cpu_rank_%d' % rank))
|
||||
gpu_time_gpu = parse_clock_calibration_info(os.path.join(npkit_dump_dir, 'clock_calibration_gpu_gpu_rank_%d' % rank))
|
||||
|
||||
for buf_idx in buf_indices:
|
||||
gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, gpu_time_cpu, gpu_time_gpu, dictionary_of_stats)
|
||||
gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale, dictionary_of_stats)
|
||||
trace['traceEvents'].extend(gpu_events)
|
||||
|
||||
for channel in channels:
|
||||
cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale, cpu_time_global, cpu_time_local)
|
||||
cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale)
|
||||
trace['traceEvents'].extend(cpu_events)
|
||||
|
||||
trace['traceEvents'].sort(key=lambda x : x['ts'])
|
||||
|
||||
Ссылка в новой задаче
Block a user