f078db5998
script for parsing json trace, generating heatmap, throughput series, etc.
145 line
4.3 KiB
Python
145 line
4.3 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# Modifications Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
|
|
# Licensed under the MIT License.
|
|
|
|
# Having run npkit_trace_generator.py, use functions in this script (via import * from script, for example)
|
|
# to parse and dump data from the raw json trace files. Always run parse() first with correct json file and desired event name
|
|
|
|
import csv
|
|
import json
|
|
import sys
|
|
from operator import itemgetter
|
|
|
|
events = dict()
|
|
|
|
# collect all occurences of a certain event from trace file
|
|
def parse(file, event):
|
|
events.clear()
|
|
|
|
# load json as dict
|
|
f = open(file, 'rb')
|
|
raw_content = f.read()
|
|
json_data = json.loads(raw_content.decode('utf-8'))
|
|
trace = json_data['traceEvents']
|
|
|
|
B = dict()
|
|
|
|
for entry in trace:
|
|
id_pair = (entry['pid'], entry['tid'])
|
|
|
|
if entry['ph'] == 'B':
|
|
if id_pair not in B:
|
|
B[id_pair] = []
|
|
B[id_pair].append(entry) #stack from end
|
|
else:
|
|
b = B[id_pair].pop() #pop from end
|
|
if b['name'] == event:
|
|
dur = entry['ts'] - b['ts']
|
|
#adding to results
|
|
if id_pair not in events:
|
|
events[id_pair] = dict()
|
|
events[id_pair][b['ts']] = (dur, entry['args']['bw (GB/s)'], entry['args']['size'], entry['ts'])
|
|
# channel : {start time: [duration, bw, size, end time], ... : ..., ...}
|
|
|
|
return events
|
|
|
|
def size():
|
|
return len(events)
|
|
|
|
# return top i longest events within a certain (process,thread) pair, where default is all processes/threads
|
|
def longest_events(i, process = None, thread = None):
|
|
if process == None and thread != None:
|
|
raise RuntimeError("makes no sense to compare a thread id across all processes")
|
|
flatten_list = []
|
|
|
|
for id_pair in events:
|
|
if process == None or id_pair[0] == process:
|
|
if thread == None or id_pair[1] == thread:
|
|
for ts in events[id_pair]:
|
|
dur = events[id_pair][ts][0]
|
|
flatten_list.append( (id_pair, ts, dur) )
|
|
|
|
return sorted(flatten_list,key=itemgetter(2))[-i:]
|
|
|
|
# calculate total bandwidth of a channel aggregated through all events
|
|
def aggregate(channel):
|
|
us = 0
|
|
byte = 0
|
|
timeline = events[channel]
|
|
for i in timeline:
|
|
us += timeline[i][0]
|
|
byte += timeline[i][2]
|
|
|
|
return byte/us/1e3
|
|
|
|
# total throughput of all channels on a gpu (process) in every <interval> us
|
|
# tested on proxy channel events (e.g. NPKIT_EVENT_NET_TEST_ENTRY), think twice for other events
|
|
def thruput_series(gpu, interval = 100):
|
|
early = sys.maxsize
|
|
late = 0
|
|
a = events
|
|
|
|
# determine earliest and latest happening events
|
|
for i in a:
|
|
for j in a[i]:
|
|
if j < early:
|
|
early = j
|
|
if j > late:
|
|
late = j
|
|
|
|
# round up for interval length
|
|
late_r = late - (late % -interval)
|
|
early_r = early - (early % interval)
|
|
early = int(early_r)
|
|
late = int(late_r)
|
|
|
|
# aggregate all bytes transferred in a given interval
|
|
series = []
|
|
for ts in range(early,late, interval):
|
|
totalbyte = 0
|
|
for i in a:
|
|
if i[0] == gpu:
|
|
for j in a[i]:
|
|
start = j
|
|
end = a[i][j][3]
|
|
size = a[i][j][2] # total bytes transferred of this event
|
|
duration = a[i][j][0] # total duration of this event
|
|
if start <= ts and end > ts:
|
|
end = min(end, ts+interval)
|
|
# assume constant bw across time for an event, we only add bytes proportional
|
|
# to this event's presence in this interval over its total duration
|
|
totalbyte += (size / 1e6) * ( (end-start) / duration)
|
|
elif start < (ts + interval) and end >= ts: #>= for 0 dur case
|
|
start = max(start, ts)
|
|
end = min(end, ts+interval)
|
|
# sometimes there are 0 time events, probably a bug in npkit or trace generation
|
|
if duration == 0:
|
|
assert end-start == 0
|
|
totalbyte += (size / 1e6)
|
|
continue
|
|
totalbyte += (size / 1e6) * ( (end-start) / duration)
|
|
if totalbyte < 0:
|
|
print(i, j, start, end, ts)
|
|
print(totalbyte, size, ( (end-start) / duration))
|
|
raise RuntimeError("an error with time interval")
|
|
|
|
series.append(totalbyte * 1000 / interval)
|
|
return series
|
|
|
|
# export the bw of all events as csv, used for producing heatmap later
|
|
# only used and tested for CU level events like NPKIT_EVENT_ALL_REDUCE_RING_ENTRY
|
|
def export_csv(name):
|
|
a = events
|
|
matrix = []
|
|
for i in a :
|
|
l = [i[0],i[1]]
|
|
for j in a[i]:
|
|
l.append(a[i][j][1]) #bw
|
|
matrix.append(l)
|
|
|
|
file = open(name, 'w')
|
|
csvwriter = csv.writer(file)
|
|
for i in matrix:
|
|
csvwriter.writerow(i)
|
|
file.close()
|