SWDEV-402866: Added browser file mode. Fixed caching. Changed ATT buffer size. Added MAX_ATT environment variable. Updated README/Changelog.

Change-Id: I36a6093bb85bf4ef179b59df676fc2e4cbdb5288
This commit is contained in:
Giovanni LB
2023-06-01 21:45:16 -03:00
committed by Giovanni Baraldi
szülő 44e787957b
commit 59ee21f7d1
9 fájl változott, egészen pontosan 106 új sor hozzáadva és 26 régi sor törölve
+18 -2
Fájl megtekintése
@@ -209,18 +209,26 @@ The resulting `a.out` will depend on
### Optimized
- Improved Test Suite
### Changed
- ATT analysis will not run by default. For ATT to have the same behaviour as 5.5, use --plugin att <as.s> --mode network
### Added
- 'end_time' need to be disabled in roctx_trace.txt
- support for hsa_amd_memory_async_copy_on_engine API function trace
### Fixed
- rocprof in ROcm/5.4.0 gpu selector broken.
- rocprof in ROCm/5.4.1 fails to generate kernel info.
- rocprof clobbers LD_PRELOAD.
## ROCprofiler for rocm 5.7.0
### Navi support
Rocprofiler for ROCm 5.7 added support for counter collection (PMC) and advanced thread tracing (ATT) for Navi21 and Navi31 GPUs.
- On Navi, specially Navi31, counter collection requires the GPU to be in a stable power state. See README.md for instructions.
- Navi does not support streaming SQ counters and ATT at the same time, unlike GFX9.
- On Navi ATT, "att: target_cu" indexes the WGP and the SIMD_MASK parameter is actually the SIMD_ID, in the range [0,3].
- HIP RT in ATT not yet supported.
### Changed
- ATT analysis will not run by default. For ATT to have the same behaviour as 5.5, use --plugin att <as.s> --mode network
### Optimized
- ATT json filesizes
### Added
- Every API trace in V2 reported synchronously will have two records, one for Enter phase and for Exit phase
- File Plugin now reports the HSA OPS operation kind as part of the output text
@@ -230,9 +238,17 @@ The resulting `a.out` will depend on
- MI300 individual XCC counters dumped per-xcc as separate records but with same record-id and kernel dispatch info
- Naming for MPI ranks. Filenames containing "%rank" are replaced by variables "MPI_RANK", "OMPI_COMM_WORLD_RANK" or "MV2_COMM_WORLD_RANK".
- MPI Rank will appear in perfetto track names.
- SE_MASK parameter in ATT, a binary mask specifying for which shader engines to run ATT.
On GFX9, SEs are masked out completely. On Navi only part of the data is masked.
The use of SE_MASK=0x1 is heavily encouraged to avoid packet lost events.
- "--mode file" option in ATT, which allows for parsed files to be stored. Run python3 httpserver.py from within ./UI/ to view files locally.
- "ROCPROFILER_MAX_ATT_PROFILES" environment variable can be set. Previously fixed at 16, now the default is 1.
- Increased ATT buffer size per collection to 1GB.
### Fixed
- Samples are fixed to show the new usage of phases.
- Plugin option validates the plugin names.
- Fixing rocsys, for rocsys options, rocsys -h can be called
- "--output-file" option ignored when no output folder was specified.
- Perfetto crash when using ROCTX and/or no output file specified.
- Parsing of the getpc, setpc and swappc instructions with registers loaded from scratch space.
- Some browsers caching ATT data from older kernels.
+4
Fájl megtekintése
@@ -381,3 +381,7 @@ samples can be run as independent executables once installed
Please report in the Github Issues
## Limitations
- Navi requires a stable power state for counter collection. Currently this state needs to be set by the user.
To do so, set "power_dpm_force_performance_level" to be writeable for non-root users with chmod, then:
echo profile_standard >> /sys/class/drm/card0/device/power_dpm_force_performance_level
Recommended: "auto" or "high" for ATT and "profile_standard" for PMC. Use rocm-smi to verify the current power state.
+1 -2
Fájl megtekintése
@@ -57,11 +57,10 @@ install(TARGETS att_plugin
configure_file(att.py att/att.py COPYONLY)
configure_file(trace_view.py att/trace_view.py COPYONLY)
# configure_file(t.db att/t.db COPYONLY)
configure_file(ui/index.html att/ui/index.html COPYONLY)
configure_file(ui/logo.svg att/ui/logo.svg COPYONLY)
configure_file(ui/styles.css att/ui/styles.css COPYONLY)
# configure_file(ui/trace.json att/ui/trace.json COPYONLY)
configure_file(ui/httpserver.py att/ui/httpserver.py COPYONLY)
install(
DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/att
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/rocprofiler
+8 -10
Fájl megtekintése
@@ -19,7 +19,7 @@ from io import BytesIO
class FileBytesIO:
def __init__(self, iobytes):
self.iobytes = iobytes
self.iobytes = deepcopy(iobytes)
self.seek = 0
def __len__(self):
@@ -27,9 +27,9 @@ class FileBytesIO:
def read(self, length=0):
if length<=0:
return bytes(self.getbuffer())
return bytes(self.iobytes.getbuffer())
else:
if self.seek >= len(self):
if self.seek >= self.iobytes.getbuffer().nbytes:
self.seek = 0
return None
response = self.iobytes.getbuffer()[self.seek:self.seek+length]
@@ -382,14 +382,14 @@ def draw_wave_states(selections, normalize):
if normalize:
timelines = np.array(timelines) / np.maximum(np.sum(timelines,0)*1E-2,1E-7)
kernsize = maxtime//150+1
trim = max(maxtime//5000,1)
cycles = np.arange(timelines[0].size)[::trim]
cycles = np.arange(0, timelines[0].size//trim, 1)*trim
timelines = [time[:trim*(time.size//trim)].reshape((-1, trim)).mean(-1) if len(time) > 0 else cycles*0 for time in timelines]
kernsize = 21
kernel = np.asarray([np.exp(-abs(10*k/kernsize)) for k in range(-kernsize//2,kernsize//2+1)])
kernel /= np.sum(kernel)
timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2][::trim] if len(time) > 0 else cycles*0 for time in timelines]
timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2] for time in timelines]
[plt.plot(cycles, t, label='State '+s, linewidth=1.1, color=c)
for t, s, c, sel in zip(timelines, STATES, colors, selections) if sel]
@@ -456,8 +456,6 @@ if __name__ == "__main__":
for line in lines:
if 'PERFCOUNTER=' in line:
EVENT_NAMES += [clean(line).split('SQ_')[1].lower()]
if len(EVENT_NAMES) == 0:
EVENT_NAMES = ['SPI', 'Vdata', 'Sdata', 'LDS']
if args.target_cu is None:
args.target_cu = 1
@@ -546,7 +544,7 @@ if __name__ == "__main__":
tuples3 = [(0,df['begin_time'][T]-min_event_time)]+[(int(t[0]),int(t[1])) for t in tuples2]
for state in tuples3:
if state[1] > 50E6:
if state[1] > 1E8:
print('Warning: Time limit reached for ',state[0], state[1])
break
if time_acc+state[1] > TIMELINES[state[0]].size:
+3 -6
Fájl megtekintése
@@ -798,10 +798,7 @@ def view_trace(args, code, jumps, dbnames, att_filenames, bReturnLoc, pic_callba
print("Exitting.")
else:
os.makedirs('ui', exist_ok=True)
os.system('cp ' + os.path.join(os.path.abspath(os.path.dirname(__file__)),'ui') + '/* ui/' )
for k, v in JSON_GLOBAL_DICTIONARY.items():
if '.json' in k:
try:
with open(os.path.join('ui',k), 'w') as f:
f.write(v.read())
except:
pass
with open(os.path.join('ui',k), 'w' if '.json' in k else 'wb') as f:
f.write(v.read())
+64
Fájl megtekintése
@@ -0,0 +1,64 @@
#!/usr/bin/env python3
import sys
if sys.version_info[0] < 3:
raise Exception("Must be using Python 3")
import http.server
import socketserver
import socket
import os
import sys
class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
def end_headers(self):
self.send_my_headers()
http.server.SimpleHTTPRequestHandler.end_headers(self)
def send_my_headers(self):
self.send_header("Cache-Control", "no-cache, no-store, must-revalidate")
self.send_header("Pragma", "no-cache")
self.send_header("Expires", "0")
def do_GET(self):
if 'timeline.png?' in self.path:
self.path = 'timeline.png'
http.server.SimpleHTTPRequestHandler.do_GET(self)
class RocTCPServer(socketserver.TCPServer):
def server_bind(self):
self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self.socket.bind(self.server_address)
def run_server():
Handler = NoCacheHTTPRequestHandler
os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'.'))
try:
with RocTCPServer((IPAddr, PORT), Handler) as httpd:
httpd.serve_forever()
except KeyboardInterrupt:
pass
def get_ip():
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.settimeout(0)
try:
hostname = socket.gethostname()
IPAddr = socket.gethostbyname(hostname)
s.connect(({IPAddr}, 1))
except Exception:
IPAddr = '127.0.0.1'
finally:
return IPAddr
IPAddr = get_ip()
PORT = 8000
if len(sys.argv) > 1:
PORT = int(sys.argv[1])
print('serving at port: {0}'.format(PORT))
try:
run_server()
except KeyboardInterrupt:
print("Exitting.")
+4 -4
Fájl megtekintése
@@ -206,7 +206,7 @@
var current_WV = 0
var filename_data = {}
fetch("filenames.json").then(response => response.json()).then(data => {
fetch("filenames.json", {cache: "no-store"}).then(response => response.json()).then(data => {
filename_data = data.wave_filenames
wave_cu_index = {};
@@ -272,7 +272,7 @@
document.getElementById("minimap").innerHTML = HTML_MINI
document.getElementById("Images").innerHTML = HTML_IMAG
fetch("counters.json").then(response => response.json()).then(data => {
fetch("counters.json", {cache: "no-store"}).then(response => response.json()).then(data => {
var html_gh = '<input type="checkbox" id="btn_norm" onclick="UpdImageSrc()" checked=true>Normalize\t'
for(var key in data.counters) {
console.log(key, data.counters[key])
@@ -294,14 +294,14 @@
//document.getElementById('what').innerHTML = ""
d3.select('nav').style('visibility', 'hidden')
fetch(file_to_gather)
fetch(file_to_gather, {cache: "no-store"})
.then(response => response.json())
.then(data => {
code_data_file = file_to_gather.split('_sm')[0]+'_code.json'
console.log("Requestd:", file_to_gather)
console.log("Request code:", code_data_file)
fetch(code_data_file)
fetch(code_data_file, {cache: "no-store"})
.then(response => response.json())
.then(code_data => {
const SP = '\u00A0'
@@ -468,7 +468,7 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
}
// ATT
uint32_t g_output_buffer_size = 0x8000000; // 128M x 16 = 2GB
uint32_t g_output_buffer_size = 0x40000000; // 1GB
bool g_output_buffer_local = true;
// Allocate system memory accessible by both CPU and GPU
+3 -1
Fájl megtekintése
@@ -50,7 +50,6 @@
} while (0)
#define __NR_gettid 186
#define MAX_ATT_PROFILES 16
std::mutex sessions_pending_signal_lock;
@@ -664,6 +663,9 @@ std::atomic<uint32_t> WRITER_ID{0};
*/
void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt_index, void* data,
hsa_amd_queue_intercept_packet_writer writer) {
static const char* env_MAX_ATT_PROFILES = getenv("ROCPROFILER_MAX_ATT_PROFILES");
static int MAX_ATT_PROFILES = env_MAX_ATT_PROFILES ? atoi(env_MAX_ATT_PROFILES) : 1;
const Packet::packet_t* packets_arr = reinterpret_cast<const Packet::packet_t*>(packets);
std::vector<Packet::packet_t> transformed_packets;