From 59ee21f7d1503bbfd57a92bbdcec830b245856f7 Mon Sep 17 00:00:00 2001 From: Giovanni LB Date: Thu, 1 Jun 2023 21:45:16 -0300 Subject: [PATCH] SWDEV-402866: Added browser file mode. Fixed caching. Changed ATT buffer size. Added MAX_ATT environment variable. Updated README/Changelog. Change-Id: I36a6093bb85bf4ef179b59df676fc2e4cbdb5288 --- CHANGELOG.md | 20 ++++++- README.md | 4 ++ plugin/att/CMakeLists.txt | 3 +- plugin/att/att.py | 18 +++--- plugin/att/trace_view.py | 9 +-- plugin/att/ui/httpserver.py | 64 ++++++++++++++++++++++ plugin/att/ui/index.html | 8 +-- src/core/hsa/packets/packets_generator.cpp | 2 +- src/core/hsa/queues/queue.cpp | 4 +- 9 files changed, 106 insertions(+), 26 deletions(-) create mode 100644 plugin/att/ui/httpserver.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 065f56210a..f0552d4417 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -209,18 +209,26 @@ The resulting `a.out` will depend on ### Optimized - Improved Test Suite -### Changed -- ATT analysis will not run by default. For ATT to have the same behaviour as 5.5, use --plugin att --mode network ### Added - 'end_time' need to be disabled in roctx_trace.txt - support for hsa_amd_memory_async_copy_on_engine API function trace + ### Fixed - rocprof in ROcm/5.4.0 gpu selector broken. - rocprof in ROCm/5.4.1 fails to generate kernel info. - rocprof clobbers LD_PRELOAD. ## ROCprofiler for rocm 5.7.0 +### Navi support +Rocprofiler for ROCm 5.7 added support for counter collection (PMC) and advanced thread tracing (ATT) for Navi21 and Navi31 GPUs. +- On Navi, specially Navi31, counter collection requires the GPU to be in a stable power state. See README.md for instructions. +- Navi does not support streaming SQ counters and ATT at the same time, unlike GFX9. +- On Navi ATT, "att: target_cu" indexes the WGP and the SIMD_MASK parameter is actually the SIMD_ID, in the range [0,3]. +- HIP RT in ATT not yet supported. +### Changed +- ATT analysis will not run by default. For ATT to have the same behaviour as 5.5, use --plugin att --mode network ### Optimized +- ATT json filesizes ### Added - Every API trace in V2 reported synchronously will have two records, one for Enter phase and for Exit phase - File Plugin now reports the HSA OPS operation kind as part of the output text @@ -230,9 +238,17 @@ The resulting `a.out` will depend on - MI300 individual XCC counters dumped per-xcc as separate records but with same record-id and kernel dispatch info - Naming for MPI ranks. Filenames containing "%rank" are replaced by variables "MPI_RANK", "OMPI_COMM_WORLD_RANK" or "MV2_COMM_WORLD_RANK". - MPI Rank will appear in perfetto track names. +- SE_MASK parameter in ATT, a binary mask specifying for which shader engines to run ATT. + On GFX9, SEs are masked out completely. On Navi only part of the data is masked. + The use of SE_MASK=0x1 is heavily encouraged to avoid packet lost events. +- "--mode file" option in ATT, which allows for parsed files to be stored. Run python3 httpserver.py from within ./UI/ to view files locally. +- "ROCPROFILER_MAX_ATT_PROFILES" environment variable can be set. Previously fixed at 16, now the default is 1. +- Increased ATT buffer size per collection to 1GB. ### Fixed - Samples are fixed to show the new usage of phases. - Plugin option validates the plugin names. - Fixing rocsys, for rocsys options, rocsys -h can be called - "--output-file" option ignored when no output folder was specified. - Perfetto crash when using ROCTX and/or no output file specified. +- Parsing of the getpc, setpc and swappc instructions with registers loaded from scratch space. +- Some browsers caching ATT data from older kernels. diff --git a/README.md b/README.md index ac4ddeb4b3..76895463c8 100644 --- a/README.md +++ b/README.md @@ -381,3 +381,7 @@ samples can be run as independent executables once installed Please report in the Github Issues ## Limitations +- Navi requires a stable power state for counter collection. Currently this state needs to be set by the user. + To do so, set "power_dpm_force_performance_level" to be writeable for non-root users with chmod, then: + echo profile_standard >> /sys/class/drm/card0/device/power_dpm_force_performance_level + Recommended: "auto" or "high" for ATT and "profile_standard" for PMC. Use rocm-smi to verify the current power state. diff --git a/plugin/att/CMakeLists.txt b/plugin/att/CMakeLists.txt index 5ae4c86c8d..199b2b474b 100644 --- a/plugin/att/CMakeLists.txt +++ b/plugin/att/CMakeLists.txt @@ -57,11 +57,10 @@ install(TARGETS att_plugin configure_file(att.py att/att.py COPYONLY) configure_file(trace_view.py att/trace_view.py COPYONLY) -# configure_file(t.db att/t.db COPYONLY) configure_file(ui/index.html att/ui/index.html COPYONLY) configure_file(ui/logo.svg att/ui/logo.svg COPYONLY) configure_file(ui/styles.css att/ui/styles.css COPYONLY) -# configure_file(ui/trace.json att/ui/trace.json COPYONLY) +configure_file(ui/httpserver.py att/ui/httpserver.py COPYONLY) install( DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/att DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/rocprofiler diff --git a/plugin/att/att.py b/plugin/att/att.py index 901dd83d41..f5e7a0a36e 100755 --- a/plugin/att/att.py +++ b/plugin/att/att.py @@ -19,7 +19,7 @@ from io import BytesIO class FileBytesIO: def __init__(self, iobytes): - self.iobytes = iobytes + self.iobytes = deepcopy(iobytes) self.seek = 0 def __len__(self): @@ -27,9 +27,9 @@ class FileBytesIO: def read(self, length=0): if length<=0: - return bytes(self.getbuffer()) + return bytes(self.iobytes.getbuffer()) else: - if self.seek >= len(self): + if self.seek >= self.iobytes.getbuffer().nbytes: self.seek = 0 return None response = self.iobytes.getbuffer()[self.seek:self.seek+length] @@ -382,14 +382,14 @@ def draw_wave_states(selections, normalize): if normalize: timelines = np.array(timelines) / np.maximum(np.sum(timelines,0)*1E-2,1E-7) - kernsize = maxtime//150+1 trim = max(maxtime//5000,1) - cycles = np.arange(timelines[0].size)[::trim] - + cycles = np.arange(0, timelines[0].size//trim, 1)*trim + timelines = [time[:trim*(time.size//trim)].reshape((-1, trim)).mean(-1) if len(time) > 0 else cycles*0 for time in timelines] + kernsize = 21 kernel = np.asarray([np.exp(-abs(10*k/kernsize)) for k in range(-kernsize//2,kernsize//2+1)]) kernel /= np.sum(kernel) - timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2][::trim] if len(time) > 0 else cycles*0 for time in timelines] + timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2] for time in timelines] [plt.plot(cycles, t, label='State '+s, linewidth=1.1, color=c) for t, s, c, sel in zip(timelines, STATES, colors, selections) if sel] @@ -456,8 +456,6 @@ if __name__ == "__main__": for line in lines: if 'PERFCOUNTER=' in line: EVENT_NAMES += [clean(line).split('SQ_')[1].lower()] - if len(EVENT_NAMES) == 0: - EVENT_NAMES = ['SPI', 'Vdata', 'Sdata', 'LDS'] if args.target_cu is None: args.target_cu = 1 @@ -546,7 +544,7 @@ if __name__ == "__main__": tuples3 = [(0,df['begin_time'][T]-min_event_time)]+[(int(t[0]),int(t[1])) for t in tuples2] for state in tuples3: - if state[1] > 50E6: + if state[1] > 1E8: print('Warning: Time limit reached for ',state[0], state[1]) break if time_acc+state[1] > TIMELINES[state[0]].size: diff --git a/plugin/att/trace_view.py b/plugin/att/trace_view.py index fc1fd751a4..91277f930c 100755 --- a/plugin/att/trace_view.py +++ b/plugin/att/trace_view.py @@ -798,10 +798,7 @@ def view_trace(args, code, jumps, dbnames, att_filenames, bReturnLoc, pic_callba print("Exitting.") else: os.makedirs('ui', exist_ok=True) + os.system('cp ' + os.path.join(os.path.abspath(os.path.dirname(__file__)),'ui') + '/* ui/' ) for k, v in JSON_GLOBAL_DICTIONARY.items(): - if '.json' in k: - try: - with open(os.path.join('ui',k), 'w') as f: - f.write(v.read()) - except: - pass + with open(os.path.join('ui',k), 'w' if '.json' in k else 'wb') as f: + f.write(v.read()) diff --git a/plugin/att/ui/httpserver.py b/plugin/att/ui/httpserver.py new file mode 100644 index 0000000000..b8a821b4cc --- /dev/null +++ b/plugin/att/ui/httpserver.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import sys +if sys.version_info[0] < 3: + raise Exception("Must be using Python 3") + +import http.server +import socketserver +import socket +import os +import sys + +class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler): + def end_headers(self): + self.send_my_headers() + http.server.SimpleHTTPRequestHandler.end_headers(self) + + def send_my_headers(self): + self.send_header("Cache-Control", "no-cache, no-store, must-revalidate") + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + + def do_GET(self): + if 'timeline.png?' in self.path: + self.path = 'timeline.png' + + http.server.SimpleHTTPRequestHandler.do_GET(self) + +class RocTCPServer(socketserver.TCPServer): + def server_bind(self): + self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.socket.bind(self.server_address) + +def run_server(): + Handler = NoCacheHTTPRequestHandler + os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'.')) + try: + with RocTCPServer((IPAddr, PORT), Handler) as httpd: + httpd.serve_forever() + except KeyboardInterrupt: + pass + +def get_ip(): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.settimeout(0) + try: + hostname = socket.gethostname() + IPAddr = socket.gethostbyname(hostname) + s.connect(({IPAddr}, 1)) + except Exception: + IPAddr = '127.0.0.1' + finally: + return IPAddr + +IPAddr = get_ip() +PORT = 8000 + +if len(sys.argv) > 1: + PORT = int(sys.argv[1]) +print('serving at port: {0}'.format(PORT)) + +try: + run_server() +except KeyboardInterrupt: + print("Exitting.") diff --git a/plugin/att/ui/index.html b/plugin/att/ui/index.html index f3fb714d1e..5faf62919e 100644 --- a/plugin/att/ui/index.html +++ b/plugin/att/ui/index.html @@ -206,7 +206,7 @@ var current_WV = 0 var filename_data = {} - fetch("filenames.json").then(response => response.json()).then(data => { + fetch("filenames.json", {cache: "no-store"}).then(response => response.json()).then(data => { filename_data = data.wave_filenames wave_cu_index = {}; @@ -272,7 +272,7 @@ document.getElementById("minimap").innerHTML = HTML_MINI document.getElementById("Images").innerHTML = HTML_IMAG - fetch("counters.json").then(response => response.json()).then(data => { + fetch("counters.json", {cache: "no-store"}).then(response => response.json()).then(data => { var html_gh = 'Normalize\t' for(var key in data.counters) { console.log(key, data.counters[key]) @@ -294,14 +294,14 @@ //document.getElementById('what').innerHTML = "" d3.select('nav').style('visibility', 'hidden') - fetch(file_to_gather) + fetch(file_to_gather, {cache: "no-store"}) .then(response => response.json()) .then(data => { code_data_file = file_to_gather.split('_sm')[0]+'_code.json' console.log("Requestd:", file_to_gather) console.log("Request code:", code_data_file) - fetch(code_data_file) + fetch(code_data_file, {cache: "no-store"}) .then(response => response.json()) .then(code_data => { const SP = '\u00A0' diff --git a/src/core/hsa/packets/packets_generator.cpp b/src/core/hsa/packets/packets_generator.cpp index 5434c25263..92702a1159 100644 --- a/src/core/hsa/packets/packets_generator.cpp +++ b/src/core/hsa/packets/packets_generator.cpp @@ -468,7 +468,7 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets( } // ATT -uint32_t g_output_buffer_size = 0x8000000; // 128M x 16 = 2GB +uint32_t g_output_buffer_size = 0x40000000; // 1GB bool g_output_buffer_local = true; // Allocate system memory accessible by both CPU and GPU diff --git a/src/core/hsa/queues/queue.cpp b/src/core/hsa/queues/queue.cpp index cbdc14dff6..8d58c5d426 100644 --- a/src/core/hsa/queues/queue.cpp +++ b/src/core/hsa/queues/queue.cpp @@ -50,7 +50,6 @@ } while (0) #define __NR_gettid 186 -#define MAX_ATT_PROFILES 16 std::mutex sessions_pending_signal_lock; @@ -664,6 +663,9 @@ std::atomic WRITER_ID{0}; */ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt_index, void* data, hsa_amd_queue_intercept_packet_writer writer) { + static const char* env_MAX_ATT_PROFILES = getenv("ROCPROFILER_MAX_ATT_PROFILES"); + static int MAX_ATT_PROFILES = env_MAX_ATT_PROFILES ? atoi(env_MAX_ATT_PROFILES) : 1; + const Packet::packet_t* packets_arr = reinterpret_cast(packets); std::vector transformed_packets;