SWDEV-402866: Added browser file mode. Fixed caching. Changed ATT buffer size. Added MAX_ATT environment variable. Updated README/Changelog.
Change-Id: I36a6093bb85bf4ef179b59df676fc2e4cbdb5288
This commit is contained in:
committed by
Giovanni Baraldi
szülő
44e787957b
commit
59ee21f7d1
+18
-2
@@ -209,18 +209,26 @@ The resulting `a.out` will depend on
|
||||
|
||||
### Optimized
|
||||
- Improved Test Suite
|
||||
### Changed
|
||||
- ATT analysis will not run by default. For ATT to have the same behaviour as 5.5, use --plugin att <as.s> --mode network
|
||||
### Added
|
||||
- 'end_time' need to be disabled in roctx_trace.txt
|
||||
- support for hsa_amd_memory_async_copy_on_engine API function trace
|
||||
|
||||
### Fixed
|
||||
- rocprof in ROcm/5.4.0 gpu selector broken.
|
||||
- rocprof in ROCm/5.4.1 fails to generate kernel info.
|
||||
- rocprof clobbers LD_PRELOAD.
|
||||
|
||||
## ROCprofiler for rocm 5.7.0
|
||||
### Navi support
|
||||
Rocprofiler for ROCm 5.7 added support for counter collection (PMC) and advanced thread tracing (ATT) for Navi21 and Navi31 GPUs.
|
||||
- On Navi, specially Navi31, counter collection requires the GPU to be in a stable power state. See README.md for instructions.
|
||||
- Navi does not support streaming SQ counters and ATT at the same time, unlike GFX9.
|
||||
- On Navi ATT, "att: target_cu" indexes the WGP and the SIMD_MASK parameter is actually the SIMD_ID, in the range [0,3].
|
||||
- HIP RT in ATT not yet supported.
|
||||
### Changed
|
||||
- ATT analysis will not run by default. For ATT to have the same behaviour as 5.5, use --plugin att <as.s> --mode network
|
||||
### Optimized
|
||||
- ATT json filesizes
|
||||
### Added
|
||||
- Every API trace in V2 reported synchronously will have two records, one for Enter phase and for Exit phase
|
||||
- File Plugin now reports the HSA OPS operation kind as part of the output text
|
||||
@@ -230,9 +238,17 @@ The resulting `a.out` will depend on
|
||||
- MI300 individual XCC counters dumped per-xcc as separate records but with same record-id and kernel dispatch info
|
||||
- Naming for MPI ranks. Filenames containing "%rank" are replaced by variables "MPI_RANK", "OMPI_COMM_WORLD_RANK" or "MV2_COMM_WORLD_RANK".
|
||||
- MPI Rank will appear in perfetto track names.
|
||||
- SE_MASK parameter in ATT, a binary mask specifying for which shader engines to run ATT.
|
||||
On GFX9, SEs are masked out completely. On Navi only part of the data is masked.
|
||||
The use of SE_MASK=0x1 is heavily encouraged to avoid packet lost events.
|
||||
- "--mode file" option in ATT, which allows for parsed files to be stored. Run python3 httpserver.py from within ./UI/ to view files locally.
|
||||
- "ROCPROFILER_MAX_ATT_PROFILES" environment variable can be set. Previously fixed at 16, now the default is 1.
|
||||
- Increased ATT buffer size per collection to 1GB.
|
||||
### Fixed
|
||||
- Samples are fixed to show the new usage of phases.
|
||||
- Plugin option validates the plugin names.
|
||||
- Fixing rocsys, for rocsys options, rocsys -h can be called
|
||||
- "--output-file" option ignored when no output folder was specified.
|
||||
- Perfetto crash when using ROCTX and/or no output file specified.
|
||||
- Parsing of the getpc, setpc and swappc instructions with registers loaded from scratch space.
|
||||
- Some browsers caching ATT data from older kernels.
|
||||
|
||||
@@ -381,3 +381,7 @@ samples can be run as independent executables once installed
|
||||
Please report in the Github Issues
|
||||
|
||||
## Limitations
|
||||
- Navi requires a stable power state for counter collection. Currently this state needs to be set by the user.
|
||||
To do so, set "power_dpm_force_performance_level" to be writeable for non-root users with chmod, then:
|
||||
echo profile_standard >> /sys/class/drm/card0/device/power_dpm_force_performance_level
|
||||
Recommended: "auto" or "high" for ATT and "profile_standard" for PMC. Use rocm-smi to verify the current power state.
|
||||
|
||||
@@ -57,11 +57,10 @@ install(TARGETS att_plugin
|
||||
|
||||
configure_file(att.py att/att.py COPYONLY)
|
||||
configure_file(trace_view.py att/trace_view.py COPYONLY)
|
||||
# configure_file(t.db att/t.db COPYONLY)
|
||||
configure_file(ui/index.html att/ui/index.html COPYONLY)
|
||||
configure_file(ui/logo.svg att/ui/logo.svg COPYONLY)
|
||||
configure_file(ui/styles.css att/ui/styles.css COPYONLY)
|
||||
# configure_file(ui/trace.json att/ui/trace.json COPYONLY)
|
||||
configure_file(ui/httpserver.py att/ui/httpserver.py COPYONLY)
|
||||
install(
|
||||
DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/att
|
||||
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/rocprofiler
|
||||
|
||||
@@ -19,7 +19,7 @@ from io import BytesIO
|
||||
|
||||
class FileBytesIO:
|
||||
def __init__(self, iobytes):
|
||||
self.iobytes = iobytes
|
||||
self.iobytes = deepcopy(iobytes)
|
||||
self.seek = 0
|
||||
|
||||
def __len__(self):
|
||||
@@ -27,9 +27,9 @@ class FileBytesIO:
|
||||
|
||||
def read(self, length=0):
|
||||
if length<=0:
|
||||
return bytes(self.getbuffer())
|
||||
return bytes(self.iobytes.getbuffer())
|
||||
else:
|
||||
if self.seek >= len(self):
|
||||
if self.seek >= self.iobytes.getbuffer().nbytes:
|
||||
self.seek = 0
|
||||
return None
|
||||
response = self.iobytes.getbuffer()[self.seek:self.seek+length]
|
||||
@@ -382,14 +382,14 @@ def draw_wave_states(selections, normalize):
|
||||
if normalize:
|
||||
timelines = np.array(timelines) / np.maximum(np.sum(timelines,0)*1E-2,1E-7)
|
||||
|
||||
kernsize = maxtime//150+1
|
||||
trim = max(maxtime//5000,1)
|
||||
cycles = np.arange(timelines[0].size)[::trim]
|
||||
|
||||
cycles = np.arange(0, timelines[0].size//trim, 1)*trim
|
||||
timelines = [time[:trim*(time.size//trim)].reshape((-1, trim)).mean(-1) if len(time) > 0 else cycles*0 for time in timelines]
|
||||
kernsize = 21
|
||||
kernel = np.asarray([np.exp(-abs(10*k/kernsize)) for k in range(-kernsize//2,kernsize//2+1)])
|
||||
kernel /= np.sum(kernel)
|
||||
|
||||
timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2][::trim] if len(time) > 0 else cycles*0 for time in timelines]
|
||||
timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2] for time in timelines]
|
||||
|
||||
[plt.plot(cycles, t, label='State '+s, linewidth=1.1, color=c)
|
||||
for t, s, c, sel in zip(timelines, STATES, colors, selections) if sel]
|
||||
@@ -456,8 +456,6 @@ if __name__ == "__main__":
|
||||
for line in lines:
|
||||
if 'PERFCOUNTER=' in line:
|
||||
EVENT_NAMES += [clean(line).split('SQ_')[1].lower()]
|
||||
if len(EVENT_NAMES) == 0:
|
||||
EVENT_NAMES = ['SPI', 'Vdata', 'Sdata', 'LDS']
|
||||
if args.target_cu is None:
|
||||
args.target_cu = 1
|
||||
|
||||
@@ -546,7 +544,7 @@ if __name__ == "__main__":
|
||||
tuples3 = [(0,df['begin_time'][T]-min_event_time)]+[(int(t[0]),int(t[1])) for t in tuples2]
|
||||
|
||||
for state in tuples3:
|
||||
if state[1] > 50E6:
|
||||
if state[1] > 1E8:
|
||||
print('Warning: Time limit reached for ',state[0], state[1])
|
||||
break
|
||||
if time_acc+state[1] > TIMELINES[state[0]].size:
|
||||
|
||||
@@ -798,10 +798,7 @@ def view_trace(args, code, jumps, dbnames, att_filenames, bReturnLoc, pic_callba
|
||||
print("Exitting.")
|
||||
else:
|
||||
os.makedirs('ui', exist_ok=True)
|
||||
os.system('cp ' + os.path.join(os.path.abspath(os.path.dirname(__file__)),'ui') + '/* ui/' )
|
||||
for k, v in JSON_GLOBAL_DICTIONARY.items():
|
||||
if '.json' in k:
|
||||
try:
|
||||
with open(os.path.join('ui',k), 'w') as f:
|
||||
f.write(v.read())
|
||||
except:
|
||||
pass
|
||||
with open(os.path.join('ui',k), 'w' if '.json' in k else 'wb') as f:
|
||||
f.write(v.read())
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
if sys.version_info[0] < 3:
|
||||
raise Exception("Must be using Python 3")
|
||||
|
||||
import http.server
|
||||
import socketserver
|
||||
import socket
|
||||
import os
|
||||
import sys
|
||||
|
||||
class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
def end_headers(self):
|
||||
self.send_my_headers()
|
||||
http.server.SimpleHTTPRequestHandler.end_headers(self)
|
||||
|
||||
def send_my_headers(self):
|
||||
self.send_header("Cache-Control", "no-cache, no-store, must-revalidate")
|
||||
self.send_header("Pragma", "no-cache")
|
||||
self.send_header("Expires", "0")
|
||||
|
||||
def do_GET(self):
|
||||
if 'timeline.png?' in self.path:
|
||||
self.path = 'timeline.png'
|
||||
|
||||
http.server.SimpleHTTPRequestHandler.do_GET(self)
|
||||
|
||||
class RocTCPServer(socketserver.TCPServer):
|
||||
def server_bind(self):
|
||||
self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
self.socket.bind(self.server_address)
|
||||
|
||||
def run_server():
|
||||
Handler = NoCacheHTTPRequestHandler
|
||||
os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'.'))
|
||||
try:
|
||||
with RocTCPServer((IPAddr, PORT), Handler) as httpd:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
def get_ip():
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.settimeout(0)
|
||||
try:
|
||||
hostname = socket.gethostname()
|
||||
IPAddr = socket.gethostbyname(hostname)
|
||||
s.connect(({IPAddr}, 1))
|
||||
except Exception:
|
||||
IPAddr = '127.0.0.1'
|
||||
finally:
|
||||
return IPAddr
|
||||
|
||||
IPAddr = get_ip()
|
||||
PORT = 8000
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
PORT = int(sys.argv[1])
|
||||
print('serving at port: {0}'.format(PORT))
|
||||
|
||||
try:
|
||||
run_server()
|
||||
except KeyboardInterrupt:
|
||||
print("Exitting.")
|
||||
@@ -206,7 +206,7 @@
|
||||
var current_WV = 0
|
||||
var filename_data = {}
|
||||
|
||||
fetch("filenames.json").then(response => response.json()).then(data => {
|
||||
fetch("filenames.json", {cache: "no-store"}).then(response => response.json()).then(data => {
|
||||
filename_data = data.wave_filenames
|
||||
|
||||
wave_cu_index = {};
|
||||
@@ -272,7 +272,7 @@
|
||||
document.getElementById("minimap").innerHTML = HTML_MINI
|
||||
document.getElementById("Images").innerHTML = HTML_IMAG
|
||||
|
||||
fetch("counters.json").then(response => response.json()).then(data => {
|
||||
fetch("counters.json", {cache: "no-store"}).then(response => response.json()).then(data => {
|
||||
var html_gh = '<input type="checkbox" id="btn_norm" onclick="UpdImageSrc()" checked=true>Normalize\t'
|
||||
for(var key in data.counters) {
|
||||
console.log(key, data.counters[key])
|
||||
@@ -294,14 +294,14 @@
|
||||
//document.getElementById('what').innerHTML = ""
|
||||
|
||||
d3.select('nav').style('visibility', 'hidden')
|
||||
fetch(file_to_gather)
|
||||
fetch(file_to_gather, {cache: "no-store"})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
code_data_file = file_to_gather.split('_sm')[0]+'_code.json'
|
||||
console.log("Requestd:", file_to_gather)
|
||||
console.log("Request code:", code_data_file)
|
||||
|
||||
fetch(code_data_file)
|
||||
fetch(code_data_file, {cache: "no-store"})
|
||||
.then(response => response.json())
|
||||
.then(code_data => {
|
||||
const SP = '\u00A0'
|
||||
|
||||
@@ -468,7 +468,7 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
|
||||
}
|
||||
|
||||
// ATT
|
||||
uint32_t g_output_buffer_size = 0x8000000; // 128M x 16 = 2GB
|
||||
uint32_t g_output_buffer_size = 0x40000000; // 1GB
|
||||
bool g_output_buffer_local = true;
|
||||
|
||||
// Allocate system memory accessible by both CPU and GPU
|
||||
|
||||
@@ -50,7 +50,6 @@
|
||||
} while (0)
|
||||
|
||||
#define __NR_gettid 186
|
||||
#define MAX_ATT_PROFILES 16
|
||||
|
||||
std::mutex sessions_pending_signal_lock;
|
||||
|
||||
@@ -664,6 +663,9 @@ std::atomic<uint32_t> WRITER_ID{0};
|
||||
*/
|
||||
void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt_index, void* data,
|
||||
hsa_amd_queue_intercept_packet_writer writer) {
|
||||
static const char* env_MAX_ATT_PROFILES = getenv("ROCPROFILER_MAX_ATT_PROFILES");
|
||||
static int MAX_ATT_PROFILES = env_MAX_ATT_PROFILES ? atoi(env_MAX_ATT_PROFILES) : 1;
|
||||
|
||||
const Packet::packet_t* packets_arr = reinterpret_cast<const Packet::packet_t*>(packets);
|
||||
std::vector<Packet::packet_t> transformed_packets;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user