From 675e1b9d38ed6d6783be95ea528d590400debe3b Mon Sep 17 00:00:00 2001 From: Giovanni LB Date: Mon, 25 Sep 2023 14:54:00 -0300 Subject: [PATCH] SWDEV-423898: Fixing issues with parallel kernels Change-Id: I6726f3003af6036ba041c2b4bc5227dd08691090 --- bin/rocprofv2 | 6 +- plugin/att/att.py | 267 ++++++++++------------------------- plugin/att/code_printing.cpp | 54 ------- plugin/att/disassembly.cpp | 26 ++-- plugin/att/drawing.py | 129 ++++++++++++----- plugin/att/stitch.py | 110 ++------------- plugin/att/trace_view.py | 81 ++++------- plugin/att/ui/index.html | 1 + 8 files changed, 225 insertions(+), 449 deletions(-) diff --git a/bin/rocprofv2 b/bin/rocprofv2 index a2a86f66bd..4012f10c39 100755 --- a/bin/rocprofv2 +++ b/bin/rocprofv2 @@ -194,11 +194,7 @@ while [ 1 ]; do ATT_ARGV="$ATT_ARGV $3 \"$4\"" shift shift - elif [[ "$3" = "--mpi" ]]; then - ATT_PYTHON3_ARG="mpirun -np $4 python3 " - shift - shift - elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" = "--genasm" || "$3" == "--att_kernel" || "$3" == "--depth" ]]; then + elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" == "--att_kernel" ]]; then ATT_ARGV="$ATT_ARGV $3 $4" shift shift diff --git a/plugin/att/att.py b/plugin/att/att.py index 7c9251e756..20ad9413c0 100755 --- a/plugin/att/att.py +++ b/plugin/att/att.py @@ -16,14 +16,7 @@ import glob import numpy as np from stitch import stitch import gc - -try: - from mpi4py import MPI - - MPI_IMPORTED = True -except: - MPI_IMPORTED = False - +from collections import defaultdict class PerfEvent(ctypes.Structure): _fields_ = [ @@ -130,6 +123,8 @@ class ReturnInfo(ctypes.Structure): ("occupancy", POINTER(ctypes.c_uint64)), ("num_occupancy", ctypes.c_uint64), ("flags", ctypes.c_uint64), + ("kernel_id_addr", POINTER(ctypes.c_uint64)), + ("num_kernel_ids", ctypes.c_uint64), ] @@ -162,10 +157,15 @@ def parse_binary(filename, kernel=None): info = SO.wrapped_parse_binary(str(filename).encode("utf-8"), kernel) code = [] + kernel_addr = defaultdict(lambda : "Unknown") + last_known_function = "Unknown" for k in range(info.code_len): code_entry = info.code[k] line = deepcopy(code_entry.line.decode("utf-8")) + if "; Begin " in line: + last_known_function = line.split("; Begin ")[1] + loc = deepcopy(code_entry.loc.decode("utf-8")) to_line = int(code_entry.to_line) if (code_entry.to_line >= 0) else None @@ -175,31 +175,31 @@ def parse_binary(filename, kernel=None): code.append([line, int(code_entry.value), to_line, loc, int(code_entry.index), int(code_entry.line_num), int(code_entry.addr), 0, 0]) + if code[-1][-3] != 0 and len(code) > 1: + kernel_addr[code[-1][-3]] = last_known_function + jumps = {} for k in range(info.jumps_len): jumps[info.jumps[k].key] = info.jumps[k].value - return code, jumps + return code, jumps, kernel_addr -def getWaves_binary(name, shader_engine_data_dict, target_cu, depth): +def getWaves_binary(name, shader_engine_data_dict, target_cu): filename = os.path.abspath(str(name)) info = SO.AnalyseBinary(filename.encode("utf-8"), target_cu, False) + kernel_addr = [int(info.kernel_id_addr[k]) for k in range(info.num_kernel_ids)] + waves = [info.wavedata[k] for k in range(info.num_waves)] events = [deepcopy(info.perfevents[k]) for k in range(info.num_events)] occupancy = [int(info.occupancy[k]) for k in range(int(info.num_occupancy))] flags = "navi" if (info.flags & 0x1) else "vega" - wave_slot_count = [[0 for k in range(20)] for j in range(4)] waves_python = [] for wave in waves: - if ( - wave_slot_count[wave.simd][wave.wave_id] >= depth - or wave.instructions_size == 0 - ): + if wave.instructions_size < 2: continue - wave_slot_count[wave.simd][wave.wave_id] += 1 pwave = PythonWave(wave) pwave.timeline = [ (wave.timeline_array[2 * k], wave.timeline_array[2 * k + 1]) @@ -210,16 +210,16 @@ def getWaves_binary(name, shader_engine_data_dict, target_cu, depth): for k in range(wave.instructions_size) ] waves_python.append(pwave) - shader_engine_data_dict[name] = (waves_python, events, occupancy, flags) + shader_engine_data_dict[name] = (waves_python, events, occupancy, flags, kernel_addr) def getWaves_stitch(SIMD, code, jumps, flags, latency_map, hitcount_map, bIsAuto): for pwave in SIMD: pwave.instructions = stitch(pwave.instructions, code, jumps, flags, bIsAuto) - - for inst in pwave.instructions[0]: - hitcount_map[inst[-1]] += 1 - latency_map[inst[-1]] += inst[3] + if pwave.instructions is not None: + for inst in pwave.instructions[0]: + hitcount_map[inst[-1]] += 1 + latency_map[inst[-1]] += inst[3] def persist(trace_file, SIMD): @@ -232,6 +232,8 @@ def persist(trace_file, SIMD): smem_ins, smem_stalls, br_ins, br_taken_ins, br_stalls = [], [], [], [], [] for wave in SIMD: + if wave.instructions is None: + continue simds.append(wave.simd) waves.append(wave.wave_id) begin_time.append(wave.begin_time) @@ -344,50 +346,30 @@ def insert_waitcnt(flight_count, assembly_code): return assembly_code -def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): - for n, occ in enumerate(OCCUPANCY): - OCCUPANCY[n] = [ - max(min(int((u >> 16) - min_event_time) << 16, 2**42), 0) | (u & 0xFFFFF) - for u in occ - ] - for perf in EVENTS: - for p in perf: - p.time -= min_event_time - +def gen_timelines(DBFILES): + TIMELINES = [np.zeros(int(1E6), dtype=np.float32) for k in range(5)] + TIME_RESOLUTION = 16 for df in DBFILES: for T in range(len(df["timeline"])): timeline = df["timeline"][T] time_acc = 0 - tuples3 = [(0, df["begin_time"][T] - min_event_time)] + [ - (int(t[0]), int(t[1])) for t in timeline - ] + tuples3 = [(0, df["begin_time"][T])] + [(int(t[0]), int(t[1])) for t in timeline] for state in tuples3: - if state[1] > 1e8: + t_end = (time_acc + state[1])//TIME_RESOLUTION + if t_end > 1E8: print("Warning: Time limit reached for ", state[0], state[1]) break - if time_acc + state[1] > TIMELINES[state[0]].size: + elif t_end > TIMELINES[state[0]].size: TIMELINES[state[0]] = np.hstack( [TIMELINES[state[0]], np.zeros_like(TIMELINES[state[0]])] ) - TIMELINES[state[0]][time_acc : time_acc + state[1]] += 1 + TIMELINES[state[0]][time_acc//TIME_RESOLUTION : t_end] += 1 time_acc += state[1] + return TIMELINES if __name__ == "__main__": - comm = None - mpi_root = True - if MPI_IMPORTED: - try: - comm = MPI.COMM_WORLD - if comm.Get_size() < 2: - comm = None - else: - mpi_root = comm.Get_rank() == 0 - except: - print("Could not load MPI") - comm = None - pathenv = os.getenv("OUTPUT_PATH") if pathenv is None: pathenv = "." @@ -395,9 +377,6 @@ if __name__ == "__main__": parser.add_argument( "assembly_code", help="Path to the assembly code. Must be the first parameter." ) - parser.add_argument( - "--depth", help="Maximum number of parsed waves per slot", default=100, type=int - ) parser.add_argument( "--trace_file", help="Filter for trace files", default=None, type=str ) @@ -405,12 +384,6 @@ if __name__ == "__main__": "--att_kernel", help="Kernel file", type=str, default=pathenv + "/*_kernel.txt" ) parser.add_argument("--ports", help="Server and websocket ports, default: 8000,18000") - parser.add_argument( - "--genasm", - help="Generate post-processed asm file at this path", - type=str, - default="", - ) parser.add_argument( "--mode", help="""ATT analysis modes:\n @@ -455,22 +428,19 @@ if __name__ == "__main__": print("Could not find att output kernel:", args.att_kernel) exit(1) elif len(att_kernel) > 1: - if mpi_root: - print("Found multiple kernel matching given filters:") - for n, k in enumerate(att_kernel): - print("\t", n, "->", k) + print("Found multiple kernel matching given filters:") + for n, k in enumerate(att_kernel): + print("\t", n, "->", k) - bValid = False - while bValid == False: - try: - args.att_kernel = att_kernel[int(input("Please select number: "))] - bValid = True - except KeyboardInterrupt: - exit(0) - except: - print("Invalid option.") - if comm is not None: - args.att_kernel = comm.bcast(args.att_kernel, root=0) + bValid = False + while bValid == False: + try: + args.att_kernel = att_kernel[int(input("Please select number: "))] + bValid = True + except KeyboardInterrupt: + exit(0) + except: + print("Invalid option.") else: args.att_kernel = att_kernel[0] @@ -491,38 +461,31 @@ if __name__ == "__main__": filenames = glob.glob(args.trace_file) assert len(filenames) > 0 - if comm is not None: - filenames = filenames[comm.Get_rank() :: comm.Get_size()] - - code = jumps = None - if mpi_root: - print('Att kernel:', args.att_kernel) - code, jumps = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel) + print('Att kernel:', args.att_kernel) + code, jumps, kern_addr = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel) DBFILES = [] - TIMELINES = [np.zeros(int(1e4), dtype=np.int16) for k in range(5)] EVENTS = [] OCCUPANCY = [] GFXV = [] analysed_filenames = [] occupancy_filenames = [] - + dispatch_kernel_names = {} shader_engine_data_dict = {} for name in filenames: - getWaves_binary(name, shader_engine_data_dict, args.target_cu, args.depth) - - if comm is not None: - code = comm.bcast(code, root=0) - jumps = comm.bcast(jumps, root=0) + getWaves_binary(name, shader_engine_data_dict, args.target_cu) gc.collect() latency_map = np.zeros((len(code)), dtype=np.int64) hitcount_map = np.zeros((len(code)), dtype=np.int32) for name in filenames: - SIMD, perfevents, occupancy, gfxv = shader_engine_data_dict[name] - if len(occupancy) > 0: + SIMD, perfevents, occupancy, gfxv, addrs = shader_engine_data_dict[name] + + for id, addr in enumerate(addrs): + dispatch_kernel_names[id] = kern_addr[addr] + if len(occupancy) > 16: OCCUPANCY.append( occupancy ) - occupancy_filenames.append( name ) + occupancy_filenames.append(name) if np.sum([0]+[len(s.instructions) for s in SIMD]) == 0: print("No waves from", name) continue @@ -534,117 +497,33 @@ if __name__ == "__main__": GFXV.append(gfxv) gc.collect() - min_event_time = 2**62 - for df in DBFILES: - if len(df["begin_time"]) > 0: - min_event_time = min(min_event_time, np.min(df["begin_time"])) - for perf in EVENTS: - for p in perf: - min_event_time = min(min_event_time, p.time) - for occ in OCCUPANCY: - min_event_time = min(min_event_time, np.min(np.array(occ) >> 16)) - - gc.collect() - min_event_time = max(0, min_event_time - 32) - if comm is not None: - min_event_time = comm.reduce(min_event_time, op=MPI.MIN) - min_event_time = comm.bcast(min_event_time, root=0) - - apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES) - - GFXV = comm.gather(GFXV, root=0) - EVENTS = comm.gather(EVENTS, root=0) - OCCUPANCY = comm.gather(OCCUPANCY, root=0) - TIMELINES = comm.gather(TIMELINES, root=0) - gather_latency_map = comm.gather(latency_map, root=0) - gather_hitcount_map = comm.gather(hitcount_map, root=0) - gathered_filenames = comm.gather(occupancy_filenames, root=0) - - if mpi_root: - latency_map *= 0 - hitcount_map *= 0 - for hit, lat in zip(gather_hitcount_map, gather_latency_map): - hitcount_map += hit - latency_map += lat - EVENTS = [e for elem in EVENTS for e in elem] - OCCUPANCY = [e for elem in OCCUPANCY for e in elem] - gathered_filenames = [e for elem in gathered_filenames for e in elem] - gfxv = [e for elem in GFXV for e in elem][0] - - TIMELINES_GATHER = TIMELINES - TIMELINES = [ - np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16) - for k in range(5) - ] - for gather in TIMELINES_GATHER: - for t, m in zip(TIMELINES, gather): - t[: len(m)] += m - del TIMELINES_GATHER - else: # free up memory - TIMELINES = [] - OCCUPANCY = [] - EVENTS = [] - else: - apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES) - gathered_filenames = occupancy_filenames - - if mpi_root: - for k in range(len(code)): - code[k][-2] = int(hitcount_map[k]) - code[k][-1] = int(latency_map[k]) + for k in range(len(code)): + code[k][-2] = int(hitcount_map[k]) + code[k][-1] = int(latency_map[k]) if CSV_MODE: - if mpi_root: - from att_to_csv import dump_csv - dump_csv(code) + from att_to_csv import dump_csv + dump_csv(code) quit() + gc.collect() - print("Min time:", min_event_time) drawinfo = { - "TIMELINES": TIMELINES, + "TIMELINES": gen_timelines(DBFILES), "EVENTS": EVENTS, "EVENT_NAMES": EVENT_NAMES, "OCCUPANCY": OCCUPANCY, - "ShaderNames": gathered_filenames, + "ShaderNames": occupancy_filenames, + "DispatchNames": dispatch_kernel_names, } - if args.genasm and len(args.genasm) > 0: - flight_count = view_trace( - args, - code, - DBFILES, - analysed_filenames, - True, - OCCUPANCY, - args.dumpfiles, - min_event_time, - gfxv, - drawinfo, - comm, - mpi_root, - ) - with open(args.assembly_code, "r") as file: - lines = file.readlines() - assembly_code = {l + 1.0: lines[l][:-1] for l in range(len(lines))} - assembly_code = insert_waitcnt(flight_count, assembly_code) - - with open(args.genasm, "w") as file: - keys = sorted(assembly_code.keys()) - for k in keys: - file.write(assembly_code[k] + "\n") - else: - view_trace( - args, - code, - DBFILES, - analysed_filenames, - False, - OCCUPANCY, - args.dumpfiles, - min_event_time, - gfxv, - drawinfo, - comm, - mpi_root, - ) + view_trace( + args, + code, + DBFILES, + analysed_filenames, + args.dumpfiles, + 0, + gfxv, + drawinfo + ) diff --git a/plugin/att/code_printing.cpp b/plugin/att/code_printing.cpp index aff5259a99..ed5db7ab31 100644 --- a/plugin/att/code_printing.cpp +++ b/plugin/att/code_printing.cpp @@ -136,55 +136,6 @@ std::optional code_object_decoder_t::find_ return {}; } -/* -void code_object_decoder_t::load_symbol_map() { - std::unique_ptr elf ( - elf_begin(m_fd, ELF_C_READ, nullptr), - [](Elf *elf){ elf_end(elf); }); - - if (!elf) { - rocprofiler::warning("Error opening ELF!\n"); - return; - } - - Elf64_Ehdr *ehdr = elf64_getehdr(elf.get()); - if (!ehdr) { - printf("elf64_getehdr failed\n"); - return; - } - - // Slurp the symbol table. - Elf_Scn *scn = nullptr; - while ((scn = elf_nextscn(elf.get(), scn)) != nullptr) { - GElf_Shdr shdr_mem; - GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_mem); - if (shdr->sh_type != SHT_SYMTAB && shdr->sh_type != SHT_DYNSYM) { - continue; - } - - Elf_Data *data = elf_getdata(scn, nullptr); - if (!data) continue; - - size_t symbol_count = data->d_size / gelf_fsize(elf.get(), ELF_T_SYM, 1, EV_CURRENT); - for (size_t j = 0; j < symbol_count; ++j) { - GElf_Sym sym_mem; - GElf_Sym *sym = gelf_getsym(data, j, &sym_mem); - - if (GELF_ST_TYPE(sym->st_info) != STT_FUNC || sym->st_shndx == SHN_UNDEF) continue; - - std::string symbol_name{ elf_strptr(elf.get(), shdr->sh_link, sym->st_name) }; - auto symbol_pair = std::make_pair(symbol_name, sym->st_size); - - auto [it, success] = m_symbol_map.emplace(sym->st_value, symbol_pair); - - // If there already was a symbol defined at this address, but this - // new symbol covers a larger address range, replace the old symbol - // with this new one. - if (!success && sym->st_size > it->second.second) it->second = symbol_pair; - } - } -} */ - void code_object_decoder_t::disassemble_kernel(uint64_t addr) { auto symbol = find_symbol(addr); @@ -193,9 +144,6 @@ void code_object_decoder_t::disassemble_kernel(uint64_t addr) { return; } - // if (symbol->m_name.find("__amd_rocclr_") == 0) - // return; - std::cout << "Dumping ISA for " << symbol->m_name << std::endl; uint64_t end_addr = addr + symbol->m_size; @@ -218,8 +166,6 @@ void code_object_decoder_t::disassemble_kernel(uint64_t addr) { void code_object_decoder_t::disassemble_kernels() { disassembly = std::make_unique(*this); - - // if (m_symbol_map.begin() == m_symbol_map.end()) m_symbol_map = disassembly->GetKernelMap(); for (auto& [k, v] : m_symbol_map) disassemble_kernel(k); diff --git a/plugin/att/disassembly.cpp b/plugin/att/disassembly.cpp index d564719ed2..d94e22931a 100644 --- a/plugin/att/disassembly.cpp +++ b/plugin/att/disassembly.cpp @@ -57,8 +57,9 @@ if (amd_comgr_status_s status = call) { \ const char* reason = ""; \ amd_comgr_status_string(status, &reason); \ + std::cerr << __LINE__ << " code: " << status << std::endl; \ std::cerr << __LINE__ << " failed: " << reason << std::endl; \ - return; \ + exit(1); \ } CodeObjectBinary::CodeObjectBinary(const std::string& uri) : m_uri(uri) { @@ -156,12 +157,12 @@ DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder) : buffer(reinterpret_cast(decoder.buffer.data())), size(decoder.buffer.size()), instructions(decoder.instructions) { - amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &data); - amd_comgr_set_data(data, size, decoder.buffer.data()); + CHECK_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &data)); + CHECK_COMGR(amd_comgr_set_data(data, size, decoder.buffer.data())); char isa_name[128]; size_t isa_size = sizeof(isa_name); - amd_comgr_get_data_isa_name(data, &isa_size, isa_name); + CHECK_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, isa_name)); CHECK_COMGR(amd_comgr_create_disassembly_info( isa_name, //"amdgcn-amd-amdhsa--gfx1100", @@ -172,24 +173,24 @@ DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder) amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbol, void* user_data) { amd_comgr_symbol_type_t type; - amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type); + CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type)); if (type != AMD_COMGR_SYMBOL_TYPE_FUNC && type != AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL) return AMD_COMGR_STATUS_SUCCESS; uint64_t addr; - amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &addr); + CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &addr)); uint64_t mem_size; - amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size); + CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size)); uint64_t name_size; - amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size); + CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size)); std::string name; name.resize(name_size); - amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data()); + CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data())); static_cast(user_data)->symbol_map[addr] = {name, mem_size}; return AMD_COMGR_STATUS_SUCCESS; @@ -197,18 +198,19 @@ amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbo std::map>& DisassemblyInstance::GetKernelMap() { symbol_map = std::map>{}; - amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this); + CHECK_COMGR(amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this)); return symbol_map; } DisassemblyInstance::~DisassemblyInstance() { - amd_comgr_release_data(data); + CHECK_COMGR(amd_comgr_release_data(data)); CHECK_COMGR(amd_comgr_destroy_disassembly_info(info)); } uint64_t DisassemblyInstance::ReadInstruction(uint64_t addr, const char* cpp_line) { uint64_t size_read; - amd_comgr_disassemble_instruction(info, buffer + addr, (void*)this, &size_read); + CHECK_COMGR(amd_comgr_disassemble_instruction(info, buffer + addr, (void*)this, &size_read)); + assert(instructions.size() != 0); instructions.back().address = addr; instructions.back().cpp_reference = cpp_line; return size_read; diff --git a/plugin/att/drawing.py b/plugin/att/drawing.py index b6a5e62a24..945ab30e50 100644 --- a/plugin/att/drawing.py +++ b/plugin/att/drawing.py @@ -153,6 +153,7 @@ def draw_wave_states(selections, normalize, TIMELINES): plt.figure(figsize=(15, 4)) + maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices]) maxtime = max(maxtime, 1) timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices] @@ -169,21 +170,18 @@ def draw_wave_states(selections, normalize, TIMELINES): else cycles * 0 for time in timelines ] - kernsize = 21 - kernel = np.asarray( - [ - np.exp(-abs(10 * k / kernsize)) - for k in range(-kernsize // 2, kernsize // 2 + 1) - ] - ) + kernsize = 15 + kernel = np.asarray([ + np.exp(-abs(10 * k / kernsize)) for k in range(-kernsize // 2, kernsize // 2 + 1) + ]) kernel /= np.sum(kernel) timelines = [ np.convolve(time, kernel)[kernsize // 2 : -kernsize // 2] - for time in timelines - if len(time) > 0 + for time in timelines if len(time) > 0 ] - + maxtime *= 16 + cycles *= 16 [ plt.plot(cycles, t, label="State " + s, linewidth=1.1, color=c) for t, s, c, sel in zip(timelines, STATES, colors, selections) @@ -204,48 +202,113 @@ def draw_wave_states(selections, normalize, TIMELINES): return STATES, FileBytesIO(figure_bytes) -def draw_occupancy(selections, normalize, OCCUPANCY, shadernames): +def draw_occupancy_per_dispatch(selections, normalize, OCCUPANCY, dispatchnames): + plt.figure(figsize=(15, 4)) + maxtime = 1 + delta = 1 + + for k in range(len(OCCUPANCY)): + if len(OCCUPANCY[k]) <= 16: + continue + OCCUPANCY[k] = [(16*int(u>>23), (u>>12) & 0x7F, (u>>19) & 0xF, u&0xFFF) for u in OCCUPANCY[k]] + maxtime = max(maxtime, OCCUPANCY[k][-1][0]) + + NUM_DOTS = 1600 + delta = max(1, maxtime // NUM_DOTS) + chart = np.zeros((len(dispatchnames), maxtime // delta + 2), dtype=np.float32) + + for occ in OCCUPANCY: + if len(occ) <= 16: + continue + small_chart = np.zeros_like(chart) + norm_fact = np.zeros_like(chart) + norm_fact += 1E-6 + + current_occ = [[0 for m in range(16)] for k in range(len(dispatchnames))] + current_occ[0] = [m[1] for m in occ[:16]] + current_time = [0 for k in range(len(dispatchnames))] + total_value = [0 for k in range(len(dispatchnames))] + total_value[0] = np.sum(current_occ[0]) + + for time, value, cu, kid in occ: + b = current_time[kid] + e = max(b + 1, time // delta) + small_chart[kid][b:e] += total_value[kid] + norm_fact[kid][b:e] += 1 + + total_value[kid] += value - current_occ[kid][cu] + current_occ[kid][cu] = value + current_time[kid] = time // delta + for small, norm, time, value in zip(small_chart, norm_fact, current_time, total_value): + small[time] += value + norm[time] += value + + chart += small_chart/norm_fact + + for (id, name), occ in zip(dispatchnames.items(), chart): + plt.plot(np.arange(occ.size) * delta, occ, label=str(id)+'#'+name, linewidth=1.1) + + plt.legend() + if normalize: + plt.ylabel("Occupancy %") + else: + plt.ylabel("Occupancy total") + plt.xlabel("Cycle") + plt.ylim(-1) + plt.xlim(-maxtime // 200, maxtime + maxtime // 200 + delta + 1) + plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1) + figure_bytes = BytesIO() + plt.savefig(figure_bytes, dpi=150) + return dispatchnames, FileBytesIO(figure_bytes) + + +def draw_occupancy(selections, normalize, OCCUPANCY, shadernames, numdispatchid): plt.figure(figsize=(15, 4)) names = [] if len(OCCUPANCY) == 1: # If single SE, do occupancy per CU/WGP - OCCUPANCY = [[u for u in OCCUPANCY[0] if u&0xFF==k] for k in range(16)] - shadernames = ['CU'+str(k) for k in range(16) if len(OCCUPANCY[k]) > 0] - OCCUPANCY = [occ for occ in OCCUPANCY if len(occ) > 0] + percu = [[u for u in OCCUPANCY[0] if (u>>19) & 0xF == k] for k in range(16)] + shadernames = shadernames + [['CU'+str(k),''] for k in range(16) if len(percu[k]) > 0] + OCCUPANCY = OCCUPANCY + [occ for occ in percu if len(occ) > 0] - maxtime = 1 - delta = 1 for name, occ in zip(shadernames, OCCUPANCY): - occ_values = [0] - occ_times = [0] - occ = [(int(u >> 16), (u >> 8) & 0xFF, u & 0xFF) for u in occ] - current_occ = [0 for k in range(16)] + if len(occ) <= 16: + continue + maxtime = 1 + delta = 1 + occ = [(16*int(u >> 23), (u >> 12) & 0x7F, (u>>19) & 0xF, u&0xFFF) for u in occ] + current_occ = [[0 for m in range(16)] for k in range(numdispatchid)] + current_occ[0] = [m[1] for m in occ[:16]] - for time, value, cu in occ: + occ_values = [np.sum(current_occ[0])] + occ_times = [0] + + for time, value, cu, kid in occ: occ_times.append(time) - occ_values.append(occ_values[-1] + value - current_occ[cu]) - current_occ[cu] = value + occ_values.append(occ_values[-1] + value - current_occ[kid][cu]) + current_occ[kid][cu] = value try: - names.append('SE'+name.split('.att')[0].split('_se')[-1]) + names.append('SE'+name.split('_se')[1].split('.att')[0]) except: names.append(name) NUM_DOTS = 1500 - maxtime = np.max(occ_times) + maxtime = occ_times[-1]+1 delta = max(1, maxtime // NUM_DOTS) chart = np.zeros((maxtime // delta + 1), dtype=np.float32) norm_fact = np.zeros_like(chart) + norm_fact += 1E-6 - for i, t in enumerate(occ_times[:-1]): - b = t // delta + for i in range(len(occ_times)-1): + b = occ_times[i] // delta e = max(b + 1, occ_times[i + 1] // delta) chart[b:e] += occ_values[i] norm_fact[b:e] += 1 - chart /= np.maximum(norm_fact, 1) + chart /= norm_fact if normalize: chart /= max(chart.max(), 1e-6) - plt.plot(np.arange(chart.size) * delta, chart, label=name, linewidth=1.1) + plt.plot(np.arange(chart.size) * delta, chart, label=names[-1], linewidth=1.1) plt.legend() if normalize: @@ -267,12 +330,14 @@ def GeneratePIC(drawinfo, selections=[True for k in range(16)], normalize=False) response = {} figures = {} - states, figure = draw_occupancy( - selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"] - ) + states, figure = draw_occupancy(selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"], len(drawinfo["DispatchNames"])) response["occupancy.png"] = states figures["occupancy.png"] = figure + states, figure = draw_occupancy_per_dispatch(selections, normalize, drawinfo["OCCUPANCY"], drawinfo["DispatchNames"]) + response["dispatches.png"] = states + figures["dispatches.png"] = figure + states, figure = draw_wave_states(selections, normalize, drawinfo["TIMELINES"]) response["timeline.png"] = states figures["timeline.png"] = figure diff --git a/plugin/att/stitch.py b/plugin/att/stitch.py index 0cd03bd55a..f67485b282 100644 --- a/plugin/att/stitch.py +++ b/plugin/att/stitch.py @@ -193,104 +193,11 @@ def try_match_swapped(insts, code, i, line): return insts[i + 1][1] == code[line][1] and insts[i][1] == code[line + 1][1] -FORK_NAMES = 1 -# A successful parsed instruction -class CachedInst: - def __init__(self, inst, as_line): - self.inst_type = inst - self.as_line = as_line - self.forks = None - -# A branch of the parsing tree -class Fork: - def __init__(self): - global FORK_NAMES - self.insts = [] - self.data = None - self.name = FORK_NAMES - FORK_NAMES += 1 - # print('Created new fork: ', self.name) - -# Try to match sequence "insts" with the branch "fork", starting at position "i" -def move_down_fork(fork, insts, i): #(fork : Fork, insts : list, i : int): - N = min(len(insts), len(fork.insts)) - - while i < N: - if insts[i][1] == fork.insts[i].inst_type: - i += 1 - elif i= len(cur_fork.insts): - return False, cur_fork - - last_inst = cur_fork.insts[i] - if last_inst.forks is None: - last_inst.forks = [] - - bMatchFork = False - for fork in last_inst.forks: - if fork.insts[0].inst_type == insts[0][1]: - cur_fork = fork - bMatchFork = True - break - if not bMatchFork: - cur_fork = Fork() - last_inst.forks.append(cur_fork) - return False, cur_fork - - print("Warning: Reached end of loop!") - return False, cur_fork - - def stitch(insts, raw_code, jumps, gfxv, bIsAuto): bGFX9 = gfxv == 'vega' - # Try from cached result from a previous wave that have already been parsed - dict_sucess, current_fork = fromDict(insts) - if dict_sucess: - result, loopCount, mem_unroll, flight_count, maxline, pcsequence = current_fork.data - # Check if the sequence of measured PC values are equal for cached and new wave - if len(pcsequence) > 0: - pcs = [r[2] for r in insts if r[1] == PCINFO] - if len(pcs) != len(pcsequence): - dict_sucess = False - for pc1, pc2 in zip(pcs, pcsequence): - if pc1 != pc2: - dict_sucess = False - - # If successful, use resulting assembly from cache - if dict_sucess: - result = [r+(asm[-1],) for r, asm in zip(insts, result)] - return result, loopCount, mem_unroll, flight_count, maxline, len(result) - result, i, line, loopCount, N = [], 0, 0, defaultdict(int), len(insts) - SMEM_INST = [] # scalar memory VLMEM_INST = [] # vector memory load VSMEM_INST = [] # vector memory store @@ -310,10 +217,6 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): # Clean the code and remove comments code = [raw_code[0]] for c in raw_code[1:]: - if bIsAuto and '; Begin ' == c[0][:len('; Begin ')]: - if '; Begin ' in c[0]: - line = len(code) - print('Begin at:', line, c) c = list(c) c[0] = c[0].split(";")[0].split("//")[0].strip() @@ -339,7 +242,16 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): loops = 0 maxline = 0 - watchlist = RegisterWatchList(labels=labels) if not bIsAuto else PCTranslator(code, insts) + if bIsAuto and len(insts) and insts[0][1] == PCINFO: + try: + watchlist = PCTranslator(code, insts) + line = watchlist.addrmap[insts[0][2]] + result.append((insts[0][0], PCINFO, 0, 0, 0)) + i = 1 + except: + return None + else: + watchlist = RegisterWatchList(labels=labels) pcsequence = [] while i < N: @@ -534,7 +446,5 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): break line += 1 - current_fork.insts = [CachedInst(inst[1], inst[-1]) for inst in result] - current_fork.data = result, loopCount, mem_unroll, flight_count, maxline, pcsequence result = [r for r in result if r[1] != PCINFO] return result, loopCount, mem_unroll, flight_count, maxline, len(result) if i == N else N diff --git a/plugin/att/trace_view.py b/plugin/att/trace_view.py index 806a90bbeb..134cc7a4e0 100755 --- a/plugin/att/trace_view.py +++ b/plugin/att/trace_view.py @@ -296,25 +296,25 @@ def view_trace( code, dbnames, att_filenames, - bReturnLoc, - OCCUPANCY, bDumpOnly, se_time_begin, gfxv, - drawinfo, - MPI_COMM, - mpi_root, + drawinfo ): global JSON_GLOBAL_DICTIONARY pic_thread = None - if mpi_root: - manager = Manager() - return_dict = manager.dict() - JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable( - {str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))} - ) - pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo)) - pic_thread.start() + + manager = Manager() + return_dict = manager.dict() + occ_dict = {str(k): drawinfo["OCCUPANCY"][k] for k in range(len(drawinfo["OCCUPANCY"]))} + occ_dict['dispatches'] = {} + for id, name in drawinfo['DispatchNames'].items(): + occ_dict['dispatches'][id] = name + occ_dict['names'] = drawinfo['ShaderNames'] + + JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable(occ_dict) + pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo)) + pic_thread.start() att_filenames = [Path(f).name for f in att_filenames] se_numbers = [int(a.split("_se")[1].split(".att")[0]) for a in att_filenames] @@ -337,9 +337,8 @@ def view_trace( flight_count.append(count) simd_wave_filenames[se_number] = wv_filenames - if mpi_root: - code_sel = [c[:-3]+c[-2:] for c in code[:allse_maxline+16]] - JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)}) + code_sel = [c[:-3]+c[-2:] for c in code[:allse_maxline+16]] + JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)}) for key in simd_wave_filenames.keys(): wv_array = [ @@ -367,42 +366,21 @@ def view_trace( simd_wave_filenames[key] = wv_dict - if MPI_COMM is not None: - se_filenames = MPI_COMM.gather(se_filenames, root=0) - simd_wave_filenames = MPI_COMM.gather(simd_wave_filenames, root=0) - if mpi_root: - se_filenames = [e for elem in se_filenames for e in elem] - simd_wave_filenames = { - k: v for smf in simd_wave_filenames for k, v in smf.items() - } - - if mpi_root: - JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable( - { - "wave_filenames": simd_wave_filenames, - "se_filenames": se_filenames, - "global_begin_time": int(se_time_begin), - "gfxv": gfxv, - } - ) + JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable( + { + "wave_filenames": simd_wave_filenames, + "se_filenames": se_filenames, + "global_begin_time": int(se_time_begin), + "gfxv": gfxv, + } + ) if pic_thread is not None: pic_thread.join() for k, v in return_dict.items(): JSON_GLOBAL_DICTIONARY[k] = v - if bReturnLoc: - return flight_count - if bDumpOnly == False: - if MPI_COMM is not None: - JSON_GLOBAL_DICTIONARY = MPI_COMM.gather(JSON_GLOBAL_DICTIONARY, root=0) - if not mpi_root: - quit() - JSON_GLOBAL_DICTIONARY = { - k: v for smf in JSON_GLOBAL_DICTIONARY for k, v in smf.items() - } - JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 1}) if args.ports: assign_ports(args.ports) @@ -420,13 +398,12 @@ def view_trace( print("Exitting.") else: os.makedirs("ui/", exist_ok=True) - if mpi_root: - JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0}) - os.system( - "cp " - + os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui") - + "/* ui/" - ) + JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0}) + os.system( + "cp " + + os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui") + + "/* ui/" + ) for k, v in JSON_GLOBAL_DICTIONARY.items(): with open(os.path.join("ui", k), "w" if ".json" in k else "wb") as f: f.write(v.read()) diff --git a/plugin/att/ui/index.html b/plugin/att/ui/index.html index c6418511ef..185b3b5bc3 100644 --- a/plugin/att/ui/index.html +++ b/plugin/att/ui/index.html @@ -14,6 +14,7 @@
+