From 675e1b9d38ed6d6783be95ea528d590400debe3b Mon Sep 17 00:00:00 2001
From: Giovanni  LB <gbaraldi@amd.com>
Date: Mon, 25 Sep 2023 14:54:00 -0300
Subject: [PATCH] SWDEV-423898: Fixing issues with parallel kernels

Change-Id: I6726f3003af6036ba041c2b4bc5227dd08691090
---
 bin/rocprofv2                |   6 +-
 plugin/att/att.py            | 267 ++++++++++-------------------------
 plugin/att/code_printing.cpp |  54 -------
 plugin/att/disassembly.cpp   |  26 ++--
 plugin/att/drawing.py        | 129 ++++++++++++-----
 plugin/att/stitch.py         | 110 ++-------------
 plugin/att/trace_view.py     |  81 ++++-------
 plugin/att/ui/index.html     |   1 +
 8 files changed, 225 insertions(+), 449 deletions(-)

diff --git a/bin/rocprofv2 b/bin/rocprofv2
index a2a86f66bd..4012f10c39 100755
--- a/bin/rocprofv2
+++ b/bin/rocprofv2
@@ -194,11 +194,7 @@ while [ 1 ]; do
           ATT_ARGV="$ATT_ARGV $3 \"$4\""
           shift
           shift
-        elif [[ "$3" = "--mpi" ]]; then
-          ATT_PYTHON3_ARG="mpirun -np $4 python3 "
-          shift
-          shift
-        elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" = "--genasm" || "$3" == "--att_kernel" || "$3" == "--depth" ]]; then
+        elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" == "--att_kernel" ]]; then
           ATT_ARGV="$ATT_ARGV $3 $4"
           shift
           shift
diff --git a/plugin/att/att.py b/plugin/att/att.py
index 7c9251e756..20ad9413c0 100755
--- a/plugin/att/att.py
+++ b/plugin/att/att.py
@@ -16,14 +16,7 @@ import glob
 import numpy as np
 from stitch import stitch
 import gc
-
-try:
-    from mpi4py import MPI
-
-    MPI_IMPORTED = True
-except:
-    MPI_IMPORTED = False
-
+from collections import defaultdict
 
 class PerfEvent(ctypes.Structure):
     _fields_ = [
@@ -130,6 +123,8 @@ class ReturnInfo(ctypes.Structure):
         ("occupancy", POINTER(ctypes.c_uint64)),
         ("num_occupancy", ctypes.c_uint64),
         ("flags", ctypes.c_uint64),
+        ("kernel_id_addr", POINTER(ctypes.c_uint64)),
+        ("num_kernel_ids", ctypes.c_uint64),
     ]
 
 
@@ -162,10 +157,15 @@ def parse_binary(filename, kernel=None):
     info = SO.wrapped_parse_binary(str(filename).encode("utf-8"), kernel)
 
     code = []
+    kernel_addr = defaultdict(lambda : "Unknown")
+    last_known_function = "Unknown"
     for k in range(info.code_len):
         code_entry = info.code[k]
 
         line = deepcopy(code_entry.line.decode("utf-8"))
+        if "; Begin " in line:
+            last_known_function = line.split("; Begin ")[1]
+
         loc = deepcopy(code_entry.loc.decode("utf-8"))
 
         to_line = int(code_entry.to_line) if (code_entry.to_line >= 0) else None
@@ -175,31 +175,31 @@ def parse_binary(filename, kernel=None):
         code.append([line, int(code_entry.value), to_line, loc, int(code_entry.index),
                     int(code_entry.line_num), int(code_entry.addr), 0, 0])
 
+        if code[-1][-3] != 0 and len(code) > 1:
+            kernel_addr[code[-1][-3]] = last_known_function
+
     jumps = {}
     for k in range(info.jumps_len):
         jumps[info.jumps[k].key] = info.jumps[k].value
 
-    return code, jumps
+    return code, jumps, kernel_addr
 
 
-def getWaves_binary(name, shader_engine_data_dict, target_cu, depth):
+def getWaves_binary(name, shader_engine_data_dict, target_cu):
     filename = os.path.abspath(str(name))
     info = SO.AnalyseBinary(filename.encode("utf-8"), target_cu, False)
 
+    kernel_addr = [int(info.kernel_id_addr[k]) for k in range(info.num_kernel_ids)]
+
     waves = [info.wavedata[k] for k in range(info.num_waves)]
     events = [deepcopy(info.perfevents[k]) for k in range(info.num_events)]
     occupancy = [int(info.occupancy[k]) for k in range(int(info.num_occupancy))]
     flags = "navi" if (info.flags & 0x1) else "vega"
 
-    wave_slot_count = [[0 for k in range(20)] for j in range(4)]
     waves_python = []
     for wave in waves:
-        if (
-            wave_slot_count[wave.simd][wave.wave_id] >= depth
-            or wave.instructions_size == 0
-        ):
+        if wave.instructions_size < 2:
             continue
-        wave_slot_count[wave.simd][wave.wave_id] += 1
         pwave = PythonWave(wave)
         pwave.timeline = [
             (wave.timeline_array[2 * k], wave.timeline_array[2 * k + 1])
@@ -210,16 +210,16 @@ def getWaves_binary(name, shader_engine_data_dict, target_cu, depth):
             for k in range(wave.instructions_size)
         ]
         waves_python.append(pwave)
-    shader_engine_data_dict[name] = (waves_python, events, occupancy, flags)
+    shader_engine_data_dict[name] = (waves_python, events, occupancy, flags, kernel_addr)
 
 
 def getWaves_stitch(SIMD, code, jumps, flags, latency_map, hitcount_map, bIsAuto):
     for pwave in SIMD:
         pwave.instructions = stitch(pwave.instructions, code, jumps, flags, bIsAuto)
-
-        for inst in pwave.instructions[0]:
-            hitcount_map[inst[-1]] += 1
-            latency_map[inst[-1]] += inst[3]
+        if pwave.instructions is not None:
+            for inst in pwave.instructions[0]:
+                hitcount_map[inst[-1]] += 1
+                latency_map[inst[-1]] += inst[3]
 
 
 def persist(trace_file, SIMD):
@@ -232,6 +232,8 @@ def persist(trace_file, SIMD):
     smem_ins, smem_stalls, br_ins, br_taken_ins, br_stalls = [], [], [], [], []
 
     for wave in SIMD:
+        if wave.instructions is None:
+            continue
         simds.append(wave.simd)
         waves.append(wave.wave_id)
         begin_time.append(wave.begin_time)
@@ -344,50 +346,30 @@ def insert_waitcnt(flight_count, assembly_code):
     return assembly_code
 
 
-def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
-    for n, occ in enumerate(OCCUPANCY):
-        OCCUPANCY[n] = [
-            max(min(int((u >> 16) - min_event_time) << 16, 2**42), 0) | (u & 0xFFFFF)
-            for u in occ
-        ]
-    for perf in EVENTS:
-        for p in perf:
-            p.time -= min_event_time
-
+def gen_timelines(DBFILES):
+    TIMELINES = [np.zeros(int(1E6), dtype=np.float32) for k in range(5)]
+    TIME_RESOLUTION = 16
     for df in DBFILES:
         for T in range(len(df["timeline"])):
             timeline = df["timeline"][T]
             time_acc = 0
-            tuples3 = [(0, df["begin_time"][T] - min_event_time)] + [
-                (int(t[0]), int(t[1])) for t in timeline
-            ]
+            tuples3 = [(0, df["begin_time"][T])] + [(int(t[0]), int(t[1])) for t in timeline]
 
             for state in tuples3:
-                if state[1] > 1e8:
+                t_end = (time_acc + state[1])//TIME_RESOLUTION
+                if t_end > 1E8:
                     print("Warning: Time limit reached for ", state[0], state[1])
                     break
-                if time_acc + state[1] > TIMELINES[state[0]].size:
+                elif t_end > TIMELINES[state[0]].size:
                     TIMELINES[state[0]] = np.hstack(
                         [TIMELINES[state[0]], np.zeros_like(TIMELINES[state[0]])]
                     )
-                TIMELINES[state[0]][time_acc : time_acc + state[1]] += 1
+                TIMELINES[state[0]][time_acc//TIME_RESOLUTION : t_end] += 1
                 time_acc += state[1]
+    return TIMELINES
 
 
 if __name__ == "__main__":
-    comm = None
-    mpi_root = True
-    if MPI_IMPORTED:
-        try:
-            comm = MPI.COMM_WORLD
-            if comm.Get_size() < 2:
-                comm = None
-            else:
-                mpi_root = comm.Get_rank() == 0
-        except:
-            print("Could not load MPI")
-            comm = None
-
     pathenv = os.getenv("OUTPUT_PATH")
     if pathenv is None:
         pathenv = "."
@@ -395,9 +377,6 @@ if __name__ == "__main__":
     parser.add_argument(
         "assembly_code", help="Path to the assembly code. Must be the first parameter."
     )
-    parser.add_argument(
-        "--depth", help="Maximum number of parsed waves per slot", default=100, type=int
-    )
     parser.add_argument(
         "--trace_file", help="Filter for trace files", default=None, type=str
     )
@@ -405,12 +384,6 @@ if __name__ == "__main__":
         "--att_kernel", help="Kernel file", type=str, default=pathenv + "/*_kernel.txt"
     )
     parser.add_argument("--ports", help="Server and websocket ports, default: 8000,18000")
-    parser.add_argument(
-        "--genasm",
-        help="Generate post-processed asm file at this path",
-        type=str,
-        default="",
-    )
     parser.add_argument(
         "--mode",
         help="""ATT analysis modes:\n
@@ -455,22 +428,19 @@ if __name__ == "__main__":
         print("Could not find att output kernel:", args.att_kernel)
         exit(1)
     elif len(att_kernel) > 1:
-        if mpi_root:
-            print("Found multiple kernel matching given filters:")
-            for n, k in enumerate(att_kernel):
-                print("\t", n, "->", k)
+        print("Found multiple kernel matching given filters:")
+        for n, k in enumerate(att_kernel):
+            print("\t", n, "->", k)
 
-            bValid = False
-            while bValid == False:
-                try:
-                    args.att_kernel = att_kernel[int(input("Please select number: "))]
-                    bValid = True
-                except KeyboardInterrupt:
-                    exit(0)
-                except:
-                    print("Invalid option.")
-        if comm is not None:
-            args.att_kernel = comm.bcast(args.att_kernel, root=0)
+        bValid = False
+        while bValid == False:
+            try:
+                args.att_kernel = att_kernel[int(input("Please select number: "))]
+                bValid = True
+            except KeyboardInterrupt:
+                exit(0)
+            except:
+                print("Invalid option.")
     else:
         args.att_kernel = att_kernel[0]
 
@@ -491,38 +461,31 @@ if __name__ == "__main__":
         filenames = glob.glob(args.trace_file)
     assert len(filenames) > 0
 
-    if comm is not None:
-        filenames = filenames[comm.Get_rank() :: comm.Get_size()]
-
-    code = jumps = None
-    if mpi_root:
-        print('Att kernel:', args.att_kernel)
-        code, jumps = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel)
+    print('Att kernel:', args.att_kernel)
+    code, jumps, kern_addr = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel)
 
     DBFILES = []
-    TIMELINES = [np.zeros(int(1e4), dtype=np.int16) for k in range(5)]
     EVENTS = []
     OCCUPANCY = []
     GFXV = []
     analysed_filenames = []
     occupancy_filenames = []
-
+    dispatch_kernel_names = {}
     shader_engine_data_dict = {}
     for name in filenames:
-        getWaves_binary(name, shader_engine_data_dict, args.target_cu, args.depth)
-
-    if comm is not None:
-        code = comm.bcast(code, root=0)
-        jumps = comm.bcast(jumps, root=0)
+        getWaves_binary(name, shader_engine_data_dict, args.target_cu)
 
     gc.collect()
     latency_map = np.zeros((len(code)), dtype=np.int64)
     hitcount_map = np.zeros((len(code)), dtype=np.int32)
     for name in filenames:
-        SIMD, perfevents, occupancy, gfxv = shader_engine_data_dict[name]
-        if len(occupancy) > 0:
+        SIMD, perfevents, occupancy, gfxv, addrs = shader_engine_data_dict[name]
+
+        for id, addr in enumerate(addrs):
+            dispatch_kernel_names[id] = kern_addr[addr]
+        if len(occupancy) > 16:
             OCCUPANCY.append( occupancy )
-            occupancy_filenames.append( name )
+            occupancy_filenames.append(name)
         if np.sum([0]+[len(s.instructions) for s in SIMD]) == 0:
             print("No waves from", name)
             continue
@@ -534,117 +497,33 @@ if __name__ == "__main__":
         GFXV.append(gfxv)
 
     gc.collect()
-    min_event_time = 2**62
-    for df in DBFILES:
-        if len(df["begin_time"]) > 0:
-            min_event_time = min(min_event_time, np.min(df["begin_time"]))
-    for perf in EVENTS:
-        for p in perf:
-            min_event_time = min(min_event_time, p.time)
-    for occ in OCCUPANCY:
-        min_event_time = min(min_event_time, np.min(np.array(occ) >> 16))
-
-    gc.collect()
-    min_event_time = max(0, min_event_time - 32)
-    if comm is not None:
-        min_event_time = comm.reduce(min_event_time, op=MPI.MIN)
-        min_event_time = comm.bcast(min_event_time, root=0)
-
-        apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES)
-
-        GFXV = comm.gather(GFXV, root=0)
-        EVENTS = comm.gather(EVENTS, root=0)
-        OCCUPANCY = comm.gather(OCCUPANCY, root=0)
-        TIMELINES = comm.gather(TIMELINES, root=0)
-        gather_latency_map = comm.gather(latency_map, root=0)
-        gather_hitcount_map = comm.gather(hitcount_map, root=0)
-        gathered_filenames = comm.gather(occupancy_filenames, root=0)
-
-        if mpi_root:
-            latency_map *= 0
-            hitcount_map *= 0
-            for hit, lat in zip(gather_hitcount_map, gather_latency_map):
-                hitcount_map += hit
-                latency_map += lat
-            EVENTS = [e for elem in EVENTS for e in elem]
-            OCCUPANCY = [e for elem in OCCUPANCY for e in elem]
-            gathered_filenames = [e for elem in gathered_filenames for e in elem]
-            gfxv = [e for elem in GFXV for e in elem][0]
-
-            TIMELINES_GATHER = TIMELINES
-            TIMELINES = [
-                np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16)
-                for k in range(5)
-            ]
-            for gather in TIMELINES_GATHER:
-                for t, m in zip(TIMELINES, gather):
-                    t[: len(m)] += m
-            del TIMELINES_GATHER
-        else:  # free up memory
-            TIMELINES = []
-            OCCUPANCY = []
-            EVENTS = []
-    else:
-        apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES)
-        gathered_filenames = occupancy_filenames
-
-    if mpi_root:
-        for k in range(len(code)):
-            code[k][-2] = int(hitcount_map[k])
-            code[k][-1] = int(latency_map[k])
+    for k in range(len(code)):
+        code[k][-2] = int(hitcount_map[k])
+        code[k][-1] = int(latency_map[k])
 
     if CSV_MODE:
-        if mpi_root:
-            from att_to_csv import dump_csv
-            dump_csv(code)
+        from att_to_csv import dump_csv
+        dump_csv(code)
         quit()
 
+
     gc.collect()
-    print("Min time:", min_event_time)
 
     drawinfo = {
-        "TIMELINES": TIMELINES,
+        "TIMELINES": gen_timelines(DBFILES),
         "EVENTS": EVENTS,
         "EVENT_NAMES": EVENT_NAMES,
         "OCCUPANCY": OCCUPANCY,
-        "ShaderNames": gathered_filenames,
+        "ShaderNames": occupancy_filenames,
+        "DispatchNames": dispatch_kernel_names,
     }
-    if args.genasm and len(args.genasm) > 0:
-        flight_count = view_trace(
-            args,
-            code,
-            DBFILES,
-            analysed_filenames,
-            True,
-            OCCUPANCY,
-            args.dumpfiles,
-            min_event_time,
-            gfxv,
-            drawinfo,
-            comm,
-            mpi_root,
-        )
-        with open(args.assembly_code, "r") as file:
-            lines = file.readlines()
-        assembly_code = {l + 1.0: lines[l][:-1] for l in range(len(lines))}
-        assembly_code = insert_waitcnt(flight_count, assembly_code)
-
-        with open(args.genasm, "w") as file:
-            keys = sorted(assembly_code.keys())
-            for k in keys:
-                file.write(assembly_code[k] + "\n")
-    else:
-        view_trace(
-            args,
-            code,
-            DBFILES,
-            analysed_filenames,
-            False,
-            OCCUPANCY,
-            args.dumpfiles,
-            min_event_time,
-            gfxv,
-            drawinfo,
-            comm,
-            mpi_root,
-        )
+    view_trace(
+        args,
+        code,
+        DBFILES,
+        analysed_filenames,
+        args.dumpfiles,
+        0,
+        gfxv,
+        drawinfo
+    )
diff --git a/plugin/att/code_printing.cpp b/plugin/att/code_printing.cpp
index aff5259a99..ed5db7ab31 100644
--- a/plugin/att/code_printing.cpp
+++ b/plugin/att/code_printing.cpp
@@ -136,55 +136,6 @@ std::optional<code_object_decoder_t::symbol_info_t> code_object_decoder_t::find_
   return {};
 }
 
-/*
-void code_object_decoder_t::load_symbol_map() {
-  std::unique_ptr<Elf, void (*)(Elf *)> elf (
-      elf_begin(m_fd, ELF_C_READ, nullptr),
-      [](Elf *elf){ elf_end(elf); });
-
-  if (!elf) {
-    rocprofiler::warning("Error opening ELF!\n");
-    return;
-  }
-
-  Elf64_Ehdr *ehdr = elf64_getehdr(elf.get());
-  if (!ehdr) {
-    printf("elf64_getehdr failed\n");
-    return;
-  }
-
-  // Slurp the symbol table.
-  Elf_Scn *scn = nullptr;
-  while ((scn = elf_nextscn(elf.get(), scn)) != nullptr) {
-    GElf_Shdr shdr_mem;
-    GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_mem);
-    if (shdr->sh_type != SHT_SYMTAB && shdr->sh_type != SHT_DYNSYM) {
-      continue;
-    }
-
-    Elf_Data *data = elf_getdata(scn, nullptr);
-    if (!data) continue;
-
-    size_t symbol_count = data->d_size / gelf_fsize(elf.get(), ELF_T_SYM, 1, EV_CURRENT);
-    for (size_t j = 0; j < symbol_count; ++j) {
-      GElf_Sym sym_mem;
-      GElf_Sym *sym = gelf_getsym(data, j, &sym_mem);
-
-      if (GELF_ST_TYPE(sym->st_info) != STT_FUNC || sym->st_shndx == SHN_UNDEF) continue;
-
-      std::string symbol_name{ elf_strptr(elf.get(), shdr->sh_link, sym->st_name) };
-      auto symbol_pair = std::make_pair(symbol_name, sym->st_size);
-
-      auto [it, success] = m_symbol_map.emplace(sym->st_value, symbol_pair);
-
-      // If there already was a symbol defined at this address, but this
-      // new symbol covers a larger address range, replace the old symbol
-      //         with this new one.
-      if (!success && sym->st_size > it->second.second) it->second = symbol_pair;
-    }
-  }
-} */
-
 void code_object_decoder_t::disassemble_kernel(uint64_t addr) {
   auto symbol = find_symbol(addr);
 
@@ -193,9 +144,6 @@ void code_object_decoder_t::disassemble_kernel(uint64_t addr) {
     return;
   }
 
-  // if (symbol->m_name.find("__amd_rocclr_") == 0)
-  //  return;
-
   std::cout << "Dumping ISA for " << symbol->m_name << std::endl;
 
   uint64_t end_addr = addr + symbol->m_size;
@@ -218,8 +166,6 @@ void code_object_decoder_t::disassemble_kernel(uint64_t addr) {
 
 void code_object_decoder_t::disassemble_kernels() {
   disassembly = std::make_unique<DisassemblyInstance>(*this);
-
-  // if (m_symbol_map.begin() == m_symbol_map.end())
   m_symbol_map = disassembly->GetKernelMap();
 
   for (auto& [k, v] : m_symbol_map) disassemble_kernel(k);
diff --git a/plugin/att/disassembly.cpp b/plugin/att/disassembly.cpp
index d564719ed2..d94e22931a 100644
--- a/plugin/att/disassembly.cpp
+++ b/plugin/att/disassembly.cpp
@@ -57,8 +57,9 @@
   if (amd_comgr_status_s status = call) {                                                          \
     const char* reason = "";                                                                       \
     amd_comgr_status_string(status, &reason);                                                      \
+    std::cerr << __LINE__ << " code: " << status << std::endl;                                     \
     std::cerr << __LINE__ << " failed: " << reason << std::endl;                                   \
-    return;                                                                                        \
+    exit(1);                                                                                       \
   }
 
 CodeObjectBinary::CodeObjectBinary(const std::string& uri) : m_uri(uri) {
@@ -156,12 +157,12 @@ DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder)
     : buffer(reinterpret_cast<int64_t>(decoder.buffer.data())),
       size(decoder.buffer.size()),
       instructions(decoder.instructions) {
-  amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &data);
-  amd_comgr_set_data(data, size, decoder.buffer.data());
+  CHECK_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &data));
+  CHECK_COMGR(amd_comgr_set_data(data, size, decoder.buffer.data()));
 
   char isa_name[128];
   size_t isa_size = sizeof(isa_name);
-  amd_comgr_get_data_isa_name(data, &isa_size, isa_name);
+  CHECK_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, isa_name));
 
   CHECK_COMGR(amd_comgr_create_disassembly_info(
       isa_name,  //"amdgcn-amd-amdhsa--gfx1100",
@@ -172,24 +173,24 @@ DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder)
 amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbol,
                                                         void* user_data) {
   amd_comgr_symbol_type_t type;
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type);
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type));
 
   if (type != AMD_COMGR_SYMBOL_TYPE_FUNC && type != AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL)
     return AMD_COMGR_STATUS_SUCCESS;
 
   uint64_t addr;
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &addr);
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &addr));
 
   uint64_t mem_size;
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size);
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size));
 
   uint64_t name_size;
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size);
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size));
 
   std::string name;
   name.resize(name_size);
 
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data());
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data()));
 
   static_cast<DisassemblyInstance*>(user_data)->symbol_map[addr] = {name, mem_size};
   return AMD_COMGR_STATUS_SUCCESS;
@@ -197,18 +198,19 @@ amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbo
 
 std::map<uint64_t, std::pair<std::string, uint64_t>>& DisassemblyInstance::GetKernelMap() {
   symbol_map = std::map<uint64_t, std::pair<std::string, uint64_t>>{};
-  amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this);
+  CHECK_COMGR(amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this));
   return symbol_map;
 }
 
 DisassemblyInstance::~DisassemblyInstance() {
-  amd_comgr_release_data(data);
+  CHECK_COMGR(amd_comgr_release_data(data));
   CHECK_COMGR(amd_comgr_destroy_disassembly_info(info));
 }
 
 uint64_t DisassemblyInstance::ReadInstruction(uint64_t addr, const char* cpp_line) {
   uint64_t size_read;
-  amd_comgr_disassemble_instruction(info, buffer + addr, (void*)this, &size_read);
+  CHECK_COMGR(amd_comgr_disassemble_instruction(info, buffer + addr, (void*)this, &size_read));
+  assert(instructions.size() != 0);
   instructions.back().address = addr;
   instructions.back().cpp_reference = cpp_line;
   return size_read;
diff --git a/plugin/att/drawing.py b/plugin/att/drawing.py
index b6a5e62a24..945ab30e50 100644
--- a/plugin/att/drawing.py
+++ b/plugin/att/drawing.py
@@ -153,6 +153,7 @@ def draw_wave_states(selections, normalize, TIMELINES):
 
     plt.figure(figsize=(15, 4))
 
+
     maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices])
     maxtime = max(maxtime, 1)
     timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices]
@@ -169,21 +170,18 @@ def draw_wave_states(selections, normalize, TIMELINES):
         else cycles * 0
         for time in timelines
     ]
-    kernsize = 21
-    kernel = np.asarray(
-        [
-            np.exp(-abs(10 * k / kernsize))
-            for k in range(-kernsize // 2, kernsize // 2 + 1)
-        ]
-    )
+    kernsize = 15
+    kernel = np.asarray([
+        np.exp(-abs(10 * k / kernsize)) for k in range(-kernsize // 2, kernsize // 2 + 1)
+    ])
     kernel /= np.sum(kernel)
 
     timelines = [
         np.convolve(time, kernel)[kernsize // 2 : -kernsize // 2]
-        for time in timelines
-        if len(time) > 0
+        for time in timelines if len(time) > 0
     ]
-
+    maxtime *= 16
+    cycles *= 16
     [
         plt.plot(cycles, t, label="State " + s, linewidth=1.1, color=c)
         for t, s, c, sel in zip(timelines, STATES, colors, selections)
@@ -204,48 +202,113 @@ def draw_wave_states(selections, normalize, TIMELINES):
     return STATES, FileBytesIO(figure_bytes)
 
 
-def draw_occupancy(selections, normalize, OCCUPANCY, shadernames):
+def draw_occupancy_per_dispatch(selections, normalize, OCCUPANCY, dispatchnames):
+    plt.figure(figsize=(15, 4))
+    maxtime = 1
+    delta = 1
+
+    for k in range(len(OCCUPANCY)):
+        if len(OCCUPANCY[k]) <= 16:
+            continue
+        OCCUPANCY[k] = [(16*int(u>>23), (u>>12) & 0x7F, (u>>19) & 0xF, u&0xFFF) for u in OCCUPANCY[k]]
+        maxtime = max(maxtime, OCCUPANCY[k][-1][0])
+
+    NUM_DOTS = 1600
+    delta = max(1, maxtime // NUM_DOTS)
+    chart = np.zeros((len(dispatchnames), maxtime // delta + 2), dtype=np.float32)
+
+    for occ in OCCUPANCY:
+        if len(occ) <= 16:
+            continue
+        small_chart = np.zeros_like(chart)
+        norm_fact = np.zeros_like(chart)
+        norm_fact += 1E-6
+
+        current_occ = [[0 for m in range(16)] for k in range(len(dispatchnames))]
+        current_occ[0] = [m[1] for m in occ[:16]]
+        current_time = [0 for k in range(len(dispatchnames))]
+        total_value = [0 for k in range(len(dispatchnames))]
+        total_value[0] = np.sum(current_occ[0])
+
+        for time, value, cu, kid in occ:
+            b = current_time[kid]
+            e = max(b + 1, time // delta)
+            small_chart[kid][b:e] += total_value[kid]
+            norm_fact[kid][b:e] += 1
+
+            total_value[kid] += value - current_occ[kid][cu]
+            current_occ[kid][cu] = value
+            current_time[kid] = time // delta
+        for small, norm, time, value in zip(small_chart, norm_fact, current_time, total_value):
+            small[time] += value
+            norm[time] += value
+
+        chart += small_chart/norm_fact
+
+    for (id, name), occ in zip(dispatchnames.items(), chart):
+        plt.plot(np.arange(occ.size) * delta, occ, label=str(id)+'#'+name, linewidth=1.1)
+
+    plt.legend()
+    if normalize:
+        plt.ylabel("Occupancy %")
+    else:
+        plt.ylabel("Occupancy total")
+    plt.xlabel("Cycle")
+    plt.ylim(-1)
+    plt.xlim(-maxtime // 200, maxtime + maxtime // 200 + delta + 1)
+    plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
+    figure_bytes = BytesIO()
+    plt.savefig(figure_bytes, dpi=150)
+    return dispatchnames, FileBytesIO(figure_bytes)
+
+
+def draw_occupancy(selections, normalize, OCCUPANCY, shadernames, numdispatchid):
     plt.figure(figsize=(15, 4))
     names = []
     if len(OCCUPANCY) == 1: # If single SE, do occupancy per CU/WGP
-        OCCUPANCY = [[u for u in OCCUPANCY[0] if u&0xFF==k] for k in range(16)]
-        shadernames = ['CU'+str(k) for k in range(16) if len(OCCUPANCY[k]) > 0]
-        OCCUPANCY = [occ for occ in OCCUPANCY if len(occ) > 0]
+        percu = [[u for u in OCCUPANCY[0] if (u>>19) & 0xF == k] for k in range(16)]
+        shadernames = shadernames + [['CU'+str(k),''] for k in range(16) if len(percu[k]) > 0]
+        OCCUPANCY = OCCUPANCY + [occ for occ in percu if len(occ) > 0]
 
-    maxtime = 1
-    delta = 1
     for name, occ in zip(shadernames, OCCUPANCY):
-        occ_values = [0]
-        occ_times = [0]
-        occ = [(int(u >> 16), (u >> 8) & 0xFF, u & 0xFF) for u in occ]
-        current_occ = [0 for k in range(16)]
+        if len(occ) <= 16:
+            continue
+        maxtime = 1
+        delta = 1
+        occ = [(16*int(u >> 23), (u >> 12) & 0x7F, (u>>19) & 0xF, u&0xFFF) for u in occ]
+        current_occ = [[0 for m in range(16)] for k in range(numdispatchid)]
+        current_occ[0] = [m[1] for m in occ[:16]]
 
-        for time, value, cu in occ:
+        occ_values = [np.sum(current_occ[0])]
+        occ_times = [0]
+
+        for time, value, cu, kid in occ:
             occ_times.append(time)
-            occ_values.append(occ_values[-1] + value - current_occ[cu])
-            current_occ[cu] = value
+            occ_values.append(occ_values[-1] + value - current_occ[kid][cu])
+            current_occ[kid][cu] = value
         try:
-            names.append('SE'+name.split('.att')[0].split('_se')[-1])
+            names.append('SE'+name.split('_se')[1].split('.att')[0])
         except:
             names.append(name)
 
         NUM_DOTS = 1500
-        maxtime = np.max(occ_times)
+        maxtime = occ_times[-1]+1
         delta = max(1, maxtime // NUM_DOTS)
         chart = np.zeros((maxtime // delta + 1), dtype=np.float32)
         norm_fact = np.zeros_like(chart)
+        norm_fact += 1E-6
 
-        for i, t in enumerate(occ_times[:-1]):
-            b = t // delta
+        for i in range(len(occ_times)-1):
+            b = occ_times[i] // delta
             e = max(b + 1, occ_times[i + 1] // delta)
             chart[b:e] += occ_values[i]
             norm_fact[b:e] += 1
 
-        chart /= np.maximum(norm_fact, 1)
+        chart /= norm_fact
         if normalize:
             chart /= max(chart.max(), 1e-6)
 
-        plt.plot(np.arange(chart.size) * delta, chart, label=name, linewidth=1.1)
+        plt.plot(np.arange(chart.size) * delta, chart, label=names[-1], linewidth=1.1)
 
     plt.legend()
     if normalize:
@@ -267,12 +330,14 @@ def GeneratePIC(drawinfo, selections=[True for k in range(16)], normalize=False)
     response = {}
     figures = {}
 
-    states, figure = draw_occupancy(
-        selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"]
-    )
+    states, figure = draw_occupancy(selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"], len(drawinfo["DispatchNames"]))
     response["occupancy.png"] = states
     figures["occupancy.png"] = figure
 
+    states, figure = draw_occupancy_per_dispatch(selections, normalize, drawinfo["OCCUPANCY"], drawinfo["DispatchNames"])
+    response["dispatches.png"] = states
+    figures["dispatches.png"] = figure
+
     states, figure = draw_wave_states(selections, normalize, drawinfo["TIMELINES"])
     response["timeline.png"] = states
     figures["timeline.png"] = figure
diff --git a/plugin/att/stitch.py b/plugin/att/stitch.py
index 0cd03bd55a..f67485b282 100644
--- a/plugin/att/stitch.py
+++ b/plugin/att/stitch.py
@@ -193,104 +193,11 @@ def try_match_swapped(insts, code, i, line):
     return insts[i + 1][1] == code[line][1] and insts[i][1] == code[line + 1][1]
 
 
-FORK_NAMES = 1
-# A successful parsed instruction
-class CachedInst:
-    def __init__(self, inst, as_line):
-        self.inst_type = inst
-        self.as_line = as_line
-        self.forks = None
-
-# A branch of the parsing tree
-class Fork:
-    def __init__(self):
-        global FORK_NAMES
-        self.insts = []
-        self.data = None
-        self.name = FORK_NAMES
-        FORK_NAMES += 1
-        # print('Created new fork: ', self.name)
-
-# Try to match sequence "insts" with the branch "fork", starting at position "i"
-def move_down_fork(fork, insts, i): #(fork : Fork, insts : list, i : int):
-    N = min(len(insts), len(fork.insts))
-
-    while i < N:
-        if insts[i][1] == fork.insts[i].inst_type:
-            i += 1
-        elif i<N-1  and insts[i+1][1] == fork.insts[i].inst_type \
-                    and insts[i][1] == fork.insts[i+1].inst_type:
-            i += 2
-        else:
-            return False, i
-
-    if len(fork.insts) != len(insts):
-        return False, i
-
-    return True, i
-
-
-FORK_TREE = Fork()
-
-# Check if there exists a previous wave with the same sequence of instructions executed
-def fromDict(insts):
-    i = 0
-    N = len(insts)
-    cur_fork = FORK_TREE
-    while i < N:
-        tillEnd, final_pos = move_down_fork(cur_fork, insts, i)
-        if tillEnd:
-            # print('Reached end')
-            return True, cur_fork
-
-        i += final_pos
-
-        if i >= len(cur_fork.insts):
-            return False, cur_fork
-
-        last_inst = cur_fork.insts[i]
-        if last_inst.forks is None:
-            last_inst.forks = []
-
-        bMatchFork = False
-        for fork in last_inst.forks:
-            if fork.insts[0].inst_type == insts[0][1]:
-                cur_fork = fork
-                bMatchFork = True
-                break
-        if not bMatchFork:
-            cur_fork = Fork()
-            last_inst.forks.append(cur_fork)
-            return False, cur_fork
-
-    print("Warning: Reached end of loop!")
-    return False, cur_fork
-
-
 def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
     bGFX9 = gfxv == 'vega'
 
-    # Try from cached result from a previous wave that have already been parsed
-    dict_sucess, current_fork = fromDict(insts)
-    if dict_sucess:
-        result, loopCount, mem_unroll, flight_count, maxline, pcsequence = current_fork.data
-        # Check if the sequence of measured PC values are equal for cached and new wave
-        if len(pcsequence) > 0:
-            pcs = [r[2] for r in insts if r[1] == PCINFO]
-            if len(pcs) != len(pcsequence):
-                dict_sucess = False
-            for pc1, pc2 in zip(pcs, pcsequence):
-                if pc1 != pc2:
-                    dict_sucess = False
-
-    # If successful, use resulting assembly from cache
-    if dict_sucess:
-        result = [r+(asm[-1],) for r, asm in zip(insts, result)]
-        return result, loopCount, mem_unroll, flight_count, maxline, len(result)
-
     result, i, line, loopCount, N = [], 0, 0, defaultdict(int), len(insts)
 
-
     SMEM_INST = []  # scalar memory
     VLMEM_INST = []  # vector memory load
     VSMEM_INST = []  # vector memory store
@@ -310,10 +217,6 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
     # Clean the code and remove comments
     code = [raw_code[0]]
     for c in raw_code[1:]:
-        if bIsAuto and '; Begin ' == c[0][:len('; Begin ')]:
-            if '; Begin <Kernel>' in c[0]:
-                line = len(code)
-                print('Begin at:', line, c)
         c = list(c)
         c[0] = c[0].split(";")[0].split("//")[0].strip()
 
@@ -339,7 +242,16 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
     loops = 0
     maxline = 0
 
-    watchlist = RegisterWatchList(labels=labels) if not bIsAuto else PCTranslator(code, insts)
+    if bIsAuto and len(insts) and insts[0][1] == PCINFO:
+        try:
+            watchlist = PCTranslator(code, insts)
+            line = watchlist.addrmap[insts[0][2]]
+            result.append((insts[0][0], PCINFO, 0, 0, 0))
+            i = 1
+        except:
+            return None
+    else:
+        watchlist = RegisterWatchList(labels=labels)
 
     pcsequence = []
     while i < N:
@@ -534,7 +446,5 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
                 break
             line += 1
 
-    current_fork.insts = [CachedInst(inst[1], inst[-1]) for inst in result]
-    current_fork.data = result, loopCount, mem_unroll, flight_count, maxline, pcsequence
     result = [r for r in result if r[1] != PCINFO]
     return result, loopCount, mem_unroll, flight_count, maxline, len(result) if i == N else N
diff --git a/plugin/att/trace_view.py b/plugin/att/trace_view.py
index 806a90bbeb..134cc7a4e0 100755
--- a/plugin/att/trace_view.py
+++ b/plugin/att/trace_view.py
@@ -296,25 +296,25 @@ def view_trace(
     code,
     dbnames,
     att_filenames,
-    bReturnLoc,
-    OCCUPANCY,
     bDumpOnly,
     se_time_begin,
     gfxv,
-    drawinfo,
-    MPI_COMM,
-    mpi_root,
+    drawinfo
 ):
     global JSON_GLOBAL_DICTIONARY
     pic_thread = None
-    if mpi_root:
-        manager = Manager()
-        return_dict = manager.dict()
-        JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable(
-            {str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))}
-        )
-        pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo))
-        pic_thread.start()
+
+    manager = Manager()
+    return_dict = manager.dict()
+    occ_dict = {str(k): drawinfo["OCCUPANCY"][k] for k in range(len(drawinfo["OCCUPANCY"]))}
+    occ_dict['dispatches'] = {}
+    for id, name in drawinfo['DispatchNames'].items():
+        occ_dict['dispatches'][id] = name
+    occ_dict['names'] = drawinfo['ShaderNames']
+
+    JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable(occ_dict)
+    pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo))
+    pic_thread.start()
 
     att_filenames = [Path(f).name for f in att_filenames]
     se_numbers = [int(a.split("_se")[1].split(".att")[0]) for a in att_filenames]
@@ -337,9 +337,8 @@ def view_trace(
             flight_count.append(count)
             simd_wave_filenames[se_number] = wv_filenames
 
-    if mpi_root:
-        code_sel = [c[:-3]+c[-2:] for c in code[:allse_maxline+16]]
-        JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)})
+    code_sel = [c[:-3]+c[-2:] for c in code[:allse_maxline+16]]
+    JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)})
 
     for key in simd_wave_filenames.keys():
         wv_array = [
@@ -367,42 +366,21 @@ def view_trace(
 
         simd_wave_filenames[key] = wv_dict
 
-    if MPI_COMM is not None:
-        se_filenames = MPI_COMM.gather(se_filenames, root=0)
-        simd_wave_filenames = MPI_COMM.gather(simd_wave_filenames, root=0)
-        if mpi_root:
-            se_filenames = [e for elem in se_filenames for e in elem]
-            simd_wave_filenames = {
-                k: v for smf in simd_wave_filenames for k, v in smf.items()
-            }
-
-    if mpi_root:
-        JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable(
-            {
-                "wave_filenames": simd_wave_filenames,
-                "se_filenames": se_filenames,
-                "global_begin_time": int(se_time_begin),
-                "gfxv": gfxv,
-            }
-        )
+    JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable(
+        {
+            "wave_filenames": simd_wave_filenames,
+            "se_filenames": se_filenames,
+            "global_begin_time": int(se_time_begin),
+            "gfxv": gfxv,
+        }
+    )
 
     if pic_thread is not None:
         pic_thread.join()
         for k, v in return_dict.items():
             JSON_GLOBAL_DICTIONARY[k] = v
 
-    if bReturnLoc:
-        return flight_count
-
     if bDumpOnly == False:
-        if MPI_COMM is not None:
-            JSON_GLOBAL_DICTIONARY = MPI_COMM.gather(JSON_GLOBAL_DICTIONARY, root=0)
-            if not mpi_root:
-                quit()
-            JSON_GLOBAL_DICTIONARY = {
-                k: v for smf in JSON_GLOBAL_DICTIONARY for k, v in smf.items()
-            }
-
         JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 1})
         if args.ports:
             assign_ports(args.ports)
@@ -420,13 +398,12 @@ def view_trace(
             print("Exitting.")
     else:
         os.makedirs("ui/", exist_ok=True)
-        if mpi_root:
-            JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0})
-            os.system(
-                "cp "
-                + os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui")
-                + "/* ui/"
-            )
+        JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0})
+        os.system(
+            "cp "
+            + os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui")
+            + "/* ui/"
+        )
         for k, v in JSON_GLOBAL_DICTIONARY.items():
             with open(os.path.join("ui", k), "w" if ".json" in k else "wb") as f:
                 f.write(v.read())
diff --git a/plugin/att/ui/index.html b/plugin/att/ui/index.html
index c6418511ef..185b3b5bc3 100644
--- a/plugin/att/ui/index.html
+++ b/plugin/att/ui/index.html
@@ -14,6 +14,7 @@
 			<div class="tab">
 				<button class="tablinks" onclick="showImage('timeline.png')">Wave States</button>
 				<button class="tablinks" onclick="showImage('occupancy.png')">Occupancy</button>
+				<button class="tablinks" onclick="showImage('dispatches.png')">Dispatches</button>
 				<button class="tablinks" onclick="showImage('counters.png')" id="counterspng_button">Counters</button>
 			</div>
 			<img id="GraphImage" src=timeline.png width=100%>