SWDEV-423898: Fixing issues with parallel kernels

Change-Id: I6726f3003af6036ba041c2b4bc5227dd08691090
2023-09-25 14:54:00 -03:00
parent 7418c52cc8
commit 675e1b9d38
@@ -194,11 +194,7 @@ while [ 1 ]; do
          ATT_ARGV="$ATT_ARGV $3 \"$4\""
          shift
          shift
-        elif [[ "$3" = "--mpi" ]]; then
-          ATT_PYTHON3_ARG="mpirun -np $4 python3 "
-          shift
-          shift
-        elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" = "--genasm" || "$3" == "--att_kernel" || "$3" == "--depth" ]]; then
+        elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" == "--att_kernel" ]]; then
          ATT_ARGV="$ATT_ARGV $3 $4"
          shift
          shift
@@ -16,14 +16,7 @@ import glob
 import numpy as np
 from stitch import stitch
 import gc
-
-try:
-    from mpi4py import MPI
-
-    MPI_IMPORTED = True
-except:
-    MPI_IMPORTED = False
-
+from collections import defaultdict

 class PerfEvent(ctypes.Structure):
    _fields_ = [
@@ -130,6 +123,8 @@ class ReturnInfo(ctypes.Structure):
        ("occupancy", POINTER(ctypes.c_uint64)),
        ("num_occupancy", ctypes.c_uint64),
        ("flags", ctypes.c_uint64),
+        ("kernel_id_addr", POINTER(ctypes.c_uint64)),
+        ("num_kernel_ids", ctypes.c_uint64),
    ]


@@ -162,10 +157,15 @@ def parse_binary(filename, kernel=None):
    info = SO.wrapped_parse_binary(str(filename).encode("utf-8"), kernel)

    code = []
+    kernel_addr = defaultdict(lambda : "Unknown")
+    last_known_function = "Unknown"
    for k in range(info.code_len):
        code_entry = info.code[k]

        line = deepcopy(code_entry.line.decode("utf-8"))
+        if "; Begin " in line:
+            last_known_function = line.split("; Begin ")[1]
+
        loc = deepcopy(code_entry.loc.decode("utf-8"))

        to_line = int(code_entry.to_line) if (code_entry.to_line >= 0) else None
@@ -175,31 +175,31 @@ def parse_binary(filename, kernel=None):
        code.append([line, int(code_entry.value), to_line, loc, int(code_entry.index),
                    int(code_entry.line_num), int(code_entry.addr), 0, 0])

+        if code[-1][-3] != 0 and len(code) > 1:
+            kernel_addr[code[-1][-3]] = last_known_function
+
    jumps = {}
    for k in range(info.jumps_len):
        jumps[info.jumps[k].key] = info.jumps[k].value

-    return code, jumps
+    return code, jumps, kernel_addr


-def getWaves_binary(name, shader_engine_data_dict, target_cu, depth):
+def getWaves_binary(name, shader_engine_data_dict, target_cu):
    filename = os.path.abspath(str(name))
    info = SO.AnalyseBinary(filename.encode("utf-8"), target_cu, False)

+    kernel_addr = [int(info.kernel_id_addr[k]) for k in range(info.num_kernel_ids)]
+
    waves = [info.wavedata[k] for k in range(info.num_waves)]
    events = [deepcopy(info.perfevents[k]) for k in range(info.num_events)]
    occupancy = [int(info.occupancy[k]) for k in range(int(info.num_occupancy))]
    flags = "navi" if (info.flags & 0x1) else "vega"

-    wave_slot_count = [[0 for k in range(20)] for j in range(4)]
    waves_python = []
    for wave in waves:
-        if (
-            wave_slot_count[wave.simd][wave.wave_id] >= depth
-            or wave.instructions_size == 0
-        ):
+        if wave.instructions_size < 2:
            continue
-        wave_slot_count[wave.simd][wave.wave_id] += 1
        pwave = PythonWave(wave)
        pwave.timeline = [
            (wave.timeline_array[2 * k], wave.timeline_array[2 * k + 1])
@@ -210,16 +210,16 @@ def getWaves_binary(name, shader_engine_data_dict, target_cu, depth):
            for k in range(wave.instructions_size)
        ]
        waves_python.append(pwave)
-    shader_engine_data_dict[name] = (waves_python, events, occupancy, flags)
+    shader_engine_data_dict[name] = (waves_python, events, occupancy, flags, kernel_addr)


 def getWaves_stitch(SIMD, code, jumps, flags, latency_map, hitcount_map, bIsAuto):
    for pwave in SIMD:
        pwave.instructions = stitch(pwave.instructions, code, jumps, flags, bIsAuto)
-
-        for inst in pwave.instructions[0]:
-            hitcount_map[inst[-1]] += 1
-            latency_map[inst[-1]] += inst[3]
+        if pwave.instructions is not None:
+            for inst in pwave.instructions[0]:
+                hitcount_map[inst[-1]] += 1
+                latency_map[inst[-1]] += inst[3]


 def persist(trace_file, SIMD):
@@ -232,6 +232,8 @@ def persist(trace_file, SIMD):
    smem_ins, smem_stalls, br_ins, br_taken_ins, br_stalls = [], [], [], [], []

    for wave in SIMD:
+        if wave.instructions is None:
+            continue
        simds.append(wave.simd)
        waves.append(wave.wave_id)
        begin_time.append(wave.begin_time)
@@ -344,50 +346,30 @@ def insert_waitcnt(flight_count, assembly_code):
    return assembly_code


-def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
-    for n, occ in enumerate(OCCUPANCY):
-        OCCUPANCY[n] = [
-            max(min(int((u >> 16) - min_event_time) << 16, 2**42), 0) | (u & 0xFFFFF)
-            for u in occ
-        ]
-    for perf in EVENTS:
-        for p in perf:
-            p.time -= min_event_time
-
+def gen_timelines(DBFILES):
+    TIMELINES = [np.zeros(int(1E6), dtype=np.float32) for k in range(5)]
+    TIME_RESOLUTION = 16
    for df in DBFILES:
        for T in range(len(df["timeline"])):
            timeline = df["timeline"][T]
            time_acc = 0
-            tuples3 = [(0, df["begin_time"][T] - min_event_time)] + [
-                (int(t[0]), int(t[1])) for t in timeline
-            ]
+            tuples3 = [(0, df["begin_time"][T])] + [(int(t[0]), int(t[1])) for t in timeline]

            for state in tuples3:
-                if state[1] > 1e8:
+                t_end = (time_acc + state[1])//TIME_RESOLUTION
+                if t_end > 1E8:
                    print("Warning: Time limit reached for ", state[0], state[1])
                    break
-                if time_acc + state[1] > TIMELINES[state[0]].size:
+                elif t_end > TIMELINES[state[0]].size:
                    TIMELINES[state[0]] = np.hstack(
                        [TIMELINES[state[0]], np.zeros_like(TIMELINES[state[0]])]
                    )
-                TIMELINES[state[0]][time_acc : time_acc + state[1]] += 1
+                TIMELINES[state[0]][time_acc//TIME_RESOLUTION : t_end] += 1
                time_acc += state[1]
+    return TIMELINES


 if __name__ == "__main__":
-    comm = None
-    mpi_root = True
-    if MPI_IMPORTED:
-        try:
-            comm = MPI.COMM_WORLD
-            if comm.Get_size() < 2:
-                comm = None
-            else:
-                mpi_root = comm.Get_rank() == 0
-        except:
-            print("Could not load MPI")
-            comm = None
-
    pathenv = os.getenv("OUTPUT_PATH")
    if pathenv is None:
        pathenv = "."
@@ -395,9 +377,6 @@ if __name__ == "__main__":
    parser.add_argument(
        "assembly_code", help="Path to the assembly code. Must be the first parameter."
    )
-    parser.add_argument(
-        "--depth", help="Maximum number of parsed waves per slot", default=100, type=int
-    )
    parser.add_argument(
        "--trace_file", help="Filter for trace files", default=None, type=str
    )
@@ -405,12 +384,6 @@ if __name__ == "__main__":
        "--att_kernel", help="Kernel file", type=str, default=pathenv + "/*_kernel.txt"
    )
    parser.add_argument("--ports", help="Server and websocket ports, default: 8000,18000")
-    parser.add_argument(
-        "--genasm",
-        help="Generate post-processed asm file at this path",
-        type=str,
-        default="",
-    )
    parser.add_argument(
        "--mode",
        help="""ATT analysis modes:\n
@@ -455,22 +428,19 @@ if __name__ == "__main__":
        print("Could not find att output kernel:", args.att_kernel)
        exit(1)
    elif len(att_kernel) > 1:
-        if mpi_root:
-            print("Found multiple kernel matching given filters:")
-            for n, k in enumerate(att_kernel):
-                print("\t", n, "->", k)
+        print("Found multiple kernel matching given filters:")
+        for n, k in enumerate(att_kernel):
+            print("\t", n, "->", k)

-            bValid = False
-            while bValid == False:
-                try:
-                    args.att_kernel = att_kernel[int(input("Please select number: "))]
-                    bValid = True
-                except KeyboardInterrupt:
-                    exit(0)
-                except:
-                    print("Invalid option.")
-        if comm is not None:
-            args.att_kernel = comm.bcast(args.att_kernel, root=0)
+        bValid = False
+        while bValid == False:
+            try:
+                args.att_kernel = att_kernel[int(input("Please select number: "))]
+                bValid = True
+            except KeyboardInterrupt:
+                exit(0)
+            except:
+                print("Invalid option.")
    else:
        args.att_kernel = att_kernel[0]

@@ -491,38 +461,31 @@ if __name__ == "__main__":
        filenames = glob.glob(args.trace_file)
    assert len(filenames) > 0

-    if comm is not None:
-        filenames = filenames[comm.Get_rank() :: comm.Get_size()]
-
-    code = jumps = None
-    if mpi_root:
-        print('Att kernel:', args.att_kernel)
-        code, jumps = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel)
+    print('Att kernel:', args.att_kernel)
+    code, jumps, kern_addr = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel)

    DBFILES = []
-    TIMELINES = [np.zeros(int(1e4), dtype=np.int16) for k in range(5)]
    EVENTS = []
    OCCUPANCY = []
    GFXV = []
    analysed_filenames = []
    occupancy_filenames = []
-
+    dispatch_kernel_names = {}
    shader_engine_data_dict = {}
    for name in filenames:
-        getWaves_binary(name, shader_engine_data_dict, args.target_cu, args.depth)
-
-    if comm is not None:
-        code = comm.bcast(code, root=0)
-        jumps = comm.bcast(jumps, root=0)
+        getWaves_binary(name, shader_engine_data_dict, args.target_cu)

    gc.collect()
    latency_map = np.zeros((len(code)), dtype=np.int64)
    hitcount_map = np.zeros((len(code)), dtype=np.int32)
    for name in filenames:
-        SIMD, perfevents, occupancy, gfxv = shader_engine_data_dict[name]
-        if len(occupancy) > 0:
+        SIMD, perfevents, occupancy, gfxv, addrs = shader_engine_data_dict[name]
+
+        for id, addr in enumerate(addrs):
+            dispatch_kernel_names[id] = kern_addr[addr]
+        if len(occupancy) > 16:
            OCCUPANCY.append( occupancy )
-            occupancy_filenames.append( name )
+            occupancy_filenames.append(name)
        if np.sum([0]+[len(s.instructions) for s in SIMD]) == 0:
            print("No waves from", name)
            continue
@@ -534,117 +497,33 @@ if __name__ == "__main__":
        GFXV.append(gfxv)

    gc.collect()
-    min_event_time = 2**62
-    for df in DBFILES:
-        if len(df["begin_time"]) > 0:
-            min_event_time = min(min_event_time, np.min(df["begin_time"]))
-    for perf in EVENTS:
-        for p in perf:
-            min_event_time = min(min_event_time, p.time)
-    for occ in OCCUPANCY:
-        min_event_time = min(min_event_time, np.min(np.array(occ) >> 16))
-
-    gc.collect()
-    min_event_time = max(0, min_event_time - 32)
-    if comm is not None:
-        min_event_time = comm.reduce(min_event_time, op=MPI.MIN)
-        min_event_time = comm.bcast(min_event_time, root=0)
-
-        apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES)
-
-        GFXV = comm.gather(GFXV, root=0)
-        EVENTS = comm.gather(EVENTS, root=0)
-        OCCUPANCY = comm.gather(OCCUPANCY, root=0)
-        TIMELINES = comm.gather(TIMELINES, root=0)
-        gather_latency_map = comm.gather(latency_map, root=0)
-        gather_hitcount_map = comm.gather(hitcount_map, root=0)
-        gathered_filenames = comm.gather(occupancy_filenames, root=0)
-
-        if mpi_root:
-            latency_map *= 0
-            hitcount_map *= 0
-            for hit, lat in zip(gather_hitcount_map, gather_latency_map):
-                hitcount_map += hit
-                latency_map += lat
-            EVENTS = [e for elem in EVENTS for e in elem]
-            OCCUPANCY = [e for elem in OCCUPANCY for e in elem]
-            gathered_filenames = [e for elem in gathered_filenames for e in elem]
-            gfxv = [e for elem in GFXV for e in elem][0]
-
-            TIMELINES_GATHER = TIMELINES
-            TIMELINES = [
-                np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16)
-                for k in range(5)
-            ]
-            for gather in TIMELINES_GATHER:
-                for t, m in zip(TIMELINES, gather):
-                    t[: len(m)] += m
-            del TIMELINES_GATHER
-        else:  # free up memory
-            TIMELINES = []
-            OCCUPANCY = []
-            EVENTS = []
-    else:
-        apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES)
-        gathered_filenames = occupancy_filenames
-
-    if mpi_root:
-        for k in range(len(code)):
-            code[k][-2] = int(hitcount_map[k])
-            code[k][-1] = int(latency_map[k])
+    for k in range(len(code)):
+        code[k][-2] = int(hitcount_map[k])
+        code[k][-1] = int(latency_map[k])

    if CSV_MODE:
-        if mpi_root:
-            from att_to_csv import dump_csv
-            dump_csv(code)
+        from att_to_csv import dump_csv
+        dump_csv(code)
        quit()

+
    gc.collect()
-    print("Min time:", min_event_time)

    drawinfo = {
-        "TIMELINES": TIMELINES,
+        "TIMELINES": gen_timelines(DBFILES),
        "EVENTS": EVENTS,
        "EVENT_NAMES": EVENT_NAMES,
        "OCCUPANCY": OCCUPANCY,
-        "ShaderNames": gathered_filenames,
+        "ShaderNames": occupancy_filenames,
+        "DispatchNames": dispatch_kernel_names,
    }
-    if args.genasm and len(args.genasm) > 0:
-        flight_count = view_trace(
-            args,
-            code,
-            DBFILES,
-            analysed_filenames,
-            True,
-            OCCUPANCY,
-            args.dumpfiles,
-            min_event_time,
-            gfxv,
-            drawinfo,
-            comm,
-            mpi_root,
-        )
-        with open(args.assembly_code, "r") as file:
-            lines = file.readlines()
-        assembly_code = {l + 1.0: lines[l][:-1] for l in range(len(lines))}
-        assembly_code = insert_waitcnt(flight_count, assembly_code)
-
-        with open(args.genasm, "w") as file:
-            keys = sorted(assembly_code.keys())
-            for k in keys:
-                file.write(assembly_code[k] + "\n")
-    else:
-        view_trace(
-            args,
-            code,
-            DBFILES,
-            analysed_filenames,
-            False,
-            OCCUPANCY,
-            args.dumpfiles,
-            min_event_time,
-            gfxv,
-            drawinfo,
-            comm,
-            mpi_root,
-        )
+    view_trace(
+        args,
+        code,
+        DBFILES,
+        analysed_filenames,
+        args.dumpfiles,
+        0,
+        gfxv,
+        drawinfo
+    )
@@ -136,55 +136,6 @@ std::optional<code_object_decoder_t::symbol_info_t> code_object_decoder_t::find_
  return {};
 }

-/*
-void code_object_decoder_t::load_symbol_map() {
-  std::unique_ptr<Elf, void (*)(Elf *)> elf (
-      elf_begin(m_fd, ELF_C_READ, nullptr),
-      [](Elf *elf){ elf_end(elf); });
-
-  if (!elf) {
-    rocprofiler::warning("Error opening ELF!\n");
-    return;
-  }
-
-  Elf64_Ehdr *ehdr = elf64_getehdr(elf.get());
-  if (!ehdr) {
-    printf("elf64_getehdr failed\n");
-    return;
-  }
-
-  // Slurp the symbol table.
-  Elf_Scn *scn = nullptr;
-  while ((scn = elf_nextscn(elf.get(), scn)) != nullptr) {
-    GElf_Shdr shdr_mem;
-    GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_mem);
-    if (shdr->sh_type != SHT_SYMTAB && shdr->sh_type != SHT_DYNSYM) {
-      continue;
-    }
-
-    Elf_Data *data = elf_getdata(scn, nullptr);
-    if (!data) continue;
-
-    size_t symbol_count = data->d_size / gelf_fsize(elf.get(), ELF_T_SYM, 1, EV_CURRENT);
-    for (size_t j = 0; j < symbol_count; ++j) {
-      GElf_Sym sym_mem;
-      GElf_Sym *sym = gelf_getsym(data, j, &sym_mem);
-
-      if (GELF_ST_TYPE(sym->st_info) != STT_FUNC || sym->st_shndx == SHN_UNDEF) continue;
-
-      std::string symbol_name{ elf_strptr(elf.get(), shdr->sh_link, sym->st_name) };
-      auto symbol_pair = std::make_pair(symbol_name, sym->st_size);
-
-      auto [it, success] = m_symbol_map.emplace(sym->st_value, symbol_pair);
-
-      // If there already was a symbol defined at this address, but this
-      // new symbol covers a larger address range, replace the old symbol
-      //         with this new one.
-      if (!success && sym->st_size > it->second.second) it->second = symbol_pair;
-    }
-  }
-} */
-
 void code_object_decoder_t::disassemble_kernel(uint64_t addr) {
  auto symbol = find_symbol(addr);

@@ -193,9 +144,6 @@ void code_object_decoder_t::disassemble_kernel(uint64_t addr) {
    return;
  }

-  // if (symbol->m_name.find("__amd_rocclr_") == 0)
-  //  return;
-
  std::cout << "Dumping ISA for " << symbol->m_name << std::endl;

  uint64_t end_addr = addr + symbol->m_size;
@@ -218,8 +166,6 @@ void code_object_decoder_t::disassemble_kernel(uint64_t addr) {

 void code_object_decoder_t::disassemble_kernels() {
  disassembly = std::make_unique<DisassemblyInstance>(*this);
-
-  // if (m_symbol_map.begin() == m_symbol_map.end())
  m_symbol_map = disassembly->GetKernelMap();

  for (auto& [k, v] : m_symbol_map) disassemble_kernel(k);
@@ -57,8 +57,9 @@
  if (amd_comgr_status_s status = call) {                                                          \
    const char* reason = "";                                                                       \
    amd_comgr_status_string(status, &reason);                                                      \
+    std::cerr << __LINE__ << " code: " << status << std::endl;                                     \
    std::cerr << __LINE__ << " failed: " << reason << std::endl;                                   \
-    return;                                                                                        \
+    exit(1);                                                                                       \
  }

 CodeObjectBinary::CodeObjectBinary(const std::string& uri) : m_uri(uri) {
@@ -156,12 +157,12 @@ DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder)
    : buffer(reinterpret_cast<int64_t>(decoder.buffer.data())),
      size(decoder.buffer.size()),
      instructions(decoder.instructions) {
-  amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &data);
-  amd_comgr_set_data(data, size, decoder.buffer.data());
+  CHECK_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &data));
+  CHECK_COMGR(amd_comgr_set_data(data, size, decoder.buffer.data()));

  char isa_name[128];
  size_t isa_size = sizeof(isa_name);
-  amd_comgr_get_data_isa_name(data, &isa_size, isa_name);
+  CHECK_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, isa_name));

  CHECK_COMGR(amd_comgr_create_disassembly_info(
      isa_name,  //"amdgcn-amd-amdhsa--gfx1100",
@@ -172,24 +173,24 @@ DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder)
 amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbol,
                                                        void* user_data) {
  amd_comgr_symbol_type_t type;
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type);
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type));

  if (type != AMD_COMGR_SYMBOL_TYPE_FUNC && type != AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL)
    return AMD_COMGR_STATUS_SUCCESS;

  uint64_t addr;
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &addr);
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &addr));

  uint64_t mem_size;
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size);
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size));

  uint64_t name_size;
-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size);
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size));

  std::string name;
  name.resize(name_size);

-  amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data());
+  CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data()));

  static_cast<DisassemblyInstance*>(user_data)->symbol_map[addr] = {name, mem_size};
  return AMD_COMGR_STATUS_SUCCESS;
@@ -197,18 +198,19 @@ amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbo

 std::map<uint64_t, std::pair<std::string, uint64_t>>& DisassemblyInstance::GetKernelMap() {
  symbol_map = std::map<uint64_t, std::pair<std::string, uint64_t>>{};
-  amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this);
+  CHECK_COMGR(amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this));
  return symbol_map;
 }

 DisassemblyInstance::~DisassemblyInstance() {
-  amd_comgr_release_data(data);
+  CHECK_COMGR(amd_comgr_release_data(data));
  CHECK_COMGR(amd_comgr_destroy_disassembly_info(info));
 }

 uint64_t DisassemblyInstance::ReadInstruction(uint64_t addr, const char* cpp_line) {
  uint64_t size_read;
-  amd_comgr_disassemble_instruction(info, buffer + addr, (void*)this, &size_read);
+  CHECK_COMGR(amd_comgr_disassemble_instruction(info, buffer + addr, (void*)this, &size_read));
+  assert(instructions.size() != 0);
  instructions.back().address = addr;
  instructions.back().cpp_reference = cpp_line;
  return size_read;
@@ -153,6 +153,7 @@ def draw_wave_states(selections, normalize, TIMELINES):

    plt.figure(figsize=(15, 4))

+
    maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices])
    maxtime = max(maxtime, 1)
    timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices]
@@ -169,21 +170,18 @@ def draw_wave_states(selections, normalize, TIMELINES):
        else cycles * 0
        for time in timelines
    ]
-    kernsize = 21
-    kernel = np.asarray(
-        [
-            np.exp(-abs(10 * k / kernsize))
-            for k in range(-kernsize // 2, kernsize // 2 + 1)
-        ]
-    )
+    kernsize = 15
+    kernel = np.asarray([
+        np.exp(-abs(10 * k / kernsize)) for k in range(-kernsize // 2, kernsize // 2 + 1)
+    ])
    kernel /= np.sum(kernel)

    timelines = [
        np.convolve(time, kernel)[kernsize // 2 : -kernsize // 2]
-        for time in timelines
-        if len(time) > 0
+        for time in timelines if len(time) > 0
    ]
-
+    maxtime *= 16
+    cycles *= 16
    [
        plt.plot(cycles, t, label="State " + s, linewidth=1.1, color=c)
        for t, s, c, sel in zip(timelines, STATES, colors, selections)
@@ -204,48 +202,113 @@ def draw_wave_states(selections, normalize, TIMELINES):
    return STATES, FileBytesIO(figure_bytes)


-def draw_occupancy(selections, normalize, OCCUPANCY, shadernames):
+def draw_occupancy_per_dispatch(selections, normalize, OCCUPANCY, dispatchnames):
+    plt.figure(figsize=(15, 4))
+    maxtime = 1
+    delta = 1
+
+    for k in range(len(OCCUPANCY)):
+        if len(OCCUPANCY[k]) <= 16:
+            continue
+        OCCUPANCY[k] = [(16*int(u>>23), (u>>12) & 0x7F, (u>>19) & 0xF, u&0xFFF) for u in OCCUPANCY[k]]
+        maxtime = max(maxtime, OCCUPANCY[k][-1][0])
+
+    NUM_DOTS = 1600
+    delta = max(1, maxtime // NUM_DOTS)
+    chart = np.zeros((len(dispatchnames), maxtime // delta + 2), dtype=np.float32)
+
+    for occ in OCCUPANCY:
+        if len(occ) <= 16:
+            continue
+        small_chart = np.zeros_like(chart)
+        norm_fact = np.zeros_like(chart)
+        norm_fact += 1E-6
+
+        current_occ = [[0 for m in range(16)] for k in range(len(dispatchnames))]
+        current_occ[0] = [m[1] for m in occ[:16]]
+        current_time = [0 for k in range(len(dispatchnames))]
+        total_value = [0 for k in range(len(dispatchnames))]
+        total_value[0] = np.sum(current_occ[0])
+
+        for time, value, cu, kid in occ:
+            b = current_time[kid]
+            e = max(b + 1, time // delta)
+            small_chart[kid][b:e] += total_value[kid]
+            norm_fact[kid][b:e] += 1
+
+            total_value[kid] += value - current_occ[kid][cu]
+            current_occ[kid][cu] = value
+            current_time[kid] = time // delta
+        for small, norm, time, value in zip(small_chart, norm_fact, current_time, total_value):
+            small[time] += value
+            norm[time] += value
+
+        chart += small_chart/norm_fact
+
+    for (id, name), occ in zip(dispatchnames.items(), chart):
+        plt.plot(np.arange(occ.size) * delta, occ, label=str(id)+'#'+name, linewidth=1.1)
+
+    plt.legend()
+    if normalize:
+        plt.ylabel("Occupancy %")
+    else:
+        plt.ylabel("Occupancy total")
+    plt.xlabel("Cycle")
+    plt.ylim(-1)
+    plt.xlim(-maxtime // 200, maxtime + maxtime // 200 + delta + 1)
+    plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
+    figure_bytes = BytesIO()
+    plt.savefig(figure_bytes, dpi=150)
+    return dispatchnames, FileBytesIO(figure_bytes)
+
+
+def draw_occupancy(selections, normalize, OCCUPANCY, shadernames, numdispatchid):
    plt.figure(figsize=(15, 4))
    names = []
    if len(OCCUPANCY) == 1: # If single SE, do occupancy per CU/WGP
-        OCCUPANCY = [[u for u in OCCUPANCY[0] if u&0xFF==k] for k in range(16)]
-        shadernames = ['CU'+str(k) for k in range(16) if len(OCCUPANCY[k]) > 0]
-        OCCUPANCY = [occ for occ in OCCUPANCY if len(occ) > 0]
+        percu = [[u for u in OCCUPANCY[0] if (u>>19) & 0xF == k] for k in range(16)]
+        shadernames = shadernames + [['CU'+str(k),''] for k in range(16) if len(percu[k]) > 0]
+        OCCUPANCY = OCCUPANCY + [occ for occ in percu if len(occ) > 0]

-    maxtime = 1
-    delta = 1
    for name, occ in zip(shadernames, OCCUPANCY):
-        occ_values = [0]
-        occ_times = [0]
-        occ = [(int(u >> 16), (u >> 8) & 0xFF, u & 0xFF) for u in occ]
-        current_occ = [0 for k in range(16)]
+        if len(occ) <= 16:
+            continue
+        maxtime = 1
+        delta = 1
+        occ = [(16*int(u >> 23), (u >> 12) & 0x7F, (u>>19) & 0xF, u&0xFFF) for u in occ]
+        current_occ = [[0 for m in range(16)] for k in range(numdispatchid)]
+        current_occ[0] = [m[1] for m in occ[:16]]

-        for time, value, cu in occ:
+        occ_values = [np.sum(current_occ[0])]
+        occ_times = [0]
+
+        for time, value, cu, kid in occ:
            occ_times.append(time)
-            occ_values.append(occ_values[-1] + value - current_occ[cu])
-            current_occ[cu] = value
+            occ_values.append(occ_values[-1] + value - current_occ[kid][cu])
+            current_occ[kid][cu] = value
        try:
-            names.append('SE'+name.split('.att')[0].split('_se')[-1])
+            names.append('SE'+name.split('_se')[1].split('.att')[0])
        except:
            names.append(name)

        NUM_DOTS = 1500
-        maxtime = np.max(occ_times)
+        maxtime = occ_times[-1]+1
        delta = max(1, maxtime // NUM_DOTS)
        chart = np.zeros((maxtime // delta + 1), dtype=np.float32)
        norm_fact = np.zeros_like(chart)
+        norm_fact += 1E-6

-        for i, t in enumerate(occ_times[:-1]):
-            b = t // delta
+        for i in range(len(occ_times)-1):
+            b = occ_times[i] // delta
            e = max(b + 1, occ_times[i + 1] // delta)
            chart[b:e] += occ_values[i]
            norm_fact[b:e] += 1

-        chart /= np.maximum(norm_fact, 1)
+        chart /= norm_fact
        if normalize:
            chart /= max(chart.max(), 1e-6)

-        plt.plot(np.arange(chart.size) * delta, chart, label=name, linewidth=1.1)
+        plt.plot(np.arange(chart.size) * delta, chart, label=names[-1], linewidth=1.1)

    plt.legend()
    if normalize:
@@ -267,12 +330,14 @@ def GeneratePIC(drawinfo, selections=[True for k in range(16)], normalize=False)
    response = {}
    figures = {}

-    states, figure = draw_occupancy(
-        selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"]
-    )
+    states, figure = draw_occupancy(selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"], len(drawinfo["DispatchNames"]))
    response["occupancy.png"] = states
    figures["occupancy.png"] = figure

+    states, figure = draw_occupancy_per_dispatch(selections, normalize, drawinfo["OCCUPANCY"], drawinfo["DispatchNames"])
+    response["dispatches.png"] = states
+    figures["dispatches.png"] = figure
+
    states, figure = draw_wave_states(selections, normalize, drawinfo["TIMELINES"])
    response["timeline.png"] = states
    figures["timeline.png"] = figure
@@ -193,104 +193,11 @@ def try_match_swapped(insts, code, i, line):
    return insts[i + 1][1] == code[line][1] and insts[i][1] == code[line + 1][1]


-FORK_NAMES = 1
-# A successful parsed instruction
-class CachedInst:
-    def __init__(self, inst, as_line):
-        self.inst_type = inst
-        self.as_line = as_line
-        self.forks = None
-
-# A branch of the parsing tree
-class Fork:
-    def __init__(self):
-        global FORK_NAMES
-        self.insts = []
-        self.data = None
-        self.name = FORK_NAMES
-        FORK_NAMES += 1
-        # print('Created new fork: ', self.name)
-
-# Try to match sequence "insts" with the branch "fork", starting at position "i"
-def move_down_fork(fork, insts, i): #(fork : Fork, insts : list, i : int):
-    N = min(len(insts), len(fork.insts))
-
-    while i < N:
-        if insts[i][1] == fork.insts[i].inst_type:
-            i += 1
-        elif i<N-1  and insts[i+1][1] == fork.insts[i].inst_type \
-                    and insts[i][1] == fork.insts[i+1].inst_type:
-            i += 2
-        else:
-            return False, i
-
-    if len(fork.insts) != len(insts):
-        return False, i
-
-    return True, i
-
-
-FORK_TREE = Fork()
-
-# Check if there exists a previous wave with the same sequence of instructions executed
-def fromDict(insts):
-    i = 0
-    N = len(insts)
-    cur_fork = FORK_TREE
-    while i < N:
-        tillEnd, final_pos = move_down_fork(cur_fork, insts, i)
-        if tillEnd:
-            # print('Reached end')
-            return True, cur_fork
-
-        i += final_pos
-
-        if i >= len(cur_fork.insts):
-            return False, cur_fork
-
-        last_inst = cur_fork.insts[i]
-        if last_inst.forks is None:
-            last_inst.forks = []
-
-        bMatchFork = False
-        for fork in last_inst.forks:
-            if fork.insts[0].inst_type == insts[0][1]:
-                cur_fork = fork
-                bMatchFork = True
-                break
-        if not bMatchFork:
-            cur_fork = Fork()
-            last_inst.forks.append(cur_fork)
-            return False, cur_fork
-
-    print("Warning: Reached end of loop!")
-    return False, cur_fork
-
-
 def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
    bGFX9 = gfxv == 'vega'

-    # Try from cached result from a previous wave that have already been parsed
-    dict_sucess, current_fork = fromDict(insts)
-    if dict_sucess:
-        result, loopCount, mem_unroll, flight_count, maxline, pcsequence = current_fork.data
-        # Check if the sequence of measured PC values are equal for cached and new wave
-        if len(pcsequence) > 0:
-            pcs = [r[2] for r in insts if r[1] == PCINFO]
-            if len(pcs) != len(pcsequence):
-                dict_sucess = False
-            for pc1, pc2 in zip(pcs, pcsequence):
-                if pc1 != pc2:
-                    dict_sucess = False
-
-    # If successful, use resulting assembly from cache
-    if dict_sucess:
-        result = [r+(asm[-1],) for r, asm in zip(insts, result)]
-        return result, loopCount, mem_unroll, flight_count, maxline, len(result)
-
    result, i, line, loopCount, N = [], 0, 0, defaultdict(int), len(insts)

-
    SMEM_INST = []  # scalar memory
    VLMEM_INST = []  # vector memory load
    VSMEM_INST = []  # vector memory store
@@ -310,10 +217,6 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
    # Clean the code and remove comments
    code = [raw_code[0]]
    for c in raw_code[1:]:
-        if bIsAuto and '; Begin ' == c[0][:len('; Begin ')]:
-            if '; Begin <Kernel>' in c[0]:
-                line = len(code)
-                print('Begin at:', line, c)
        c = list(c)
        c[0] = c[0].split(";")[0].split("//")[0].strip()

@@ -339,7 +242,16 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
    loops = 0
    maxline = 0

-    watchlist = RegisterWatchList(labels=labels) if not bIsAuto else PCTranslator(code, insts)
+    if bIsAuto and len(insts) and insts[0][1] == PCINFO:
+        try:
+            watchlist = PCTranslator(code, insts)
+            line = watchlist.addrmap[insts[0][2]]
+            result.append((insts[0][0], PCINFO, 0, 0, 0))
+            i = 1
+        except:
+            return None
+    else:
+        watchlist = RegisterWatchList(labels=labels)

    pcsequence = []
    while i < N:
@@ -534,7 +446,5 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
                break
            line += 1

-    current_fork.insts = [CachedInst(inst[1], inst[-1]) for inst in result]
-    current_fork.data = result, loopCount, mem_unroll, flight_count, maxline, pcsequence
    result = [r for r in result if r[1] != PCINFO]
    return result, loopCount, mem_unroll, flight_count, maxline, len(result) if i == N else N
@@ -296,25 +296,25 @@ def view_trace(
    code,
    dbnames,
    att_filenames,
-    bReturnLoc,
-    OCCUPANCY,
    bDumpOnly,
    se_time_begin,
    gfxv,
-    drawinfo,
-    MPI_COMM,
-    mpi_root,
+    drawinfo
 ):
    global JSON_GLOBAL_DICTIONARY
    pic_thread = None
-    if mpi_root:
-        manager = Manager()
-        return_dict = manager.dict()
-        JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable(
-            {str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))}
-        )
-        pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo))
-        pic_thread.start()
+
+    manager = Manager()
+    return_dict = manager.dict()
+    occ_dict = {str(k): drawinfo["OCCUPANCY"][k] for k in range(len(drawinfo["OCCUPANCY"]))}
+    occ_dict['dispatches'] = {}
+    for id, name in drawinfo['DispatchNames'].items():
+        occ_dict['dispatches'][id] = name
+    occ_dict['names'] = drawinfo['ShaderNames']
+
+    JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable(occ_dict)
+    pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo))
+    pic_thread.start()

    att_filenames = [Path(f).name for f in att_filenames]
    se_numbers = [int(a.split("_se")[1].split(".att")[0]) for a in att_filenames]
@@ -337,9 +337,8 @@ def view_trace(
            flight_count.append(count)
            simd_wave_filenames[se_number] = wv_filenames

-    if mpi_root:
-        code_sel = [c[:-3]+c[-2:] for c in code[:allse_maxline+16]]
-        JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)})
+    code_sel = [c[:-3]+c[-2:] for c in code[:allse_maxline+16]]
+    JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)})

    for key in simd_wave_filenames.keys():
        wv_array = [
@@ -367,42 +366,21 @@ def view_trace(

        simd_wave_filenames[key] = wv_dict

-    if MPI_COMM is not None:
-        se_filenames = MPI_COMM.gather(se_filenames, root=0)
-        simd_wave_filenames = MPI_COMM.gather(simd_wave_filenames, root=0)
-        if mpi_root:
-            se_filenames = [e for elem in se_filenames for e in elem]
-            simd_wave_filenames = {
-                k: v for smf in simd_wave_filenames for k, v in smf.items()
-            }
-
-    if mpi_root:
-        JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable(
-            {
-                "wave_filenames": simd_wave_filenames,
-                "se_filenames": se_filenames,
-                "global_begin_time": int(se_time_begin),
-                "gfxv": gfxv,
-            }
-        )
+    JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable(
+        {
+            "wave_filenames": simd_wave_filenames,
+            "se_filenames": se_filenames,
+            "global_begin_time": int(se_time_begin),
+            "gfxv": gfxv,
+        }
+    )

    if pic_thread is not None:
        pic_thread.join()
        for k, v in return_dict.items():
            JSON_GLOBAL_DICTIONARY[k] = v

-    if bReturnLoc:
-        return flight_count
-
    if bDumpOnly == False:
-        if MPI_COMM is not None:
-            JSON_GLOBAL_DICTIONARY = MPI_COMM.gather(JSON_GLOBAL_DICTIONARY, root=0)
-            if not mpi_root:
-                quit()
-            JSON_GLOBAL_DICTIONARY = {
-                k: v for smf in JSON_GLOBAL_DICTIONARY for k, v in smf.items()
-            }
-
        JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 1})
        if args.ports:
            assign_ports(args.ports)
@@ -420,13 +398,12 @@ def view_trace(
            print("Exitting.")
    else:
        os.makedirs("ui/", exist_ok=True)
-        if mpi_root:
-            JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0})
-            os.system(
-                "cp "
-                + os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui")
-                + "/* ui/"
-            )
+        JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0})
+        os.system(
+            "cp "
+            + os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui")
+            + "/* ui/"
+        )
        for k, v in JSON_GLOBAL_DICTIONARY.items():
            with open(os.path.join("ui", k), "w" if ".json" in k else "wb") as f:
                f.write(v.read())
@@ -14,6 +14,7 @@
 			<div class="tab">
 				<button class="tablinks" onclick="showImage('timeline.png')">Wave States</button>
 				<button class="tablinks" onclick="showImage('occupancy.png')">Occupancy</button>
+				<button class="tablinks" onclick="showImage('dispatches.png')">Dispatches</button>
 				<button class="tablinks" onclick="showImage('counters.png')" id="counterspng_button">Counters</button>
 			</div>
 			<img id="GraphImage" src=timeline.png width=100%>