diff --git a/bin/rocprofv2 b/bin/rocprofv2
index 0698062aa0..b808262435 100755
--- a/bin/rocprofv2
+++ b/bin/rocprofv2
@@ -238,12 +238,17 @@ while [ 1 ] ; do
       shift
 
       ATT_OPTIONS="Not done"
+      ATT_PYTHON3_ARG="python3 "
       while [ "$ATT_OPTIONS" = "Not done" ]; do
         if [[ "$3" = "--trace_file" ]]; then
           ATT_ARGV="$ATT_ARGV $3 \"$4\""
           shift
           shift
-        elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" = "--genasm" || "$3" == "--att_kernel" ]]; then
+        elif [[ "$3" = "--mpi" ]]; then
+          ATT_PYTHON3_ARG="mpirun -np $4 python3 "
+          shift
+          shift
+        elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" = "--genasm" || "$3" == "--att_kernel"  || "$3" == "--depth" ]]; then
           ATT_ARGV="$ATT_ARGV $3 $4"
           shift
           shift
@@ -327,7 +332,7 @@ get_pmc_results_txt_path(){
 
 if [ -n "$ATT_PATH" ]; then
   if [ -n "$ATT_ARGV" ]; then
-    eval "python3 $ATT_PATH $ATT_ARGV"
+    eval "$ATT_PYTHON3_ARG $ATT_PATH $ATT_ARGV"
   elif [ ! -n "$PMC_LINES" ]; then
     echo "ATT File  is required!"
   fi
diff --git a/plugin/att/CMakeLists.txt b/plugin/att/CMakeLists.txt
index d35ef63dc0..05c0ce8774 100644
--- a/plugin/att/CMakeLists.txt
+++ b/plugin/att/CMakeLists.txt
@@ -57,6 +57,8 @@ install(TARGETS att_plugin
 
 configure_file(att.py att/att.py COPYONLY)
 configure_file(trace_view.py att/trace_view.py COPYONLY)
+configure_file(stitch.py att/stitch.py COPYONLY)
+configure_file(drawing.py att/drawing.py COPYONLY)
 configure_file(ui/index.html att/ui/index.html COPYONLY)
 configure_file(ui/logo.svg att/ui/logo.svg COPYONLY)
 configure_file(ui/styles.css att/ui/styles.css COPYONLY)
diff --git a/plugin/att/att.py b/plugin/att/att.py
index f5e7a0a36e..087c944252 100755
--- a/plugin/att/att.py
+++ b/plugin/att/att.py
@@ -6,38 +6,20 @@ if sys.version_info[0] < 3:
 import os
 import argparse
 from pathlib import Path
-from struct import *
 from ctypes import *
 import ctypes
 from copy import deepcopy
-from trace_view import view_trace, Readable
+from trace_view import view_trace
 import sys
 import glob
 import numpy as np
-import matplotlib.pyplot as plt
-from io import BytesIO
+from stitch import stitch
+import gc
 
-class FileBytesIO:
-    def __init__(self, iobytes):
-        self.iobytes = deepcopy(iobytes)
-        self.seek = 0
-
-    def __len__(self):
-        return self.iobytes.getbuffer().nbytes
-
-    def read(self, length=0):
-        if length<=0:
-            return bytes(self.iobytes.getbuffer())
-        else:
-            if self.seek >= self.iobytes.getbuffer().nbytes:
-                self.seek = 0
-                return None
-            response =  self.iobytes.getbuffer()[self.seek:self.seek+length]
-            self.seek += length
-            return bytes(response)
-
-
-COUNTERS_MAX_CAPTURES = 1<<12
+try:
+    from mpi4py import MPI
+except:
+    pass
 
 class PerfEvent(ctypes.Structure):
     _fields_ = [
@@ -114,8 +96,17 @@ class Wave(ctypes.Structure):
         ('num_branch_taken_instrs', ctypes.c_uint64),
         ('num_branch_stalls', ctypes.c_uint64),
 
-        ('timeline_string', ctypes.c_char_p),
-        ('instructions_string', ctypes.c_char_p)]
+        ('timeline_array', POINTER(ctypes.c_int64)),
+        ('instructions_array', POINTER(ctypes.c_int64)),
+        ('timeline_size', ctypes.c_uint64),
+        ('instructions_size', ctypes.c_uint64)]
+
+class PythonWave:
+    def __init__(self, source_wave):
+        for property, value in Wave._fields_:
+            setattr(self, property, getattr(source_wave, property))
+        self.timeline_array = None
+        self.instructions_array = None
 
 # Flags :
 #   IS_NAVI = 0x1
@@ -154,16 +145,14 @@ def parse_binary(filename, kernel=None):
     for k in range(info.code_len):
         code_entry = info.code[k]
 
-        # copy string memory from C++
         line = deepcopy(code_entry.line.decode("utf-8"))
         loc = deepcopy(code_entry.loc.decode("utf-8"))
 
-        # Transform empty entries back to python's None
         to_line = int(code_entry.to_line) if (code_entry.to_line >= 0) else None
         loc = loc if len(loc) > 0 else None
 
-        code.append((line, int(code_entry.value), to_line, loc,
-                    int(code_entry.index), int(code_entry.line_num)))
+        code.append([line, int(code_entry.value), to_line, loc,
+                    int(code_entry.index), int(code_entry.line_num), 0, 0]) # hitcount + cycles
 
     jumps = {}
     for k in range(info.jumps_len):
@@ -172,19 +161,35 @@ def parse_binary(filename, kernel=None):
     return code, jumps
 
 
-def getWaves(filename, target_cu, verbose):
-    filename = os.path.abspath(str(filename))
-    info = SO.AnalyseBinary(filename.encode('utf-8'), target_cu, verbose)
+def getWaves_binary(name, shader_engine_data_dict, target_cu, depth):
+    filename = os.path.abspath(str(name))
+    info = SO.AnalyseBinary(filename.encode('utf-8'), target_cu, False)
 
     waves = [info.wavedata[k] for k in range(info.num_waves)]
     events = [deepcopy(info.perfevents[k]) for k in range(info.num_events)]
     occupancy = [int(info.occupancy[k]) for k in range(int(info.num_occupancy))]
+    flags = 'navi' if (info.flags & 0x1) else 'vega'
 
+    wave_slot_count = [[0 for k in range(20)] for j in range(4)]
+    waves_python = []
     for wave in waves:
-        wave.timeline = deepcopy(wave.timeline_string.decode("utf-8"))
-        wave.instructions = deepcopy(wave.instructions_string.decode("utf-8"))
+        if wave_slot_count[wave.simd][wave.wave_id] >= depth:
+            continue
+        wave_slot_count[wave.simd][wave.wave_id] += 1
+        pwave = PythonWave(wave)
+        pwave.timeline = [(wave.timeline_array[2*k], wave.timeline_array[2*k+1]) for k in range(wave.timeline_size)]
+        pwave.instructions = [tuple([wave.instructions_array[4*k+m] for m in range(4)]) for k in range(wave.instructions_size)]
+        waves_python.append( pwave )
+    shader_engine_data_dict[name] = (waves_python, events, occupancy, flags)
 
-    return waves, events, occupancy, 'navi' if (info.flags & 0x1) else 'vega'
+
+def getWaves_stitch(SIMD, code, jumps, flags, latency_map, hitcount_map):
+    for pwave in SIMD:
+        pwave.instructions = stitch(pwave.instructions, code, jumps, flags)
+
+        for inst in pwave.instructions[0]:
+            hitcount_map[inst[-1]] += 1
+            latency_map[inst[-1]] += inst[3]
 
 
 def persist(trace_file, SIMD):
@@ -221,7 +226,6 @@ def persist(trace_file, SIMD):
         timeline.append(wave.timeline)
         instructions.append(wave.instructions)
 
-    #df = pd.DataFrame({
     df = {
         'name': [trace for _ in range(len(begin_time))],
         'id': [i for i in range(len(begin_time))],
@@ -248,8 +252,7 @@ def persist(trace_file, SIMD):
         'br_stalls': br_stalls,
         'timeline': timeline,
         'instructions': instructions,
-    }#)
-    #[print(d) for c, d in df.iterrows()]; quit()
+    }
     return df
 
 
@@ -299,128 +302,50 @@ def insert_waitcnt(flight_count, assembly_code):
     return assembly_code
 
 
-def get_delta_time(events):
-    try:
-        CUS = [[e.time for e in events if e.cu==k and e.bank==0] for k in range(16)]
-        CUS = [np.asarray(c).astype(np.int64) for c in CUS if len(c) > 2]
-        return np.min([np.min(abs(c[1:]-c[:-1])) for c in CUS])
-    except:
-        return 1
+def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
+    for n, occ in enumerate(OCCUPANCY):
+        OCCUPANCY[n] = [max(min(int((u>>16)-min_event_time)<<16,2**42),0) | (u&0xFFFFF) for u in occ]
+    for perf in EVENTS:
+        for p in perf:
+            p.time -= min_event_time
 
-def draw_wave_metrics(selections, normalize):
-    global TIMELINES
-    global EVENTS
-    global EVENT_NAMES
-
-    response = Readable({"counters": EVENT_NAMES})
-
-    plt.figure(figsize=(15,3))
-
-    delta_step = 8
-    quad_delta_time = max(delta_step,int(0.5+np.min([get_delta_time(events) for events in EVENTS])))
-    maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])/quad_delta_time+1
-
-    if maxtime*delta_step >= COUNTERS_MAX_CAPTURES:
-        delta_step = 1
-    while maxtime >= COUNTERS_MAX_CAPTURES:
-        quad_delta_time *= 2
-        maxtime /= 2
-
-    maxtime = int(min(maxtime*delta_step, COUNTERS_MAX_CAPTURES))
-    event_timeline = np.zeros((16, maxtime), dtype=np.int32)
-    print('Delta:', quad_delta_time)
-    print('Max_cycles:', maxtime*quad_delta_time*4//delta_step)
-
-    cycles = 4*quad_delta_time//delta_step*np.arange(maxtime)
-    kernel = len(EVENTS)*quad_delta_time
-
-    for events in EVENTS:
-        for e in range(len(events)-1):
-            bk = events[e].bank*4
-            start = events[e].time // (quad_delta_time//delta_step)
-            end = start+delta_step
-            event_timeline[bk:bk+4, start:end] += np.asarray(events[e].toTuple()[1:5])[:, None]
-        start = events[-1].time
-        event_timeline[bk:bk+4, start:start+delta_step] += \
-            np.asarray(events[-1].toTuple()[1:5])[:, None]
-
-    event_timeline = [np.convolve(e, [kernel for k in range(3)])[1:-1] for e in event_timeline]
-    #event_timeline = [e/kernel for e in event_timeline]
-
-    if normalize:
-        event_timeline = [100*e/max(e.max(), 1E-5) for e in event_timeline]
-
-    colors = ['blue', 'green', 'gray', 'red', 'orange', 'cyan', 'black', 'darkviolet',
-                'yellow', 'darkred', 'pink', 'lime', 'gold', 'tan', 'aqua', 'olive']
-    [plt.plot(cycles, e, '-', label=n, color=c)
-        for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections) if sel]
-
-    plt.legend()
-    if normalize:
-        plt.ylabel('As % of maximum')
-    else:
-        plt.ylabel('Value')
-    plt.subplots_adjust(left=0.05, right=1, top=1, bottom=0.07)
-
-    figure_bytes = BytesIO()
-    plt.savefig(figure_bytes, dpi=150)
-    return response, FileBytesIO(figure_bytes), TIMELINES, EVENTS
-
-
-def draw_wave_states(selections, normalize):
-    global TIMELINES
-    plot_indices = [1, 2, 3, 4]
-    STATES = [['Empty', 'Idle', 'Exec', 'Wait', 'Stall'][k] for k in plot_indices]
-    colors = [['gray', 'orange', 'green', 'red', 'blue'][k] for k in plot_indices]
-
-    plt.figure(figsize=(15,3))
-
-    maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices])
-    timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices]
-    timelines = [np.pad(t, [0, maxtime-t.size]) for t in timelines]
-
-    if normalize:
-        timelines = np.array(timelines) / np.maximum(np.sum(timelines,0)*1E-2,1E-7)
-
-    trim = max(maxtime//5000,1)
-    cycles = np.arange(0, timelines[0].size//trim, 1)*trim
-    timelines = [time[:trim*(time.size//trim)].reshape((-1, trim)).mean(-1) if len(time) > 0 else cycles*0 for time in timelines]
-    kernsize = 21
-    kernel = np.asarray([np.exp(-abs(10*k/kernsize)) for k in range(-kernsize//2,kernsize//2+1)])
-    kernel /= np.sum(kernel)
-
-    timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2] for time in timelines]
-
-    [plt.plot(cycles, t, label='State '+s, linewidth=1.1, color=c)
-        for t, s, c, sel in zip(timelines, STATES, colors, selections) if sel]
-
-    plt.legend()
-    if normalize:
-        plt.ylabel('Waves state %')
-    else:
-        plt.ylabel('Waves state total')
-    plt.ylim(-1)
-    plt.xlim(-maxtime//200, maxtime+maxtime//200+1)
-    plt.subplots_adjust(left=0.05, right=1, top=1, bottom=0.07)
-    figure_bytes = BytesIO()
-    plt.savefig(figure_bytes, dpi=150)
-    response = Readable({"counters": STATES})
-    return response, FileBytesIO(figure_bytes), TIMELINES, []
-
-
-def GeneratePIC(selections=[True for k in range(16)], normalize=True, bScounter=True):
-    if bScounter and len(EVENTS) > 0 and np.sum([len(e) for e in EVENTS]) > 32:
-        return draw_wave_metrics(selections, normalize)
-    else:
-        return draw_wave_states(selections, normalize)
+    for df in DBFILES:
+        for T in range(len(df['timeline'])):
+            timeline = df['timeline'][T]
+            time_acc = 0
+            tuples3 = [(0,df['begin_time'][T]-min_event_time)]+[(int(t[0]),int(t[1])) for t in timeline]
 
+            for state in tuples3:
+                if state[1] > 1E8:
+                    print('Warning: Time limit reached for ',state[0], state[1])
+                    break
+                if time_acc+state[1] > TIMELINES[state[0]].size:
+                    TIMELINES[state[0]] = np.hstack([
+                        TIMELINES[state[0]],
+                        np.zeros_like(TIMELINES[state[0]])
+                    ])
+                TIMELINES[state[0]][time_acc:time_acc+state[1]] += 1
+                time_acc += state[1]
 
 if __name__ == "__main__":
+    comm = None
+    mpi_root = True
+    try:
+        comm = MPI.COMM_WORLD
+        if comm.Get_size() < 2:
+            comm = None
+        else:
+            mpi_root = comm.Get_rank() == 0
+    except:
+        print('Could not load MPI')
+        comm = None
+
     pathenv = os.getenv('OUTPUT_PATH')
     if pathenv is None:
         pathenv = "."
     parser = argparse.ArgumentParser()
     parser.add_argument("assembly_code", help="Path of the assembly code")
+    parser.add_argument("--depth", help="Maximum number of parsed waves per slot", default=100, type=int)
     parser.add_argument("--trace_file", help="Filter for trace files", default=None, type=str)
     parser.add_argument("--att_kernel", help="Kernel file",
                         type=str, default=pathenv+'/*_kernel.txt')
@@ -441,7 +366,6 @@ if __name__ == "__main__":
         print('Skipping analysis.')
         quit()
 
-    global EVENT_NAMES
     with open(os.getenv("COUNTERS_PATH"), 'r') as f:
         lines = [l.split('//')[0] for l in f.readlines()]
 
@@ -452,7 +376,6 @@ if __name__ == "__main__":
                 EVENT_NAMES += ['id: '+clean(line)]
             elif 'att: TARGET_CU' in line:
                 args.target_cu = int(clean(line))
-                print('Target CU set to:', args.target_cu)
         for line in lines:
             if 'PERFCOUNTER=' in line:
                 EVENT_NAMES += [clean(line).split('SQ_')[1].lower()]
@@ -471,45 +394,61 @@ if __name__ == "__main__":
         print('Could not find att output kernel:', args.att_kernel)
         exit(1)
     elif len(att_kernel) > 1:
-        print('Found multiple kernel matching given filters:')
-        for n, k in enumerate(att_kernel):
-            print('\t', n, '->', k)
+        if mpi_root:
+            print('Found multiple kernel matching given filters:')
+            for n, k in enumerate(att_kernel):
+                print('\t', n, '->', k)
 
-        bValid = False
-        while bValid == False:
-            try:
-                args.att_kernel = att_kernel[int(input("Please select number: "))]
-                bValid = True
-            except KeyboardInterrupt:
-                exit(0)
-            except:
-                print('Invalid option.')
+            bValid = False
+            while bValid == False:
+                try:
+                    args.att_kernel = att_kernel[int(input("Please select number: "))]
+                    bValid = True
+                except KeyboardInterrupt:
+                    exit(0)
+                except:
+                    print('Invalid option.')
+        if comm is not None:
+            args.att_kernel = comm.bcast(args.att_kernel, root=0)
     else:
         args.att_kernel = att_kernel[0]
 
-    print('Att kernel:', args.att_kernel)
-    code, jumps = parse_binary(args.assembly_code, args.att_kernel)
-
     # Trace Parsing
     if args.trace_file is None:
         filenames = glob.glob(args.att_kernel.split('_kernel.txt')[0]+'_*.att')
-        assert(len(filenames) > 0)
     else:
         filenames = glob.glob(args.trace_file)
+    assert(len(filenames) > 0)
 
-    print('Trace filenames:', filenames)
+    if comm is not None:
+        filenames = filenames[comm.Get_rank()::comm.Get_size()]
+
+    code = jumps = None
+    if mpi_root:
+        print('Att kernel:', args.att_kernel)
+        code, jumps = parse_binary(args.assembly_code, args.att_kernel)
 
     DBFILES = []
-    global TIMELINES
-    global EVENTS
     TIMELINES = [np.zeros(int(1E4),dtype=np.int16) for k in range(5)]
     EVENTS = []
     OCCUPANCY = []
-
+    GFXV = []
     analysed_filenames = []
-    SIMD_list = []
+
+    shader_engine_data_dict = {}
     for name in filenames:
-        SIMD, perfevents, occupancy, gfxv = getWaves(name, args.target_cu, False)
+        getWaves_binary(name, shader_engine_data_dict, args.target_cu, args.depth)
+
+    if comm is not None:
+        code = comm.bcast(code, root=0)
+        jumps = comm.bcast(jumps, root=0)
+
+    gc.collect()
+    latency_map = np.zeros((len(code)), dtype=np.int64)
+    hitcount_map = np.zeros((len(code)), dtype=np.int32)
+    for name in filenames:
+        SIMD, perfevents, occupancy, gfxv = shader_engine_data_dict[name]
+        getWaves_stitch(SIMD, code, jumps, gfxv, latency_map, hitcount_map)
         if len(SIMD) == 0:
             print("Error parsing ", name)
             continue
@@ -517,8 +456,9 @@ if __name__ == "__main__":
         EVENTS.append(perfevents)
         DBFILES.append( persist(name, SIMD) )
         OCCUPANCY.append( occupancy )
-        SIMD_list.append( SIMD )
+        GFXV.append(gfxv)
 
+    gc.collect()
     min_event_time = 2**62
     for df in DBFILES:
         if len(df['begin_time']) > 0:
@@ -528,36 +468,59 @@ if __name__ == "__main__":
             min_event_time = min(min_event_time, p.time)
     for occ in OCCUPANCY:
         min_event_time = min(min_event_time, np.min(np.array(occ)>>16))
+
+    gc.collect()
+    min_event_time = max(0, min_event_time-32)
+    if comm is not None:
+        min_event_time = comm.reduce(min_event_time, op=MPI.MIN)
+        min_event_time = comm.bcast(min_event_time, root=0)
+
+        apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES)
+
+        GFXV = comm.gather(GFXV, root=0)
+        EVENTS = comm.gather(EVENTS, root=0)
+        OCCUPANCY = comm.gather(OCCUPANCY, root=0)
+        TIMELINES = comm.gather(TIMELINES, root=0)
+        gather_latency_map = comm.gather(latency_map, root=0)
+        gather_hitcount_map = comm.gather(hitcount_map, root=0)
+        gathered_filenames = comm.gather(analysed_filenames, root=0)
+
+        if mpi_root:
+            latency_map *= 0
+            hitcount_map *= 0
+            for hit, lat in zip(gather_hitcount_map, gather_latency_map):
+                hitcount_map += hit
+                latency_map += lat
+            EVENTS = [e for elem in EVENTS for e in elem]
+            OCCUPANCY = [e for elem in OCCUPANCY for e in elem]
+            gathered_filenames = [e for elem in gathered_filenames for e in elem]
+            gfxv = [e for elem in GFXV for e in elem][0]
+    
+            TIMELINES_GATHER = TIMELINES
+            TIMELINES = [np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16) for k in range(5)]
+            for gather in TIMELINES_GATHER:
+                for t, m in zip(TIMELINES, gather):
+                    t[:len(m)] += m
+            del(TIMELINES_GATHER)
+        else: # free up memory
+            TIMELINES = []
+            OCCUPANCY = []
+            EVENTS = []
+    else:
+        apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES)
+        gathered_filenames = analysed_filenames
+
+    if mpi_root:
+        for k in range(len(code)):
+            code[k][-2] = int(hitcount_map[k])
+            code[k][-1] = int(latency_map[k])
+
+    gc.collect()
     print("Min time:", min_event_time)
-    for perf in EVENTS:
-        for p in perf:
-            p.time -= min_event_time
-
-    OCCUPANCY = [[max(min(int((u>>16)-min_event_time)<<16,2**42),0) | (u&0xFFFFF) for u in occ] for occ in OCCUPANCY]
-
-    for df in DBFILES:
-        for T in range(len(df['timeline'])):
-            timeline = df['timeline'][T]
-            time_acc = 0
-            tuples1 = timeline.split('(')
-            tuples2 = [t.split(')')[0].split(',') for t in tuples1 if t != '']
-            tuples3 = [(0,df['begin_time'][T]-min_event_time)]+[(int(t[0]),int(t[1])) for t in tuples2]
-
-            for state in tuples3:
-                if state[1] > 1E8:
-                    print('Warning: Time limit reached for ',state[0], state[1])
-                    break
-                if time_acc+state[1] > TIMELINES[state[0]].size:
-                    TIMELINES[state[0]] = np.hstack([
-                        TIMELINES[state[0]],
-                        np.zeros_like(TIMELINES[state[0]])
-                    ])
-                TIMELINES[state[0]][time_acc:time_acc+state[1]] += 1
-                time_acc += state[1]
 
+    drawinfo = {'TIMELINES':TIMELINES, 'EVENTS':EVENTS, 'EVENT_NAMES':EVENT_NAMES, 'OCCUPANCY': OCCUPANCY, 'ShaderNames': gathered_filenames}
     if args.genasm and len(args.genasm) > 0:
-        flight_count = view_trace(args, code, jumps, DBFILES, analysed_filenames, True, None, OCCUPANCY, args.dumpfiles, min_event_time, gfxv)
-
+        flight_count = view_trace(args, code, DBFILES, analysed_filenames, True, OCCUPANCY, args.dumpfiles, min_event_time, gfxv, drawinfo, comm, mpi_root)
         with open(args.assembly_code, 'r') as file:
             lines = file.readlines()
         assembly_code = {l+1.0: lines[l][:-1] for l in range(len(lines))}
@@ -568,4 +531,4 @@ if __name__ == "__main__":
             for k in keys:
                 file.write(assembly_code[k]+'\n')
     else:
-        view_trace(args, code, jumps, DBFILES, analysed_filenames, False, GeneratePIC, OCCUPANCY, args.dumpfiles, min_event_time, gfxv)
+        view_trace(args, code, DBFILES, analysed_filenames, False, OCCUPANCY, args.dumpfiles, min_event_time, gfxv, drawinfo, comm, mpi_root)
diff --git a/plugin/att/drawing.py b/plugin/att/drawing.py
new file mode 100644
index 0000000000..cd44fc3219
--- /dev/null
+++ b/plugin/att/drawing.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+import sys
+if sys.version_info[0] < 3:
+    raise Exception("Must be using Python 3")
+
+import numpy as np
+from io import BytesIO
+import matplotlib.pyplot as plt
+from copy import deepcopy
+import json
+
+COUNTERS_MAX_CAPTURES = 1<<12
+
+class Readable:
+    def __init__(self, jsonstring):
+        self.jsonstr = json.dumps(jsonstring)
+        self.seek = 0
+
+    def read(self, length=0):
+        if length<=0:
+            return self.jsonstr
+        else:
+            if self.seek >= len(self):
+                self.seek = 0
+                return None
+            response =  self.jsonstr[self.seek:self.seek+length]
+            self.seek += length
+            return bytes(response, 'utf-8')
+
+    def __len__(self):
+        return len(self.jsonstr)
+
+class FileBytesIO:
+    def __init__(self, iobytes):
+        self.iobytes = deepcopy(iobytes)
+        self.seek = 0
+
+    def __len__(self):
+        return self.iobytes.getbuffer().nbytes
+
+    def read(self, length=0):
+        if length<=0:
+            return bytes(self.iobytes.getbuffer())
+        else:
+            if self.seek >= self.iobytes.getbuffer().nbytes:
+                self.seek = 0
+                return None
+            response =  self.iobytes.getbuffer()[self.seek:self.seek+length]
+            self.seek += length
+            return bytes(response)
+
+def get_delta_time(events):
+    try:
+        CUS = [[e.time for e in events if e.cu==k and e.bank==0] for k in range(16)]
+        CUS = [np.asarray(c).astype(np.int64) for c in CUS if len(c) > 2]
+        return np.min([np.min(abs(c[1:]-c[:-1])) for c in CUS])
+    except:
+        return 1
+
+def draw_wave_metrics(selections, normalize, TIMELINES, EVENTS, EVENT_NAMES):
+    plt.figure(figsize=(15,4))
+
+    delta_step = 8
+    quad_delta_time = max(delta_step,int(0.5+np.min([get_delta_time(events) for events in EVENTS])))
+    maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])/quad_delta_time+1
+
+    if maxtime*delta_step >= COUNTERS_MAX_CAPTURES:
+        delta_step = 1
+    while maxtime >= COUNTERS_MAX_CAPTURES:
+        quad_delta_time *= 2
+        maxtime /= 2
+
+    maxtime = int(min(maxtime*delta_step, COUNTERS_MAX_CAPTURES))
+    event_timeline = np.zeros((16, maxtime), dtype=np.int32)
+    print('Delta:', quad_delta_time)
+    print('Max_cycles:', maxtime*quad_delta_time*4//delta_step)
+
+    cycles = 4*quad_delta_time//delta_step*np.arange(maxtime)
+    kernel = len(EVENTS)*quad_delta_time
+
+    for events in EVENTS:
+        for e in range(len(events)-1):
+            bk = events[e].bank*4
+            start = events[e].time // (quad_delta_time//delta_step)
+            end = start+delta_step
+            event_timeline[bk:bk+4, start:end] += np.asarray(events[e].toTuple()[1:5])[:, None]
+        start = events[-1].time
+        event_timeline[bk:bk+4, start:start+delta_step] += \
+            np.asarray(events[-1].toTuple()[1:5])[:, None]
+
+    event_timeline = [np.convolve(e, [kernel for k in range(3)])[1:-1] for e in event_timeline]
+    #event_timeline = [e/kernel for e in event_timeline]
+
+    if normalize:
+        event_timeline = [100*e/max(e.max(), 1E-5) for e in event_timeline]
+
+    colors = ['blue', 'green', 'gray', 'red', 'orange', 'cyan', 'black', 'darkviolet',
+                'yellow', 'darkred', 'pink', 'lime', 'gold', 'tan', 'aqua', 'olive']
+    [plt.plot(cycles, e, '-', label=n, color=c)
+        for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections) if sel]
+
+    plt.legend()
+    if normalize:
+        plt.ylabel('As % of maximum')
+    else:
+        plt.ylabel('Value')
+    plt.xlabel('Cycle')
+    plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
+
+    figure_bytes = BytesIO()
+    plt.savefig(figure_bytes, dpi=150)
+    return EVENT_NAMES, FileBytesIO(figure_bytes)
+
+
+def draw_wave_states(selections, normalize, TIMELINES):
+    plot_indices = [1, 2, 3, 4]
+    STATES = [['Empty', 'Idle', 'Exec', 'Wait', 'Stall'][k] for k in plot_indices]
+    colors = [['gray', 'orange', 'green', 'red', 'blue'][k] for k in plot_indices]
+
+    plt.figure(figsize=(15,4))
+
+    maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices])
+    timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices]
+    timelines = [np.pad(t, [0, maxtime-t.size]) for t in timelines]
+
+    if normalize:
+        timelines = np.array(timelines) / np.maximum(np.sum(timelines,0)*1E-2,1E-7)
+
+    trim = max(maxtime//5000,1)
+    cycles = np.arange(0, timelines[0].size//trim, 1)*trim
+    timelines = [time[:trim*(time.size//trim)].reshape((-1, trim)).mean(-1) if len(time) > 0 else cycles*0 for time in timelines]
+    kernsize = 21
+    kernel = np.asarray([np.exp(-abs(10*k/kernsize)) for k in range(-kernsize//2,kernsize//2+1)])
+    kernel /= np.sum(kernel)
+
+    timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2] for time in timelines]
+
+    [plt.plot(cycles, t, label='State '+s, linewidth=1.1, color=c)
+        for t, s, c, sel in zip(timelines, STATES, colors, selections) if sel]
+
+    plt.legend()
+    if normalize:
+        plt.ylabel('Waves state %')
+    else:
+        plt.ylabel('Waves state total')
+    plt.xlabel('Cycle')
+    plt.ylim(-1)
+    plt.xlim(-maxtime//200, maxtime+maxtime//200+1)
+    plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
+    figure_bytes = BytesIO()
+    plt.savefig(figure_bytes, dpi=150)
+    return STATES, FileBytesIO(figure_bytes)
+
+
+def draw_occupancy(selections, normalize, OCCUPANCY, shadernames):
+    plt.figure(figsize=(15,4))
+    names = []
+    for name, occ in zip(shadernames, OCCUPANCY):
+        occ_values = [0]
+        occ_times = [0]
+        occ = [(int(u>>16), (u>>8)&0xFF, u&0xFF) for u in occ]
+        current_occ = [0 for k in range(16)]
+
+        for time, value, cu in occ:
+            occ_times.append(time)
+            occ_values.append(occ_values[-1] + value - current_occ[cu])
+            current_occ[cu] = value
+        try:
+            name = 'SE'+name.split('.att')[0].split('_se')[-1]
+        except:
+            pass
+        names.append(name)
+
+        NUM_DOTS = 1500
+        maxtime = np.max(occ_times)
+        delta = max(1, maxtime//NUM_DOTS)
+        chart = np.zeros((maxtime//delta+1), dtype=np.float32)
+        norm_fact = np.zeros_like(chart)
+
+        for i, t in enumerate(occ_times[:-1]):
+            b = t//delta
+            e = max(b+1,occ_times[i+1]//delta)
+            chart[b:e] += occ_values[i]
+            norm_fact[b:e] += 1
+
+        chart /= np.maximum(norm_fact,1)
+        if normalize:
+            chart /= max(chart.max(),1E-6)
+
+        plt.plot(np.arange(chart.size)*delta, chart, label=name, linewidth=1.1)
+
+    plt.legend()
+    if normalize:
+        plt.ylabel('Occupancy %')
+    else:
+        plt.ylabel('Occupancy total')
+    plt.xlabel('Cycle')
+    plt.ylim(-1)
+    plt.xlim(-maxtime//200, maxtime+maxtime//200+delta+1)
+    plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
+    figure_bytes = BytesIO()
+    plt.savefig(figure_bytes, dpi=150)
+    return names, FileBytesIO(figure_bytes)
+
+
+def GeneratePIC(drawinfo, selections=[True for k in range(16)], normalize=False):
+    EVENTS = drawinfo['EVENTS']
+
+    response = {}
+    figures = {}
+
+    states, figure = draw_occupancy(selections, normalize, drawinfo['OCCUPANCY'], drawinfo['ShaderNames'])
+    response['occupancy.png'] = states
+    figures['occupancy.png'] = figure
+
+    states, figure = draw_wave_states(selections, normalize, drawinfo['TIMELINES'])
+    response['timeline.png'] = states
+    figures['timeline.png'] = figure
+
+    if len(EVENTS) > 0 and np.sum([len(e) for e in EVENTS]) > 32:
+        EVENT_NAMES, figure = draw_wave_metrics(selections, normalize, drawinfo['TIMELINES'], EVENTS, drawinfo['EVENT_NAMES'])
+        response['counters.png'] = EVENT_NAMES
+        figures['counters.png'] = figure
+
+    return Readable(response), figures
diff --git a/plugin/att/stitch.py b/plugin/att/stitch.py
new file mode 100644
index 0000000000..1893ba27ab
--- /dev/null
+++ b/plugin/att/stitch.py
@@ -0,0 +1,441 @@
+#!/usr/bin/env python3
+import sys
+if sys.version_info[0] < 3:
+    raise Exception("Must be using Python 3")
+
+from collections import defaultdict
+from copy import deepcopy
+
+MAX_STITCHED_TOKENS = 10000000
+MAX_FAILED_STITCHES = 256
+STACK_SIZE_LIMIT = 64
+
+UNKNOWN = 0
+SMEM = 1
+SALU = 2
+VMEM = 3
+FLAT = 4
+LDS = 5
+VALU = 6
+JUMP = 7
+NEXT = 8
+IMMED = 9
+BRANCH = 10
+GETPC = 11
+SETPC = 12
+SWAPPC = 13
+LANEIO = 14
+DONT_KNOW = 100
+
+WaveInstCategory = {
+    UNKNOWN: "UNKNOWN",
+    SMEM: "SMEM",
+    SALU: "SALU",
+    VMEM: "VMEM",
+    FLAT: "FLAT",
+    LDS: "LDS",
+    VALU: "VALU",
+    JUMP: "JUMP",
+    NEXT: "NEXT",
+    IMMED: "IMMED",
+    JUMP: "JUMP",
+    NEXT: "NEXT",
+    IMMED: "IMMED",
+    BRANCH: "BRANCH",
+    GETPC: "GETPC",
+    SETPC: "SETPC",
+    SWAPPC: "SWAPPC",
+    LANEIO: "LANEIO",
+    DONT_KNOW: "DONT_KNOW",
+}
+
+
+class RegisterWatchList:
+    def __init__(self, labels):
+        self.registers = {'v'+str(k): [[] for m in range(64)] for k in range(64)}
+        for k in range(64):
+            self.registers['s'+str(k)] = []
+        self.labels = labels
+
+    def try_translate(self, tok):
+        if tok[0] in ['s']:
+            return self.registers[self.range(tok)[0]]
+        elif '@' in tok:
+            return self.labels[tok.split('@')[0]]+1
+
+    def range(self, r):
+        reg = r.split(':')
+        if len(reg) == 1:
+            return reg
+        else:
+            r0 = reg[0].split('[')
+            return [r0[0]+str(k) for k in range(int(r0[1]), int(reg[1][:-1])+1)]
+
+    def tokenize(self, line):
+        return [u for u in [t.split(',')[0].strip() for t in line.split(' ')] if len(u) > 0]
+
+    def getpc(self, line, next_line):
+        #print('Get pc:', line)
+        try:
+            dst = line.split(' ')[1].strip()
+            label_dest = next_line.split(', ')[-1].split('@')[0]
+            for reg in self.range(dst):
+                self.registers[reg].append(deepcopy(self.labels[label_dest]))
+        except:
+            pass
+
+    def swappc(self, line, line_num):
+        try:
+            tokens = self.tokenize(line)
+            dst = tokens[1]
+            src = tokens[2]
+
+            popped = self.registers[self.range(src)[0]][-1]
+            self.registers[self.range(src)[0]] = self.registers[self.range(src)[0]][:-1]
+            self.registers[self.range(dst)[0]].append(line_num+1)
+            return popped
+        except:
+            return 0
+
+    def setpc(self, line):
+        try:
+            src = line.split(' ')[1].strip()
+            #print('Going to:', self.registers[self.range(src)[0]], src)
+            popped = self.registers[self.range(src)[0]][-1]
+            self.registers[self.range(src)[0]] = self.registers[self.range(src)[0]][:-1]
+            return popped
+        except:
+            return 0
+
+    def scratch(self, line):
+        try:
+            tokens = self.tokenize(line)
+            if '_load' in tokens[0]:
+                dst = tokens[1]
+                src = tokens[3]+tokens[4]
+            else:
+                src = tokens[2]
+                dst = tokens[3]+tokens[4]
+            self.registers[dst] = self.registers[src]
+        except:
+            pass
+
+    def move(self, line):
+        try:
+            tokens = self.tokenize(line)
+            if tokens[2][0] in ['s', 'd'] and tokens[1][0] in ['s', 'd']:
+                self.registers[self.range(tokens[1])[0]] = deepcopy(self.registers[self.range(tokens[2])[0]])
+        except:
+            pass
+
+    def updatelane(self, line):
+        tokens = self.tokenize(line)
+        try:
+            if 'v_readlane' in tokens[0]:
+                self.registers[tokens[1]].append(self.registers[tokens[2]][int(tokens[3])][-1])
+                self.registers[tokens[2]][int(tokens[3])] = self.registers[tokens[2]][int(tokens[3])][:-1]
+            elif 'v_writelane' in tokens[0]:
+                self.registers[tokens[1]][int(tokens[3])].append(self.registers[tokens[2]][-1])
+                self.registers[tokens[2]] = self.registers[tokens[2]][-STACK_SIZE_LIMIT:]
+        except Exception as e:
+            pass
+
+def try_match_swapped(insts, code, i, line):
+    return insts[i+1][1] == code[line][1] and insts[i][1] == code[line+1][1]
+
+FORK_NAMES = 1
+class CachedInst:
+    def __init__(self, inst, as_line):
+        self.inst_type = inst
+        self.as_line = as_line
+        self.forks = None
+
+class Fork:
+    def __init__(self):
+        global FORK_NAMES
+        self.insts = []
+        self.data = None
+        self.name = FORK_NAMES
+        FORK_NAMES += 1
+        #print('Created new fork: ', self.name)
+
+def move_down_fork(fork, insts, i): #def move_down_fork(fork : Fork, insts : list, i : int):
+    N = min(len(insts), len(fork.insts))
+
+    while i < N:
+        if insts[i][1] == fork.insts[i].inst_type:
+            i += 1
+        elif i<N-1 and insts[i+1][1] == fork.insts[i].inst_type and insts[i][1] == fork.insts[i+1].inst_type:
+            i += 2
+        else:
+            #print('Failed at', i, insts[i])
+            return False, i
+
+    if len(fork.insts) < len(insts):
+        #print('Failed at the end at', i, insts[i])
+        return False, i
+
+    #print('Reached end of ', fork.name)
+    return True, i
+
+FORK_TREE = Fork()
+
+def fromDict(insts):
+    i = 0
+    N = len(insts)
+    cur_fork = FORK_TREE
+    #print('Getting from dict')
+    while i < N:
+        tillEnd, final_pos = move_down_fork(cur_fork, insts, i)
+        if tillEnd:
+            #print('Reached end')
+            return True, cur_fork
+
+        i += final_pos
+        #print('Got fpos:', i, 'of', len(insts))
+
+        if i >= len(cur_fork.insts):
+            return False, cur_fork
+
+        last_inst = cur_fork.insts[i]
+        if last_inst.forks is None:
+            last_inst.forks = []
+
+        bMatchFork = False
+        for fork in last_inst.forks:
+            if fork.insts[0].inst_type == insts[0][1]:
+                #print('Found match fork', fork.name)
+                cur_fork = fork
+                bMatchFork = True
+                break
+        if not bMatchFork:
+            cur_fork = Fork()
+            last_inst.forks.append(cur_fork)
+            return False, cur_fork
+
+    print('Warning: Reached end of loop!')
+    return False, cur_fork
+
+
+def stitch(insts, raw_code, jumps, gfxv):
+    bGFX9 = gfxv == 'vega'
+    result, i, line, loopCount, N = [], 0, 0, defaultdict(int), len(insts)
+
+    SMEM_INST = []  # scalar memory
+    VLMEM_INST = [] # vector memory load
+    VSMEM_INST = [] # vector memory store
+    FLAT_INST = []
+    NUM_SMEM = 0
+    NUM_VLMEM = 0
+    NUM_VSMEM = 0
+    NUM_FLAT = 0
+    skipped_immed = 0
+
+    mem_unroll = []
+    flight_count = []
+
+    labels = {}
+    jump_map = [0]
+    code = [raw_code[0]]
+    for c in raw_code[1:]:
+        c = list(c)
+        c[0] = c[0].split(';')[0].split('//')[0].strip()
+
+        if c[1] != 100:
+            code.append(c)
+        elif ':' in c[0]:
+            labels[c[0].split(':')[0]] = len(code)
+        jump_map.append(len(code)-1)
+
+    reverse_map = []
+    for k, v in enumerate(jump_map):
+        if v >= len(reverse_map):
+            reverse_map.append(k)
+
+    jumps = {jump_map[j]+1: j for j in jumps}
+
+    smem_ordering = 0
+    vlmem_ordering = 0
+    vsmem_ordering = 0
+
+    watchlist = RegisterWatchList(labels=labels)
+
+    num_failed_stitches = 0
+    loops = 0
+    maxline = 0
+
+    dict_sucess, current_fork = fromDict(insts)
+    if dict_sucess:
+        result, loopCount, mem_unroll, flight_count, maxline = current_fork.data
+        result = [r+(asm[-1],) for r, asm in zip(insts, result)]
+        return result, loopCount, mem_unroll, flight_count, maxline, len(insts)
+
+    while i < N:
+        loops += 1
+        if line >= len(code) or loops > MAX_STITCHED_TOKENS or num_failed_stitches > MAX_FAILED_STITCHES:
+            break
+
+        maxline = max(reverse_map[line], maxline)
+        inst = insts[i]
+        as_line = code[line]
+
+        matched = True
+        next = line+1
+
+        if '_mov_' in as_line[0]:
+            watchlist.move(as_line[0])
+        elif 'scratch_' in as_line[0]:
+            watchlist.scratch(as_line[0])
+
+        if as_line[1] == GETPC:
+            watchlist.getpc(as_line[0], code[line+1][0])
+            matched = inst[1] in [SALU, JUMP]
+        elif as_line[1] == LANEIO:
+            watchlist.updatelane(as_line[0])
+            matched = inst[1] == VALU
+        elif as_line[1] == SETPC:
+            next = watchlist.setpc(as_line[0])
+            matched = inst[1] in [SALU, JUMP]
+        elif as_line[1] == SWAPPC:
+            next = watchlist.swappc(as_line[0], line)
+            matched = inst[1] in [SALU, JUMP]
+        elif inst[1] == as_line[1]:
+            if line in jumps:
+                loopCount[jumps[line]-1] += 1
+            num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
+
+            if inst[1] == SMEM or inst[1] == LDS:
+                smem_ordering = 1 if inst[1] == SMEM else smem_ordering
+                SMEM_INST.append([reverse_map[line],  num_inflight])
+                NUM_SMEM += 1
+            elif inst[1] == VMEM or (inst[1] == FLAT and 'global_' in as_line[0]):
+                inc_ordering = False
+                if 'buffer_' in as_line[0] or 'flat_' in as_line[0]:
+                    inc_ordering = True
+
+                if bGFX9 or 'load' in as_line[0]:
+                    VLMEM_INST.append([reverse_map[line],  num_inflight])
+                    NUM_VLMEM += 1
+                    if inc_ordering:
+                        vlmem_ordering = 1
+                else:
+                    VSMEM_INST.append([reverse_map[line],  num_inflight])
+                    NUM_VSMEM += 1
+                    if inc_ordering:
+                        vsmem_ordering = 1
+            elif inst[1] == FLAT:
+                smem_ordering = 1
+                vlmem_ordering = 1
+                vsmem_ordering = 1
+                FLAT_INST.append([reverse_map[line],  num_inflight])
+                NUM_FLAT += 1
+            elif inst[1] == IMMED and 's_waitcnt ' in as_line[0]:
+                if 'lgkmcnt' in as_line[0]:
+                    wait_N = int(as_line[0].split('lgkmcnt(')[1].split(')')[0])
+                    flight_count.append([as_line[-1], num_inflight, wait_N])
+                    if wait_N == 0:
+                        smem_ordering = 0
+                    if smem_ordering == 0:
+                        offset = len(SMEM_INST)-wait_N
+                        mem_unroll.append( [reverse_map[line], SMEM_INST[:offset]+FLAT_INST] )
+                        SMEM_INST = SMEM_INST[offset:]
+                        NUM_SMEM = len(SMEM_INST)
+                        FLAT_INST = []
+                        NUM_FLAT = 0
+                    else:
+                        NUM_SMEM = min(max(wait_N-NUM_FLAT, 0), NUM_SMEM)
+                        NUM_FLAT = min(max(wait_N-NUM_SMEM, 0), NUM_FLAT)
+                    num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
+
+                if 'vmcnt' in as_line[0]:
+                    wait_N = int(as_line[0].split('vmcnt(')[1].split(')')[0])
+                    flight_count.append([as_line[-1], num_inflight, wait_N])
+                    if wait_N == 0:
+                        vlmem_ordering = 0
+                    if vlmem_ordering == 0:
+                        offset = len(VLMEM_INST)-wait_N
+                        mem_unroll.append( [reverse_map[line], VLMEM_INST[:offset]+FLAT_INST] )
+                        VLMEM_INST = VLMEM_INST[offset:]
+                        NUM_VLMEM = len(VLMEM_INST)
+                        FLAT_INST = []
+                        NUM_FLAT = 0
+                    else:
+                        NUM_VLMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VLMEM)
+                        NUM_FLAT = min(max(wait_N-NUM_VLMEM, 0), NUM_FLAT)
+                    num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
+
+                if 'vscnt' in as_line[0] or (bGFX9 and 'vmcnt' in as_line[0]):
+                    try:
+                        wait_N = int(as_line[0].split('vscnt(')[1].split(')')[0])
+                    except:
+                        wait_N = int(as_line[0].split('vmcnt(')[1].split(')')[0])
+                    flight_count.append([as_line[-1], num_inflight, wait_N])
+                    if wait_N == 0:
+                        vsmem_ordering = 0
+                    if vsmem_ordering == 0:
+                        offset = len(VSMEM_INST)-wait_N
+                        mem_unroll.append( [reverse_map[line], VSMEM_INST[:offset]+FLAT_INST] )
+                        VSMEM_INST = VSMEM_INST[offset:]
+                        NUM_VSMEM = len(VSMEM_INST)
+                        FLAT_INST = []
+                        NUM_FLAT = 0
+                    else:
+                        NUM_VSMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VSMEM)
+                        NUM_FLAT = min(max(wait_N-NUM_VSMEM, 0), NUM_FLAT)
+                    num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
+
+        elif inst[1] == JUMP and as_line[1] == BRANCH:
+            next = jump_map[as_line[2]]
+            if next is None or next == 0:
+                print('Jump to unknown location!', as_line)
+                break
+        elif inst[1] == NEXT and as_line[1] == BRANCH:
+            next = line + 1
+        else:
+            matched = False
+            next = line + 1
+            if i+1 < N and line+1 < len(code):
+                if try_match_swapped(insts, code, i, line):
+                    temp = insts[i]
+                    insts[i] = insts[i+1]
+                    insts[i+1] = temp
+                    next = line
+                elif 's_waitcnt ' in as_line[0] or '_load_' in as_line[0]:
+                    if skipped_immed > 0 and 's_waitcnt ' in as_line[0]:
+                        matched = True
+                        skipped_immed -= 1
+                    else:
+                        print('Parsing terminated at:', as_line)
+                        break
+
+        if matched:
+            result.append(inst + (reverse_map[line],))
+            i += 1
+            num_failed_stitches = 0
+        elif not bGFX9 and inst[1] == IMMED and line != next:
+            skipped_immed += 1
+            result.append(inst + (reverse_map[line],))
+            next = line
+            i += 1
+        else:
+            num_failed_stitches += 1
+        line = next
+
+    N = max(N, 1)
+    if len(result) != N:
+        print('Warning - Stitching rate: '+str(len(result) * 100 / N)+'% matched')
+        print('Leftovers:', [WaveInstCategory[insts[i+k][1]] for k in range(20) if i+k < len(insts)])
+        try:
+            print(line, code[line])
+        except:
+            pass
+    else:
+        while line < len(code):
+            if 's_endpgm' in code[line]:
+                mem_unroll.append( [reverse_map[line], SMEM_INST+VLMEM_INST+VSMEM_INST+FLAT_INST] )
+                break
+            line += 1
+
+    current_fork.insts = [CachedInst(inst[1], inst[-1]) for inst in result]
+    current_fork.data = result, loopCount, mem_unroll, flight_count, maxline
+    return result, loopCount, mem_unroll, flight_count, maxline, len(insts)
diff --git a/plugin/att/trace_view.py b/plugin/att/trace_view.py
index 91277f930c..ae0f5ce474 100755
--- a/plugin/att/trace_view.py
+++ b/plugin/att/trace_view.py
@@ -3,16 +3,12 @@ import sys
 if sys.version_info[0] < 3:
     raise Exception("Must be using Python 3")
 
-
 import os
 import sys
 import time
 import socket
 from pathlib import Path
-from struct import *
 from collections import defaultdict
-import json
-import time
 import http.server
 import socketserver
 import socket
@@ -20,427 +16,13 @@ import asyncio
 import websockets
 from multiprocessing import Process, Manager
 import numpy as np
-from copy import deepcopy
 from http import HTTPStatus
 from io import BytesIO
-
-
-class Readable:
-    def __init__(self, jsonstring):
-        self.jsonstr = json.dumps(jsonstring)
-        self.seek = 0
-
-    def read(self, length=0):
-        if length<=0:
-            return self.jsonstr
-        else:
-            if self.seek >= len(self):
-                self.seek = 0
-                return None
-            response =  self.jsonstr[self.seek:self.seek+length]
-            self.seek += length
-            return bytes(response, 'utf-8')
-
-    def __len__(self):
-        return len(self.jsonstr)
-
-
-MAX_STITCHED_TOKENS = 10000000
-MAX_FAILED_STITCHES = 256
-STACK_SIZE_LIMIT = 64
-
-UNKNOWN = 0
-SMEM = 1
-SALU = 2
-VMEM = 3
-FLAT = 4
-LDS = 5
-VALU = 6
-JUMP = 7
-NEXT = 8
-IMMED = 9
-BRANCH = 10
-GETPC = 11
-SETPC = 12
-SWAPPC = 13
-LANEIO = 14
-DONT_KNOW = 100
-
-WaveInstCategory = {
-    UNKNOWN: "UNKNOWN",
-    SMEM: "SMEM",
-    SALU: "SALU",
-    VMEM: "VMEM",
-    FLAT: "FLAT",
-    LDS: "LDS",
-    VALU: "VALU",
-    JUMP: "JUMP",
-    NEXT: "NEXT",
-    IMMED: "IMMED",
-    JUMP: "JUMP",
-    NEXT: "NEXT",
-    IMMED: "IMMED",
-    BRANCH: "BRANCH",
-    GETPC: "GETPC",
-    SETPC: "SETPC",
-    SWAPPC: "SWAPPC",
-    LANEIO: "LANEIO",
-    DONT_KNOW: "DONT_KNOW",
-}
+from drawing import Readable, GeneratePIC
+from copy import deepcopy
 
 JSON_GLOBAL_DICTIONARY = {}
 
-
-class RegisterWatchList:
-    def __init__(self, labels):
-        self.registers = {'v'+str(k): [[] for m in range(64)] for k in range(64)}
-        for k in range(64):
-            self.registers['s'+str(k)] = []
-        self.labels = labels
-
-    def try_translate(self, tok):
-        if tok[0] in ['s']:
-            return self.registers[self.range(tok)[0]]
-        elif '@' in tok:
-            return self.labels[tok.split('@')[0]]+1
-
-    def range(self, r):
-        reg = r.split(':')
-        if len(reg) == 1:
-            return reg
-        else:
-            r0 = reg[0].split('[')
-            return [r0[0]+str(k) for k in range(int(r0[1]), int(reg[1][:-1])+1)]
-
-    def tokenize(self, line):
-        return [u for u in [t.split(',')[0].strip() for t in line.split(' ')] if len(u) > 0]
-
-    def getpc(self, line, next_line):
-        #print('Get pc:', line)
-        try:
-            dst = line.split(' ')[1].strip()
-            label_dest = next_line.split(', ')[-1].split('@')[0]
-            for reg in self.range(dst):
-                self.registers[reg].append(deepcopy(self.labels[label_dest]))
-        except:
-            pass
-
-    def swappc(self, line, line_num):
-        try:
-            tokens = self.tokenize(line)
-            dst = tokens[1]
-            src = tokens[2]
-
-            popped = self.registers[self.range(src)[0]][-1]
-            self.registers[self.range(src)[0]] = self.registers[self.range(src)[0]][:-1]
-            self.registers[self.range(dst)[0]].append(line_num+1)
-            return popped
-        except:
-            return 0
-
-    def setpc(self, line):
-        try:
-            src = line.split(' ')[1].strip()
-            #print('Going to:', self.registers[self.range(src)[0]], src)
-            popped = self.registers[self.range(src)[0]][-1]
-            self.registers[self.range(src)[0]] = self.registers[self.range(src)[0]][:-1]
-            return popped
-        except:
-            return 0
-
-    def scratch(self, line):
-        try:
-            tokens = self.tokenize(line)
-            if '_load' in tokens[0]:
-                dst = tokens[1]
-                src = tokens[3]+tokens[4]
-            else:
-                src = tokens[2]
-                dst = tokens[3]+tokens[4]
-            self.registers[dst] = self.registers[src]
-        except:
-            pass
-
-    def move(self, line):
-        try:
-            tokens = self.tokenize(line)
-            if tokens[2][0] in ['s', 'd'] and tokens[1][0] in ['s', 'd']:
-                self.registers[self.range(tokens[1])[0]] = deepcopy(self.registers[self.range(tokens[2])[0]])
-        except:
-            pass
-
-    def updatelane(self, line):
-        tokens = self.tokenize(line)
-        try:
-            if 'v_readlane' in tokens[0]:
-                self.registers[tokens[1]].append(self.registers[tokens[2]][int(tokens[3])][-1])
-                self.registers[tokens[2]][int(tokens[3])] = self.registers[tokens[2]][int(tokens[3])][:-1]
-            elif 'v_writelane' in tokens[0]:
-                self.registers[tokens[1]][int(tokens[3])].append(self.registers[tokens[2]][-1])
-                self.registers[tokens[2]] = self.registers[tokens[2]][-STACK_SIZE_LIMIT:]
-        except Exception as e:
-            pass
-
-
-def try_match_swapped(insts, code, i, line):
-    return insts[i+1][1] == code[line][1] and insts[i][1] == code[line+1][1]
-
-
-def Match(inst_value, code_value):
-    if code_value == inst_value:
-        return True
-    if code_value in [GETPC, SWAPPC, SETPC] and inst_value in [SALU, JUMP]:
-        return True
-    if code_value == BRANCH and inst_value in [JUMP, NEXT]: # TODO: Maybe lets not reorder branches?
-        return True
-    return False
-
-
-def get_match_lookahead(insts, code, i, line):
-    if try_match_swapped(insts, code, i, line):
-        return [i+1, i]
-    new_inst_order = []
-
-    allowed_insts = list(range(i, min(i+4, len(insts))))
-    for l in range(line, min(line+10, len(code))):
-        bMatch = False
-        for j in allowed_insts:
-            if Match(insts[j][1], code[l][1]):
-                new_inst_order.append(j)
-                allowed_insts.remove(j)
-                bMatch = True
-                break
-        if bMatch == False:
-            break
-    if len(new_inst_order):
-        new_inst_order += [j for j in list(range(i, max(new_inst_order)+1)) if j not in new_inst_order]
-    return new_inst_order
-
-
-def stitch(insts, raw_code, jumps, gfxv):
-    bGFX9 = gfxv == 'vega'
-    result, i, line, loopCount, N = [], 0, 0, defaultdict(int), len(insts)
-
-    SMEM_INST = []  # scalar memory
-    VLMEM_INST = [] # vector memory load
-    VSMEM_INST = [] # vector memory store
-    FLAT_INST = []
-    NUM_SMEM = 0
-    NUM_VLMEM = 0
-    NUM_VSMEM = 0
-    NUM_FLAT = 0
-    skipped_immed = 0
-
-    mem_unroll = []
-    flight_count = []
-
-    labels = {}
-    jump_map = [0]
-    code = [raw_code[0]]
-    for c in raw_code[1:]:
-        c = list(c)
-        c[0] = c[0].split(';')[0].split('//')[0].strip()
-
-        if c[1] != 100:
-            code.append(c)
-        elif ':' in c[0]:
-            labels[c[0].split(':')[0]] = len(code)
-        jump_map.append(len(code)-1)
-
-    reverse_map = []
-    for k, v in enumerate(jump_map):
-        if v >= len(reverse_map):
-            reverse_map.append(k)
-
-    jumps = {jump_map[j]+1: j for j in jumps}
-
-    smem_ordering = 0
-    vlmem_ordering = 0
-    vsmem_ordering = 0
-    max_line = 0
-
-    watchlist = RegisterWatchList(labels=labels)
-
-    num_failed_stitches = 0
-    loops = 0
-    maxline = 0
-
-    while i < N:
-        #print('L', line)
-        loops += 1
-        if line >= len(code) or loops > MAX_STITCHED_TOKENS or num_failed_stitches > MAX_FAILED_STITCHES:
-            break
-
-        maxline = max(reverse_map[line], maxline)
-        inst = insts[i]
-
-        as_line = code[line]
-        max_line = max(max_line, reverse_map[line])
-
-        matched = True
-        next = line+1
-
-        if '_mov_' in as_line[0]:
-            watchlist.move(as_line[0])
-        elif 'scratch_' in as_line[0]:
-            watchlist.scratch(as_line[0])
-
-        if as_line[1] == GETPC: # TODO: @ can put you ahead of label!
-            watchlist.getpc(as_line[0], code[line+1][0])
-            matched = inst[1] in [SALU, JUMP]
-        elif as_line[1] == LANEIO:
-            watchlist.updatelane(as_line[0])
-            matched = inst[1] == VALU
-        elif as_line[1] == SETPC:
-            next = watchlist.setpc(as_line[0])
-            matched = inst[1] in [SALU, JUMP]
-        elif as_line[1] == SWAPPC:
-            next = watchlist.swappc(as_line[0], line)
-            #print('Next:', next, code[next])
-            matched = inst[1] in [SALU, JUMP]
-        elif inst[1] == as_line[1]:
-            if line in jumps:
-                loopCount[jumps[line]-1] += 1  # label is the previous line
-            num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
-
-            if inst[1] == SMEM or inst[1] == LDS:
-                smem_ordering = 1 if inst[1] == SMEM else smem_ordering
-                SMEM_INST.append([reverse_map[line],  num_inflight])
-                NUM_SMEM += 1
-            elif inst[1] == VMEM or (inst[1] == FLAT and 'global_' in as_line[0]):
-                inc_ordering = False
-                if 'buffer_' in as_line[0] or 'flat_' in as_line[0]:
-                    inc_ordering = True
-
-                if bGFX9 or 'load' in as_line[0]:
-                    VLMEM_INST.append([reverse_map[line],  num_inflight])
-                    NUM_VLMEM += 1
-                    if inc_ordering:
-                        vlmem_ordering = 1
-                else:
-                    VSMEM_INST.append([reverse_map[line],  num_inflight])
-                    NUM_VSMEM += 1
-                    if inc_ordering:
-                        vsmem_ordering = 1
-            elif inst[1] == FLAT:
-                smem_ordering = 1
-                vlmem_ordering = 1
-                vsmem_ordering = 1
-                FLAT_INST.append([reverse_map[line],  num_inflight])
-                NUM_FLAT += 1
-            elif inst[1] == IMMED and 's_waitcnt ' in as_line[0]:
-                if 'lgkmcnt' in as_line[0]:
-                    wait_N = int(as_line[0].split('lgkmcnt(')[1].split(')')[0])
-                    flight_count.append([as_line[-1], num_inflight, wait_N])
-                    if wait_N == 0:
-                        smem_ordering = 0
-                    if smem_ordering == 0:
-                        offset = len(SMEM_INST)-wait_N
-                        mem_unroll.append( [reverse_map[line], SMEM_INST[:offset]+FLAT_INST] )
-                        SMEM_INST = SMEM_INST[offset:]
-                        NUM_SMEM = len(SMEM_INST)
-                        FLAT_INST = []
-                        NUM_FLAT = 0
-                    else:
-                        NUM_SMEM = min(max(wait_N-NUM_FLAT, 0), NUM_SMEM)
-                        NUM_FLAT = min(max(wait_N-NUM_SMEM, 0), NUM_FLAT)
-                    num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
-
-                if 'vmcnt' in as_line[0]:
-                    wait_N = int(as_line[0].split('vmcnt(')[1].split(')')[0])
-                    flight_count.append([as_line[-1], num_inflight, wait_N])
-                    if wait_N == 0:
-                        vlmem_ordering = 0
-                    if vlmem_ordering == 0:
-                        offset = len(VLMEM_INST)-wait_N
-                        mem_unroll.append( [reverse_map[line], VLMEM_INST[:offset]+FLAT_INST] )
-                        VLMEM_INST = VLMEM_INST[offset:]
-                        NUM_VLMEM = len(VLMEM_INST)
-                        FLAT_INST = []
-                        NUM_FLAT = 0
-                    else:
-                        NUM_VLMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VLMEM)
-                        NUM_FLAT = min(max(wait_N-NUM_VLMEM, 0), NUM_FLAT)
-                    num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
-
-                if 'vscnt' in as_line[0] or (bGFX9 and 'vmcnt' in as_line[0]):
-                    try:
-                        wait_N = int(as_line[0].split('vscnt(')[1].split(')')[0])
-                    except:
-                        wait_N = int(as_line[0].split('vmcnt(')[1].split(')')[0])
-                    flight_count.append([as_line[-1], num_inflight, wait_N])
-                    if wait_N == 0:
-                        vsmem_ordering = 0
-                    if vsmem_ordering == 0:
-                        offset = len(VSMEM_INST)-wait_N
-                        mem_unroll.append( [reverse_map[line], VSMEM_INST[:offset]+FLAT_INST] )
-                        VSMEM_INST = VSMEM_INST[offset:]
-                        NUM_VSMEM = len(VSMEM_INST)
-                        FLAT_INST = []
-                        NUM_FLAT = 0
-                    else:
-                        NUM_VSMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VSMEM)
-                        NUM_FLAT = min(max(wait_N-NUM_VSMEM, 0), NUM_FLAT)
-                    num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
-
-        elif inst[1] == JUMP and as_line[1] == BRANCH:
-            next = jump_map[as_line[2]]
-            if next is None or next == 0:
-                print('Jump to unknown location!', as_line)
-                break
-        elif inst[1] == NEXT and as_line[1] == BRANCH:
-            next = line + 1
-        else:
-            matched = False
-            next = line + 1
-            if i+1 < N and line+1 < len(code):
-                #print('Swap:', try_match_swapped(insts, code, i, line))
-                if try_match_swapped(insts, code, i, line):
-                    temp = insts[i]
-                    insts[i] = insts[i+1]
-                    insts[i+1] = temp
-                    next = line
-                elif 's_waitcnt ' in as_line[0] or '_load_' in as_line[0]:
-                    if skipped_immed > 0 and 's_waitcnt ' in as_line[0]:
-                        matched = True
-                        skipped_immed -= 1
-                    else:
-                        print('Parsing terminated at:', as_line)
-                        break
-
-        #print(matched, WaveInstCategory[inst[1]], WaveInstCategory[as_line[1]], as_line, inst)
-        #print([WaveInstCategory[insts[i+k][1]] for k in range(20) if i+k < len(insts)])
-        if matched:
-            result.append(inst + (reverse_map[line],))
-            i += 1
-            num_failed_stitches = 0
-        elif not bGFX9 and inst[1] == IMMED and line != next:
-            skipped_immed += 1
-            result.append(inst + (reverse_map[line],))
-            next = line
-            i += 1
-        else:
-            num_failed_stitches += 1
-        line = next
-
-    N = max(N, 1)
-    if len(result) != N:
-        print('Warning - Stitching rate: '+str(len(result) * 100 / N)+'% matched')
-        print('Leftovers:', [WaveInstCategory[insts[i+k][1]] for k in range(20) if i+k < len(insts)])
-        try:
-            print(line, code[line])
-        except:
-            pass
-    else:
-        while line < len(code):
-            if 's_endpgm' in code[line]:
-                mem_unroll.append( [reverse_map[line], SMEM_INST+VLMEM_INST+VSMEM_INST+FLAT_INST] )
-                break
-            line += 1
-
-    return result, loopCount, mem_unroll, flight_count, maxline
-
-
 def get_ip():
     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     s.settimeout(0)
@@ -459,28 +41,10 @@ PORT, WebSocketPort = 8000, 18000
 SP = '\u00A0'
 
 
-def extract_tuple(content, num):
-    vals = content.split(',')
-    assert (len(vals) == num)
-    last_val = vals[-1][:-1] if vals[-1].endswith(')') else vals[-1]
-    vals = [vals[0][1:]] + vals[1:-1] + [last_val]
-    return tuple(int(val) for val in vals)
-
-
-def get_top_n(stitched):
+def get_top_n(code):
     TOP_N = 10
-    by_line_num = defaultdict(lambda: [0, 0, 0])
-    for (_, _, s2i, run_time, line_num) in stitched:
-        entry = by_line_num[line_num]
-        entry[0] += 1
-        entry[1] += s2i
-        entry[2] += run_time
-    top_n = sorted(
-        [(line_num, v[0], v[1], v[2])
-         for (line_num, v) in by_line_num.items()],
-        key=lambda x: x[2] + x[3],
-        reverse=True)
-    return top_n[:TOP_N]
+    top_n = sorted(deepcopy(code), key=lambda x: x[-1], reverse=True)[:TOP_N]
+    return [(line_num, hitc, 0, run_time) for _, _, _, _, line_num, _, hitc, run_time in top_n]
 
 
 def wave_info(df, id):
@@ -498,74 +62,28 @@ def wave_info(df, id):
     return dic
 
 
-def extract_waves(waves):
-    result, slot2seq = [], {}
-    for id in waves['id']:
-        row = {key: waves[key][id] for key in waves.keys()}
-
-        insts, timeline = [], []
-        for x in row['instructions'].split('),'):
-            if len(x) > 0:
-                insts.append(extract_tuple(x, 4))
-        for x in row['timeline'].split('),'):
-            if len(x) > 0:
-                timeline.append(extract_tuple(x, 2))
-
-        # aggregate per wave slot
-        if (row['simd'], row['wave_slot']) in slot2seq:
-            slot = result[slot2seq[(row['simd'], row['wave_slot'])]]
-            last_end_time = slot[2][-1][-1]
-            slot[2] += (row['id'], row['begin_time'], row['end_time']),
-            slot[3] += insts
-            # filler between waves
-            slot[4] += (0, row['begin_time'] - last_end_time),
-            slot[4] += timeline
-        else:
-            slot2seq[row['simd'], row['wave_slot']] = len(result)
-            result.append([row['simd'], row['wave_slot'],
-                           [(row['id'], row['begin_time'], row['end_time'])],
-                           insts,
-                           timeline])
-
-    return result
-
-
-def extract_data(df, se_number, code, jumps, gfxv):
+def extract_data(df, se_number):
     if len(df['id']) == 0 or len(df['instructions']) == 0 or len(df['timeline']) == 0:
         return None
 
-    cu_waves = extract_waves(df)
     wave_filenames = []
     flight_count = []
-    maxgrade = [{df['wave_slot'][wave_id]: -1 for wave_id in df['id']} for k in range(4)]
-    non_stitched = [{df['wave_slot'][wave_id]: -1 for wave_id in df['id']} for k in range(4)]
-
+    wave_slot_count = [{df['wave_slot'][wave_id]: 0 for wave_id in df['id']} for k in range(4)]
+    
     print('Number of waves:', len(df['id']))
     allwaves_maxline = 0
 
     for wave_id in df['id']:
-        if non_stitched[df['simd'][wave_id]][df['wave_slot'][wave_id]] == 0:
-            continue
-        insts, timeline = [], []
-        if len(df['instructions'][wave_id]) == 0 or len(df['timeline'][wave_id]) == 0:
-            continue
+        stitched, loopCount, mem_unroll, count, maxline, num_insts = df['instructions'][wave_id]
+        timeline = df['timeline'][wave_id]
 
-        for x in df['instructions'][wave_id].split('),'):
-            insts.append(extract_tuple(x, 4))
-        for x in df['timeline'][wave_id].split('),'):
-            timeline.append(extract_tuple(x, 2))
-
-        stitched, loopCount, mem_unroll, count, maxline = stitch(insts, code, jumps, gfxv)
-        srate = len(stitched)**2 / max(len(insts), 1)
-        if srate <= maxgrade[df['simd'][wave_id]][df['wave_slot'][wave_id]]:
+        if len(stitched) == 0 or len(timeline) == 0 or len(stitched) != num_insts:
             continue
 
         allwaves_maxline = max(allwaves_maxline, maxline)
-        maxgrade[df['simd'][wave_id]][df['wave_slot'][wave_id]] = srate
-        non_stitched[df['simd'][wave_id]][df['wave_slot'][wave_id]] = len(insts) - len(stitched)
         flight_count.append(count)
-        
-        wave_entry = {  
+
+        wave_entry = {
             "id": int(df['id'][wave_id]),
             "simd": int(df['simd'][wave_id]),
             "slot": int(df['wave_slot'][wave_id]),
@@ -578,33 +96,36 @@ def extract_data(df, se_number, code, jumps, gfxv):
         }
         data_obj = {
             "name": 'SE'.format(se_number),
-            "kernel": code[0][0],
             "duration": sum(dur for (_, dur) in timeline),
             "wave": wave_entry,
             "loop_count": loopCount,
-            "top_n": get_top_n(stitched),
+            "top_n": [],
+            "num_stitched": len(stitched),
+            "num_insts": num_insts,
             "websocket_port": WebSocketPort,
             "generation_time": time.ctime()
         }
 
-        OUT = 'se'+str(se_number)+'_sm'+str(df['simd'][wave_id])+'_wv'+str(df['wave_slot'][wave_id])+'.json'
+        simd_id = df['simd'][wave_id]
+        slot_id = df['wave_slot'][wave_id]
+        slot_count = wave_slot_count[simd_id][slot_id]
+        wave_slot_count[simd_id][slot_id] += 1
+
+        OUT = 'se'+str(se_number)+'_sm'+str(simd_id)+'_sl'+str(slot_id)+'_wv'+str(slot_count)+'.json'
         JSON_GLOBAL_DICTIONARY[OUT] = Readable(data_obj)
-        wave_filenames.append(OUT)
+        wave_filenames.append((OUT, df['begin_time'][wave_id], df['end_time'][wave_id]))
 
     data_obj = {
         "name": 'SE'.format(se_number),
-        "kernel": code[0][0],
-        "simd_waves": [],
-        "cu_waves": cu_waves,
-        "code": code[:allwaves_maxline+16],
         "websocket_port": WebSocketPort,
         "generation_time": time.ctime()
     }
-    se_filename = 'se'+str(se_number)+'_code.json'
+    se_filename = None
     if len(wave_filenames) > 0:
+        se_filename = 'se'+str(se_number)+'_info.json'
         JSON_GLOBAL_DICTIONARY[se_filename] = Readable(data_obj)
 
-    return flight_count, wave_filenames, se_filename
+    return flight_count, wave_filenames, se_filename, allwaves_maxline
 
 
 class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
@@ -618,19 +139,18 @@ class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
         self.send_header("Expires", "0")
 
     def do_GET(self):
-        global PICTURE_CALLBACK
-        if 'timeline.png?' in self.path:
-            selections = [int(s)!=0 for s in self.path.split('timeline.png?')[1]]
-            counters_json, imagebytes, _, _ = PICTURE_CALLBACK(selections[1:], selections[0])
-            JSON_GLOBAL_DICTIONARY['counters.json'] = counters_json
-            JSON_GLOBAL_DICTIONARY[self.path.split('/')[-1]] = imagebytes
+        if '.png?' in self.path and self.path.split('/')[-1] not in JSON_GLOBAL_DICTIONARY.keys():
+            selections = [int(s)!=0 for s in self.path.split('.png?')[-1]]
+            counters_json, imagebytes = GeneratePIC(self.drawinfo, selections[1:], selections[0])
+            JSON_GLOBAL_DICTIONARY['graph_options.json'] = counters_json
+            JSON_GLOBAL_DICTIONARY[self.path.split('/')[-1]] = imagebytes[self.path.split('/')[-1].split('?')[0]]
 
-        if '.json' in self.path or 'timeline.png' in self.path or 'wstates' in self.path:
+        if '.json' in self.path or '.png' in self.path:
             try:
                 response_file = JSON_GLOBAL_DICTIONARY[self.path.split('/')[-1]]
-                #print(response_file)
             except:
                 print('Invalid json request:', self.path)
+                print(JSON_GLOBAL_DICTIONARY.keys())
                 self.send_error(HTTPStatus.NOT_FOUND, "File not found")
                 return
             self.send_response(HTTPStatus.OK)
@@ -658,9 +178,11 @@ class RocTCPServer(socketserver.TCPServer):
         self.socket.bind(self.server_address)
 
 
-def run_server():
+def run_server(drawinfo):
     Handler = NoCacheHTTPRequestHandler
-    os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'ui'))
+    Handler.drawinfo = drawinfo
+    os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'ui/'))
+    #os.chdir('ui/')
     try:
         with RocTCPServer((IPAddr, PORT), Handler) as httpd:
             httpd.serve_forever()
@@ -676,7 +198,6 @@ def fix_space(line):
 
 def WebSocketserver(websocket, path):
     data = websocket.recv()
-    print(354, data)
     cpp, ln, _ = data.split(':')
     ln = int(ln)
     HL, EMP = 'highlight', ''
@@ -713,68 +234,87 @@ def assign_ports(ports):
     PORT, WebSocketPort = ps[0], ps[1]
 
 
-def call_picture_callback(return_dict):
-    global PICTURE_CALLBACK
-    response, imagebytes, wstates, counter_events = PICTURE_CALLBACK()
-    return_dict['counters.json'] = response
-    return_dict['timeline.png'] = imagebytes
-    for n, m in enumerate(wstates):
+def call_picture_callback(return_dict, drawinfo):
+    response, imagebytes = GeneratePIC(drawinfo)
+    return_dict['graph_options.json'] = response
+    for k, v in imagebytes.items():
+        return_dict[k] = v
+
+    for n, m in enumerate(drawinfo['TIMELINES']):
         return_dict['wstates'+str(n)+'.json'] = Readable({"data": [int(n) for n in list(np.asarray(m))]})
-    for n, e in enumerate(counter_events):
+    for n, e in enumerate(drawinfo['EVENTS']):
         return_dict['se'+str(n)+'_perfcounter.json'] = Readable({"data": [v.toTuple() for v in e]})
 
 
-def view_trace(args, code, jumps, dbnames, att_filenames, bReturnLoc, pic_callback, OCCUPANCY, bDumpOnly, se_time_begin, gfxv):
-    global PICTURE_CALLBACK
-    PICTURE_CALLBACK = pic_callback
-    manager = Manager()
-    return_dict = manager.dict()
-    JSON_GLOBAL_DICTIONARY['occupancy.json'] = Readable({str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))})
+def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpOnly, se_time_begin, gfxv, drawinfo, MPI_COMM, mpi_root):
+    global JSON_GLOBAL_DICTIONARY
+    pic_thread = None
+    if mpi_root:
+        manager = Manager()
+        return_dict = manager.dict()
+        JSON_GLOBAL_DICTIONARY['occupancy.json'] = Readable({str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))})
+        pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo))
+        pic_thread.start()
 
-    pic_thread = Process(target=call_picture_callback, args=(return_dict,))
-    pic_thread.start()
-
-    assert(len(dbnames) > 0)
     att_filenames = [Path(f).name for f in att_filenames]
     se_numbers = [int(a.split('_se')[1].split('.att')[0]) for a in att_filenames]
     flight_count = []
     simd_wave_filenames = {}
     se_filenames = []
 
+    allse_maxline = 0
     for se_number, dbname in zip(se_numbers, dbnames):
         if len(dbname['id']) == 0:
             continue
 
-        count, wv_filenames, se_filename = extract_data(dbname, se_number, code, jumps, gfxv)
+        count, wv_filenames, se_filename, maxline = extract_data(dbname, se_number)
+        if se_filename is None:
+            continue
+        allse_maxline = max(allse_maxline, maxline)
         se_filenames.append(se_filename)
 
         if count is not None:
             flight_count.append(count)
             simd_wave_filenames[se_number] = wv_filenames
 
+    if mpi_root:
+        JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code[:allse_maxline+16], "top_n": get_top_n(code[:allse_maxline+16])})
+
     if bReturnLoc:
         return flight_count
 
     for key in simd_wave_filenames.keys():
         wv_array = [[
-            int(s.split('_sm')[1].split('_wv')[0]),
-            int(s.split('_wv')[1].split('.')[0]),
+            int(s[0].split('_sm')[1].split('_sl')[0]),
+            int(s[0].split('_sl')[1].split('_wv')[0]),
+            int(s[0].split('_wv')[1].split('.')[0]),
             s
         ] for s in simd_wave_filenames[key]]
 
         wv_dict = {}
         for wv in wv_array:
             try:
-                wv_dict[wv[0]][wv[1]] = wv[2]
+                wv_dict[wv[0]][wv[1]][wv[2]] = wv[3]
             except:
                 try:
-                    wv_dict[wv[0]] = {wv[1]: wv[2]}
+                    wv_dict[wv[0]][wv[1]] = {wv[2]: wv[3]}
                 except:
-                    exit(-1)
+                    try:
+                        wv_dict[wv[0]] = {wv[1]: {wv[2]: wv[3]}}
+                    except:
+                        pass
 
         simd_wave_filenames[key] = wv_dict
 
-    JSON_GLOBAL_DICTIONARY['filenames.json'] = Readable({"wave_filenames": simd_wave_filenames,
+    if MPI_COMM is not None:
+        se_filenames = MPI_COMM.gather(se_filenames, root=0)
+        simd_wave_filenames = MPI_COMM.gather(simd_wave_filenames, root=0)
+        if mpi_root:
+            se_filenames = [e for elem in se_filenames for e in elem]
+            simd_wave_filenames = {k:v for smf in simd_wave_filenames for k,v in smf.items()}
+
+    if mpi_root:
+        JSON_GLOBAL_DICTIONARY['filenames.json'] = Readable({"wave_filenames": simd_wave_filenames,
                                                         "se_filenames": se_filenames,
                                                         "global_begin_time": int(se_time_begin),
                                                         "gfxv": gfxv})
@@ -785,11 +325,18 @@ def view_trace(args, code, jumps, dbnames, att_filenames, bReturnLoc, pic_callba
             JSON_GLOBAL_DICTIONARY[k] = v
 
     if bDumpOnly == False:
+        if MPI_COMM is not None:
+            JSON_GLOBAL_DICTIONARY = MPI_COMM.gather(JSON_GLOBAL_DICTIONARY, root=0)
+            if not mpi_root:
+                quit()
+            JSON_GLOBAL_DICTIONARY = {k:v for smf in JSON_GLOBAL_DICTIONARY for k,v in smf.items()}
+
+        JSON_GLOBAL_DICTIONARY['live.json'] = Readable({'live': 1})
         if args.ports:
             assign_ports(args.ports)
         print('serving at ports: {0},{1}'.format(PORT, WebSocketPort))
         try:
-            PROCS = [Process(target=run_server), Process(target=run_websocket)]
+            PROCS = [Process(target=run_server, args=[drawinfo]), Process(target=run_websocket)]
             for p in PROCS:
                 p.start()
             for p in PROCS:
@@ -797,8 +344,10 @@ def view_trace(args, code, jumps, dbnames, att_filenames, bReturnLoc, pic_callba
         except KeyboardInterrupt:
             print("Exitting.")
     else:
-        os.makedirs('ui', exist_ok=True)
-        os.system('cp ' + os.path.join(os.path.abspath(os.path.dirname(__file__)),'ui') + '/* ui/' )
+        os.makedirs('ui/', exist_ok=True)
+        if mpi_root:
+            JSON_GLOBAL_DICTIONARY['live.json'] = Readable({'live': 0})
+            os.system('cp ' + os.path.join(os.path.abspath(os.path.dirname(__file__)),'ui') + '/* ui/' )
         for k, v in JSON_GLOBAL_DICTIONARY.items():
             with open(os.path.join('ui',k), 'w' if '.json' in k else 'wb') as f:
                 f.write(v.read())
diff --git a/plugin/att/ui/httpserver.py b/plugin/att/ui/httpserver.py
index b8a821b4cc..8e75b7be9f 100644
--- a/plugin/att/ui/httpserver.py
+++ b/plugin/att/ui/httpserver.py
@@ -20,8 +20,8 @@ class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
         self.send_header("Expires", "0")
 
     def do_GET(self):
-        if 'timeline.png?' in self.path:
-            self.path = 'timeline.png'
+        if '.png?' in self.path:
+            self.path = self.path.split('.png?')[0]+'.png'
 
         http.server.SimpleHTTPRequestHandler.do_GET(self)
 
diff --git a/plugin/att/ui/index.html b/plugin/att/ui/index.html
index 7dc12e15f8..c6418511ef 100644
--- a/plugin/att/ui/index.html
+++ b/plugin/att/ui/index.html
@@ -3,16 +3,19 @@
 	<link href="data:image/x-icon;base64,AAABAAEAEBAQAAAAAAAoAQAAFgAAACgAAAAQAAAAIAAAAAEABAAAAAAAgAAAAAAAAAAAAAAAEAAAAAAAAADc6sMA////AG2nAAD4+vMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAERERERERERESIiIhERERMRIiIiIRERMhEiIiIiEREiESIiIiIhEiIRIiIREREiIhEiIhERESIiERIiERERIiIRESIREREiIhEREhERESIiERERERERIiIREREiIiIiIhEREiIiIiIiERAiIiIiIiIRAiIiIiIiIhEREREREREREAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" rel="icon" type="image/x-icon" />
 	<link rel="stylesheet" href="styles.css">
 	<head>
-		<title>MI Trace Viewer</title>
+		<title>ATT Analysis View</title>
 	</head>
 
 	<body>
-	<div id="Images"> 
+	<div id="Images">
 		<div id="padding" style="height:4px"></div>
 
-		<div><img src="logo.svg"/> </div>
-
 		<div style="order: 1px solid lightblue; overflow:auto; width: calc(min(100% - 350px, 1400px));">
+			<div class="tab">
+				<button class="tablinks" onclick="showImage('timeline.png')">Wave States</button>
+				<button class="tablinks" onclick="showImage('occupancy.png')">Occupancy</button>
+				<button class="tablinks" onclick="showImage('counters.png')" id="counterspng_button">Counters</button>
+			</div>
 			<img id="GraphImage" src=timeline.png width=100%>
 		</div>
 
@@ -26,13 +29,14 @@
 	</div>
 
 	<div id="padding" style="height: 10px"></div>
-	<div id="Buttons" style="overflow:auto; max-width: calc(100% - 370px); min-height: 120px; max-height: calc(480px - 20vw); z-index: 9999;">
+	<div id="Buttons" display="inline-block" style="overflow:scroll; max-width: calc(100% - 370px); height: 150px; z-index: 9999;">
 		<div id="GH_select"></div>
 		<div id="SE_select"></div>
 		<div id="SM_select"></div>
-		<div id="WV_select"></div>
+		<div id="WSL_select"></div>
+		<div id="WID_select"></div>
 	</div>
-	<div id="map" style="position: absolute; top:460px; width: 100%;">
+	<div id="map" style="position: absolute; top:570px; width: 100%;">
 		<div id="flexbox">
 			<div id="logo">
 				<div id="what"></div>
@@ -43,9 +47,9 @@
 	</div>
 	<div id="minimap"></div>
 	<div id="ma_code">
-		<ul id="code" style="position: absolute; top:610px; left:390px"></ul>
+		<ul id="code" style="position: absolute; top:720px; left:390px"></ul>
 	</div>
-	<canvas id="arrows" width="400px" height="500px" style="position: absolute; top:630px; left:1px;"></canvas>
+	<canvas id="arrows" width="400px" height="500px" style="position: absolute; top:740px; left:1px;"></canvas>
 
 	<script src="https://cdn.jsdelivr.net/npm/d3@7.0.0/dist/d3.min.js"></script>
 	<script>
@@ -159,21 +163,63 @@
 
 		setInterval(DrawCanvas, 200)
 
-		function WaveButtonHtml(index) {
-			return '<button class="btn" id="wv_button' + index + '" onclick="FetchNamesAndGather(' + index + ')">Wave'+index+'</button>\n'
+		var dropdowns_open = [[], [], [], []]
+		function closeAllDropDownsExcept(index) {
+			for (var i in dropdowns_open) {
+				if (i === index) continue;
+				for (var drop in dropdowns_open[i]) {
+					if (dropdowns_open[i][drop].classList.contains('show')) {
+						dropdowns_open[i][drop].classList.remove('show')
+					}
+				}
+			}
 		}
-		function SEButonHtml(index) {
-			return '<button class="btn" id="se_button' + index + '" onclick="OpenSIMDView(' + index + ')">Shader'+index+'</button>\n'
+		function dropDownSE() {
+			closeAllDropDownsExcept(0);
+			document.getElementById("SE_dropdown").classList.toggle("show");
 		}
-		function SIMDButonHtml(index) {
-			return '<button class="btn" id="sm_button' + index + '" onclick="OpenWaveView(' + index + ')">SIMD'+index+'</button>\n'
+		function dropDownSM() {
+			closeAllDropDownsExcept(1);
+			document.getElementById("SM_dropdown").classList.toggle("show");
 		}
-		function GraphButtonHtml(index, name) {
-			return '<input type="checkbox" id="gh_button' + index + '" onclick="UpdGraph(this, '+index+')" checked=true>'+name
+		function dropDownWSL() {
+			closeAllDropDownsExcept(2);
+			document.getElementById("WSL_dropdown").classList.toggle("show");
+		}
+		function dropDownWID() {
+			closeAllDropDownsExcept(3);
+			document.getElementById("WID_dropdown").classList.toggle("show");
 		}
 
+		function ButtonHtml(id, onc_func, name, index) {
+			var slot_name = ""
+			if (index <= 16)
+				slot_name = name + index
+			else
+				slot_name = index
+			return '<button class="btn" id="'+id+index+'" onclick="'+onc_func+'('+index+')">'+slot_name+'</button>'
+		}
+		function WaveButtonHtml(index) {
+			return ButtonHtml("wid_button", "FetchNamesAndGather", "ID", index)
+		}
+		function SEButonHtml(index) {
+			return ButtonHtml("se_button", "OpenSIMDView", "Shader", index)
+		}
+		function SIMDButonHtml(index) {
+			return ButtonHtml("sm_button", "OpenWSLView", "SIMD", index)
+		}
+		function WSLButonHtml(index) {
+			return ButtonHtml("wsl_button", "OpenWIDView", "Slot", index)
+		}
+		function GraphButtonHtml(index, name) {
+			return '\n<input type="checkbox" id="gh_button' + index + '" onclick="UpdGraph(this, '+index+')" checked=true>'+name
+		}
+
+		var global_imagename = "timeline.png"
 		var graph_selected_counters = {};
-		function UpdImageSrc() {
+		function UpdImageSrc(imgname) {
+			if (imgname != null)
+				global_imagename = imgname
 			var endstr = document.getElementById("btn_norm").checked ? "1" : "0"
 			for(var key in graph_selected_counters) {
 				if(graph_selected_counters[key])
@@ -181,16 +227,15 @@
 				else
 					endstr += "0"
 			}
-			console.log(graph_selected_counters)
-			console.log('Updated to', endstr)
-			document.getElementById("GraphImage").src = "timeline.png?" + endstr
+			console.log(graph_selected_counters, 'Updated to', endstr)
+			document.getElementById("GraphImage").src = global_imagename + "?" + endstr
 		}
 
 		function UpdGraph(checkbox, index) {
 			graph_selected_counters[index] = checkbox.checked
 			document.getElementById('gh_button'+index).style.backgroundColor
 				= graph_selected_counters[name] ? "white" : "#D7D7D7"
-			UpdImageSrc()
+			UpdImageSrc(global_imagename)
 		}
 
 		var HTML_MAC = document.getElementById("ma_code").innerHTML
@@ -198,12 +243,14 @@
 		var HTML_MINI = document.getElementById("minimap").innerHTML
 		var HTML_IMAG = document.getElementById("Images").innerHTML
 		var SE_BTN_HTML = ""
-		var WV_BTN_HTML = ""
 		var SM_BTN_HTML = ""
+		var WSL_BTN_HTML = ""
+		var WID_BTN_HTML = ""
 
 		var current_SE = 0
 		var current_SM = 0
-		var current_WV = 0
+		var current_WSL = 0
+		var current_WID = 0
 		var filename_data = {}
 		var clock_scale = 1
 
@@ -214,83 +261,182 @@
 			filename_data = data.wave_filenames
 
 			wave_cu_index = {};
+			SE_BTN_HTML = 	'<div class="dropdown">\
+							<button onclick="dropDownSE()" class="dropbtn" id="SE_BTN_DROP">Shader:</button>\
+							<div id="SE_dropdown" class="dropdown-content">'
 			for(var i in filename_data) {
 				SE_BTN_HTML = SE_BTN_HTML + SEButonHtml(i)
 			}
+			SE_BTN_HTML += '</div></div>'
 
 			document.getElementById("ma_code").innerHTML = ""
 			document.getElementById("map").innerHTML = ""
 			document.getElementById("minimap").innerHTML = ""
 			document.getElementById('SE_select').innerHTML = SE_BTN_HTML
 			document.getElementById('SM_select').innerHTML = ""
-			document.getElementById('WV_select').innerHTML = ""
+			document.getElementById('WSL_select').innerHTML = ""
+			document.getElementById('WID_select').innerHTML = ""
 
 			for(var se in filename_data)
 			for(var sm in filename_data[se])
-			for(var wv in filename_data[se][sm]) {
+			for(var wsl in filename_data[se][sm])
+			for(var wid in filename_data[se][sm][wsl]) {
 				OpenSIMDView(se)
-				OpenWaveView(sm)
-				FetchNamesAndGather(wv)
+				OpenWSLView(sm)
+				OpenWIDView(wsl)
+				FetchNamesAndGather(wid)
 				return
 			}
-			//FetchNamesAndGather(0)
 		})
 
 		function OpenSIMDView(se_index) {
 			if(document.getElementById('se_button'+current_SE) != null)
 				document.getElementById('se_button'+current_SE).style.backgroundColor = "#D7D7D7"
-			current_SE = se_index
 			document.getElementById('se_button'+se_index).style.backgroundColor = "white"
+			document.getElementById("SE_BTN_DROP").textContent = 'Shader: ' + se_index
+			current_SE = se_index
 
-			SM_BTN_HTML = ""
+			SM_BTN_HTML = 	'<div class="dropdown">\
+							<button onclick="dropDownSM()" class="dropbtn" id="SM_BTN_DROP">SIMD</button>\
+							<div id="SM_dropdown" class="dropdown-content">'
 			for(var i in filename_data[current_SE]) {
 				SM_BTN_HTML = SM_BTN_HTML + SIMDButonHtml(i)
 			}
+			SM_BTN_HTML += '</div></div>'
 
 			document.getElementById('SM_select').innerHTML = SM_BTN_HTML
-			document.getElementById('WV_select').innerHTML = ""
+			document.getElementById('WSL_select').innerHTML = ""
+			document.getElementById('WID_select').innerHTML = ""
 		}
-		function OpenWaveView(sm_index) {
+		function OpenWSLView(sm_index) {
 			if(document.getElementById('sm_button'+current_SM) != null)
 				document.getElementById('sm_button'+current_SM).style.backgroundColor = "#D7D7D7"
-			current_SM = sm_index
 			document.getElementById('sm_button'+sm_index).style.backgroundColor = "white"
+			document.getElementById("SM_BTN_DROP").textContent = 'SIMD: ' + sm_index
+			current_SM = sm_index
 
-			WV_BTN_HTML = ""
+			WSL_BTN_HTML = 	'<div class="dropdown">\
+							<button onclick="dropDownWSL()" class="dropbtn" id="WSL_BTN_DROP">WaveSlot</button>\
+							<div id="WSL_dropdown" class="dropdown-content">'
 			for(var i in filename_data[current_SE][current_SM]) {
-				WV_BTN_HTML = WV_BTN_HTML + WaveButtonHtml(i)
+				WSL_BTN_HTML = WSL_BTN_HTML + WSLButonHtml(i)
 			}
+			WSL_BTN_HTML += '</div></div>'
 
-			document.getElementById('WV_select').innerHTML = WV_BTN_HTML
+			document.getElementById('WSL_select').innerHTML = WSL_BTN_HTML
+			document.getElementById('WID_select').innerHTML = ""
+		}
+		function OpenWIDView(sl_index) {
+			if(document.getElementById('wsl_button'+current_WSL) != null)
+				document.getElementById('wsl_button'+current_WSL).style.backgroundColor = "#D7D7D7"
+			document.getElementById('wsl_button'+sl_index).style.backgroundColor = "white"
+			document.getElementById("WSL_BTN_DROP").textContent = 'WaveSlot: ' + sl_index
+			current_WSL = sl_index
+
+			WID_BTN_HTML = 	'<div class="dropdown">\
+							<button onclick="dropDownWID()" class="dropbtn" id="WID_BTN_DROP">WaveID</button>\
+							<div id="WID_dropdown" class="dropdown-content">'
+			for(var i in filename_data[current_SE][current_SM][current_WSL]) {
+				WID_BTN_HTML = WID_BTN_HTML + WaveButtonHtml(i)
+			}
+			WID_BTN_HTML += '</div></div>'
+
+			document.getElementById('WID_select').innerHTML = WID_BTN_HTML
+		}
+
+		function ApplyLiveImage() {
+			fetch("live.json", {cache: "no-store"}).then(response => response.json()).then(data => {
+				if (data.live === 1)
+					return;
+				document.getElementById("btn_norm").disabled = true;
+				try {
+					for (var index = 0; index < 99; index++)
+						document.getElementById("gh_button"+index).disabled = true;
+				} catch {}
+			})
+		}
+
+		function showImage(imgname) {
+			fetch("graph_options.json", {cache: "no-store"}).then(response => response.json()).then(data => {
+				var html_gh = '<input type="checkbox" id="btn_norm" onclick="UpdImageSrc(null)" checked=false>Normalize\t'
+				try {
+					if (data[imgname] === undefined) throw "invalid";
+				} catch {
+					console.log("Invalid data:", imgname)
+					return
+				}
+				for(var key in data[imgname]) {
+					graph_selected_counters[key] = true
+					html_gh += GraphButtonHtml(key, data[imgname][key])
+				}
+
+				document.getElementById("GH_select").innerHTML = html_gh
+				document.getElementById("btn_norm").checked = false
+
+				ApplyLiveImage()
+				UpdImageSrc(imgname)
+			})
 		}
 
 		function FetchNamesAndGather(wave_index) {
-			if(document.getElementById('wv_button'+current_WV) != null)
-				document.getElementById('wv_button'+current_WV).style.backgroundColor = "#D7D7D7"
-			current_WV = wave_index
-			document.getElementById('wv_button'+wave_index).style.backgroundColor = "white"
+			if(document.getElementById('wid_button'+current_WID) != null)
+				document.getElementById('wid_button'+current_WID).style.backgroundColor = "#D7D7D7"
+			document.getElementById("WID_BTN_DROP").textContent = 'WaveID: ' + wave_index
+			document.getElementById('wid_button'+wave_index).style.backgroundColor = "white"
+			current_WID = wave_index
 
 			document.getElementById("ma_code").innerHTML = HTML_MAC
 			document.getElementById("map").innerHTML = HTML_MAP
 			document.getElementById("minimap").innerHTML = HTML_MINI
 			document.getElementById("Images").innerHTML = HTML_IMAG
 
-			fetch("counters.json", {cache: "no-store"}).then(response => response.json()).then(data => {
-				var html_gh = '<input type="checkbox" id="btn_norm" onclick="UpdImageSrc()" checked=true>Normalize\t'
-				for(var key in data.counters) {
-					console.log(key, data.counters[key])
-					graph_selected_counters[key] = true
-					html_gh += GraphButtonHtml(key, data.counters[key])
-				}
+			showImage('timeline.png')
+			console.log('SE:',current_SE,' sm:', current_SM, 'wsl:', current_WSL, 'wid:',current_WID)
+			console.log('Fetch', filename_data[current_SE][current_SM][current_WSL][current_WID])
 
-				document.getElementById("GH_select").innerHTML = html_gh
-				UpdImageSrc()
+			//GatherData(filename_data[current_SE][current_SM][current_WV][0][0])
+
+			fetch("graph_options.json", {cache: "no-store"}).then(response => response.json()).then(data => {
+				try { if (data["counters.png"] === undefined) throw "disabled"; }
+				catch { document.getElementById("counterspng_button").disabled = true; }
 			})
 
-			console.log('SE:',current_SE,' sm:', current_SM, 'wv:', current_WV)
-			console.log('Fetch', filename_data[current_SE][current_SM][current_WV])
+			GatherCUWavesData(filename_data[current_SE][current_SM][current_WSL][current_WID])
+		}
 
-			GatherData(filename_data[current_SE][current_SM][current_WV])
+		var cuwaves_data = []
+		function GatherCUWavesData(wave_to_gather) {
+			shader = filename_data[current_SE]
+			file_to_gather = wave_to_gather[0]
+			wave_start = wave_to_gather[1]
+			wave_end = wave_to_gather[2]
+
+			wavelist = []
+			for (var sm in shader)
+			for (var wsl in shader[sm])
+			for (var wid in shader[sm][wsl]) {
+				wv = shader[sm][wsl][wid]
+				if (wv[1] < wave_end && wv[2] > wave_start)
+					wavelist.push([wv[0], wv[1], wv[2], sm, wsl, wid])
+			}
+			cuwaves_data = []
+			LoopOverList(file_to_gather, wavelist, 0)
+		}
+		function LoopOverList(file_to_gather, wavelist, index) {
+			if (index >= wavelist.length) {
+				GatherData(file_to_gather)
+			} else {
+				wave_sel = wavelist[index]
+				fetch(wave_sel[0]).then(response => response.json()).then(data => {
+					if (cuwaves_data.length == 0 || cuwaves_data[cuwaves_data.length-1][0] != wave_sel[3] || cuwaves_data[cuwaves_data.length-1][1] != wave_sel[4]) {
+						cuwaves_data.push([wave_sel[3], wave_sel[4], [wave_sel[5], wave_sel[1], wave_sel[2]], data.wave.instructions, data.wave.timeline])
+					} else {
+						cuwaves_data[cuwaves_data.length-1][3].concat(data.wave.instructions)
+						cuwaves_data[cuwaves_data.length-1][4].concat(data.wave.timeline)
+					}
+					LoopOverList(file_to_gather, wavelist, index+1)
+				})
+			}
 		}
 
 		function GatherData(file_to_gather) {
@@ -300,11 +446,9 @@
 			fetch(file_to_gather, {cache: "no-store"})
 				.then(response => response.json())
 				.then(data => {
-				code_data_file = file_to_gather.split('_sm')[0]+'_code.json'
 				console.log("Requestd:", file_to_gather)
-				console.log("Request code:", code_data_file)
 
-				fetch(code_data_file, {cache: "no-store"})
+				fetch('code.json', {cache: "no-store"})
 				.then(response => response.json())
 				.then(code_data => {
 					const SP = '\u00A0'
@@ -465,7 +609,7 @@
 							const CU = cu_waves_div.append('svg')
 								.attr('id', 'CU')
 								.attr('width', WIDTH)
-								.attr('height', code_data.cu_waves.length * CU_HEIGHT + MARGIN)
+								.attr('height', cuwaves_data.length * CU_HEIGHT + MARGIN)
 								.append('g')
 							CU.append('rect')
 								.attr('x', 0)
@@ -521,7 +665,7 @@
 					
 					d3.select('#top_n')
 						.selectAll("li")
-						.data(data.top_n)
+						.data(code_data.top_n)
 						.enter()
 						.append('li')
 						.datum((d) => { return {data:d} })
@@ -542,7 +686,6 @@
 								}, 800)
 								const inView = isInViewport(src_line)
 								if (!inView) {
-									//console.log('scrolling to source line', line_num)
 									src_line.scrollIntoView({behavior: "smooth", block: "start"})
 								}
 							}
@@ -637,32 +780,8 @@
 
 					canvas_waitcnt = data.wave.waitcnt
 
-					/*var all_nodes = d3.select("#code")
-						.selectAll("li")
-						.nodes()
-					
-					console.log(all_nodes.length)
-					console.log(all_nodes[0])
-					for(node in all_nodes) {
-						console.log(node.getBoundingClientRect().top)
-					}
-
-					d3.select("#code")
-						.selectAll("li")
-						.append('svg')
-						.attr('width', 10)
-						.attr('height', 10)
-						.append('rect')
-						.attr('x', 0)
-						.attr('y', 0)
-						.attr('rx', 2)
-						.attr('ry', 2)
-						.attr('width', 10)
-						.attr('height', 10)
-						.attr('fill', 'blue') */
-
 					const START_TIME = insts[0][0]
-					const DURATION = data.duration
+					const DURATION = data.duration * clock_scale
 					const END_TIME = START_TIME + DURATION
 
 					const NUM_BINS = 20
@@ -738,7 +857,7 @@
 					const MARGIN = 30
 					const PADDING = 3
 					const scaleX = d3.scaleLinear()
-						.domain([START_TIME, START_TIME + data.duration + MARGIN])
+						.domain([START_TIME, START_TIME + data.duration * clock_scale + MARGIN])
 						.range([MARGIN, WIDTH-MARGIN])
 					const toX = (x) => { return Math.ceil(scaleX(x)*clock_scale) }
 
@@ -915,13 +1034,13 @@
 					const show_simd = () => {
 						SIMD.append("rect")
 							.attr("width", WIDTH)
-							.attr("height", HEIGHT * code_data.simd_waves.length)
+							.attr("height", HEIGHT * wave_info_data.simd_waves.length)
 							.attr("x", 0)
 							.attr("y", HEIGHT)
 							.attr("fill", "black")
 							.attr("opacity", 0.3)
 						let current_height = HEIGHT
-						code_data.simd_waves.forEach((wave, i) => {
+						wave_info_data.simd_waves.forEach((wave, i) => {
 							// wave: (simd, slot, [(id, start, end)+], instructions, timeline)
 							let [simd, slot, waves] = [wave[0], wave[1], wave[2]]
 							let ins_in_range = wave[3].filter((x) => {
@@ -943,7 +1062,7 @@
 								.text(simd + '-' + slot)
 								.style("cursor", "pointer")
 								.append("svg:title")
-								.text(waves2str(waves))
+								.text(waves2str([waves]))
 
 							SLOT.selectAll("rect")
 								.data(ins_in_range)
@@ -961,7 +1080,7 @@
 									return INST_TYPE[d[1]][0] + ":" + d[0] + SP + "slot:" + slot
 								})
 							SLOT.selectAll("path")
-								.data(states2timeline(clamp_timeline(wave[4], wave[2][0][1])))
+								.data(states2timeline(clamp_timeline(wave[4], wave[2][1])))
 								.enter()
 								.append('path')
 								.style("cursor", "pointer")
@@ -981,7 +1100,7 @@
 					const show_cu = () => {
 						const CU = d3.select('#CU')
 						let current_height = 10
-						code_data.cu_waves.forEach((wave, i) => {
+						cuwaves_data.forEach((wave, i) => {
 							// wave: (simd, slot, [(id, start, end)+], instructions, timeline)
 							let [simd, slot, waves] = [wave[0], wave[1], wave[2]]
 							let ins_in_range = wave[3].filter((x) => {
@@ -1012,7 +1131,7 @@
 								.text(simd + '-' + slot)
 								.style("cursor", "pointer")
 								.append("svg:title")
-								.text(waves2str(waves))
+								.text(waves2str([waves]))
 
 							SLOT.selectAll("rect")
 								.data(ins_in_range)
@@ -1032,7 +1151,7 @@
 										"slot:" + simd + '-' + slot
 								})
 							SLOT.selectAll("path")
-								.data(states2timeline(clamp_timeline(wave[4], wave[2][0][1])))
+								.data(states2timeline(clamp_timeline(wave[4], wave[2][1])))
 								.enter()
 								.append('path')
 								.style("cursor", "pointer")
@@ -1044,7 +1163,6 @@
 								.attr("stroke-width", 4)
 								.append("svg:title")
 								.text((d) => { return STATE_COLOR[d[0]][0] + ":" + d[1]})
-
 							current_height += CU_HEIGHT
 						})
 
@@ -1056,10 +1174,8 @@
 							})
 						}
 					}
-
 				})
-				}
-			)
+			})
 		}
 	</script>
 	</body>
diff --git a/plugin/att/ui/styles.css b/plugin/att/ui/styles.css
index 1949a608d9..396913b77d 100644
--- a/plugin/att/ui/styles.css
+++ b/plugin/att/ui/styles.css
@@ -103,4 +103,35 @@ li:hover .tooltip {
 
 .btn:hover {
 	color: blue;
-}
\ No newline at end of file
+}
+
+.dropbtn {
+	border: 2px solid black;
+	background-color: #D7D7D7;
+	color: black;
+	padding: 3px 4px;
+	font-size: 15px;
+	cursor: pointer;
+	border-style: ridge;
+	border-radius: 4px;
+}
+
+.dropbtn:hover, .dropbtn:focus {
+	color: blue;
+}
+
+.dropdown {
+	position: relative;
+}
+
+.dropdown-content {
+	display: none;
+	position: absolute;
+	background-color: #e0e0f0;
+	min-width: 10px;
+	box-shadow: 0px 8px 16px 0px rgba(0,0,0.1,0.1);
+	z-index: 1;
+}
+
+.dropdown-content a:hover {background-color: #ddd;}
+.show {display:inline-flex;}