SWDEV-423898: Fixing issues with parallel kernels

Change-Id: I6726f3003af6036ba041c2b4bc5227dd08691090
This commit is contained in:
Giovanni LB
2023-09-25 14:54:00 -03:00
parent 7418c52cc8
commit 675e1b9d38
8 ha cambiato i file con 225 aggiunte e 449 eliminazioni
+1 -5
Vedi File
@@ -194,11 +194,7 @@ while [ 1 ]; do
ATT_ARGV="$ATT_ARGV $3 \"$4\""
shift
shift
elif [[ "$3" = "--mpi" ]]; then
ATT_PYTHON3_ARG="mpirun -np $4 python3 "
shift
shift
elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" = "--genasm" || "$3" == "--att_kernel" || "$3" == "--depth" ]]; then
elif [[ "$3" = "--mode" || "$3" = "--ports" || "$3" == "--att_kernel" ]]; then
ATT_ARGV="$ATT_ARGV $3 $4"
shift
shift
+73 -194
Vedi File
@@ -16,14 +16,7 @@ import glob
import numpy as np
from stitch import stitch
import gc
try:
from mpi4py import MPI
MPI_IMPORTED = True
except:
MPI_IMPORTED = False
from collections import defaultdict
class PerfEvent(ctypes.Structure):
_fields_ = [
@@ -130,6 +123,8 @@ class ReturnInfo(ctypes.Structure):
("occupancy", POINTER(ctypes.c_uint64)),
("num_occupancy", ctypes.c_uint64),
("flags", ctypes.c_uint64),
("kernel_id_addr", POINTER(ctypes.c_uint64)),
("num_kernel_ids", ctypes.c_uint64),
]
@@ -162,10 +157,15 @@ def parse_binary(filename, kernel=None):
info = SO.wrapped_parse_binary(str(filename).encode("utf-8"), kernel)
code = []
kernel_addr = defaultdict(lambda : "Unknown")
last_known_function = "Unknown"
for k in range(info.code_len):
code_entry = info.code[k]
line = deepcopy(code_entry.line.decode("utf-8"))
if "; Begin " in line:
last_known_function = line.split("; Begin ")[1]
loc = deepcopy(code_entry.loc.decode("utf-8"))
to_line = int(code_entry.to_line) if (code_entry.to_line >= 0) else None
@@ -175,31 +175,31 @@ def parse_binary(filename, kernel=None):
code.append([line, int(code_entry.value), to_line, loc, int(code_entry.index),
int(code_entry.line_num), int(code_entry.addr), 0, 0])
if code[-1][-3] != 0 and len(code) > 1:
kernel_addr[code[-1][-3]] = last_known_function
jumps = {}
for k in range(info.jumps_len):
jumps[info.jumps[k].key] = info.jumps[k].value
return code, jumps
return code, jumps, kernel_addr
def getWaves_binary(name, shader_engine_data_dict, target_cu, depth):
def getWaves_binary(name, shader_engine_data_dict, target_cu):
filename = os.path.abspath(str(name))
info = SO.AnalyseBinary(filename.encode("utf-8"), target_cu, False)
kernel_addr = [int(info.kernel_id_addr[k]) for k in range(info.num_kernel_ids)]
waves = [info.wavedata[k] for k in range(info.num_waves)]
events = [deepcopy(info.perfevents[k]) for k in range(info.num_events)]
occupancy = [int(info.occupancy[k]) for k in range(int(info.num_occupancy))]
flags = "navi" if (info.flags & 0x1) else "vega"
wave_slot_count = [[0 for k in range(20)] for j in range(4)]
waves_python = []
for wave in waves:
if (
wave_slot_count[wave.simd][wave.wave_id] >= depth
or wave.instructions_size == 0
):
if wave.instructions_size < 2:
continue
wave_slot_count[wave.simd][wave.wave_id] += 1
pwave = PythonWave(wave)
pwave.timeline = [
(wave.timeline_array[2 * k], wave.timeline_array[2 * k + 1])
@@ -210,16 +210,16 @@ def getWaves_binary(name, shader_engine_data_dict, target_cu, depth):
for k in range(wave.instructions_size)
]
waves_python.append(pwave)
shader_engine_data_dict[name] = (waves_python, events, occupancy, flags)
shader_engine_data_dict[name] = (waves_python, events, occupancy, flags, kernel_addr)
def getWaves_stitch(SIMD, code, jumps, flags, latency_map, hitcount_map, bIsAuto):
for pwave in SIMD:
pwave.instructions = stitch(pwave.instructions, code, jumps, flags, bIsAuto)
for inst in pwave.instructions[0]:
hitcount_map[inst[-1]] += 1
latency_map[inst[-1]] += inst[3]
if pwave.instructions is not None:
for inst in pwave.instructions[0]:
hitcount_map[inst[-1]] += 1
latency_map[inst[-1]] += inst[3]
def persist(trace_file, SIMD):
@@ -232,6 +232,8 @@ def persist(trace_file, SIMD):
smem_ins, smem_stalls, br_ins, br_taken_ins, br_stalls = [], [], [], [], []
for wave in SIMD:
if wave.instructions is None:
continue
simds.append(wave.simd)
waves.append(wave.wave_id)
begin_time.append(wave.begin_time)
@@ -344,50 +346,30 @@ def insert_waitcnt(flight_count, assembly_code):
return assembly_code
def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
for n, occ in enumerate(OCCUPANCY):
OCCUPANCY[n] = [
max(min(int((u >> 16) - min_event_time) << 16, 2**42), 0) | (u & 0xFFFFF)
for u in occ
]
for perf in EVENTS:
for p in perf:
p.time -= min_event_time
def gen_timelines(DBFILES):
TIMELINES = [np.zeros(int(1E6), dtype=np.float32) for k in range(5)]
TIME_RESOLUTION = 16
for df in DBFILES:
for T in range(len(df["timeline"])):
timeline = df["timeline"][T]
time_acc = 0
tuples3 = [(0, df["begin_time"][T] - min_event_time)] + [
(int(t[0]), int(t[1])) for t in timeline
]
tuples3 = [(0, df["begin_time"][T])] + [(int(t[0]), int(t[1])) for t in timeline]
for state in tuples3:
if state[1] > 1e8:
t_end = (time_acc + state[1])//TIME_RESOLUTION
if t_end > 1E8:
print("Warning: Time limit reached for ", state[0], state[1])
break
if time_acc + state[1] > TIMELINES[state[0]].size:
elif t_end > TIMELINES[state[0]].size:
TIMELINES[state[0]] = np.hstack(
[TIMELINES[state[0]], np.zeros_like(TIMELINES[state[0]])]
)
TIMELINES[state[0]][time_acc : time_acc + state[1]] += 1
TIMELINES[state[0]][time_acc//TIME_RESOLUTION : t_end] += 1
time_acc += state[1]
return TIMELINES
if __name__ == "__main__":
comm = None
mpi_root = True
if MPI_IMPORTED:
try:
comm = MPI.COMM_WORLD
if comm.Get_size() < 2:
comm = None
else:
mpi_root = comm.Get_rank() == 0
except:
print("Could not load MPI")
comm = None
pathenv = os.getenv("OUTPUT_PATH")
if pathenv is None:
pathenv = "."
@@ -395,9 +377,6 @@ if __name__ == "__main__":
parser.add_argument(
"assembly_code", help="Path to the assembly code. Must be the first parameter."
)
parser.add_argument(
"--depth", help="Maximum number of parsed waves per slot", default=100, type=int
)
parser.add_argument(
"--trace_file", help="Filter for trace files", default=None, type=str
)
@@ -405,12 +384,6 @@ if __name__ == "__main__":
"--att_kernel", help="Kernel file", type=str, default=pathenv + "/*_kernel.txt"
)
parser.add_argument("--ports", help="Server and websocket ports, default: 8000,18000")
parser.add_argument(
"--genasm",
help="Generate post-processed asm file at this path",
type=str,
default="",
)
parser.add_argument(
"--mode",
help="""ATT analysis modes:\n
@@ -455,22 +428,19 @@ if __name__ == "__main__":
print("Could not find att output kernel:", args.att_kernel)
exit(1)
elif len(att_kernel) > 1:
if mpi_root:
print("Found multiple kernel matching given filters:")
for n, k in enumerate(att_kernel):
print("\t", n, "->", k)
print("Found multiple kernel matching given filters:")
for n, k in enumerate(att_kernel):
print("\t", n, "->", k)
bValid = False
while bValid == False:
try:
args.att_kernel = att_kernel[int(input("Please select number: "))]
bValid = True
except KeyboardInterrupt:
exit(0)
except:
print("Invalid option.")
if comm is not None:
args.att_kernel = comm.bcast(args.att_kernel, root=0)
bValid = False
while bValid == False:
try:
args.att_kernel = att_kernel[int(input("Please select number: "))]
bValid = True
except KeyboardInterrupt:
exit(0)
except:
print("Invalid option.")
else:
args.att_kernel = att_kernel[0]
@@ -491,38 +461,31 @@ if __name__ == "__main__":
filenames = glob.glob(args.trace_file)
assert len(filenames) > 0
if comm is not None:
filenames = filenames[comm.Get_rank() :: comm.Get_size()]
code = jumps = None
if mpi_root:
print('Att kernel:', args.att_kernel)
code, jumps = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel)
print('Att kernel:', args.att_kernel)
code, jumps, kern_addr = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel)
DBFILES = []
TIMELINES = [np.zeros(int(1e4), dtype=np.int16) for k in range(5)]
EVENTS = []
OCCUPANCY = []
GFXV = []
analysed_filenames = []
occupancy_filenames = []
dispatch_kernel_names = {}
shader_engine_data_dict = {}
for name in filenames:
getWaves_binary(name, shader_engine_data_dict, args.target_cu, args.depth)
if comm is not None:
code = comm.bcast(code, root=0)
jumps = comm.bcast(jumps, root=0)
getWaves_binary(name, shader_engine_data_dict, args.target_cu)
gc.collect()
latency_map = np.zeros((len(code)), dtype=np.int64)
hitcount_map = np.zeros((len(code)), dtype=np.int32)
for name in filenames:
SIMD, perfevents, occupancy, gfxv = shader_engine_data_dict[name]
if len(occupancy) > 0:
SIMD, perfevents, occupancy, gfxv, addrs = shader_engine_data_dict[name]
for id, addr in enumerate(addrs):
dispatch_kernel_names[id] = kern_addr[addr]
if len(occupancy) > 16:
OCCUPANCY.append( occupancy )
occupancy_filenames.append( name )
occupancy_filenames.append(name)
if np.sum([0]+[len(s.instructions) for s in SIMD]) == 0:
print("No waves from", name)
continue
@@ -534,117 +497,33 @@ if __name__ == "__main__":
GFXV.append(gfxv)
gc.collect()
min_event_time = 2**62
for df in DBFILES:
if len(df["begin_time"]) > 0:
min_event_time = min(min_event_time, np.min(df["begin_time"]))
for perf in EVENTS:
for p in perf:
min_event_time = min(min_event_time, p.time)
for occ in OCCUPANCY:
min_event_time = min(min_event_time, np.min(np.array(occ) >> 16))
gc.collect()
min_event_time = max(0, min_event_time - 32)
if comm is not None:
min_event_time = comm.reduce(min_event_time, op=MPI.MIN)
min_event_time = comm.bcast(min_event_time, root=0)
apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES)
GFXV = comm.gather(GFXV, root=0)
EVENTS = comm.gather(EVENTS, root=0)
OCCUPANCY = comm.gather(OCCUPANCY, root=0)
TIMELINES = comm.gather(TIMELINES, root=0)
gather_latency_map = comm.gather(latency_map, root=0)
gather_hitcount_map = comm.gather(hitcount_map, root=0)
gathered_filenames = comm.gather(occupancy_filenames, root=0)
if mpi_root:
latency_map *= 0
hitcount_map *= 0
for hit, lat in zip(gather_hitcount_map, gather_latency_map):
hitcount_map += hit
latency_map += lat
EVENTS = [e for elem in EVENTS for e in elem]
OCCUPANCY = [e for elem in OCCUPANCY for e in elem]
gathered_filenames = [e for elem in gathered_filenames for e in elem]
gfxv = [e for elem in GFXV for e in elem][0]
TIMELINES_GATHER = TIMELINES
TIMELINES = [
np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16)
for k in range(5)
]
for gather in TIMELINES_GATHER:
for t, m in zip(TIMELINES, gather):
t[: len(m)] += m
del TIMELINES_GATHER
else: # free up memory
TIMELINES = []
OCCUPANCY = []
EVENTS = []
else:
apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES)
gathered_filenames = occupancy_filenames
if mpi_root:
for k in range(len(code)):
code[k][-2] = int(hitcount_map[k])
code[k][-1] = int(latency_map[k])
for k in range(len(code)):
code[k][-2] = int(hitcount_map[k])
code[k][-1] = int(latency_map[k])
if CSV_MODE:
if mpi_root:
from att_to_csv import dump_csv
dump_csv(code)
from att_to_csv import dump_csv
dump_csv(code)
quit()
gc.collect()
print("Min time:", min_event_time)
drawinfo = {
"TIMELINES": TIMELINES,
"TIMELINES": gen_timelines(DBFILES),
"EVENTS": EVENTS,
"EVENT_NAMES": EVENT_NAMES,
"OCCUPANCY": OCCUPANCY,
"ShaderNames": gathered_filenames,
"ShaderNames": occupancy_filenames,
"DispatchNames": dispatch_kernel_names,
}
if args.genasm and len(args.genasm) > 0:
flight_count = view_trace(
args,
code,
DBFILES,
analysed_filenames,
True,
OCCUPANCY,
args.dumpfiles,
min_event_time,
gfxv,
drawinfo,
comm,
mpi_root,
)
with open(args.assembly_code, "r") as file:
lines = file.readlines()
assembly_code = {l + 1.0: lines[l][:-1] for l in range(len(lines))}
assembly_code = insert_waitcnt(flight_count, assembly_code)
with open(args.genasm, "w") as file:
keys = sorted(assembly_code.keys())
for k in keys:
file.write(assembly_code[k] + "\n")
else:
view_trace(
args,
code,
DBFILES,
analysed_filenames,
False,
OCCUPANCY,
args.dumpfiles,
min_event_time,
gfxv,
drawinfo,
comm,
mpi_root,
)
view_trace(
args,
code,
DBFILES,
analysed_filenames,
args.dumpfiles,
0,
gfxv,
drawinfo
)
-54
Vedi File
@@ -136,55 +136,6 @@ std::optional<code_object_decoder_t::symbol_info_t> code_object_decoder_t::find_
return {};
}
/*
void code_object_decoder_t::load_symbol_map() {
std::unique_ptr<Elf, void (*)(Elf *)> elf (
elf_begin(m_fd, ELF_C_READ, nullptr),
[](Elf *elf){ elf_end(elf); });
if (!elf) {
rocprofiler::warning("Error opening ELF!\n");
return;
}
Elf64_Ehdr *ehdr = elf64_getehdr(elf.get());
if (!ehdr) {
printf("elf64_getehdr failed\n");
return;
}
// Slurp the symbol table.
Elf_Scn *scn = nullptr;
while ((scn = elf_nextscn(elf.get(), scn)) != nullptr) {
GElf_Shdr shdr_mem;
GElf_Shdr *shdr = gelf_getshdr(scn, &shdr_mem);
if (shdr->sh_type != SHT_SYMTAB && shdr->sh_type != SHT_DYNSYM) {
continue;
}
Elf_Data *data = elf_getdata(scn, nullptr);
if (!data) continue;
size_t symbol_count = data->d_size / gelf_fsize(elf.get(), ELF_T_SYM, 1, EV_CURRENT);
for (size_t j = 0; j < symbol_count; ++j) {
GElf_Sym sym_mem;
GElf_Sym *sym = gelf_getsym(data, j, &sym_mem);
if (GELF_ST_TYPE(sym->st_info) != STT_FUNC || sym->st_shndx == SHN_UNDEF) continue;
std::string symbol_name{ elf_strptr(elf.get(), shdr->sh_link, sym->st_name) };
auto symbol_pair = std::make_pair(symbol_name, sym->st_size);
auto [it, success] = m_symbol_map.emplace(sym->st_value, symbol_pair);
// If there already was a symbol defined at this address, but this
// new symbol covers a larger address range, replace the old symbol
// with this new one.
if (!success && sym->st_size > it->second.second) it->second = symbol_pair;
}
}
} */
void code_object_decoder_t::disassemble_kernel(uint64_t addr) {
auto symbol = find_symbol(addr);
@@ -193,9 +144,6 @@ void code_object_decoder_t::disassemble_kernel(uint64_t addr) {
return;
}
// if (symbol->m_name.find("__amd_rocclr_") == 0)
// return;
std::cout << "Dumping ISA for " << symbol->m_name << std::endl;
uint64_t end_addr = addr + symbol->m_size;
@@ -218,8 +166,6 @@ void code_object_decoder_t::disassemble_kernel(uint64_t addr) {
void code_object_decoder_t::disassemble_kernels() {
disassembly = std::make_unique<DisassemblyInstance>(*this);
// if (m_symbol_map.begin() == m_symbol_map.end())
m_symbol_map = disassembly->GetKernelMap();
for (auto& [k, v] : m_symbol_map) disassemble_kernel(k);
+14 -12
Vedi File
@@ -57,8 +57,9 @@
if (amd_comgr_status_s status = call) { \
const char* reason = ""; \
amd_comgr_status_string(status, &reason); \
std::cerr << __LINE__ << " code: " << status << std::endl; \
std::cerr << __LINE__ << " failed: " << reason << std::endl; \
return; \
exit(1); \
}
CodeObjectBinary::CodeObjectBinary(const std::string& uri) : m_uri(uri) {
@@ -156,12 +157,12 @@ DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder)
: buffer(reinterpret_cast<int64_t>(decoder.buffer.data())),
size(decoder.buffer.size()),
instructions(decoder.instructions) {
amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &data);
amd_comgr_set_data(data, size, decoder.buffer.data());
CHECK_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &data));
CHECK_COMGR(amd_comgr_set_data(data, size, decoder.buffer.data()));
char isa_name[128];
size_t isa_size = sizeof(isa_name);
amd_comgr_get_data_isa_name(data, &isa_size, isa_name);
CHECK_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, isa_name));
CHECK_COMGR(amd_comgr_create_disassembly_info(
isa_name, //"amdgcn-amd-amdhsa--gfx1100",
@@ -172,24 +173,24 @@ DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder)
amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbol,
void* user_data) {
amd_comgr_symbol_type_t type;
amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type);
CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type));
if (type != AMD_COMGR_SYMBOL_TYPE_FUNC && type != AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL)
return AMD_COMGR_STATUS_SUCCESS;
uint64_t addr;
amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &addr);
CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &addr));
uint64_t mem_size;
amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size);
CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size));
uint64_t name_size;
amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size);
CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size));
std::string name;
name.resize(name_size);
amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data());
CHECK_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data()));
static_cast<DisassemblyInstance*>(user_data)->symbol_map[addr] = {name, mem_size};
return AMD_COMGR_STATUS_SUCCESS;
@@ -197,18 +198,19 @@ amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbo
std::map<uint64_t, std::pair<std::string, uint64_t>>& DisassemblyInstance::GetKernelMap() {
symbol_map = std::map<uint64_t, std::pair<std::string, uint64_t>>{};
amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this);
CHECK_COMGR(amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this));
return symbol_map;
}
DisassemblyInstance::~DisassemblyInstance() {
amd_comgr_release_data(data);
CHECK_COMGR(amd_comgr_release_data(data));
CHECK_COMGR(amd_comgr_destroy_disassembly_info(info));
}
uint64_t DisassemblyInstance::ReadInstruction(uint64_t addr, const char* cpp_line) {
uint64_t size_read;
amd_comgr_disassemble_instruction(info, buffer + addr, (void*)this, &size_read);
CHECK_COMGR(amd_comgr_disassemble_instruction(info, buffer + addr, (void*)this, &size_read));
assert(instructions.size() != 0);
instructions.back().address = addr;
instructions.back().cpp_reference = cpp_line;
return size_read;
+97 -32
Vedi File
@@ -153,6 +153,7 @@ def draw_wave_states(selections, normalize, TIMELINES):
plt.figure(figsize=(15, 4))
maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices])
maxtime = max(maxtime, 1)
timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices]
@@ -169,21 +170,18 @@ def draw_wave_states(selections, normalize, TIMELINES):
else cycles * 0
for time in timelines
]
kernsize = 21
kernel = np.asarray(
[
np.exp(-abs(10 * k / kernsize))
for k in range(-kernsize // 2, kernsize // 2 + 1)
]
)
kernsize = 15
kernel = np.asarray([
np.exp(-abs(10 * k / kernsize)) for k in range(-kernsize // 2, kernsize // 2 + 1)
])
kernel /= np.sum(kernel)
timelines = [
np.convolve(time, kernel)[kernsize // 2 : -kernsize // 2]
for time in timelines
if len(time) > 0
for time in timelines if len(time) > 0
]
maxtime *= 16
cycles *= 16
[
plt.plot(cycles, t, label="State " + s, linewidth=1.1, color=c)
for t, s, c, sel in zip(timelines, STATES, colors, selections)
@@ -204,48 +202,113 @@ def draw_wave_states(selections, normalize, TIMELINES):
return STATES, FileBytesIO(figure_bytes)
def draw_occupancy(selections, normalize, OCCUPANCY, shadernames):
def draw_occupancy_per_dispatch(selections, normalize, OCCUPANCY, dispatchnames):
plt.figure(figsize=(15, 4))
maxtime = 1
delta = 1
for k in range(len(OCCUPANCY)):
if len(OCCUPANCY[k]) <= 16:
continue
OCCUPANCY[k] = [(16*int(u>>23), (u>>12) & 0x7F, (u>>19) & 0xF, u&0xFFF) for u in OCCUPANCY[k]]
maxtime = max(maxtime, OCCUPANCY[k][-1][0])
NUM_DOTS = 1600
delta = max(1, maxtime // NUM_DOTS)
chart = np.zeros((len(dispatchnames), maxtime // delta + 2), dtype=np.float32)
for occ in OCCUPANCY:
if len(occ) <= 16:
continue
small_chart = np.zeros_like(chart)
norm_fact = np.zeros_like(chart)
norm_fact += 1E-6
current_occ = [[0 for m in range(16)] for k in range(len(dispatchnames))]
current_occ[0] = [m[1] for m in occ[:16]]
current_time = [0 for k in range(len(dispatchnames))]
total_value = [0 for k in range(len(dispatchnames))]
total_value[0] = np.sum(current_occ[0])
for time, value, cu, kid in occ:
b = current_time[kid]
e = max(b + 1, time // delta)
small_chart[kid][b:e] += total_value[kid]
norm_fact[kid][b:e] += 1
total_value[kid] += value - current_occ[kid][cu]
current_occ[kid][cu] = value
current_time[kid] = time // delta
for small, norm, time, value in zip(small_chart, norm_fact, current_time, total_value):
small[time] += value
norm[time] += value
chart += small_chart/norm_fact
for (id, name), occ in zip(dispatchnames.items(), chart):
plt.plot(np.arange(occ.size) * delta, occ, label=str(id)+'#'+name, linewidth=1.1)
plt.legend()
if normalize:
plt.ylabel("Occupancy %")
else:
plt.ylabel("Occupancy total")
plt.xlabel("Cycle")
plt.ylim(-1)
plt.xlim(-maxtime // 200, maxtime + maxtime // 200 + delta + 1)
plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
figure_bytes = BytesIO()
plt.savefig(figure_bytes, dpi=150)
return dispatchnames, FileBytesIO(figure_bytes)
def draw_occupancy(selections, normalize, OCCUPANCY, shadernames, numdispatchid):
plt.figure(figsize=(15, 4))
names = []
if len(OCCUPANCY) == 1: # If single SE, do occupancy per CU/WGP
OCCUPANCY = [[u for u in OCCUPANCY[0] if u&0xFF==k] for k in range(16)]
shadernames = ['CU'+str(k) for k in range(16) if len(OCCUPANCY[k]) > 0]
OCCUPANCY = [occ for occ in OCCUPANCY if len(occ) > 0]
percu = [[u for u in OCCUPANCY[0] if (u>>19) & 0xF == k] for k in range(16)]
shadernames = shadernames + [['CU'+str(k),''] for k in range(16) if len(percu[k]) > 0]
OCCUPANCY = OCCUPANCY + [occ for occ in percu if len(occ) > 0]
maxtime = 1
delta = 1
for name, occ in zip(shadernames, OCCUPANCY):
occ_values = [0]
occ_times = [0]
occ = [(int(u >> 16), (u >> 8) & 0xFF, u & 0xFF) for u in occ]
current_occ = [0 for k in range(16)]
if len(occ) <= 16:
continue
maxtime = 1
delta = 1
occ = [(16*int(u >> 23), (u >> 12) & 0x7F, (u>>19) & 0xF, u&0xFFF) for u in occ]
current_occ = [[0 for m in range(16)] for k in range(numdispatchid)]
current_occ[0] = [m[1] for m in occ[:16]]
for time, value, cu in occ:
occ_values = [np.sum(current_occ[0])]
occ_times = [0]
for time, value, cu, kid in occ:
occ_times.append(time)
occ_values.append(occ_values[-1] + value - current_occ[cu])
current_occ[cu] = value
occ_values.append(occ_values[-1] + value - current_occ[kid][cu])
current_occ[kid][cu] = value
try:
names.append('SE'+name.split('.att')[0].split('_se')[-1])
names.append('SE'+name.split('_se')[1].split('.att')[0])
except:
names.append(name)
NUM_DOTS = 1500
maxtime = np.max(occ_times)
maxtime = occ_times[-1]+1
delta = max(1, maxtime // NUM_DOTS)
chart = np.zeros((maxtime // delta + 1), dtype=np.float32)
norm_fact = np.zeros_like(chart)
norm_fact += 1E-6
for i, t in enumerate(occ_times[:-1]):
b = t // delta
for i in range(len(occ_times)-1):
b = occ_times[i] // delta
e = max(b + 1, occ_times[i + 1] // delta)
chart[b:e] += occ_values[i]
norm_fact[b:e] += 1
chart /= np.maximum(norm_fact, 1)
chart /= norm_fact
if normalize:
chart /= max(chart.max(), 1e-6)
plt.plot(np.arange(chart.size) * delta, chart, label=name, linewidth=1.1)
plt.plot(np.arange(chart.size) * delta, chart, label=names[-1], linewidth=1.1)
plt.legend()
if normalize:
@@ -267,12 +330,14 @@ def GeneratePIC(drawinfo, selections=[True for k in range(16)], normalize=False)
response = {}
figures = {}
states, figure = draw_occupancy(
selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"]
)
states, figure = draw_occupancy(selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"], len(drawinfo["DispatchNames"]))
response["occupancy.png"] = states
figures["occupancy.png"] = figure
states, figure = draw_occupancy_per_dispatch(selections, normalize, drawinfo["OCCUPANCY"], drawinfo["DispatchNames"])
response["dispatches.png"] = states
figures["dispatches.png"] = figure
states, figure = draw_wave_states(selections, normalize, drawinfo["TIMELINES"])
response["timeline.png"] = states
figures["timeline.png"] = figure
+10 -100
Vedi File
@@ -193,104 +193,11 @@ def try_match_swapped(insts, code, i, line):
return insts[i + 1][1] == code[line][1] and insts[i][1] == code[line + 1][1]
FORK_NAMES = 1
# A successful parsed instruction
class CachedInst:
def __init__(self, inst, as_line):
self.inst_type = inst
self.as_line = as_line
self.forks = None
# A branch of the parsing tree
class Fork:
def __init__(self):
global FORK_NAMES
self.insts = []
self.data = None
self.name = FORK_NAMES
FORK_NAMES += 1
# print('Created new fork: ', self.name)
# Try to match sequence "insts" with the branch "fork", starting at position "i"
def move_down_fork(fork, insts, i): #(fork : Fork, insts : list, i : int):
N = min(len(insts), len(fork.insts))
while i < N:
if insts[i][1] == fork.insts[i].inst_type:
i += 1
elif i<N-1 and insts[i+1][1] == fork.insts[i].inst_type \
and insts[i][1] == fork.insts[i+1].inst_type:
i += 2
else:
return False, i
if len(fork.insts) != len(insts):
return False, i
return True, i
FORK_TREE = Fork()
# Check if there exists a previous wave with the same sequence of instructions executed
def fromDict(insts):
i = 0
N = len(insts)
cur_fork = FORK_TREE
while i < N:
tillEnd, final_pos = move_down_fork(cur_fork, insts, i)
if tillEnd:
# print('Reached end')
return True, cur_fork
i += final_pos
if i >= len(cur_fork.insts):
return False, cur_fork
last_inst = cur_fork.insts[i]
if last_inst.forks is None:
last_inst.forks = []
bMatchFork = False
for fork in last_inst.forks:
if fork.insts[0].inst_type == insts[0][1]:
cur_fork = fork
bMatchFork = True
break
if not bMatchFork:
cur_fork = Fork()
last_inst.forks.append(cur_fork)
return False, cur_fork
print("Warning: Reached end of loop!")
return False, cur_fork
def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
bGFX9 = gfxv == 'vega'
# Try from cached result from a previous wave that have already been parsed
dict_sucess, current_fork = fromDict(insts)
if dict_sucess:
result, loopCount, mem_unroll, flight_count, maxline, pcsequence = current_fork.data
# Check if the sequence of measured PC values are equal for cached and new wave
if len(pcsequence) > 0:
pcs = [r[2] for r in insts if r[1] == PCINFO]
if len(pcs) != len(pcsequence):
dict_sucess = False
for pc1, pc2 in zip(pcs, pcsequence):
if pc1 != pc2:
dict_sucess = False
# If successful, use resulting assembly from cache
if dict_sucess:
result = [r+(asm[-1],) for r, asm in zip(insts, result)]
return result, loopCount, mem_unroll, flight_count, maxline, len(result)
result, i, line, loopCount, N = [], 0, 0, defaultdict(int), len(insts)
SMEM_INST = [] # scalar memory
VLMEM_INST = [] # vector memory load
VSMEM_INST = [] # vector memory store
@@ -310,10 +217,6 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
# Clean the code and remove comments
code = [raw_code[0]]
for c in raw_code[1:]:
if bIsAuto and '; Begin ' == c[0][:len('; Begin ')]:
if '; Begin <Kernel>' in c[0]:
line = len(code)
print('Begin at:', line, c)
c = list(c)
c[0] = c[0].split(";")[0].split("//")[0].strip()
@@ -339,7 +242,16 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
loops = 0
maxline = 0
watchlist = RegisterWatchList(labels=labels) if not bIsAuto else PCTranslator(code, insts)
if bIsAuto and len(insts) and insts[0][1] == PCINFO:
try:
watchlist = PCTranslator(code, insts)
line = watchlist.addrmap[insts[0][2]]
result.append((insts[0][0], PCINFO, 0, 0, 0))
i = 1
except:
return None
else:
watchlist = RegisterWatchList(labels=labels)
pcsequence = []
while i < N:
@@ -534,7 +446,5 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
break
line += 1
current_fork.insts = [CachedInst(inst[1], inst[-1]) for inst in result]
current_fork.data = result, loopCount, mem_unroll, flight_count, maxline, pcsequence
result = [r for r in result if r[1] != PCINFO]
return result, loopCount, mem_unroll, flight_count, maxline, len(result) if i == N else N
+29 -52
Vedi File
@@ -296,25 +296,25 @@ def view_trace(
code,
dbnames,
att_filenames,
bReturnLoc,
OCCUPANCY,
bDumpOnly,
se_time_begin,
gfxv,
drawinfo,
MPI_COMM,
mpi_root,
drawinfo
):
global JSON_GLOBAL_DICTIONARY
pic_thread = None
if mpi_root:
manager = Manager()
return_dict = manager.dict()
JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable(
{str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))}
)
pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo))
pic_thread.start()
manager = Manager()
return_dict = manager.dict()
occ_dict = {str(k): drawinfo["OCCUPANCY"][k] for k in range(len(drawinfo["OCCUPANCY"]))}
occ_dict['dispatches'] = {}
for id, name in drawinfo['DispatchNames'].items():
occ_dict['dispatches'][id] = name
occ_dict['names'] = drawinfo['ShaderNames']
JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable(occ_dict)
pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo))
pic_thread.start()
att_filenames = [Path(f).name for f in att_filenames]
se_numbers = [int(a.split("_se")[1].split(".att")[0]) for a in att_filenames]
@@ -337,9 +337,8 @@ def view_trace(
flight_count.append(count)
simd_wave_filenames[se_number] = wv_filenames
if mpi_root:
code_sel = [c[:-3]+c[-2:] for c in code[:allse_maxline+16]]
JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)})
code_sel = [c[:-3]+c[-2:] for c in code[:allse_maxline+16]]
JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)})
for key in simd_wave_filenames.keys():
wv_array = [
@@ -367,42 +366,21 @@ def view_trace(
simd_wave_filenames[key] = wv_dict
if MPI_COMM is not None:
se_filenames = MPI_COMM.gather(se_filenames, root=0)
simd_wave_filenames = MPI_COMM.gather(simd_wave_filenames, root=0)
if mpi_root:
se_filenames = [e for elem in se_filenames for e in elem]
simd_wave_filenames = {
k: v for smf in simd_wave_filenames for k, v in smf.items()
}
if mpi_root:
JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable(
{
"wave_filenames": simd_wave_filenames,
"se_filenames": se_filenames,
"global_begin_time": int(se_time_begin),
"gfxv": gfxv,
}
)
JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable(
{
"wave_filenames": simd_wave_filenames,
"se_filenames": se_filenames,
"global_begin_time": int(se_time_begin),
"gfxv": gfxv,
}
)
if pic_thread is not None:
pic_thread.join()
for k, v in return_dict.items():
JSON_GLOBAL_DICTIONARY[k] = v
if bReturnLoc:
return flight_count
if bDumpOnly == False:
if MPI_COMM is not None:
JSON_GLOBAL_DICTIONARY = MPI_COMM.gather(JSON_GLOBAL_DICTIONARY, root=0)
if not mpi_root:
quit()
JSON_GLOBAL_DICTIONARY = {
k: v for smf in JSON_GLOBAL_DICTIONARY for k, v in smf.items()
}
JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 1})
if args.ports:
assign_ports(args.ports)
@@ -420,13 +398,12 @@ def view_trace(
print("Exitting.")
else:
os.makedirs("ui/", exist_ok=True)
if mpi_root:
JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0})
os.system(
"cp "
+ os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui")
+ "/* ui/"
)
JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0})
os.system(
"cp "
+ os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui")
+ "/* ui/"
)
for k, v in JSON_GLOBAL_DICTIONARY.items():
with open(os.path.join("ui", k), "w" if ".json" in k else "wb") as f:
f.write(v.read())
+1
Vedi File
@@ -14,6 +14,7 @@
<div class="tab">
<button class="tablinks" onclick="showImage('timeline.png')">Wave States</button>
<button class="tablinks" onclick="showImage('occupancy.png')">Occupancy</button>
<button class="tablinks" onclick="showImage('dispatches.png')">Dispatches</button>
<button class="tablinks" onclick="showImage('counters.png')" id="counterspng_button">Counters</button>
</div>
<img id="GraphImage" src=timeline.png width=100%>