SWDEV-389947: Fixing GPU memory being allocated for every kernel. Reduced python memory usage.

Change-Id: I74d31581653e53e529f148b272f5217a1edcf288
Šī revīzija ir iekļauta:
Giovanni LB
2023-03-21 14:32:12 -03:00
revīziju iesūtīja Giovanni Baraldi
vecāks 18110d146e
revīzija ba620ee7c6
2 mainīti faili ar 76 papildinājumiem un 53 dzēšanām
+25 -18
Parādīt failu
@@ -17,6 +17,8 @@ import numpy as np
import matplotlib.pyplot as plt
import json
COUNTERS_MAX_CAPTURES = 1<<12
class PerfEvent(ctypes.Structure):
_fields_ = [
('time', c_uint64),
@@ -46,7 +48,7 @@ class KvPair(ctypes.Structure):
""" Matches pair<int, int> = (key, value) on the python side """
_fields_ = [('key', ctypes.c_int),
('value', ctypes.c_int)]
class ReturnAssemblyInfo(ctypes.Structure):
""" Matches ReturnAssemblyInfo on the python side """
@@ -303,38 +305,43 @@ def draw_wave_metrics(selections, normalize):
plt.figure(figsize=(15,3))
delta_time = max(1,int(0.5+np.min([get_delta_time(events) for events in EVENTS])))
maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])+1
delta_step = 8
quad_delta_time = max(delta_step,int(0.5+np.min([get_delta_time(events) for events in EVENTS])))
maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])/quad_delta_time+1
if maxtime*delta_step >= COUNTERS_MAX_CAPTURES:
delta_step = 1
while maxtime >= COUNTERS_MAX_CAPTURES:
quad_delta_time *= 2
maxtime /= 2
maxtime = int(min(maxtime*delta_step, COUNTERS_MAX_CAPTURES))
event_timeline = np.zeros((16, maxtime), dtype=np.int32)
print('Delta:', delta_time)
print('Max_cycles:', maxtime)
print('Delta:', quad_delta_time)
print('Max_cycles:', maxtime*quad_delta_time*4//delta_step)
kernsize = 2*(delta_time//14)+1
trim = max(maxtime//5000,1)
cycles = 4*np.arange(maxtime)[::trim]
kernel = np.asarray([np.exp(-abs(k/kernsize)**2) for k in range(-kernsize*3,kernsize*3+1)])
kernel /= np.sum(kernel)*len(EVENTS)*delta_time
cycles = 4*quad_delta_time//delta_step*np.arange(maxtime)
kernel = len(EVENTS)*quad_delta_time
for events in EVENTS:
for e in range(len(events)-1):
bk = events[e].bank*4
start = events[e].time
end = start+delta_time
start = events[e].time // (quad_delta_time//delta_step)
end = start+delta_step
event_timeline[bk:bk+4, start:end] += np.asarray(events[e].toTuple()[1:5])[:, None]
start = events[-1].time
event_timeline[bk:bk+4, start:start+delta_time] += \
event_timeline[bk:bk+4, start:start+delta_step] += \
np.asarray(events[-1].toTuple()[1:5])[:, None]
event_timeline = [np.convolve(e, kernel)[3*kernsize:-3*kernsize] for e in event_timeline]
event_timeline = [np.convolve(e, [kernel for k in range(3)])[1:-1] for e in event_timeline]
#event_timeline = [e/kernel for e in event_timeline]
if normalize:
event_timeline = [100*e/max(e.max(), 1E-5) for e in event_timeline]
colors = ['blue', 'green', 'gray', 'red', 'orange', 'cyan', 'black', 'darkviolet',
'yellow', 'darkred', 'pink', 'lime', 'gold', 'tan', 'aqua', 'olive']
[plt.plot(cycles, e[::trim], '-', label=n, color=c)
[plt.plot(cycles, e, '-', label=n, color=c)
for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections) if sel]
plt.legend()
+51 -35
Parādīt failu
@@ -841,13 +841,61 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
}
/* Write the transformed packets to the hardware queue. */
writer(&transformed_packets[0], transformed_packets.size());
} else if (session_id.handle > 0 && pkt_count > 0 && is_att_collection_mode && session) {
} else if (session_id.handle > 0 && pkt_count > 0 &&
is_att_collection_mode && session &&
KernelInterceptCount < MAX_ATT_PROFILES
) {
// att start
// Getting Queue Data and Information
auto& queue_info = *static_cast<Queue*>(data);
std::lock_guard<std::mutex> lk(queue_info.qw_mutex);
Agent::AgentInfo* agentInfo = &(hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle));
bool can_profile_anypacket = false;
std::vector<bool> can_profile_packet;
for (size_t i = 0; i < pkt_count; ++i) {
auto& original_packet = static_cast<const hsa_barrier_and_packet_t*>(packets)[i];
bool b_profile_this_object = false;
// Skip packets other than kernel dispatch packets.
if (bit_extract(original_packet.header, HSA_PACKET_HEADER_TYPE,
HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1) ==
HSA_PACKET_TYPE_KERNEL_DISPATCH) {
auto& kdispatch = static_cast<const hsa_kernel_dispatch_packet_s*>(packets)[i];
uint64_t kernel_object = kdispatch.kernel_object;
// Try to match the mangled kernel name with given matches in input.txt
try {
std::lock_guard<std::mutex> lock(ksymbol_map_lock);
assert(ksymbols);
const std::string& kernel_name = ksymbols->at(kernel_object);
// We want to initiate att profiling only if a match exists
for(const std::string& kernel_matches : kernel_profile_names) {
if (kernel_name.find(kernel_matches) != std::string::npos) {
b_profile_this_object = true;
break;
}
}
if (!b_profile_this_object) printf("Skipping: %s\n", kernel_name.c_str());
} catch (...) {
printf("Warning: Unknown name for object %lu\n", kernel_object);
}
}
if (b_profile_this_object)
can_profile_anypacket = true;
can_profile_packet.push_back(b_profile_this_object);
}
if (!can_profile_anypacket) {
/* Write the original packets to the hardware if no patch will be profiled */
writer(packets, pkt_count);
return;
}
// Preparing att Packets
Packet::packet_t start_packet{};
Packet::packet_t stop_packet{};
@@ -902,44 +950,12 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
att_params, &start_packet, &stop_packet);
}
// Searching across all the packets given during this write
for (size_t i = 0; i < pkt_count; ++i) {
auto& original_packet = static_cast<const hsa_barrier_and_packet_t*>(packets)[i];
// Skip packets other than kernel dispatch packets.
if (bit_extract(original_packet.header, HSA_PACKET_HEADER_TYPE,
HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1) !=
HSA_PACKET_TYPE_KERNEL_DISPATCH) {
transformed_packets.emplace_back(packets_arr[i]);
continue;
}
auto& kdispatch = static_cast<const hsa_kernel_dispatch_packet_s*>(packets)[i];
uint64_t kernel_object = kdispatch.kernel_object;
bool b_profile_this_object = false;
// Try to match the mangled kernel name with given matches in input.txt
try {
std::lock_guard<std::mutex> lock(ksymbol_map_lock);
assert(ksymbols);
const std::string& kernel_name = ksymbols->at(kernel_object);
// We want to initiate att profiling only if a match exists
for(const std::string& kernel_matches : kernel_profile_names) {
if (kernel_name.find(kernel_matches) != std::string::npos) {
b_profile_this_object = true;
break;
}
}
if (!b_profile_this_object) printf("Skipping: %s\n", kernel_name.c_str());
} catch (...) {
printf("Warning: Unknown name for object %lu\n", kernel_object);
}
// If no match was found or intercept count > maximum desired profiles, skip this kernel.
if (!b_profile_this_object || KernelInterceptCount >= MAX_ATT_PROFILES) {
printf("Skipping: %lu\n", kernel_object);
// Skip all packets marked with !can_profile
if (i >= can_profile_packet.size() || can_profile_packet[i] == false) {
transformed_packets.emplace_back(packets_arr[i]);
continue;
}