SWDEV-389947: Fixing GPU memory being allocated for every kernel. Reduced python memory usage.
Change-Id: I74d31581653e53e529f148b272f5217a1edcf288
Šī revīzija ir iekļauta:
revīziju iesūtīja
Giovanni Baraldi
vecāks
18110d146e
revīzija
ba620ee7c6
+25
-18
@@ -17,6 +17,8 @@ import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import json
|
||||
|
||||
COUNTERS_MAX_CAPTURES = 1<<12
|
||||
|
||||
class PerfEvent(ctypes.Structure):
|
||||
_fields_ = [
|
||||
('time', c_uint64),
|
||||
@@ -46,7 +48,7 @@ class KvPair(ctypes.Structure):
|
||||
""" Matches pair<int, int> = (key, value) on the python side """
|
||||
_fields_ = [('key', ctypes.c_int),
|
||||
('value', ctypes.c_int)]
|
||||
|
||||
|
||||
|
||||
class ReturnAssemblyInfo(ctypes.Structure):
|
||||
""" Matches ReturnAssemblyInfo on the python side """
|
||||
@@ -303,38 +305,43 @@ def draw_wave_metrics(selections, normalize):
|
||||
|
||||
plt.figure(figsize=(15,3))
|
||||
|
||||
delta_time = max(1,int(0.5+np.min([get_delta_time(events) for events in EVENTS])))
|
||||
maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])+1
|
||||
delta_step = 8
|
||||
quad_delta_time = max(delta_step,int(0.5+np.min([get_delta_time(events) for events in EVENTS])))
|
||||
maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])/quad_delta_time+1
|
||||
|
||||
if maxtime*delta_step >= COUNTERS_MAX_CAPTURES:
|
||||
delta_step = 1
|
||||
while maxtime >= COUNTERS_MAX_CAPTURES:
|
||||
quad_delta_time *= 2
|
||||
maxtime /= 2
|
||||
|
||||
maxtime = int(min(maxtime*delta_step, COUNTERS_MAX_CAPTURES))
|
||||
event_timeline = np.zeros((16, maxtime), dtype=np.int32)
|
||||
print('Delta:', delta_time)
|
||||
print('Max_cycles:', maxtime)
|
||||
print('Delta:', quad_delta_time)
|
||||
print('Max_cycles:', maxtime*quad_delta_time*4//delta_step)
|
||||
|
||||
kernsize = 2*(delta_time//14)+1
|
||||
trim = max(maxtime//5000,1)
|
||||
cycles = 4*np.arange(maxtime)[::trim]
|
||||
|
||||
kernel = np.asarray([np.exp(-abs(k/kernsize)**2) for k in range(-kernsize*3,kernsize*3+1)])
|
||||
kernel /= np.sum(kernel)*len(EVENTS)*delta_time
|
||||
cycles = 4*quad_delta_time//delta_step*np.arange(maxtime)
|
||||
kernel = len(EVENTS)*quad_delta_time
|
||||
|
||||
for events in EVENTS:
|
||||
for e in range(len(events)-1):
|
||||
bk = events[e].bank*4
|
||||
start = events[e].time
|
||||
end = start+delta_time
|
||||
start = events[e].time // (quad_delta_time//delta_step)
|
||||
end = start+delta_step
|
||||
event_timeline[bk:bk+4, start:end] += np.asarray(events[e].toTuple()[1:5])[:, None]
|
||||
start = events[-1].time
|
||||
event_timeline[bk:bk+4, start:start+delta_time] += \
|
||||
event_timeline[bk:bk+4, start:start+delta_step] += \
|
||||
np.asarray(events[-1].toTuple()[1:5])[:, None]
|
||||
|
||||
|
||||
event_timeline = [np.convolve(e, kernel)[3*kernsize:-3*kernsize] for e in event_timeline]
|
||||
event_timeline = [np.convolve(e, [kernel for k in range(3)])[1:-1] for e in event_timeline]
|
||||
#event_timeline = [e/kernel for e in event_timeline]
|
||||
|
||||
if normalize:
|
||||
event_timeline = [100*e/max(e.max(), 1E-5) for e in event_timeline]
|
||||
|
||||
|
||||
colors = ['blue', 'green', 'gray', 'red', 'orange', 'cyan', 'black', 'darkviolet',
|
||||
'yellow', 'darkred', 'pink', 'lime', 'gold', 'tan', 'aqua', 'olive']
|
||||
[plt.plot(cycles, e[::trim], '-', label=n, color=c)
|
||||
[plt.plot(cycles, e, '-', label=n, color=c)
|
||||
for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections) if sel]
|
||||
|
||||
plt.legend()
|
||||
|
||||
@@ -841,13 +841,61 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
|
||||
}
|
||||
/* Write the transformed packets to the hardware queue. */
|
||||
writer(&transformed_packets[0], transformed_packets.size());
|
||||
} else if (session_id.handle > 0 && pkt_count > 0 && is_att_collection_mode && session) {
|
||||
} else if (session_id.handle > 0 && pkt_count > 0 &&
|
||||
is_att_collection_mode && session &&
|
||||
KernelInterceptCount < MAX_ATT_PROFILES
|
||||
) {
|
||||
// att start
|
||||
// Getting Queue Data and Information
|
||||
auto& queue_info = *static_cast<Queue*>(data);
|
||||
std::lock_guard<std::mutex> lk(queue_info.qw_mutex);
|
||||
Agent::AgentInfo* agentInfo = &(hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle));
|
||||
|
||||
bool can_profile_anypacket = false;
|
||||
std::vector<bool> can_profile_packet;
|
||||
|
||||
for (size_t i = 0; i < pkt_count; ++i) {
|
||||
auto& original_packet = static_cast<const hsa_barrier_and_packet_t*>(packets)[i];
|
||||
bool b_profile_this_object = false;
|
||||
|
||||
// Skip packets other than kernel dispatch packets.
|
||||
if (bit_extract(original_packet.header, HSA_PACKET_HEADER_TYPE,
|
||||
HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1) ==
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH) {
|
||||
|
||||
auto& kdispatch = static_cast<const hsa_kernel_dispatch_packet_s*>(packets)[i];
|
||||
uint64_t kernel_object = kdispatch.kernel_object;
|
||||
|
||||
// Try to match the mangled kernel name with given matches in input.txt
|
||||
try {
|
||||
std::lock_guard<std::mutex> lock(ksymbol_map_lock);
|
||||
assert(ksymbols);
|
||||
const std::string& kernel_name = ksymbols->at(kernel_object);
|
||||
|
||||
// We want to initiate att profiling only if a match exists
|
||||
for(const std::string& kernel_matches : kernel_profile_names) {
|
||||
if (kernel_name.find(kernel_matches) != std::string::npos) {
|
||||
b_profile_this_object = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!b_profile_this_object) printf("Skipping: %s\n", kernel_name.c_str());
|
||||
} catch (...) {
|
||||
printf("Warning: Unknown name for object %lu\n", kernel_object);
|
||||
}
|
||||
}
|
||||
|
||||
if (b_profile_this_object)
|
||||
can_profile_anypacket = true;
|
||||
can_profile_packet.push_back(b_profile_this_object);
|
||||
}
|
||||
|
||||
if (!can_profile_anypacket) {
|
||||
/* Write the original packets to the hardware if no patch will be profiled */
|
||||
writer(packets, pkt_count);
|
||||
return;
|
||||
}
|
||||
|
||||
// Preparing att Packets
|
||||
Packet::packet_t start_packet{};
|
||||
Packet::packet_t stop_packet{};
|
||||
@@ -902,44 +950,12 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
|
||||
att_params, &start_packet, &stop_packet);
|
||||
}
|
||||
|
||||
|
||||
// Searching across all the packets given during this write
|
||||
for (size_t i = 0; i < pkt_count; ++i) {
|
||||
auto& original_packet = static_cast<const hsa_barrier_and_packet_t*>(packets)[i];
|
||||
|
||||
// Skip packets other than kernel dispatch packets.
|
||||
if (bit_extract(original_packet.header, HSA_PACKET_HEADER_TYPE,
|
||||
HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1) !=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH) {
|
||||
transformed_packets.emplace_back(packets_arr[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto& kdispatch = static_cast<const hsa_kernel_dispatch_packet_s*>(packets)[i];
|
||||
uint64_t kernel_object = kdispatch.kernel_object;
|
||||
bool b_profile_this_object = false;
|
||||
|
||||
// Try to match the mangled kernel name with given matches in input.txt
|
||||
try {
|
||||
std::lock_guard<std::mutex> lock(ksymbol_map_lock);
|
||||
assert(ksymbols);
|
||||
const std::string& kernel_name = ksymbols->at(kernel_object);
|
||||
|
||||
// We want to initiate att profiling only if a match exists
|
||||
for(const std::string& kernel_matches : kernel_profile_names) {
|
||||
if (kernel_name.find(kernel_matches) != std::string::npos) {
|
||||
b_profile_this_object = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!b_profile_this_object) printf("Skipping: %s\n", kernel_name.c_str());
|
||||
} catch (...) {
|
||||
printf("Warning: Unknown name for object %lu\n", kernel_object);
|
||||
}
|
||||
|
||||
// If no match was found or intercept count > maximum desired profiles, skip this kernel.
|
||||
if (!b_profile_this_object || KernelInterceptCount >= MAX_ATT_PROFILES) {
|
||||
printf("Skipping: %lu\n", kernel_object);
|
||||
// Skip all packets marked with !can_profile
|
||||
if (i >= can_profile_packet.size() || can_profile_packet[i] == false) {
|
||||
transformed_packets.emplace_back(packets_arr[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user