Files
rocm-systems/runtime/hsa-runtime/core/runtime/svm_profiler.cpp
T
David Yat Sin 0e4c7336ff Use os::createThread to launch SVM profiler thread
Using previous method of std::thread for SVM profiler task was causing
segfaults on thread launch on RHEL 8 if libhsa-runtime library is loaded
using dlopen.

Change-Id: Ic010cd6ae9bc6e6ed0605de02b93f6aae8ed3e97
2022-11-03 10:52:11 -04:00

385 строки
14 KiB
C++

////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2022-2022, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include "core/inc/svm_profiler.h"
#include <stdint.h>
#include <algorithm>
#include <sys/eventfd.h>
#include <poll.h>
#include "hsakmt/hsakmt.h"
#include "core/util/utils.h"
#include "core/inc/runtime.h"
#include "core/inc/agent.h"
#include "core/inc/amd_gpu_agent.h"
#include "core/util/os.h"
namespace rocr {
namespace AMD {
static const char* smi_event_string(uint32_t event) {
static const char* strings[] = {"NONE",
"VMFAULT",
"THERMAL_THROTTLE",
"GPU_PRE_RESET",
"GPU_POST_RESET",
"MIGRATE_START",
"MIGRATE_END",
"PAGE_FAULT_START",
"PAGE_FAULT_END",
"QUEUE_EVICTION",
"QUEUE_RESTORE",
"UNMAP_FROM_GPU",
"UNKNOWN"};
event = std::min<uint32_t>(event, sizeof(strings) / sizeof(char*) - 1);
return strings[event];
}
static const char* smi_migrate_string(uint32_t trigger) {
static const char* strings[] = {"PREFETCH",
"PAGEFAULT_GPU",
"PAGEFAULT_CPU",
"TTM_EVICTION",
"UNKNOWN"};
trigger = std::min<uint32_t>(trigger, sizeof(strings) / sizeof(char*) - 1);
return strings[trigger];
}
static const char* smi_eviction_string(uint32_t trigger) {
static const char* strings[] = {"SVM",
"USERPTR",
"TTM",
"SUSPEND",
"CRIU_CHECKPOINT",
"CRIU_RESTORE",
"UNKNOWN"};
trigger = std::min<uint32_t>(trigger, sizeof(strings) / sizeof(char*) - 1);
return strings[trigger];
}
static const char* smi_unmap_string(uint32_t trigger) {
static const char* strings[] = {"MMU_NOTIFY",
"MMU_NOTIFY_MIGRATE",
"UNMAP_FROM_CPU",
"UNKNOWN"};
trigger = std::min<uint32_t>(trigger, sizeof(strings) / sizeof(char*) - 1);
return strings[trigger];
}
void SvmProfileControl::PollSmiRun(void* _profileControl) {
SvmProfileControl* profileControl = (SvmProfileControl*)_profileControl;
profileControl->PollSmi();
}
void SvmProfileControl::PollSmi() {
if (core::Runtime::runtime_singleton_->flag().svm_profile().empty()) {
return;
}
FILE* logFile = fopen(core::Runtime::runtime_singleton_->flag().svm_profile().c_str(), "a");
if (logFile == NULL) {
return;
}
MAKE_NAMED_SCOPE_GUARD(logGuard, [&]() { fclose(logFile); });
std::vector<pollfd> files;
files.resize(core::Runtime::runtime_singleton_->gpu_agents().size() + 1);
files[0].fd = event;
files[0].events = POLLIN;
files[0].revents = 0;
HSAuint64 events = 0;
events = HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_MIGRATE_START) |
HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_MIGRATE_END) |
HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_PAGE_FAULT_START) |
HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_PAGE_FAULT_END) |
HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_QUEUE_EVICTION) |
HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_QUEUE_RESTORE) |
HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_UNMAP_FROM_GPU);
for (int i = 0; i < core::Runtime::runtime_singleton_->gpu_agents().size(); i++) {
auto err = hsaKmtOpenSMI(core::Runtime::runtime_singleton_->gpu_agents()[i]->node_id(),
&files[i + 1].fd);
assert(err == HSAKMT_STATUS_SUCCESS);
files[i + 1].events = POLLIN;
files[i + 1].revents = 0;
// Enable collecting masked events.
auto wrote = write(files[i + 1].fd, &events, sizeof(events));
assert(wrote == sizeof(events));
}
MAKE_NAMED_SCOPE_GUARD(smiGuard, [&]() {
for (int i = 1; i < files.size(); i++) {
close(files[i].fd);
}
});
std::vector<std::string> smi_records;
smi_records.resize(core::Runtime::runtime_singleton_->gpu_agents().size() + 1);
char buffer[HSA_SMI_EVENT_MSG_SIZE + 1];
auto format_agent = [this](uint32_t gpuid) {
std::string ret;
core::Agent* agent = core::Runtime::runtime_singleton_->agent_by_gpuid(gpuid);
if (agent->device_type() == core::Agent::kAmdCpuDevice)
return std::string("CPU");
else
return format("GPU%u(%p)", ((AMD::GpuAgent*)agent)->enumeration_index(),
agent->public_handle());
};
while (!exit) {
int ready = poll(&files[0], files.size(), -1);
if (ready < 1) {
assert(false && "poll failed!");
return;
}
for (int i = 1; i < files.size(); i++) {
if (files[i].revents & POLLIN) {
memset(buffer, 0, sizeof(buffer));
auto len = read(files[i].fd, buffer, sizeof(buffer) - 1);
if (len > 0) {
buffer[len] = '\0';
// printf("%s\n", buffer);
// fprintf(logFile, "%s\n", buffer);
smi_records[i] += buffer;
while (true) {
size_t pos = smi_records[i].find('\n');
if (pos == std::string::npos) break;
std::string line = smi_records[i].substr(0, pos);
smi_records[i].erase(0, pos + 1);
const char* cursor;
cursor = line.c_str();
// Event records follow the format:
// event_id timestamp -pid event_specific_info trigger
// timestamp, pid, and trigger are in dec. All other are hex.
// event_specific substring is listed for each event type.
// See kfd_ioctl.h for more info.
int event_id;
uint64_t time;
int pid;
int offset = 0;
int args = sscanf(cursor, "%x %lu -%u%n", &event_id, &time, &pid, &offset);
assert(args == 3 && "Parsing error!");
std::string detail;
cursor += offset + 1;
switch (event_id) {
//@addr(size) from->to prefetch_location:preferred_location
case HSA_SMI_EVENT_MIGRATE_START: {
uint64_t addr;
uint32_t size;
uint32_t from, to;
uint32_t trigger = 0;
uint32_t fetch, pref;
args = sscanf(cursor, "@%lx(%x) %x->%x %x:%x %u", &addr, &size, &from, &to, &fetch,
&pref, &trigger);
assert(args == 7 && "Parsing error!");
addr *= 4096;
size *= 4096;
std::string from_agent = format_agent(from);
std::string to_agent = format_agent(to);
std::string range = format("[%p, %p]", addr, addr + size - 1);
std::string cause = smi_migrate_string(trigger);
detail = cause + " " + from_agent + "->" + to_agent + " " + range;
break;
}
//@addr(size) from->to
case HSA_SMI_EVENT_MIGRATE_END: {
uint64_t addr;
uint32_t size;
uint32_t from, to;
uint32_t trigger;
args = sscanf(cursor, "@%lx(%x) %x->%x %u", &addr, &size, &from, &to, &trigger);
assert(args == 5 && "Parsing error!");
addr *= 4096;
size *= 4096;
std::string from_agent = format_agent(from);
std::string to_agent = format_agent(to);
std::string range = format("[%p, %p]", addr, addr + size - 1);
std::string cause = smi_migrate_string(trigger);
detail = cause + " " + from_agent + "->" + to_agent + " " + range;
break;
}
//@addr(gpu_id) W/R
case HSA_SMI_EVENT_PAGE_FAULT_START: {
uint64_t addr;
uint32_t gpuid;
char mode;
args = sscanf(cursor, "@%lx(%x) %c", &addr, &gpuid, &mode);
addr *= 4096;
assert(args == 3 && "Parsing error!");
std::string agent = format_agent(gpuid);
std::string range = std::to_string(addr);
std::string cause = (mode == 'W') ? "Write" : "Read";
detail = cause + " " + agent + " " + range;
break;
}
//@addr(gpu_id) M/U (migration / page table update)
case HSA_SMI_EVENT_PAGE_FAULT_END: {
uint64_t addr;
uint32_t gpuid;
char mode;
args = sscanf(cursor, "@%lx(%x) %c", &addr, &gpuid, &mode);
assert(args == 3 && "Parsing error!");
addr *= 4096;
std::string agent = format_agent(gpuid);
std::string range = std::to_string(addr);
std::string cause = (mode == 'M') ? "Migration" : "Map";
detail = cause + " " + agent + " " + range;
break;
}
// gpu_id
case HSA_SMI_EVENT_QUEUE_EVICTION: {
uint32_t gpuid;
uint32_t trigger;
args = sscanf(cursor, "%x %u", &gpuid, &trigger);
assert(args == 2 && "Parsing error!");
std::string agent = format_agent(gpuid);
std::string cause = smi_eviction_string(trigger);
detail = cause + " " + agent;
break;
}
// gpu_id
case HSA_SMI_EVENT_QUEUE_RESTORE: {
uint32_t gpuid;
uint32_t trigger;
args = sscanf(cursor, "%x %u", &gpuid, &trigger);
assert(args == 2 && "Parsing error!");
std::string agent = format_agent(gpuid);
std::string cause = smi_eviction_string(trigger);
detail = cause + " " + agent;
break;
}
//@addr(size) gpu_id
case HSA_SMI_EVENT_UNMAP_FROM_GPU: {
uint64_t addr;
uint32_t size;
uint32_t gpuid;
uint32_t trigger;
args = sscanf(cursor, "@%lx(%x) %x %u", &addr, &size, &gpuid, &trigger);
assert(args == 4 && "Parsing error!");
addr *= 4096;
size *= 4096;
std::string gpu = format_agent(gpuid);
std::string range = format("[%p, %p]", addr, addr + size - 1);
std::string cause = smi_unmap_string(trigger);
detail = cause + " " + gpu + " " + range;
break;
}
default:;
}
std::string record = std::string("ROCr HMM event: ") + std::to_string(time) + " " +
smi_event_string(event_id) + " " + detail;
// printf("%s\n", record.c_str());
fprintf(logFile, "%s\n", record.c_str());
}
} else {
auto err = errno;
const char* msg = strerror(err);
// printf("ROCr HMM event error: Read returned %ld, %s (%d)\n", len, msg, err);
fprintf(logFile, "ROCr HMM event error: Read returned %ld, %s (%d)\n", len, msg, err);
}
files[i].revents = 0;
}
}
if (files[0].revents & POLLIN) return;
}
}
SvmProfileControl::SvmProfileControl() : event(-1), exit(false) {
event = eventfd(0, EFD_CLOEXEC);
if (event == -1) return;
poll_smi_thread_ = os::CreateThread(PollSmiRun, (void*)this);
if (poll_smi_thread_ == NULL) {
assert(false && "Poll SMI thread creation error.");
return;
}
}
SvmProfileControl::~SvmProfileControl() {
if (event != -1) eventfd_write(event, 1);
if (poll_smi_thread_ != NULL) {
exit = true;
os::WaitForThread(poll_smi_thread_);
os::CloseThread(poll_smi_thread_);
poll_smi_thread_ = NULL;
}
close(event);
}
template <typename... Args>
std::string SvmProfileControl::format(const char* format, Args... args) {
int len = snprintf(&format_buffer[0], format_buffer.size(), format, args...);
if (len + 1 > format_buffer.size()) {
format_buffer.resize(len + 1);
snprintf(&format_buffer[0], format_buffer.size(), format, args...);
}
return std::string(&format_buffer[0]);
}
} // namespace AMD
} // namespace rocr