3619 γραμμές
131 KiB
C++
3619 γραμμές
131 KiB
C++
/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE. */
|
|
|
|
#include "cl.h"
|
|
#include "platform/program.hpp"
|
|
#include "platform/kernel.hpp"
|
|
#include "os/os.hpp"
|
|
#include "utils/debug.hpp"
|
|
#include "utils/flags.hpp"
|
|
#include "utils/options.hpp"
|
|
#include "utils/versions.hpp"
|
|
#include "thread/monitor.hpp"
|
|
#include "CL/cl_ext.h"
|
|
|
|
#include "vdi_common.hpp"
|
|
#include "device/comgrctx.hpp"
|
|
#include "device/devhostcall.hpp"
|
|
#include "device/rocm/rocdevice.hpp"
|
|
#include "device/rocm/rocblit.hpp"
|
|
#include "device/rocm/rocvirtual.hpp"
|
|
#include "device/rocm/rocprogram.hpp"
|
|
#include "device/rocm/rockernel.hpp"
|
|
#include "device/rocm/rocmemory.hpp"
|
|
#include "device/rocm/rocglinterop.hpp"
|
|
#include "device/rocm/rocsignal.hpp"
|
|
#include "platform/sampler.hpp"
|
|
|
|
#if defined(__clang__)
|
|
#if __has_feature(address_sanitizer)
|
|
#include "device/rocm/rocurilocator.hpp"
|
|
#endif
|
|
#endif
|
|
|
|
#include <algorithm>
|
|
#include <cstring>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
#include <memory>
|
|
#ifdef ROCCLR_SUPPORT_NUMA_POLICY
|
|
#include <numa.h>
|
|
#include <numaif.h>
|
|
#endif // ROCCLR_SUPPORT_NUMA_POLICY
|
|
#include <sstream>
|
|
#include <vector>
|
|
|
|
#define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR)
|
|
#define OPENCL_C_VERSION_STR XSTR(OPENCL_C_MAJOR) "." XSTR(OPENCL_C_MINOR)
|
|
|
|
|
|
static_assert(static_cast<uint32_t>(amd::Device::VmmAccess::kNone) ==
|
|
static_cast<uint32_t>(HSA_ACCESS_PERMISSION_NONE),
|
|
"Vmm Access Flag None mismatch with ROC-runtime!");
|
|
static_assert(static_cast<uint32_t>(amd::Device::VmmAccess::kReadOnly) ==
|
|
static_cast<uint32_t>(HSA_ACCESS_PERMISSION_RO),
|
|
"Vmm Access Flag Read mismatch with ROCr-runtime!");
|
|
static_assert(static_cast<uint32_t>(amd::Device::VmmAccess::kReadWrite) ==
|
|
static_cast<uint32_t>(HSA_ACCESS_PERMISSION_RW),
|
|
"Vmm Access Flag Read Write mismatch with ROC-runtime!");
|
|
|
|
|
|
namespace amd::device {
|
|
extern const char* HipExtraSourceCode;
|
|
extern const char* HipExtraSourceCodeNoGWS;
|
|
} // namespace amd::device
|
|
|
|
namespace amd::roc {
|
|
bool roc::Device::isHsaInitialized_ = false;
|
|
std::vector<hsa_agent_t> roc::Device::gpu_agents_;
|
|
std::vector<AgentInfo> roc::Device::cpu_agents_;
|
|
|
|
address Device::mg_sync_ = nullptr;
|
|
|
|
bool NullDevice::create(const amd::Isa& isa) {
|
|
if (!isa.runtimeRocSupported()) {
|
|
LogPrintfError("Offline HSA device %s is not supported", isa.targetId());
|
|
return false;
|
|
}
|
|
|
|
online_ = false;
|
|
// Mark the device as GPU type
|
|
info_.type_ = CL_DEVICE_TYPE_GPU;
|
|
info_.vendorId_ = 0x1002;
|
|
|
|
roc::Settings* hsaSettings = new roc::Settings();
|
|
settings_ = hsaSettings;
|
|
if (!hsaSettings || !hsaSettings->create(false, isa, isa.xnack() == amd::Isa::Feature::Enabled)) {
|
|
LogPrintfError("Error creating settings for offline HSA device %s", isa.targetId());
|
|
return false;
|
|
}
|
|
|
|
if (!ValidateComgr()) {
|
|
LogPrintfError("Code object manager initialization failed for offline HSA device %s",
|
|
isa.targetId());
|
|
return false;
|
|
}
|
|
|
|
if (!amd::Device::create(isa)) {
|
|
LogPrintfError("Unable to setup offline HSA device %s", isa.targetId());
|
|
return false;
|
|
}
|
|
|
|
// Report the device name
|
|
::strncpy(info_.name_, isa.targetId(), sizeof(info_.name_) - 1);
|
|
info_.extensions_ = getExtensionString();
|
|
info_.maxWorkGroupSize_ = hsaSettings->maxWorkGroupSize_;
|
|
::strncpy(info_.vendor_, "Advanced Micro Devices, Inc.", sizeof(info_.vendor_) - 1);
|
|
info_.oclcVersion_ = "OpenCL C " OPENCL_C_VERSION_STR " ";
|
|
info_.spirVersions_ = "";
|
|
std::stringstream ss;
|
|
ss << AMD_BUILD_STRING " (HSA,LC) [Offline]";
|
|
::strncpy(info_.driverVersion_, ss.str().c_str(), sizeof(info_.driverVersion_) - 1);
|
|
info_.version_ = "OpenCL " OPENCL_VERSION_STR " ";
|
|
return true;
|
|
}
|
|
|
|
Device::Device(hsa_agent_t bkendDevice)
|
|
: mapCacheOps_(nullptr),
|
|
mapCache_(nullptr),
|
|
bkendDevice_(bkendDevice),
|
|
pciDeviceId_(0),
|
|
gpuvm_segment_max_alloc_(0),
|
|
alloc_granularity_(0),
|
|
xferQueue_(nullptr),
|
|
freeMem_(0),
|
|
vgpusAccess_(true) /* Virtual GPU List Ops Lock */
|
|
,
|
|
hsa_exclusive_gpu_access_(false),
|
|
queuePool_(QueuePriority::Total),
|
|
coopHostcallBuffer_(nullptr),
|
|
queueWithCUMaskPool_(QueuePriority::Total),
|
|
numOfVgpus_(0),
|
|
preferred_numa_node_(0),
|
|
maxSdmaReadMask_(0),
|
|
maxSdmaWriteMask_(0),
|
|
cpu_agent_info_(nullptr) {
|
|
group_segment_.handle = 0;
|
|
gpuvm_segment_.handle = 0;
|
|
gpu_fine_grained_segment_.handle = 0;
|
|
gpu_ext_fine_grained_segment_.handle = 0;
|
|
prefetch_signal_.handle = 0;
|
|
isXgmi_ = false;
|
|
cache_state_ = Device::CacheState::kCacheStateInvalid;
|
|
}
|
|
|
|
void Device::setupCpuAgent() {
|
|
int32_t numaDistance = std::numeric_limits<int32_t>::max();
|
|
uint32_t index = 0; // 0 as default
|
|
auto size = cpu_agents_.size();
|
|
for (uint32_t i = 0; i < size; i++) {
|
|
std::vector<amd::Device::LinkAttrType> link_attrs;
|
|
link_attrs.push_back(std::make_pair(LinkAttribute::kLinkDistance, 0));
|
|
if (findLinkInfo(cpu_agents_[i].fine_grain_pool, &link_attrs)) {
|
|
if (link_attrs[0].second < numaDistance) {
|
|
numaDistance = link_attrs[0].second;
|
|
index = i;
|
|
}
|
|
}
|
|
}
|
|
std::vector<amd::Device::LinkAttrType> link_attrs;
|
|
link_attrs.push_back(std::make_pair(LinkAttribute::kLinkLinkType, 0));
|
|
if (findLinkInfo(cpu_agents_[0].fine_grain_pool, &link_attrs)) {
|
|
isXgmi_ = (link_attrs[0].second == HSA_AMD_LINK_INFO_TYPE_XGMI);
|
|
}
|
|
|
|
preferred_numa_node_ = index;
|
|
cpu_agent_info_ = &cpu_agents_[index];
|
|
|
|
ClPrint(amd::LOG_INFO, amd::LOG_INIT,
|
|
"Numa selects cpu agent[%zu]=0x%zx(fine=0x%zx,"
|
|
"coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d",
|
|
index, cpu_agent_info_->agent.handle, cpu_agent_info_->fine_grain_pool.handle,
|
|
cpu_agent_info_->coarse_grain_pool.handle, bkendDevice_.handle, isXgmi_);
|
|
}
|
|
|
|
void Device::checkAtomicSupport() {
|
|
std::vector<amd::Device::LinkAttrType> link_attrs;
|
|
link_attrs.push_back(std::make_pair(LinkAttribute::kLinkAtomicSupport, 0));
|
|
if (findLinkInfo(cpu_agent_info_->fine_grain_pool, &link_attrs)) {
|
|
if (link_attrs[0].second == 1) {
|
|
info_.pcie_atomics_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
Device::~Device() {
|
|
if (coopHostcallBuffer_) {
|
|
amd::disableHostcalls(coopHostcallBuffer_);
|
|
context().svmFree(coopHostcallBuffer_);
|
|
coopHostcallBuffer_ = nullptr;
|
|
}
|
|
// Release cached map targets
|
|
for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) {
|
|
if ((*mapCache_)[i] != nullptr) {
|
|
(*mapCache_)[i]->release();
|
|
}
|
|
}
|
|
delete mapCache_;
|
|
delete mapCacheOps_;
|
|
|
|
if (nullptr != p2p_stage_) {
|
|
p2p_stage_->release();
|
|
p2p_stage_ = nullptr;
|
|
}
|
|
if (nullptr != mg_sync_) {
|
|
GlbCtx().svmFree(mg_sync_);
|
|
mg_sync_ = nullptr;
|
|
}
|
|
if (glb_ctx_ != nullptr) {
|
|
glb_ctx_->release();
|
|
glb_ctx_ = nullptr;
|
|
}
|
|
|
|
for (auto& it : queuePool_) {
|
|
for (auto qIter = it.begin(); qIter != it.end();) {
|
|
hsa_queue_t* queue = qIter->first;
|
|
auto& qInfo = qIter->second;
|
|
if (qInfo.hostcallBuffer_) {
|
|
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_QUEUE,
|
|
"Deleting hostcall buffer %p for hardware queue %p", qInfo.hostcallBuffer_,
|
|
qIter->first->base_address);
|
|
amd::disableHostcalls(qInfo.hostcallBuffer_);
|
|
context().svmFree(qInfo.hostcallBuffer_);
|
|
}
|
|
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_QUEUE, "Deleting hardware queue %p with refCount 0",
|
|
queue->base_address);
|
|
qIter = it.erase(qIter);
|
|
Hsa::queue_destroy(queue);
|
|
}
|
|
}
|
|
queuePool_.clear();
|
|
|
|
// Destroy transfer queue
|
|
delete xferQueue_;
|
|
|
|
delete blitProgram_;
|
|
|
|
if (context_ != nullptr) {
|
|
context_->release();
|
|
}
|
|
|
|
delete[] p2p_agents_list_;
|
|
|
|
if (0 != prefetch_signal_.handle) {
|
|
Hsa::signal_destroy(prefetch_signal_);
|
|
}
|
|
}
|
|
|
|
void NullDevice::tearDown() {}
|
|
|
|
bool NullDevice::init() {
|
|
// Create offline devices for all ISAs not already associated with an online
|
|
// device. This allows code objects to be compiled for all supported ISAs.
|
|
std::vector<Device*> devices = getDevices(CL_DEVICE_TYPE_GPU, false);
|
|
for (const amd::Isa* isa = amd::Isa::begin(); isa != amd::Isa::end(); isa++) {
|
|
if (!isa->runtimeRocSupported()) {
|
|
continue;
|
|
}
|
|
bool isOnline = false;
|
|
// Check if the particular device is online
|
|
for (size_t i = 0; i < devices.size(); i++) {
|
|
if (&(devices[i]->isa()) == isa) {
|
|
isOnline = true;
|
|
break;
|
|
}
|
|
}
|
|
if (isOnline) {
|
|
continue;
|
|
}
|
|
std::unique_ptr<NullDevice> nullDevice(new NullDevice());
|
|
if (!nullDevice) {
|
|
LogPrintfError("Error allocating new instance of offline HSA device %s", isa->targetId());
|
|
return false;
|
|
}
|
|
if (!nullDevice->create(*isa)) {
|
|
LogPrintfError("Skipping creating new instance of offline HSA sevice %s", isa->targetId());
|
|
continue;
|
|
}
|
|
nullDevice.release()->registerDevice();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
NullDevice::~NullDevice() {}
|
|
|
|
hsa_status_t Device::iterateAgentCallback(hsa_agent_t agent, void* data) {
|
|
hsa_device_type_t dev_type = HSA_DEVICE_TYPE_CPU;
|
|
|
|
hsa_status_t stat = Hsa::agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &dev_type);
|
|
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("HSA_AGENT_INFO_DEVICE failed with %x", stat);
|
|
return stat;
|
|
}
|
|
|
|
if (dev_type == HSA_DEVICE_TYPE_CPU) {
|
|
AgentInfo info = {agent, {0}, {0}, {0}};
|
|
stat = Hsa::agent_iterate_memory_pools(agent, Device::iterateCpuMemoryPoolCallback,
|
|
reinterpret_cast<void*>(&info));
|
|
if (stat == HSA_STATUS_SUCCESS) {
|
|
cpu_agents_.push_back(info);
|
|
}
|
|
} else if (dev_type == HSA_DEVICE_TYPE_GPU) {
|
|
gpu_agents_.push_back(agent);
|
|
}
|
|
|
|
return stat;
|
|
}
|
|
|
|
hsa_ven_amd_loader_1_00_pfn_t Device::amd_loader_ext_table = {nullptr};
|
|
|
|
hsa_status_t Device::loaderQueryHostAddress(const void* device, const void** host) {
|
|
return amd_loader_ext_table.hsa_ven_amd_loader_query_host_address
|
|
? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host)
|
|
: HSA_STATUS_ERROR;
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::init() {
|
|
if (!Hsa::LoadLib()) {
|
|
LogPrintfWarning("Failed to load rocr library!");
|
|
return false;
|
|
}
|
|
|
|
hsa_status_t status = Hsa::init();
|
|
|
|
// If there are no GPUs available, hsa_init will fail with HSA_STATUS_ERROR_OUT_OF_RESOURCES
|
|
// but for NoGpu tests to pass, true needs to be returned
|
|
constexpr bool kNoOfflineDevices = false;
|
|
std::vector<amd::Device*> devices = getDevices(CL_DEVICE_TYPE_GPU, kNoOfflineDevices);
|
|
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES && devices.size() == 0) {
|
|
return true;
|
|
}
|
|
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("hsa_init failed with %x", status);
|
|
return false;
|
|
}
|
|
|
|
Hsa::system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(amd_loader_ext_table),
|
|
&amd_loader_ext_table);
|
|
|
|
status = Hsa::iterate_agents(iterateAgentCallback, nullptr);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("hsa_iterate_agents failed with %x", status);
|
|
return false;
|
|
}
|
|
|
|
std::string ordinals =
|
|
amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
|
|
: GPU_DEVICE_ORDINAL;
|
|
if (ordinals[0] != '\0') {
|
|
size_t pos = 0;
|
|
std::vector<hsa_agent_t> valid_agents;
|
|
std::set<size_t> valid_indexes;
|
|
do {
|
|
size_t end;
|
|
bool deviceIdValid = true;
|
|
end = ordinals.find_first_of(',', pos);
|
|
if (end == std::string::npos) {
|
|
end = ordinals.size();
|
|
}
|
|
std::string str_id = ordinals.substr(pos, end - pos);
|
|
// If Uuid is specified, then convert it to index
|
|
// Uuid is an Ascii string with a maximum of 21 chars including NULL
|
|
// The string value is in the format GPU-<body>, <body> encodes UUID as a 16 chars hex
|
|
if (str_id.find("GPU-") != std::string::npos) {
|
|
for (int i = 0; i < gpu_agents_.size(); i++) {
|
|
auto agent = gpu_agents_[i];
|
|
char unique_id[32] = {0};
|
|
if (HSA_STATUS_SUCCESS ==
|
|
Hsa::agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_UUID),
|
|
unique_id)) {
|
|
if (std::string(unique_id).find(str_id) != std::string::npos) {
|
|
str_id = std::to_string(i);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
int index = atoi(str_id.c_str());
|
|
if (index < 0 || static_cast<size_t>(index) >= gpu_agents_.size() ||
|
|
str_id != std::to_string(index)) {
|
|
deviceIdValid = false;
|
|
}
|
|
|
|
if (!deviceIdValid) {
|
|
// Exit the loop as anything to the right of invalid deviceId
|
|
// has to be discarded
|
|
break;
|
|
} else {
|
|
if (valid_indexes.find(index) == valid_indexes.end()) {
|
|
valid_agents.push_back(gpu_agents_[index]);
|
|
valid_indexes.insert(index);
|
|
}
|
|
}
|
|
pos = end + 1;
|
|
} while (pos < ordinals.size());
|
|
gpu_agents_ = valid_agents;
|
|
}
|
|
|
|
LogPrintfInfo("Initalizing runtime stack, Enumerated GPU agents = %lu", gpu_agents_.size());
|
|
|
|
for (auto agent : gpu_agents_) {
|
|
std::unique_ptr<Device> roc_device(new Device(agent));
|
|
if (!roc_device) {
|
|
LogError("Error creating new instance of Device on then heap.");
|
|
continue;
|
|
}
|
|
|
|
if (!roc_device->create()) {
|
|
LogError("Error creating new instance of Device.");
|
|
continue;
|
|
}
|
|
|
|
// Setup System Memory to be Non-Coherent per user
|
|
// request via environment variable. By default the
|
|
// System Memory is setup to be Coherent
|
|
if (roc_device->settings().enableNCMode_) {
|
|
hsa_status_t err = Hsa::coherency_set_type(agent, HSA_AMD_COHERENCY_TYPE_NONCOHERENT);
|
|
if (err != HSA_STATUS_SUCCESS) {
|
|
LogError("Unable to set NC memory policy!");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Check to see if a global CU mask is requested
|
|
if (amd::IS_HIP && ROC_GLOBAL_CU_MASK[0] != '\0') {
|
|
roc_device->getGlobalCUMask(ROC_GLOBAL_CU_MASK);
|
|
}
|
|
|
|
roc_device.release()->registerDevice();
|
|
}
|
|
|
|
// Query active devices only
|
|
devices = getDevices(CL_DEVICE_TYPE_GPU, kNoOfflineDevices);
|
|
if (devices.size() > 0) {
|
|
bool p2p_available = false;
|
|
// Loop through all available devices
|
|
for (auto device1 : devices) {
|
|
// Find all agents that can have access to the current device
|
|
for (auto agent : static_cast<Device*>(device1)->p2pAgents()) {
|
|
// Find cl_device_id associated with the current agent
|
|
for (auto device2 : devices) {
|
|
if (agent.handle == static_cast<Device*>(device2)->getBackendDevice().handle) {
|
|
// Device2 can have access to device1
|
|
device2->p2pDevices_.push_back(as_cl(device1));
|
|
device1->p2p_access_devices_.push_back(device2);
|
|
p2p_available = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create a dummy context for internal memory allocations on all reported devices
|
|
glb_ctx_ = new amd::Context(devices, amd::Context::Info());
|
|
if (glb_ctx_ == nullptr) {
|
|
LogError("glb_ctx failed");
|
|
return false;
|
|
}
|
|
|
|
// Allocate a staging buffer for P2P emulation path
|
|
if ((devices.size() >= 1) && !p2p_available) {
|
|
amd::Buffer* buf =
|
|
new (*glb_ctx_) amd::Buffer(*glb_ctx_, CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
|
|
if ((buf != nullptr) && buf->create()) {
|
|
p2p_stage_ = buf;
|
|
} else {
|
|
delete buf;
|
|
LogError("p2p stg buffer alloc failed");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Allocate mgpu sync buffer for cooperative launches
|
|
if (amd::IS_HIP) {
|
|
mg_sync_ = reinterpret_cast<address>(
|
|
glb_ctx_->svmAlloc(kMGInfoSizePerDevice * devices.size(), kMGInfoSizePerDevice,
|
|
(CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS)));
|
|
if (mg_sync_ == nullptr) {
|
|
LogError("mgpu sync buffer alloc failed");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (amd::IS_HIP) {
|
|
RegisterBackendErrorCb();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
extern const char* SchedulerSourceCode;
|
|
|
|
void Device::tearDown() {
|
|
NullDevice::tearDown();
|
|
Hsa::shut_down();
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::create() {
|
|
char agent_name[64] = {0};
|
|
if (HSA_STATUS_SUCCESS != Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_NAME, agent_name)) {
|
|
LogError("Unable to get HSA device name");
|
|
return false;
|
|
}
|
|
|
|
if (HSA_STATUS_SUCCESS != Hsa::agent_get_info(bkendDevice_,
|
|
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID,
|
|
&pciDeviceId_)) {
|
|
LogPrintfError("Unable to get PCI ID of HSA device %s", agent_name);
|
|
return false;
|
|
}
|
|
|
|
struct agent_isas_t {
|
|
uint count;
|
|
hsa_isa_t first_isa;
|
|
} agent_isas = {0, {0}};
|
|
if (HSA_STATUS_SUCCESS != Hsa::agent_iterate_isas(
|
|
bkendDevice_,
|
|
[](hsa_isa_t isa, void* data) {
|
|
agent_isas_t* agent_isas = static_cast<agent_isas_t*>(data);
|
|
if (agent_isas->count++ == 0) {
|
|
agent_isas->first_isa = isa;
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
},
|
|
&agent_isas)) {
|
|
LogPrintfError("Unable to iterate supported ISAs for HSA device %s (PCI ID %x)", agent_name,
|
|
pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
uint32_t isa_name_length = 0;
|
|
if (HSA_STATUS_SUCCESS != Hsa::isa_get_info_alt(agent_isas.first_isa,
|
|
(hsa_isa_info_t)HSA_ISA_INFO_NAME_LENGTH,
|
|
&isa_name_length)) {
|
|
LogPrintfError("Unable to get ISA name length for HSA device %s (PCI ID %x)", agent_name,
|
|
pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
std::vector<char> isa_name(isa_name_length + 1, '\0');
|
|
if (HSA_STATUS_SUCCESS != Hsa::isa_get_info_alt(agent_isas.first_isa,
|
|
(hsa_isa_info_t)HSA_ISA_INFO_NAME,
|
|
isa_name.data())) {
|
|
LogPrintfError("Unable to get ISA name for HSA device %s (PCI ID %x)", agent_name,
|
|
pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
const amd::Isa* isa = amd::Isa::findIsa(isa_name.data());
|
|
if (!isa || !isa->runtimeRocSupported()) {
|
|
LogPrintfError("Unsupported HSA device %s (PCI ID %x) for ISA %s", agent_name, pciDeviceId_,
|
|
isa_name.data());
|
|
return false;
|
|
}
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_PROFILE, &agent_profile_)) {
|
|
LogPrintfError("Unable to get profile for HSA device %s (PCI ID %x)", agent_name, pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
uint32_t coop_groups = 0;
|
|
// Check cooperative groups for HIP only
|
|
if (amd::IS_HIP &&
|
|
(HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES),
|
|
&coop_groups))) {
|
|
LogPrintfError(
|
|
"Unable to determine if cooperative queues are supported for HSA device %s (PCI ID %x)",
|
|
agent_name, pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
setupCpuAgent();
|
|
|
|
// Get Agent HDP Flush Register Memory
|
|
hsa_amd_hdp_flush_t hdpInfo;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_HDP_FLUSH),
|
|
&hdpInfo)) {
|
|
LogPrintfError("Unable to determine HDP flush info for HSA device %s", agent_name);
|
|
return false;
|
|
}
|
|
|
|
info_.hdpMemFlushCntl = hdpInfo.HDP_MEM_FLUSH_CNTL;
|
|
info_.hdpRegFlushCntl = hdpInfo.HDP_REG_FLUSH_CNTL;
|
|
bool hasValidHDPFlush = (info_.hdpMemFlushCntl != nullptr) && (info_.hdpRegFlushCntl != nullptr);
|
|
|
|
// Create HSA settings
|
|
assert(!settings_);
|
|
roc::Settings* hsaSettings = new roc::Settings();
|
|
settings_ = hsaSettings;
|
|
if (!hsaSettings || !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
|
|
isa->xnack() == amd::Isa::Feature::Enabled, coop_groups,
|
|
isXgmi_, hasValidHDPFlush)) {
|
|
LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name,
|
|
pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
if (!ValidateComgr()) {
|
|
LogPrintfError("Code object manager initialization failed for HSA device %s (PCI ID %x)",
|
|
agent_name, pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
if (!amd::Device::create(*isa)) {
|
|
LogPrintfError("Unable to setup device for HSA device %s (PCI ID %x)", agent_name,
|
|
pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
uint32_t hsa_bdf_id = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_BDFID),
|
|
&hsa_bdf_id)) {
|
|
LogPrintfError("Unable to determine BFD ID for HSA device %s (PCI ID %x)", agent_name,
|
|
pciDeviceId_);
|
|
return false;
|
|
}
|
|
|
|
info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
|
|
info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8;
|
|
info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3;
|
|
info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07);
|
|
uint32_t pci_domain_id = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_DOMAIN),
|
|
&pci_domain_id)) {
|
|
LogPrintfError("Unable to determine domain ID for HSA device %s (PCI ID %x)", agent_name,
|
|
pciDeviceId_);
|
|
return false;
|
|
}
|
|
info_.pciDomainID = pci_domain_id;
|
|
|
|
if (populateOCLDeviceConstants() == false) {
|
|
LogPrintfError("populateOCLDeviceConstants failed for HSA device %s (PCI ID %x)", agent_name,
|
|
pciDeviceId_);
|
|
return false;
|
|
}
|
|
hsaSettings->limit_blit_wg_ = info().maxComputeUnits_;
|
|
if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) {
|
|
hsaSettings->limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U);
|
|
}
|
|
amd::Context::Info info = {0};
|
|
std::vector<amd::Device*> devices;
|
|
devices.push_back(this);
|
|
|
|
// Create a dummy context
|
|
context_ = new amd::Context(devices, info);
|
|
if (context_ == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
// Map Cache Lock
|
|
mapCacheOps_ = new amd::Monitor(true);
|
|
if (nullptr == mapCacheOps_) {
|
|
return false;
|
|
}
|
|
|
|
mapCache_ = new std::vector<amd::Memory*>();
|
|
if (mapCache_ == nullptr) {
|
|
return false;
|
|
}
|
|
// Use just 1 entry by default for the map cache
|
|
mapCache_->push_back(nullptr);
|
|
|
|
// Create signal for HMM prefetch operation on device
|
|
if (HSA_STATUS_SUCCESS != Hsa::signal_create(kInitSignalValueOne, 0, nullptr, &prefetch_signal_)) {
|
|
return false;
|
|
}
|
|
|
|
if (AMD_LOG_LEVEL >= LOG_EXTRA_DEBUG) {
|
|
uint8_t logMask[8] = {0};
|
|
hsa_flag_set64(logMask, HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS);
|
|
Hsa::enable_logging(logMask, outFile);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
device::Program* NullDevice::createProgram(amd::Program& owner, amd::option::Options* options) {
|
|
device::Program* program = new roc::Program(*this, owner);
|
|
|
|
if (program == nullptr) {
|
|
LogError("Memory allocation has failed!");
|
|
}
|
|
|
|
return program;
|
|
}
|
|
|
|
bool Device::createBlitProgram() {
|
|
bool result = true;
|
|
std::string extraKernel;
|
|
|
|
if (amd::IS_HIP) {
|
|
if (settings().gwsInitSupported_) {
|
|
extraKernel = device::HipExtraSourceCode;
|
|
} else {
|
|
extraKernel = device::HipExtraSourceCodeNoGWS;
|
|
}
|
|
} else {
|
|
extraKernel = SchedulerSourceCode;
|
|
}
|
|
|
|
blitProgram_ = new BlitProgram(context_);
|
|
// Create blit programs
|
|
if (blitProgram_ == nullptr || !blitProgram_->create(this, extraKernel, "")) {
|
|
delete blitProgram_;
|
|
blitProgram_ = nullptr;
|
|
LogError("Couldn't create blit kernels!");
|
|
return false;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
device::Program* Device::createProgram(amd::Program& owner, amd::option::Options* options) {
|
|
device::Program* program = new roc::Program(*this, owner);
|
|
|
|
if (program == nullptr) {
|
|
LogError("Memory allocation has failed!");
|
|
}
|
|
|
|
return program;
|
|
}
|
|
|
|
hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, void* data) {
|
|
if (data == nullptr) {
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
|
|
hsa_region_segment_t segment_type = (hsa_region_segment_t)0;
|
|
hsa_status_t stat =
|
|
Hsa::memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
return stat;
|
|
}
|
|
|
|
// TODO: system and device local segment
|
|
Device* dev = reinterpret_cast<Device*>(data);
|
|
switch (segment_type) {
|
|
case HSA_REGION_SEGMENT_GLOBAL: {
|
|
if (dev->settings().enableLocalMemory_) {
|
|
uint32_t global_flag = 0;
|
|
hsa_status_t stat =
|
|
Hsa::memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
return stat;
|
|
}
|
|
|
|
// If the flag set is ext scoped fine grain, break the loop
|
|
if ((global_flag & HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) != 0) {
|
|
dev->gpu_ext_fine_grained_segment_ = pool;
|
|
break;
|
|
}
|
|
|
|
if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
|
|
dev->gpu_fine_grained_segment_ = pool;
|
|
} else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
|
|
dev->gpuvm_segment_ = pool;
|
|
|
|
// If cpu agent cannot access this pool, the device does not support large bar.
|
|
hsa_amd_memory_pool_access_t tmp{};
|
|
Hsa::agent_memory_pool_get_info(dev->cpu_agent_info_->agent, pool,
|
|
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &tmp);
|
|
|
|
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
|
|
dev->info_.largeBar_ = false;
|
|
} else {
|
|
dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
|
|
}
|
|
|
|
// Query the recommended granularity for this pool.
|
|
stat = Hsa::memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
|
|
&(dev->info_.virtualMemAllocGranularity_));
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError(
|
|
"Cannot query HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE info"
|
|
"failed with hsa_status: %d \n",
|
|
stat);
|
|
}
|
|
}
|
|
|
|
if (dev->gpuvm_segment_.handle == 0) {
|
|
dev->gpuvm_segment_ = pool;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case HSA_REGION_SEGMENT_GROUP:
|
|
dev->group_segment_ = pool;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t Device::iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, void* data) {
|
|
if (data == nullptr) {
|
|
LogError("CpuMemoryPoolCallback invalid args");
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
|
|
hsa_region_segment_t segment_type = (hsa_region_segment_t)0;
|
|
hsa_status_t stat =
|
|
Hsa::memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("HSA_AMD_MEMORY_POOL_INFO_SEGMENT query failed with %x", stat);
|
|
return stat;
|
|
}
|
|
AgentInfo* agentInfo = reinterpret_cast<AgentInfo*>(data);
|
|
|
|
switch (segment_type) {
|
|
case HSA_REGION_SEGMENT_GLOBAL: {
|
|
uint32_t global_flag = 0;
|
|
stat =
|
|
Hsa::memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS query failed with %x", stat);
|
|
break;
|
|
}
|
|
|
|
// If the flag set is ext scoped fine grain, break the loop
|
|
if ((global_flag & HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) != 0) {
|
|
agentInfo->ext_fine_grain_pool = pool;
|
|
break;
|
|
}
|
|
|
|
if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
|
|
if (agentInfo->fine_grain_pool.handle == 0) {
|
|
agentInfo->fine_grain_pool = pool;
|
|
} else if ((global_flag & HSA_REGION_GLOBAL_FLAG_KERNARG) == 0) {
|
|
// If the fine_grain_pool was already filled, but kern_args flag was not set over-write.
|
|
// This means this is region-1(fine_grain only), so over-write this with memory pool set
|
|
// from "fine_grain and kern_args".
|
|
agentInfo->fine_grain_pool = pool;
|
|
}
|
|
guarantee(((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) == 0),
|
|
"Memory Segment cannot be both coarse and fine grained");
|
|
} else {
|
|
// If the flag is not set to fine grained, then it is coarse_grained by default.
|
|
agentInfo->coarse_grain_pool = pool;
|
|
guarantee(((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0),
|
|
"Memory Segments that are not fine grained has to be coarse grained");
|
|
guarantee(((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) == 0),
|
|
"Memory Segment cannot be both coarse and fine grained");
|
|
guarantee(((global_flag & HSA_REGION_GLOBAL_FLAG_KERNARG) == 0),
|
|
"Coarse grained memory segment cannot have kern_args tag");
|
|
}
|
|
|
|
if ((global_flag & HSA_REGION_GLOBAL_FLAG_KERNARG) != 0) {
|
|
agentInfo->kern_arg_pool = pool;
|
|
guarantee(((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) == 0),
|
|
"Coarse grained memory segment cannot have kern_args tag");
|
|
}
|
|
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return stat;
|
|
}
|
|
|
|
bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const {
|
|
*sampler = nullptr;
|
|
Sampler* gpuSampler = new Sampler(*this);
|
|
if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) {
|
|
delete gpuSampler;
|
|
return false;
|
|
}
|
|
*sampler = gpuSampler;
|
|
return true;
|
|
}
|
|
|
|
void Sampler::fillSampleDescriptor(hsa_ext_sampler_descriptor_v2_t& samplerDescriptor,
|
|
const amd::Sampler& sampler) const {
|
|
samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
|
|
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
|
|
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
|
|
samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
|
|
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
|
|
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
|
|
for (int i = 0; i < 3; i++) {
|
|
switch (sampler.addressingMode(i)) {
|
|
case CL_ADDRESS_CLAMP_TO_EDGE:
|
|
samplerDescriptor.address_modes[i] = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
|
|
break;
|
|
case CL_ADDRESS_REPEAT:
|
|
samplerDescriptor.address_modes[i] = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
|
|
break;
|
|
case CL_ADDRESS_CLAMP:
|
|
samplerDescriptor.address_modes[i] = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
|
|
break;
|
|
case CL_ADDRESS_MIRRORED_REPEAT:
|
|
samplerDescriptor.address_modes[i] = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
|
|
break;
|
|
case CL_ADDRESS_NONE:
|
|
samplerDescriptor.address_modes[i] = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Sampler::create(const amd::Sampler& owner) {
|
|
hsa_ext_sampler_descriptor_v2_t samplerDescriptor;
|
|
fillSampleDescriptor(samplerDescriptor, owner);
|
|
|
|
hsa_status_t status =
|
|
Hsa::sampler_create(dev_.getBackendDevice(), &samplerDescriptor, &hsa_sampler);
|
|
|
|
if (HSA_STATUS_SUCCESS != status) {
|
|
DevLogPrintfError("Sampler creation failed with status: %d \n", status);
|
|
return false;
|
|
}
|
|
|
|
hwSrd_ = hsa_sampler.handle;
|
|
hwState_ = reinterpret_cast<address>(hsa_sampler.handle);
|
|
|
|
return true;
|
|
}
|
|
|
|
Sampler::~Sampler() { Hsa::sampler_destroy(dev_.getBackendDevice(), hsa_sampler); }
|
|
|
|
Memory* Device::getGpuMemory(amd::Memory* mem) const {
|
|
return static_cast<roc::Memory*>(mem->getDeviceMemory(*this));
|
|
}
|
|
|
|
const bool Device::isFineGrainSupported() const {
|
|
bool result = (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
|
|
if (result) {
|
|
if (gpu_fine_grained_segment_.handle != 0) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
// ================================================================================================
|
|
bool Device::populateOCLDeviceConstants() {
|
|
info_.available_ = true;
|
|
|
|
::strncpy(info_.name_, isa().targetId(), sizeof(info_.name_) - 1);
|
|
char device_name[64] = {0};
|
|
if (HSA_STATUS_SUCCESS == Hsa::agent_get_info(bkendDevice_,
|
|
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME,
|
|
device_name)) {
|
|
::strncpy(info_.boardName_, device_name, sizeof(info_.boardName_) - 1);
|
|
}
|
|
|
|
char unique_id[32] = {0};
|
|
if (HSA_STATUS_SUCCESS ==
|
|
Hsa::agent_get_info(bkendDevice_, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_UUID),
|
|
unique_id)) {
|
|
// ROCr gives the UUID info in the format GPU-XXXX with length 20 bytes
|
|
// Strip the first 4 bytes and store only the 16 bytes representing UUID
|
|
for (size_t i = 0; i < 16; i++) {
|
|
info_.uuid_[i] = unique_id[i + 4];
|
|
}
|
|
}
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
(amd::IS_HIP)
|
|
? (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT
|
|
: (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
|
|
&info_.maxComputeUnits_)) {
|
|
return false;
|
|
}
|
|
assert(info_.maxComputeUnits_ > 0);
|
|
|
|
info_.maxComputeUnits_ =
|
|
settings().enableWgpMode_ ? info_.maxComputeUnits_ / 2 : info_.maxComputeUnits_;
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
|
|
&info_.maxPhysicalComputeUnits_)) {
|
|
return false;
|
|
}
|
|
assert(info_.maxPhysicalComputeUnits_ > 0);
|
|
|
|
info_.maxPhysicalComputeUnits_ = settings().enableWgpMode_ ? info_.maxPhysicalComputeUnits_ / 2
|
|
: info_.maxPhysicalComputeUnits_;
|
|
|
|
if (HSA_STATUS_SUCCESS != Hsa::agent_get_info(bkendDevice_,
|
|
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
|
|
&info_.globalMemCacheLineSize_)) {
|
|
return false;
|
|
}
|
|
info_.globalMemCacheLineSize_ =
|
|
(info_.globalMemCacheLineSize_ != 0) ? info_.globalMemCacheLineSize_ : 64;
|
|
|
|
uint32_t cachesize[4] = {0};
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_CACHE_SIZE, cachesize)) {
|
|
return false;
|
|
}
|
|
assert(cachesize[0] > 0);
|
|
info_.globalMemCacheSize_ = cachesize[0];
|
|
|
|
info_.globalMemCacheType_ = CL_READ_WRITE_CACHE;
|
|
|
|
info_.type_ = CL_DEVICE_TYPE_GPU;
|
|
|
|
info_.extensions_ = getExtensionString();
|
|
info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ =
|
|
(settings().doublePrecision_) ? 1 : 0;
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
|
|
&info_.maxEngineClockFrequency_)) {
|
|
return false;
|
|
}
|
|
|
|
if (!(isa().versionMajor() == 9 && isa().versionMinor() == 0 && isa().versionStepping() == 2)) {
|
|
if (info_.maxEngineClockFrequency_ <= 0) {
|
|
LogError("maxEngineClockFrequency_ is NOT positive!");
|
|
}
|
|
}
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY,
|
|
&info_.maxMemoryClockFrequency_)) {
|
|
return false;
|
|
}
|
|
|
|
uint64_t wallClockFrequency = 0; // in Hz
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
|
|
&wallClockFrequency)) {
|
|
LogWarning("HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY cannot be queried. Ignored!");
|
|
}
|
|
info_.wallClockFrequency_ = static_cast<uint32_t>(wallClockFrequency / 1000); // in KHz
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID),
|
|
&info_.driverNodeId_)) {
|
|
return false;
|
|
}
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SDMA_ENG),
|
|
&info_.numSDMAengines_)) {
|
|
return false;
|
|
}
|
|
|
|
uint64_t scratchLimitMax = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_MAX,
|
|
&scratchLimitMax)) {
|
|
LogWarning("HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_MAX cannot be queried!");
|
|
return false;
|
|
}
|
|
info_.scratchLimitMin = 0;
|
|
info_.scratchLimitMax = scratchLimitMax;
|
|
|
|
checkAtomicSupport();
|
|
|
|
assert(cpu_agent_info_->fine_grain_pool.handle != 0);
|
|
if (HSA_STATUS_SUCCESS != Hsa::agent_iterate_memory_pools(
|
|
bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) {
|
|
return false;
|
|
}
|
|
|
|
assert(group_segment_.handle != 0);
|
|
|
|
for (auto agent : gpu_agents_) {
|
|
if (agent.handle != bkendDevice_.handle) {
|
|
hsa_status_t err;
|
|
// Can another GPU (agent) have access to the current GPU memory pool (gpuvm_segment_)?
|
|
hsa_amd_memory_pool_access_t access;
|
|
err = Hsa::agent_memory_pool_get_info(agent, gpuvm_segment_,
|
|
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
|
|
if (err != HSA_STATUS_SUCCESS) {
|
|
continue;
|
|
}
|
|
|
|
// Find accessible p2p agents - i.e != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED
|
|
if (HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT == access ||
|
|
HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT == access) {
|
|
// Agent can have access to the current gpuvm_segment_
|
|
p2p_agents_.push_back(agent);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Keep track of all P2P Agents in a Array including current device handle for IPC
|
|
p2p_agents_list_ = new hsa_agent_t[1 + p2p_agents_.size()];
|
|
p2p_agents_list_[0] = getBackendDevice();
|
|
for (size_t agent_idx = 0; agent_idx < p2p_agents_.size(); ++agent_idx) {
|
|
p2p_agents_list_[1 + agent_idx] = p2p_agents_[agent_idx];
|
|
}
|
|
|
|
size_t group_segment_size = 0;
|
|
if (HSA_STATUS_SUCCESS != Hsa::memory_pool_get_info(group_segment_,
|
|
HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
|
&group_segment_size)) {
|
|
return false;
|
|
}
|
|
assert(group_segment_size > 0);
|
|
|
|
// Find SDMA read mask
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::memory_copy_engine_status(getCpuAgent(), getBackendDevice(), &maxSdmaReadMask_)) {
|
|
return false;
|
|
}
|
|
assert(maxSdmaReadMask_ > 0 && "No SDMA engines available for Read");
|
|
|
|
// Find SDMA write mask
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::memory_copy_engine_status(getBackendDevice(), getCpuAgent(), &maxSdmaWriteMask_)) {
|
|
return false;
|
|
}
|
|
assert(maxSdmaWriteMask_ > 0 && "No SDMA engines available for Write");
|
|
|
|
info_.localMemSizePerCU_ = group_segment_size;
|
|
info_.localMemSize_ = group_segment_size;
|
|
|
|
info_.maxWorkItemDimensions_ = 3;
|
|
|
|
uint8_t memory_properties[8];
|
|
// Get the memory property from ROCr.
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES,
|
|
memory_properties)) {
|
|
LogError("HSA_AGENT_INFO_AMD_MEMORY_PROPERTIES query failed");
|
|
}
|
|
|
|
// Check if the device is APU
|
|
if (hsa_flag_isset64(memory_properties, HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU)) {
|
|
info_.hostUnifiedMemory_ = 1;
|
|
}
|
|
|
|
if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
|
|
size_t global_segment_size = 0;
|
|
if (HSA_STATUS_SUCCESS != Hsa::memory_pool_get_info(gpuvm_segment_,
|
|
HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
|
&global_segment_size)) {
|
|
return false;
|
|
}
|
|
|
|
assert(global_segment_size > 0);
|
|
info_.globalMemSize_ = (static_cast<uint64_t>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
|
static_cast<uint64_t>(global_segment_size)) /
|
|
100u;
|
|
|
|
// For APU with vram size <= 512MiB, use a smaller single alloc percentage
|
|
if (info_.globalMemSize_ <= 536870912) {
|
|
if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
|
|
GPU_SINGLE_ALLOC_PERCENT = 75;
|
|
}
|
|
}
|
|
|
|
gpuvm_segment_max_alloc_ =
|
|
uint64_t(info_.globalMemSize_ * std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
|
|
assert(gpuvm_segment_max_alloc_ > 0);
|
|
|
|
info_.maxMemAllocSize_ = static_cast<uint64_t>(gpuvm_segment_max_alloc_);
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
|
|
&alloc_granularity_)) {
|
|
return false;
|
|
}
|
|
|
|
assert(alloc_granularity_ > 0);
|
|
} else {
|
|
// We suppose half of physical memory can be used by GPU in APU system
|
|
info_.globalMemSize_ = amd::Os::hostTotalPhysicalMemory() / 2;
|
|
info_.globalMemSize_ = std::max(info_.globalMemSize_, uint64_t(1 * Gi));
|
|
info_.globalMemSize_ = (static_cast<uint64_t>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
|
static_cast<uint64_t>(info_.globalMemSize_)) /
|
|
100u;
|
|
|
|
info_.maxMemAllocSize_ =
|
|
uint64_t(info_.globalMemSize_ * std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::memory_pool_get_info(cpu_agent_info_->fine_grain_pool,
|
|
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
|
|
&alloc_granularity_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
freeMem_ = info_.globalMemSize_;
|
|
|
|
// Make sure the max allocation size is not larger than the available memory size.
|
|
info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_);
|
|
info_.maxMemAllocSize_ = amd::alignDown(info_.maxMemAllocSize_, sizeof(uint64_t));
|
|
|
|
// Maximum system memory allocation size allowed
|
|
info_.maxPhysicalMemAllocSize_ = amd::Os::getPhysicalMemSize();
|
|
|
|
// make sure we don't run anything over 8 params for now
|
|
info_.maxParameterSize_ = 1024;
|
|
|
|
uint32_t max_work_group_size = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &max_work_group_size)) {
|
|
return false;
|
|
}
|
|
assert(max_work_group_size > 0);
|
|
max_work_group_size =
|
|
std::min(max_work_group_size, static_cast<uint32_t>(settings().maxWorkGroupSize_));
|
|
info_.maxWorkGroupSize_ = max_work_group_size;
|
|
|
|
uint16_t max_workgroup_size[3] = {0, 0, 0};
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &max_workgroup_size)) {
|
|
return false;
|
|
}
|
|
assert(max_workgroup_size[0] != 0 && max_workgroup_size[1] != 0 && max_workgroup_size[2] != 0);
|
|
|
|
uint16_t max_work_item_size = static_cast<uint16_t>(max_work_group_size);
|
|
info_.maxWorkItemSizes_[0] = std::min(max_workgroup_size[0], max_work_item_size);
|
|
info_.maxWorkItemSizes_[1] = std::min(max_workgroup_size[1], max_work_item_size);
|
|
info_.maxWorkItemSizes_[2] = std::min(max_workgroup_size[2], max_work_item_size);
|
|
info_.preferredWorkGroupSize_ = settings().preferredWorkGroupSize_;
|
|
|
|
info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4;
|
|
info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2;
|
|
info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1;
|
|
info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1;
|
|
info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1;
|
|
|
|
if (agent_profile_ == HSA_PROFILE_FULL) { // full-profile = participating in coherent memory,
|
|
// base-profile = NUMA based non-coherent memory
|
|
info_.hostUnifiedMemory_ = 1;
|
|
info_.iommuv2_ = true;
|
|
}
|
|
info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(int64_t[16]) * 2
|
|
: MEMOBJ_BASE_ADDR_ALIGN);
|
|
info_.minDataTypeAlignSize_ = sizeof(int64_t[16]);
|
|
|
|
info_.maxConstantArgs_ = 8;
|
|
info_.preferredConstantBufferSize_ = 16 * Ki;
|
|
info_.maxConstantBufferSize_ = info_.maxMemAllocSize_;
|
|
info_.localMemType_ = CL_LOCAL;
|
|
info_.errorCorrectionSupport_ = false;
|
|
info_.profilingTimerResolution_ = 1;
|
|
info_.littleEndian_ = true;
|
|
info_.compilerAvailable_ = true;
|
|
info_.executionCapabilities_ = CL_EXEC_KERNEL;
|
|
info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE;
|
|
info_.platform_ = AMD_PLATFORM;
|
|
info_.profile_ = "FULL_PROFILE";
|
|
::strncpy(info_.vendor_, "Advanced Micro Devices, Inc.", sizeof(info_.vendor_) - 1);
|
|
|
|
info_.addressBits_ = LP64_SWITCH(32, 64);
|
|
info_.maxSamplers_ = 16;
|
|
info_.bufferFromImageSupport_ = false;
|
|
info_.oclcVersion_ = "OpenCL C " OPENCL_C_VERSION_STR " ";
|
|
info_.spirVersions_ = "";
|
|
|
|
uint16_t major, minor;
|
|
if (Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_VERSION_MAJOR, &major) !=
|
|
HSA_STATUS_SUCCESS ||
|
|
Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_VERSION_MINOR, &minor) !=
|
|
HSA_STATUS_SUCCESS) {
|
|
return false;
|
|
}
|
|
std::stringstream ss;
|
|
ss << AMD_BUILD_STRING " (HSA" << major << "." << minor << ",LC)";
|
|
|
|
::strncpy(info_.driverVersion_, ss.str().c_str(), sizeof(info_.driverVersion_) - 1);
|
|
|
|
if (isa().versionMajor() >= 9) {
|
|
info_.version_ =
|
|
"OpenCL " /*OPENCL_VERSION_STR*/
|
|
"2.0"
|
|
" ";
|
|
} else {
|
|
info_.version_ =
|
|
"OpenCL " /*OPENCL_VERSION_STR*/
|
|
"1.2"
|
|
" ";
|
|
}
|
|
|
|
info_.builtInKernels_ = "";
|
|
info_.linkerAvailable_ = true;
|
|
info_.preferredInteropUserSync_ = true;
|
|
info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_;
|
|
info_.vendorId_ = 0x1002; // AMD's PCIe vendor id
|
|
|
|
info_.maxGlobalVariableSize_ = static_cast<size_t>(info_.maxMemAllocSize_);
|
|
info_.globalVariablePreferredTotalSize_ = static_cast<size_t>(info_.globalMemSize_);
|
|
|
|
// Populate the single config setting.
|
|
info_.singleFPConfig_ =
|
|
CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA;
|
|
|
|
if (settings().doublePrecision_) {
|
|
info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM;
|
|
info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
|
|
}
|
|
|
|
if (settings().singleFpDenorm_) {
|
|
info_.singleFPConfig_ |= CL_FP_DENORM;
|
|
}
|
|
|
|
if (settings().checkExtension(ClKhrFp16)) {
|
|
info_.halfFPConfig_ = info_.singleFPConfig_;
|
|
}
|
|
|
|
info_.preferredPlatformAtomicAlignment_ = 0;
|
|
info_.preferredGlobalAtomicAlignment_ = 0;
|
|
info_.preferredLocalAtomicAlignment_ = 0;
|
|
|
|
uint8_t hsa_extensions[128];
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_EXTENSIONS, hsa_extensions)) {
|
|
return false;
|
|
}
|
|
|
|
assert(HSA_EXTENSION_IMAGES < 8);
|
|
const bool image_is_supported = ((hsa_extensions[0] & (1 << HSA_EXTENSION_IMAGES)) != 0);
|
|
if (image_is_supported) {
|
|
// Images
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS),
|
|
&info_.maxSamplers_)) {
|
|
return false;
|
|
}
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES),
|
|
&info_.maxReadImageArgs_)) {
|
|
return false;
|
|
}
|
|
|
|
// TODO: no attribute for write image.
|
|
info_.maxWriteImageArgs_ = 8;
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES),
|
|
&info_.maxReadWriteImageArgs_)) {
|
|
return false;
|
|
}
|
|
|
|
uint32_t image_max_dim[3];
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS),
|
|
&image_max_dim)) {
|
|
return false;
|
|
}
|
|
|
|
info_.image2DMaxWidth_ = image_max_dim[0];
|
|
info_.image2DMaxHeight_ = image_max_dim[1];
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS),
|
|
&image_max_dim)) {
|
|
return false;
|
|
}
|
|
|
|
info_.image3DMaxWidth_ = image_max_dim[0];
|
|
info_.image3DMaxHeight_ = image_max_dim[1];
|
|
info_.image3DMaxDepth_ = image_max_dim[2];
|
|
|
|
uint32_t max_array_size = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS),
|
|
&max_array_size)) {
|
|
return false;
|
|
}
|
|
|
|
info_.imageMaxArraySize_ = max_array_size;
|
|
|
|
uint32_t max_image1da_width = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS),
|
|
&max_image1da_width)) {
|
|
return false;
|
|
}
|
|
|
|
info_.image1DAMaxWidth_ = max_image1da_width;
|
|
|
|
uint32_t max_image2da_width[2] = {0, 0};
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS),
|
|
&max_image2da_width)) {
|
|
return false;
|
|
}
|
|
|
|
info_.image2DAMaxWidth_[0] = max_image2da_width[0];
|
|
info_.image2DAMaxWidth_[1] = max_image2da_width[1];
|
|
|
|
uint32_t max_image1d_width = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS),
|
|
&max_image1d_width)) {
|
|
return false;
|
|
}
|
|
info_.image1DMaxWidth_ = max_image1d_width;
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS),
|
|
&image_max_dim)) {
|
|
return false;
|
|
}
|
|
info_.imageMaxBufferSize_ = (amd::IS_HIP) ? image_max_dim[0] : (1 << 27);
|
|
|
|
info_.imagePitchAlignment_ = 256;
|
|
|
|
info_.imageBaseAddressAlignment_ = 256;
|
|
|
|
info_.bufferFromImageSupport_ = false;
|
|
|
|
info_.imageSupport_ = (info_.maxReadWriteImageArgs_ > 0) ? true : false;
|
|
}
|
|
|
|
// Enable SVM Capabilities of Hsa device. Ensure
|
|
// user has not setup memory to be non-coherent
|
|
info_.svmCapabilities_ = 0;
|
|
if (!settings().enableNCMode_) {
|
|
info_.svmCapabilities_ = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER;
|
|
info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_BUFFER;
|
|
// Report fine-grain system only on full profile
|
|
if (agent_profile_ == HSA_PROFILE_FULL) {
|
|
info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM;
|
|
}
|
|
if (amd::IS_HIP) {
|
|
if (info_.iommuv2_ || isa().versionMajor() >= 8) {
|
|
info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (settings().checkExtension(ClAmdDeviceAttributeQuery)) {
|
|
info_.simdWidth_ = isa().simdWidth();
|
|
info_.simdInstructionWidth_ = isa().simdInstructionWidth();
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_WAVEFRONT_SIZE, &info_.wavefrontWidth_)) {
|
|
return false;
|
|
}
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_WIDTH),
|
|
&info_.vramBusBitWidth_)) {
|
|
return false;
|
|
}
|
|
|
|
info_.globalMemChannels_ = info_.vramBusBitWidth_ / 32;
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU),
|
|
&info_.simdPerCU_)) {
|
|
return false;
|
|
}
|
|
|
|
uint32_t max_waves_per_cu = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU),
|
|
&max_waves_per_cu)) {
|
|
return false;
|
|
}
|
|
|
|
if (settings().enableWgpMode_) {
|
|
info_.simdPerCU_ *= 2;
|
|
max_waves_per_cu *= 2;
|
|
}
|
|
|
|
info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * max_waves_per_cu;
|
|
uint32_t cache_sizes[4];
|
|
/* FIXIT [skudchad] - Seems like hardcoded in HSA backend so 0*/
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, static_cast<hsa_agent_info_t>(HSA_AGENT_INFO_CACHE_SIZE),
|
|
cache_sizes)) {
|
|
return false;
|
|
}
|
|
|
|
uint32_t asic_revision = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_ASIC_REVISION),
|
|
&asic_revision)) {
|
|
return false;
|
|
}
|
|
info_.asicRevision_ = asic_revision;
|
|
|
|
info_.l2CacheSize_ = cache_sizes[1];
|
|
info_.timeStampFrequency_ = 1000000;
|
|
info_.globalMemChannelBanks_ = 4;
|
|
info_.globalMemChannelBankWidth_ = isa().memChannelBankWidth();
|
|
info_.localMemSizePerCU_ = isa().localMemSizePerCU();
|
|
info_.localMemBanks_ = isa().localMemBanks();
|
|
info_.numAsyncQueues_ = kMaxAsyncQueues;
|
|
info_.numRTQueues_ = info_.numAsyncQueues_;
|
|
info_.numRTCUs_ = info_.maxComputeUnits_;
|
|
|
|
// TODO: set to true once thread trace support is available
|
|
info_.threadTraceEnable_ = false;
|
|
info_.pcieDeviceId_ = pciDeviceId_;
|
|
info_.cooperativeGroups_ = settings().enableCoopGroups_;
|
|
info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;
|
|
// Enable StreamWrite and StreamWait for all devices
|
|
info_.aqlBarrierValue_ = true;
|
|
}
|
|
|
|
info_.maxPipePacketSize_ = info_.maxMemAllocSize_;
|
|
info_.maxPipeActiveReservations_ = 16;
|
|
info_.maxPipeArgs_ = 16;
|
|
|
|
info_.queueOnDeviceProperties_ =
|
|
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
|
|
info_.queueOnDevicePreferredSize_ = 256 * Ki;
|
|
info_.queueOnDeviceMaxSize_ = 8 * Mi;
|
|
info_.maxOnDeviceQueues_ = 1;
|
|
info_.maxOnDeviceEvents_ = settings().numDeviceEvents_;
|
|
|
|
std::string addressableNumVGPRs, totalNumVGPRs, vGPRAllocGranule;
|
|
std::string isaName = isa().isaName();
|
|
info_.availableVGPRs_ =
|
|
amd::device::getValueFromIsaMeta(isaName, "AddressableNumVGPRs", addressableNumVGPRs)
|
|
? atoi(addressableNumVGPRs.c_str())
|
|
: 0;
|
|
info_.vgprsPerSimd_ = amd::device::getValueFromIsaMeta(isaName, "TotalNumVGPRs", totalNumVGPRs)
|
|
? atoi(totalNumVGPRs.c_str())
|
|
: 0;
|
|
info_.vgprAllocGranularity_ =
|
|
amd::device::getValueFromIsaMeta(isaName, "VGPRAllocGranule", vGPRAllocGranule)
|
|
? atoi(vGPRAllocGranule.c_str())
|
|
: 0;
|
|
|
|
info_.availableRegistersPerCU_ = info_.vgprsPerSimd_ * info_.simdPerCU_ * info_.wavefrontWidth_;
|
|
ClPrint(amd::LOG_INFO, amd::LOG_INIT,
|
|
"addressableNumVGPRs=%u, totalNumVGPRs=%u, vGPRAllocGranule=%u,"
|
|
" availableRegistersPerCU_=%u",
|
|
info_.availableVGPRs_, info_.vgprsPerSimd_, info_.vgprAllocGranularity_,
|
|
info_.availableRegistersPerCU_);
|
|
|
|
std::string sgprValue;
|
|
info_.availableSGPRs_ =
|
|
(amd::device::getValueFromIsaMeta(isaName, "AddressableNumSGPRs", sgprValue))
|
|
? (atoi(sgprValue.c_str()))
|
|
: 0;
|
|
std::string imageSupport;
|
|
if (amd::device::getValueFromIsaMeta(isaName, "ImageSupport", imageSupport)) {
|
|
info_.imageSupport_ = atoi(imageSupport.c_str());
|
|
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "imageSupport=%u", info_.imageSupport_);
|
|
} else {
|
|
LogInfo("Can not get image support info from ISA meta");
|
|
}
|
|
|
|
// Generic support for HMM interfaces
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::system_get_info(HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &info_.hmmSupported_)) {
|
|
LogError("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED query failed. HMM will be disabled");
|
|
}
|
|
|
|
// This capability should be available with xnack enabled
|
|
if (HSA_STATUS_SUCCESS != Hsa::system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT,
|
|
&info_.hmmCpuMemoryAccessible_)) {
|
|
LogError("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT query failed.");
|
|
}
|
|
|
|
// HMM specific capability for CPU direct access to device memory
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS),
|
|
&info_.hmmDirectHostAccess_)) {
|
|
LogError("HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS query failed.");
|
|
}
|
|
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_XCC),
|
|
&info_.numberOfXccs_)) {
|
|
LogError("HSA_AMD_AGENT_INFO_NUM_XCC query failed.");
|
|
}
|
|
|
|
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Gfx Major/Minor/Stepping: %d/%d/%d", isa().versionMajor(),
|
|
isa().versionMinor(), isa().versionStepping());
|
|
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "HMM support: %d, XNACK: %d, Direct host access: %d",
|
|
info_.hmmSupported_, info_.hmmCpuMemoryAccessible_, info_.hmmDirectHostAccess_);
|
|
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Max SDMA Read Mask: 0x%x, Max SDMA Write Mask: 0x%x",
|
|
maxSdmaReadMask_, maxSdmaWriteMask_);
|
|
|
|
info_.globalCUMask_ = {};
|
|
|
|
// Virtual memory Management Support, if set to true then the HW and SW Stack supports VMM.
|
|
info_.virtualMemoryManagement_ = false;
|
|
if (HIP_VMEM_MANAGE_SUPPORT) {
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::system_get_info(
|
|
static_cast<hsa_system_info_t>(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED),
|
|
&info_.virtualMemoryManagement_)) {
|
|
LogError("HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED query failed ");
|
|
}
|
|
}
|
|
HIP_MEM_POOL_USE_VM &= info_.virtualMemoryManagement_;
|
|
|
|
if (isa().versionMajor() < 8) {
|
|
info_.sgprsPerSimd_ = 512;
|
|
} else if (isa().versionMajor() < 10) {
|
|
info_.sgprsPerSimd_ = 800;
|
|
} else {
|
|
info_.sgprsPerSimd_ =
|
|
std::numeric_limits<uint32_t>::max(); // gfx10+ does not share SGPRs between waves
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
|
|
amd::ScopedLock lock(vgpusAccess());
|
|
|
|
bool profiling = (queue != nullptr) && queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
|
|
bool cooperative = false;
|
|
|
|
// If amd command queue is null, then it's an internal device queue
|
|
if (queue == nullptr) {
|
|
// In HIP mode the device queue will be allocated for the cooperative launches only
|
|
cooperative = amd::IS_HIP && settings().enableCoopGroups_;
|
|
profiling = amd::IS_HIP;
|
|
}
|
|
// Initialization of heap and other resources occur during the command
|
|
// queue creation time.
|
|
const std::vector<uint32_t> defaultCuMask = {};
|
|
bool q = (queue != nullptr);
|
|
VirtualGPU* virtualDevice =
|
|
new VirtualGPU(*this, profiling, cooperative, q ? queue->cuMask() : defaultCuMask,
|
|
q ? queue->priority() : amd::CommandQueue::Priority::Normal);
|
|
|
|
if (!virtualDevice->create()) {
|
|
delete virtualDevice;
|
|
return nullptr;
|
|
}
|
|
|
|
return virtualDevice;
|
|
}
|
|
|
|
bool Device::globalFreeMemory(size_t* freeMemory) const {
|
|
const uint TotalFreeMemory = 0;
|
|
const uint LargestFreeBlock = 1;
|
|
uint64_t globalAvailMemory;
|
|
// Queries memory available in bytes across all global pools owned by the agent
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_,
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_AVAIL),
|
|
&globalAvailMemory)) {
|
|
LogError("HSA_AMD_AGENT_INFO_MEMORY_AVAIL query failed.");
|
|
return false;
|
|
}
|
|
|
|
globalAvailMemory = globalAvailMemory / Ki;
|
|
if (globalAvailMemory > HIP_HIDDEN_FREE_MEM * Ki) {
|
|
globalAvailMemory -= HIP_HIDDEN_FREE_MEM * Ki;
|
|
} else {
|
|
globalAvailMemory = 0;
|
|
}
|
|
|
|
freeMemory[TotalFreeMemory] = globalAvailMemory;
|
|
// since there is no memory heap on ROCm, the biggest free block is
|
|
// equal to total free local memory
|
|
freeMemory[LargestFreeBlock] = freeMemory[TotalFreeMemory];
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Device::amdFileRead(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
|
|
uint64_t* size_copied, int32_t* status) {
|
|
hsa_amd_ais_file_handle_t fh{};
|
|
#if defined(_WIN32)
|
|
fh.handle = handle;
|
|
#else
|
|
fh.fd = handle;
|
|
#endif
|
|
hsa_status_t ret = Hsa::ais_file_read(fh,
|
|
devicePtr, size, file_offset, size_copied, status);
|
|
if (HSA_STATUS_SUCCESS != ret) {
|
|
LogPrintfError("hsa_amd_ais_file_read operation failed with err 0x%xh", ret);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Device::amdFileWrite(amd::Os::FileDesc handle, void* devicePtr, uint64_t size, int64_t file_offset,
|
|
uint64_t* size_copied, int32_t* status) {
|
|
hsa_amd_ais_file_handle_t fh{};
|
|
#if defined(_WIN32)
|
|
fh.handle = handle;
|
|
#else
|
|
fh.fd = handle;
|
|
#endif
|
|
hsa_status_t ret = Hsa::ais_file_write(fh,
|
|
devicePtr, size, file_offset, size_copied, status);
|
|
if (HSA_STATUS_SUCCESS != ret) {
|
|
LogPrintfError("hsa_amd_ais_file_write operation failed with err 0x%xh", ret);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Device::bindExternalDevice(uint flags, void* const gfxDevice[], void* gfxContext,
|
|
bool validateOnly) {
|
|
#if defined(_WIN32)
|
|
return false;
|
|
#else
|
|
if ((flags & amd::Context::GLDeviceKhr) == 0) return false;
|
|
|
|
MesaInterop::MESA_INTEROP_KIND kind = MesaInterop::MESA_INTEROP_NONE;
|
|
MesaInterop::DisplayHandle display;
|
|
MesaInterop::ContextHandle context;
|
|
|
|
if ((flags & amd::Context::EGLDeviceKhr) != 0) {
|
|
kind = MesaInterop::MESA_INTEROP_EGL;
|
|
display.eglDisplay = reinterpret_cast<EGLDisplay>(gfxDevice[amd::Context::GLDeviceKhrIdx]);
|
|
context.eglContext = reinterpret_cast<EGLContext>(gfxContext);
|
|
} else {
|
|
kind = MesaInterop::MESA_INTEROP_GLX;
|
|
display.glxDisplay = reinterpret_cast<Display*>(gfxDevice[amd::Context::GLDeviceKhrIdx]);
|
|
context.glxContext = reinterpret_cast<GLXContext>(gfxContext);
|
|
}
|
|
|
|
mesa_glinterop_device_info info;
|
|
info.version = MESA_GLINTEROP_DEVICE_INFO_VERSION;
|
|
if (!MesaInterop::Init(kind)) {
|
|
return false;
|
|
}
|
|
|
|
if (!MesaInterop::GetInfo(info, kind, display, context)) {
|
|
return false;
|
|
}
|
|
|
|
return info_.deviceTopology_.pcie.bus == info.pci_bus &&
|
|
info_.deviceTopology_.pcie.device == info.pci_device &&
|
|
info_.deviceTopology_.pcie.function == info.pci_function &&
|
|
info_.vendorId_ == info.vendor_id && pciDeviceId_ == info.device_id;
|
|
|
|
#endif
|
|
}
|
|
|
|
bool Device::unbindExternalDevice(uint flags, void* const gfxDevice[], void* gfxContext,
|
|
bool validateOnly) {
|
|
#if defined(_WIN32)
|
|
return false;
|
|
#else
|
|
if ((flags & amd::Context::GLDeviceKhr) == 0) return false;
|
|
return true;
|
|
#endif
|
|
}
|
|
|
|
amd::Memory* Device::findMapTarget(size_t size) const {
|
|
// Must be serialised for access
|
|
amd::ScopedLock lk(*mapCacheOps_);
|
|
|
|
amd::Memory* map = nullptr;
|
|
size_t minSize = 0;
|
|
size_t maxSize = 0;
|
|
uint mapId = mapCache_->size();
|
|
uint releaseId = mapCache_->size();
|
|
|
|
// Find if the list has a map target of appropriate size
|
|
for (uint i = 0; i < mapCache_->size(); i++) {
|
|
if ((*mapCache_)[i] != nullptr) {
|
|
// Requested size is smaller than the entry size
|
|
if (size < (*mapCache_)[i]->getSize()) {
|
|
if ((minSize == 0) || (minSize > (*mapCache_)[i]->getSize())) {
|
|
minSize = (*mapCache_)[i]->getSize();
|
|
mapId = i;
|
|
}
|
|
}
|
|
// Requeted size matches the entry size
|
|
else if (size == (*mapCache_)[i]->getSize()) {
|
|
mapId = i;
|
|
break;
|
|
} else {
|
|
// Find the biggest map target in the list
|
|
if (maxSize < (*mapCache_)[i]->getSize()) {
|
|
maxSize = (*mapCache_)[i]->getSize();
|
|
releaseId = i;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if we found any map target
|
|
if (mapId < mapCache_->size()) {
|
|
map = (*mapCache_)[mapId];
|
|
(*mapCache_)[mapId] = nullptr;
|
|
}
|
|
// If cache is full, then release the biggest map target
|
|
else if (releaseId < mapCache_->size()) {
|
|
(*mapCache_)[releaseId]->release();
|
|
(*mapCache_)[releaseId] = nullptr;
|
|
}
|
|
|
|
return map;
|
|
}
|
|
|
|
bool Device::addMapTarget(amd::Memory* memory) const {
|
|
// Must be serialised for access
|
|
amd::ScopedLock lk(*mapCacheOps_);
|
|
|
|
// the svm memory shouldn't be cached
|
|
if (!memory->canBeCached()) {
|
|
return false;
|
|
}
|
|
// Find if the list has a map target of appropriate size
|
|
for (uint i = 0; i < mapCache_->size(); ++i) {
|
|
if ((*mapCache_)[i] == nullptr) {
|
|
(*mapCache_)[i] = memory;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Add a new entry
|
|
mapCache_->push_back(memory);
|
|
|
|
return true;
|
|
}
|
|
|
|
Memory* Device::getRocMemory(amd::Memory* mem) const {
|
|
return static_cast<roc::Memory*>(mem->getDeviceMemory(*this));
|
|
}
|
|
|
|
// ================================================================================================
|
|
device::Memory* Device::createMemory(amd::Memory& owner) const {
|
|
roc::Memory* memory = nullptr;
|
|
if (owner.asBuffer()) {
|
|
memory = new roc::Buffer(*this, owner);
|
|
} else if (owner.asImage()) {
|
|
memory = new roc::Image(*this, owner);
|
|
} else {
|
|
LogError("Unknown memory type");
|
|
}
|
|
|
|
if (memory == nullptr) {
|
|
return nullptr;
|
|
}
|
|
|
|
bool result = memory->create();
|
|
|
|
if (!result) {
|
|
LogError("Failed creating memory");
|
|
delete memory;
|
|
return nullptr;
|
|
}
|
|
|
|
if (isP2pEnabled()) {
|
|
memory->setAllowedPeerAccess(true);
|
|
}
|
|
// Initialize if the memory is a pipe object
|
|
if (owner.getType() == CL_MEM_OBJECT_PIPE) {
|
|
// Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
|
|
// Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
|
|
size_t pipeInit[3] = {0, 0, owner.asPipe()->getMaxNumPackets()};
|
|
xferMgr().writeBuffer(pipeInit, *memory, amd::Coord3D(0), amd::Coord3D(sizeof(pipeInit)));
|
|
}
|
|
|
|
// Transfer data only if OCL context has one device.
|
|
// Cache coherency layer will update data for multiple devices
|
|
if (!memory->isHostMemDirectAccess() && owner.asImage() && (owner.parent() == nullptr) &&
|
|
(owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && (owner.getContext().devices().size() == 1)) {
|
|
// To avoid recurssive call to Device::createMemory, we perform
|
|
// data transfer to the view of the image
|
|
amd::Image* imageView = owner.asImage()->createView(
|
|
owner.getContext(), owner.asImage()->getImageFormat(), xferQueue());
|
|
|
|
if (imageView == nullptr) {
|
|
LogError("[OCL] Fail to allocate view of image object");
|
|
return nullptr;
|
|
}
|
|
|
|
Image* devImageView = new roc::Image(static_cast<const Device&>(*this), *imageView);
|
|
if (devImageView == nullptr) {
|
|
LogError("[OCL] Fail to allocate device mem object for the view");
|
|
imageView->release();
|
|
return nullptr;
|
|
}
|
|
|
|
if (devImageView != nullptr && !devImageView->createView(static_cast<roc::Image&>(*memory))) {
|
|
LogError("[OCL] Fail to create device mem object for the view");
|
|
delete devImageView;
|
|
imageView->release();
|
|
return nullptr;
|
|
}
|
|
|
|
imageView->replaceDeviceMemory(this, devImageView);
|
|
|
|
// Copy data with the original pitch values, since runtime doesn't perform
|
|
// extra sysmem allocation for one device
|
|
const auto image = owner.asImage();
|
|
result = xferMgr().writeImage(owner.getHostMem(), *devImageView, amd::Coord3D(0, 0, 0),
|
|
imageView->getRegion(), image->getRowPitch(),
|
|
image->getSlicePitch(), true);
|
|
|
|
// Release host memory, since runtime copied data
|
|
owner.setHostMem(nullptr);
|
|
|
|
imageView->release();
|
|
}
|
|
|
|
// Prepin sysmem buffer for possible data synchronization between CPU and GPU
|
|
if (!memory->isHostMemDirectAccess() &&
|
|
// Pin memory for the parent object only
|
|
(owner.parent() == nullptr) && (owner.getHostMem() != nullptr) &&
|
|
(owner.getSvmPtr() == nullptr)) {
|
|
memory->pinSystemMemory(owner.getHostMem(), owner.getSize());
|
|
}
|
|
|
|
if (!result) {
|
|
delete memory;
|
|
DevLogError("Cannot Write Image \n");
|
|
return nullptr;
|
|
}
|
|
|
|
return memory;
|
|
}
|
|
|
|
// ================================================================================================
|
|
device::Memory* Device::createMemory(size_t size, size_t alignment) const {
|
|
auto buffer = new roc::Buffer(*this, size);
|
|
static constexpr bool LocalAlloc = true;
|
|
if ((buffer == nullptr) || !buffer->create(LocalAlloc)) {
|
|
LogError("Couldn't allocate memory on device!");
|
|
return nullptr;
|
|
}
|
|
return buffer;
|
|
}
|
|
|
|
// ================================================================================================
|
|
hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
|
|
const AgentInfo* agentInfo) const {
|
|
if (agentInfo == nullptr) {
|
|
agentInfo = cpu_agent_info_;
|
|
}
|
|
hsa_amd_memory_pool_t segment{0};
|
|
switch (mem_seg) {
|
|
case kKernArg: {
|
|
if (settings().fgs_kernel_arg_) {
|
|
segment = agentInfo->kern_arg_pool;
|
|
break;
|
|
}
|
|
// Falls through on else case.
|
|
}
|
|
case kNoAtomics:
|
|
// If runtime disables barrier, then all host allocations must have L2 disabled
|
|
if (agentInfo->coarse_grain_pool.handle != 0) {
|
|
segment = agentInfo->coarse_grain_pool;
|
|
break;
|
|
}
|
|
// Falls through on else case.
|
|
case kAtomics:
|
|
segment = agentInfo->fine_grain_pool;
|
|
break;
|
|
case kUncachedAtomics:
|
|
case kIoMemory:
|
|
if (agentInfo->ext_fine_grain_pool.handle != 0) {
|
|
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM,
|
|
"Using extended fine grained access system memory pool");
|
|
segment = agentInfo->ext_fine_grain_pool;
|
|
break;
|
|
}
|
|
default:
|
|
guarantee(false, "Invalid Memory Segment");
|
|
break;
|
|
}
|
|
assert(segment.handle != 0);
|
|
return segment;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
|
|
const void* agentInfo) const {
|
|
void* ptr = nullptr;
|
|
uint32_t memFlags = 0;
|
|
if (mem_seg == kKernArg) {
|
|
memFlags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
|
|
}
|
|
|
|
hsa_amd_memory_pool_t pool =
|
|
getHostMemoryPool(mem_seg, static_cast<const amd::roc::AgentInfo*>(agentInfo));
|
|
hsa_status_t stat = Hsa::memory_pool_allocate(pool, size, memFlags, &ptr);
|
|
|
|
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
|
"Allocate hsa host memory %p, size 0x%zx,"
|
|
" numa_node = %d, mem_seg = %d",
|
|
ptr, size, preferred_numa_node_, static_cast<int>(mem_seg));
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Fail allocation host memory with err %d", stat);
|
|
return nullptr;
|
|
}
|
|
|
|
stat = Hsa::agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Fail hsa_amd_agents_allow_access with err %d", stat);
|
|
hostFree(ptr, size);
|
|
return nullptr;
|
|
}
|
|
|
|
return ptr;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
|
void* ptr = nullptr;
|
|
#ifndef ROCCLR_SUPPORT_NUMA_POLICY
|
|
ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
|
|
#else
|
|
int mode = MPOL_DEFAULT;
|
|
int maxNodes = numa_num_possible_nodes();
|
|
bitmask* nodeMask = numa_bitmask_alloc(maxNodes);
|
|
auto cpuCount = cpu_agents_.size();
|
|
|
|
long res = get_mempolicy(&mode, nodeMask->maskp, nodeMask->size, NULL, 0);
|
|
if (res) {
|
|
LogPrintfError("get_mempolicy failed with error %ld", res);
|
|
return ptr;
|
|
}
|
|
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_RESOURCE,
|
|
"get_mempolicy() succeed with mode %d, nodeMask 0x%lx, cpuCount %zu", mode,
|
|
*nodeMask->maskp, cpuCount);
|
|
|
|
switch (mode) {
|
|
// For details, see "man get_mempolicy".
|
|
case MPOL_BIND:
|
|
case MPOL_PREFERRED:
|
|
// We only care about the first CPU node
|
|
for (unsigned int i = 0; i < cpuCount; i++) {
|
|
if ((1u << i) & *nodeMask->maskp) {
|
|
ptr = hostAlloc(size, alignment, mem_seg, &cpu_agents_[i]);
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
// All other modes fall back to default mode
|
|
ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
|
|
}
|
|
numa_free_cpumask(nodeMask);
|
|
#endif // ROCCLR_SUPPORT_NUMA_POLICY
|
|
return ptr;
|
|
}
|
|
|
|
void* Device::hostLock(void* hostMem, size_t size, const MemorySegment memSegment) const {
|
|
hsa_amd_memory_pool_t pool = getHostMemoryPool(memSegment);
|
|
void* deviceMemory = nullptr;
|
|
uint32_t memFlags = 0;
|
|
if (memSegment == kIoMemory) {
|
|
memFlags |= HSA_AMD_MEMORY_POOL_UNCACHED_FLAG;
|
|
}
|
|
|
|
hsa_status_t status = Hsa::memory_lock_to_pool(
|
|
hostMem, size, const_cast<hsa_agent_t*>(&bkendDevice_), 1, pool, memFlags, &deviceMemory);
|
|
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
|
"Locking to pool %p, size 0x%zx, hostMem = %p,"
|
|
" deviceMemory = %p, memSegment = %d",
|
|
pool, size, hostMem, deviceMemory, static_cast<int>(memSegment));
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
|
|
deviceMemory = nullptr;
|
|
}
|
|
return deviceMemory;
|
|
}
|
|
|
|
void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }
|
|
|
|
bool Device::deviceAllowAccess(void* ptr) const {
|
|
std::lock_guard<std::mutex> lock(lock_allow_access_);
|
|
if (!p2pAgents().empty()) {
|
|
hsa_status_t stat =
|
|
Hsa::agents_allow_access(p2pAgents().size(), p2pAgents().data(), nullptr, ptr);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Allow p2p access failed - hsa_amd_agents_allow_access with err %d", stat);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Device::allowPeerAccess(device::Memory* memory) const {
|
|
if (memory == nullptr) {
|
|
return false;
|
|
}
|
|
if (!p2pAgents().empty()) {
|
|
void* ptr = reinterpret_cast<void*>(memory->virtualAddress());
|
|
hsa_agent_t agent = getBackendDevice();
|
|
hsa_status_t stat = Hsa::agents_allow_access(1, &agent, nullptr, ptr);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Allow p2p access failed - hsa_amd_agents_allow_access with err: %d", stat);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
uint64_t Device::deviceVmemAlloc(size_t size, uint64_t flags) const {
|
|
hsa_amd_vmem_alloc_handle_t hsa_vmem_handle{};
|
|
|
|
// We only allow pinned memory at this time.
|
|
hsa_status_t hsa_status =
|
|
Hsa::vmem_handle_create(gpuvm_segment_, size, MEMORY_TYPE_PINNED, flags, &hsa_vmem_handle);
|
|
if (hsa_status != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Failed hsa_amd_vmem_handle_create! Failed with hsa status: %d \n", hsa_status);
|
|
}
|
|
|
|
return hsa_vmem_handle.handle;
|
|
}
|
|
|
|
void Device::deviceVmemRelease(uint64_t mem_handle) const {
|
|
hsa_amd_vmem_alloc_handle_t hsa_vmem_handle{};
|
|
hsa_vmem_handle.handle = mem_handle;
|
|
|
|
hsa_status_t hsa_status = Hsa::vmem_handle_release(hsa_vmem_handle);
|
|
if (hsa_status != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Failed hsa_amd_vmem_handle_release! Failed with hsa status: %d \n", hsa_status);
|
|
}
|
|
}
|
|
|
|
void* Device::reserveMemory(size_t size, size_t alignment) const {
|
|
void* ptr = nullptr;
|
|
// Reserves non registered VA memory using HSA APIs.
|
|
hsa_status_t status = Hsa::vmem_address_reserve_align(&ptr, size, 0, alignment,
|
|
HSA_AMD_VMEM_ADDRESS_NO_REGISTER);
|
|
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Reserve hsa device memory %p, size 0x%zx", ptr, size);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogError("Fail to reserve memory");
|
|
return nullptr;
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
void Device::releaseMemory(void* ptr, size_t size) const {
|
|
hsa_status_t hsa_status = Hsa::vmem_address_free(ptr, size);
|
|
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Free hsa reserved memory %p", ptr);
|
|
if (hsa_status != HSA_STATUS_SUCCESS) {
|
|
LogError("hsa_amd_vmem_address_free failed \n");
|
|
}
|
|
}
|
|
|
|
void* Device::deviceLocalAlloc(size_t size, const AllocationFlags& flags) const {
|
|
const hsa_amd_memory_pool_t& pool =
|
|
(flags.pseudo_fine_grain_ && gpu_ext_fine_grained_segment_.handle)
|
|
? gpu_ext_fine_grained_segment_
|
|
: (flags.atomics_ && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
|
|
: gpuvm_segment_;
|
|
|
|
if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
|
|
DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle,
|
|
gpuvm_segment_max_alloc_);
|
|
return nullptr;
|
|
}
|
|
|
|
uint32_t hsa_mem_flags = 0;
|
|
if (flags.contiguous_) {
|
|
hsa_mem_flags = HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG;
|
|
}
|
|
if (flags.executable_) {
|
|
hsa_mem_flags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
|
|
}
|
|
|
|
void* ptr = nullptr;
|
|
hsa_status_t stat = Hsa::memory_pool_allocate(pool, size, hsa_mem_flags, &ptr);
|
|
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
|
"Allocate hsa device memory %p, size 0x%zx, hsa_mem_flags 0x%xh", ptr, size,
|
|
hsa_mem_flags);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogError("Fail allocation local memory");
|
|
return nullptr;
|
|
}
|
|
|
|
if (isP2pEnabled() && deviceAllowAccess(ptr) == false) {
|
|
LogError("Allow p2p access for memory allocation");
|
|
memFree(ptr, size);
|
|
return nullptr;
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
void Device::memFree(void* ptr, size_t size) const {
|
|
hsa_status_t stat = Hsa::memory_pool_free(ptr);
|
|
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Free hsa memory %p", ptr);
|
|
if (stat != HSA_STATUS_SUCCESS) {
|
|
LogError("Fail freeing local memory");
|
|
}
|
|
}
|
|
|
|
void Device::updateFreeMemory(size_t size, bool free) {
|
|
if (free) {
|
|
freeMem_ += size;
|
|
} else {
|
|
if (size > freeMem_) {
|
|
// To avoid underflow of the freeMem_
|
|
// This can happen if the free mem tracked is inaccurate, as some allocations can happen
|
|
// directly via ROCr
|
|
ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS,
|
|
"Free memory set to zero on device 0x%lx, requested size = 0x%zx, freeMem_ = 0x%zx",
|
|
this, size, freeMem_.load());
|
|
freeMem_ = 0;
|
|
return;
|
|
}
|
|
freeMem_ -= size;
|
|
}
|
|
ClPrint(amd::LOG_INFO, amd::LOG_MEM, "Device=0x%lx, freeMem_ = 0x%zx", this, freeMem_.load());
|
|
}
|
|
|
|
// ================================================================================================
|
|
void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags,
|
|
void* svmPtr) const {
|
|
amd::Memory* mem = nullptr;
|
|
void* svmPtrUsed = reinterpret_cast<void*>(amd::Memory::MemoryType::kSvmMemoryPtr);
|
|
|
|
if (nullptr != svmPtr) {
|
|
// Find the existing amd::mem object
|
|
mem = amd::MemObjMap::FindMemObj(svmPtr);
|
|
if (mem != nullptr) {
|
|
return mem->getSvmPtr();
|
|
}
|
|
if (flags & CL_MEM_USE_HOST_PTR) {
|
|
svmPtrUsed = svmPtr;
|
|
} else {
|
|
DevLogPrintfError("Cannot find svm_ptr: 0x%x \n", svmPtr);
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
// create a hidden buffer, which will allocated on the device later
|
|
mem = new (context) amd::Buffer(context, flags, size, svmPtrUsed);
|
|
if (mem == nullptr) {
|
|
LogError("failed to create a svm mem object!");
|
|
return nullptr;
|
|
}
|
|
|
|
if (!mem->create(nullptr)) {
|
|
LogError("failed to create a svm hidden buffer!");
|
|
mem->release();
|
|
return nullptr;
|
|
}
|
|
// if the device supports SVM FGS, return the committed CPU address directly.
|
|
Memory* gpuMem = getRocMemory(mem);
|
|
if (gpuMem == nullptr) {
|
|
LogError("failed to create GPU memory from svm hidden buffer!");
|
|
return nullptr;
|
|
}
|
|
|
|
// add the information to context so that we can use it later.
|
|
if (mem->getSvmPtr() != nullptr) {
|
|
amd::MemObjMap::AddMemObj(mem->getSvmPtr(), mem);
|
|
}
|
|
return mem->getSvmPtr();
|
|
}
|
|
|
|
void* Device::virtualAlloc(void* req_addr, size_t size, size_t alignment) {
|
|
void* vptr = nullptr;
|
|
// Reserves the address using HSA APIs, with requested address.
|
|
// There is no guarantee that we will get the requested address.
|
|
hsa_status_t hsa_status =
|
|
Hsa::vmem_address_reserve(&vptr, size, reinterpret_cast<uint64_t>(req_addr), 0);
|
|
if (hsa_status != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Failed hsa_amd_vmem_address_reserve. Failed with status: %d \n", hsa_status);
|
|
return nullptr;
|
|
}
|
|
|
|
constexpr bool kParent = true;
|
|
amd::Memory* mem = CreateVirtualBuffer(context(), vptr, size, -1, -1, kParent);
|
|
if (mem == nullptr) {
|
|
LogPrintfError("Cannot create Virtual Buffer for vptr: %p of size: %u", vptr, size);
|
|
}
|
|
|
|
return mem->getSvmPtr();
|
|
}
|
|
|
|
bool Device::virtualFree(void* addr) {
|
|
amd::Memory* memObj = amd::MemObjMap::FindVirtualMemObj(addr);
|
|
if (memObj == nullptr) {
|
|
LogPrintfError("Cannot find the Virtual MemObj entry for this addr 0x%x", addr);
|
|
}
|
|
|
|
if (!memObj->getContext().devices()[0]->DestroyVirtualBuffer(memObj)) {
|
|
return false;
|
|
}
|
|
|
|
hsa_status_t hsa_status = Hsa::vmem_address_free(memObj->getSvmPtr(), memObj->getSize());
|
|
if (hsa_status != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Failed hsa_amd_vmem_address_free. Failed with status:%d \n", hsa_status);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Device::SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags,
|
|
VmmLocationType access_location) {
|
|
hsa_status_t hsa_status = HSA_STATUS_SUCCESS;
|
|
hsa_amd_memory_access_desc_t desc;
|
|
desc.permissions = static_cast<hsa_access_permission_t>(access_flags);
|
|
desc.agent_handle =
|
|
access_location == VmmLocationType::kDevice ? getBackendDevice() : getCpuAgent();
|
|
|
|
if ((hsa_status = Hsa::vmem_set_access(va_addr, va_size, &desc, 1)) != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Failed hsa_amd_vmem_set_access. Failed with status:%d \n", hsa_status);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Device::GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) const {
|
|
hsa_status_t hsa_status = HSA_STATUS_SUCCESS;
|
|
hsa_access_permission_t perms;
|
|
|
|
size_t discard_offset = 0;
|
|
amd::Memory* va_mem_obj = amd::MemObjMap::FindMemObj(va_addr, &discard_offset);
|
|
if (va_mem_obj == nullptr) {
|
|
LogPrintfError("Failed to get Memory Object for va_addr: 0x%x", va_addr);
|
|
return false;
|
|
}
|
|
|
|
if ((hsa_status = Hsa::vmem_get_access(va_mem_obj->getSvmPtr(), &perms, getBackendDevice())) !=
|
|
HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Failed hsa_amd_vmem_get_access. Failed with status:%d \n", hsa_status);
|
|
return false;
|
|
}
|
|
|
|
*access_flags_ptr = static_cast<VmmAccess>(perms);
|
|
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::ExportShareableVMMHandle(amd::Memory& amd_mem_obj, int flags, void* shareableHandle) {
|
|
hsa_status_t hsa_status = HSA_STATUS_SUCCESS;
|
|
hsa_amd_vmem_alloc_handle_t hsa_vmem_handle{};
|
|
hsa_vmem_handle.handle = amd_mem_obj.getUserData().hsa_handle;
|
|
int dmabuf_fd = 0;
|
|
|
|
if (hsa_vmem_handle.handle == 0) {
|
|
LogError("HSA Handle is not valid");
|
|
return false;
|
|
}
|
|
|
|
if ((hsa_status = Hsa::vmem_export_shareable_handle(&dmabuf_fd, hsa_vmem_handle, flags)) !=
|
|
HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Failed hsa_vmem_export_shareable_handle with status: %d \n", hsa_status);
|
|
return false;
|
|
}
|
|
|
|
*(reinterpret_cast<int*>(shareableHandle)) = dmabuf_fd;
|
|
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::ImportShareableHSAHandle(void* osHandle, uint64_t* hsa_handle_ptr) const {
|
|
hsa_status_t hsa_status = HSA_STATUS_SUCCESS;
|
|
hsa_amd_vmem_alloc_handle_t hsa_vmem_handle{};
|
|
|
|
if (hsa_handle_ptr == nullptr) {
|
|
LogError("HSA Handle ptr is null");
|
|
return false;
|
|
}
|
|
|
|
int dmabuf_fd = static_cast<int>(reinterpret_cast<uintptr_t>(osHandle));
|
|
if ((hsa_status = Hsa::vmem_import_shareable_handle(dmabuf_fd, &hsa_vmem_handle)) !=
|
|
HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("Failed hsa_amd_vmem_import_shareable_handle with status: %d \n", hsa_status);
|
|
return false;
|
|
}
|
|
|
|
*hsa_handle_ptr = hsa_vmem_handle.handle;
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
amd::Memory* Device::ImportShareableVMMHandle(void* osHandle) {
|
|
amd::Memory* amd_mem_obj = new (context())
|
|
amd::Buffer(context(), ROCCLR_MEM_PHYMEM | ROCCLR_MEM_INTERPROCESS, 0, osHandle);
|
|
if (amd_mem_obj == nullptr) {
|
|
LogError("Cannot create memory object");
|
|
return nullptr;
|
|
}
|
|
|
|
if (!amd_mem_obj->create(nullptr, false)) {
|
|
LogError("Failed to create mem_obj from imported fd");
|
|
amd_mem_obj->release();
|
|
return nullptr;
|
|
}
|
|
|
|
return amd_mem_obj;
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryAdvice advice,
|
|
bool first_alloc, bool use_cpu, int numa_id) const {
|
|
if ((settings().hmmFlags_ & Settings::Hmm::EnableSvmTracking) && !first_alloc) {
|
|
amd::Memory* svm_mem = amd::MemObjMap::FindMemObj(dev_ptr);
|
|
if ((nullptr == svm_mem) || ((svm_mem->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) == 0) ||
|
|
// Validate the range of provided memory
|
|
((svm_mem->getSize() - (reinterpret_cast<const_address>(dev_ptr) -
|
|
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
|
|
LogPrintfError("SetSvmAttributes received unknown memory for update: %p!", dev_ptr);
|
|
return false;
|
|
}
|
|
}
|
|
if (info().hmmSupported_) {
|
|
std::vector<hsa_amd_svm_attribute_pair_t> attr;
|
|
|
|
switch (advice) {
|
|
case amd::MemoryAdvice::SetReadMostly:
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_READ_MOSTLY, true});
|
|
break;
|
|
case amd::MemoryAdvice::UnsetReadMostly:
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_READ_MOSTLY, false});
|
|
break;
|
|
case amd::MemoryAdvice::SetPreferredLocation:
|
|
if (use_cpu) {
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, getCpuAgent(numa_id).handle});
|
|
} else {
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, getBackendDevice().handle});
|
|
}
|
|
break;
|
|
case amd::MemoryAdvice::UnsetPreferredLocation:
|
|
// @note: 0 may cause a failure on old runtimes
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, 0});
|
|
break;
|
|
case amd::MemoryAdvice::SetAccessedBy: {
|
|
const uint64_t attrib = (first_alloc) ? HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
|
|
: HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE;
|
|
if (use_cpu) {
|
|
attr.push_back({attrib, getCpuAgent().handle});
|
|
} else {
|
|
if (first_alloc) {
|
|
// Provide access to all possible devices.
|
|
//! @note: HMM should support automatic page table update with xnack enabled,
|
|
//! but currently it doesn't and runtime explicitly enables access from all devices
|
|
for (const auto dev : devices()) {
|
|
// Skip null devices
|
|
if (static_cast<Device*>(dev)->getBackendDevice().handle != 0) {
|
|
attr.push_back({attrib, static_cast<Device*>(dev)->getBackendDevice().handle});
|
|
}
|
|
}
|
|
} else {
|
|
attr.push_back({attrib, getBackendDevice().handle});
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case amd::MemoryAdvice::UnsetAccessedBy:
|
|
// When unsetting we should use HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE for the agent
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, getBackendDevice().handle});
|
|
break;
|
|
case amd::MemoryAdvice::SetCoarseGrain:
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED});
|
|
break;
|
|
case amd::MemoryAdvice::UnsetCoarseGrain:
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED});
|
|
break;
|
|
default:
|
|
return false;
|
|
break;
|
|
}
|
|
|
|
hsa_status_t status =
|
|
Hsa::svm_attributes_set(const_cast<void*>(dev_ptr), count, attr.data(), attr.size());
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogPrintfError("hsa_amd_svm_attributes_set() failed. Advice: %d, status: %d", advice, status);
|
|
return false;
|
|
}
|
|
} else {
|
|
LogWarning("hsa_amd_svm_attributes_set() is ignored, because no HMM support");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::SetSvmAttributes(const void* dev_ptr, size_t count, amd::MemoryAdvice advice,
|
|
bool use_cpu, int numa_id) const {
|
|
constexpr bool kFirstAlloc = false;
|
|
return SetSvmAttributesInt(dev_ptr, count, advice, kFirstAlloc, use_cpu);
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
|
|
size_t num_attributes, const void* dev_ptr, size_t count) const {
|
|
if (settings().hmmFlags_ & Settings::Hmm::EnableSvmTracking) {
|
|
amd::Memory* svm_mem = amd::MemObjMap::FindMemObj(dev_ptr);
|
|
if ((nullptr == svm_mem) || ((svm_mem->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) == 0) ||
|
|
// Validate the range of provided memory
|
|
((svm_mem->getSize() - (reinterpret_cast<const_address>(dev_ptr) -
|
|
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
|
|
LogPrintfError("GetSvmAttributes received unknown memory %p for state!", dev_ptr);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
hsa_amd_pointer_info_t ptr_info = {};
|
|
for (size_t i = 0; i < num_attributes; ++i) {
|
|
if (attributes[i] == amd::MemRangeAttribute::CoherencyMode) {
|
|
ptr_info.size = sizeof(hsa_amd_pointer_info_t);
|
|
// Query ptr type to see if it's a HMM allocation
|
|
hsa_status_t status =
|
|
Hsa::pointer_info(const_cast<void*>(dev_ptr), &ptr_info, nullptr, nullptr, nullptr);
|
|
// The call should never fail in ROCR, but just check for an error and continue
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogError("hsa_amd_pointer_info() failed");
|
|
}
|
|
|
|
// Check if it's a legacy non-HMM allocation and update query
|
|
*reinterpret_cast<uint32_t*>(data[i]) = HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE;
|
|
if (ptr_info.type == HSA_EXT_POINTER_TYPE_HSA ||
|
|
ptr_info.type == HSA_EXT_POINTER_TYPE_LOCKED ||
|
|
ptr_info.type == HSA_EXT_POINTER_TYPE_IPC) {
|
|
if (ptr_info.global_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
|
|
*reinterpret_cast<uint32_t*>(data[i]) = HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED;
|
|
} else if (ptr_info.global_flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
|
|
*reinterpret_cast<uint32_t*>(data[i]) = HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (info().hmmSupported_) {
|
|
uint32_t accessed_by = 0;
|
|
std::vector<hsa_amd_svm_attribute_pair_t> attr;
|
|
|
|
for (size_t i = 0; i < num_attributes; ++i) {
|
|
switch (attributes[i]) {
|
|
case amd::MemRangeAttribute::ReadMostly:
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_READ_MOSTLY, 0});
|
|
break;
|
|
case amd::MemRangeAttribute::PreferredLocation:
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, 0});
|
|
break;
|
|
case amd::MemRangeAttribute::AccessedBy:
|
|
accessed_by = attr.size();
|
|
// Add all GPU devices into the query
|
|
for (const auto agent : gpu_agents_) {
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle});
|
|
}
|
|
// Add CPU devices
|
|
for (const auto agent_info : cpu_agents_) {
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle});
|
|
}
|
|
accessed_by = attr.size() - accessed_by;
|
|
break;
|
|
case amd::MemRangeAttribute::LastPrefetchLocation:
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION, 0});
|
|
break;
|
|
case amd::MemRangeAttribute::CoherencyMode:
|
|
if (*reinterpret_cast<uint32_t*>(data[i]) == HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE) {
|
|
attr.push_back({HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, 0});
|
|
}
|
|
break;
|
|
default:
|
|
return false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
hsa_status_t status =
|
|
Hsa::svm_attributes_get(const_cast<void*>(dev_ptr), count, attr.data(), attr.size());
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogError("hsa_amd_svm_attributes_get() failed");
|
|
return false;
|
|
}
|
|
|
|
uint32_t idx = 0;
|
|
uint32_t rocr_attr = 0;
|
|
for (size_t i = 0; i < num_attributes; ++i) {
|
|
const auto& it = attr[rocr_attr];
|
|
switch (attributes[i]) {
|
|
case amd::MemRangeAttribute::ReadMostly:
|
|
if (data_sizes[idx] != sizeof(uint32_t)) {
|
|
return false;
|
|
}
|
|
// Cast ROCr value into the hip format
|
|
*reinterpret_cast<uint32_t*>(data[idx]) =
|
|
(static_cast<uint32_t>(it.value) > 0) ? true : false;
|
|
++rocr_attr;
|
|
break;
|
|
// The logic should be identical for the both queries
|
|
case amd::MemRangeAttribute::PreferredLocation:
|
|
case amd::MemRangeAttribute::LastPrefetchLocation:
|
|
if (data_sizes[idx] != sizeof(uint32_t)) {
|
|
return false;
|
|
}
|
|
*reinterpret_cast<int32_t*>(data[idx]) = static_cast<int32_t>(amd::InvalidDeviceId);
|
|
// Find device agent returned by ROCr
|
|
for (auto& device : devices()) {
|
|
if (static_cast<Device*>(device)->getBackendDevice().handle == it.value) {
|
|
*reinterpret_cast<uint32_t*>(data[idx]) = static_cast<uint32_t>(device->index());
|
|
}
|
|
}
|
|
// Find CPU agent returned by ROCr
|
|
for (auto& agent_info : cpu_agents_) {
|
|
if (agent_info.agent.handle == it.value) {
|
|
*reinterpret_cast<int32_t*>(data[idx]) = static_cast<int32_t>(amd::CpuDeviceId);
|
|
}
|
|
}
|
|
++rocr_attr;
|
|
break;
|
|
case amd::MemRangeAttribute::AccessedBy: {
|
|
uint32_t entry = 0;
|
|
uint32_t device_count = data_sizes[idx] / 4;
|
|
// Make sure it's multiple of 4
|
|
if (data_sizes[idx] % 4 != 0) {
|
|
return false;
|
|
}
|
|
for (uint32_t att = 0; att < accessed_by; ++att) {
|
|
const auto& it = attr[rocr_attr + att];
|
|
if (entry >= device_count) {
|
|
// The size of the array is less than the amount of available devices
|
|
break;
|
|
}
|
|
switch (it.attribute) {
|
|
case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE:
|
|
case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS:
|
|
break;
|
|
case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE:
|
|
reinterpret_cast<int32_t*>(data[idx])[entry] =
|
|
static_cast<int32_t>(amd::InvalidDeviceId);
|
|
// Find device agent returned by ROCr
|
|
for (auto& device : devices()) {
|
|
if (static_cast<Device*>(device)->getBackendDevice().handle == it.value) {
|
|
reinterpret_cast<uint32_t*>(data[idx])[entry] =
|
|
static_cast<uint32_t>(device->index());
|
|
}
|
|
}
|
|
// Find CPU agent returned by ROCr
|
|
for (auto& agent_info : cpu_agents_) {
|
|
if (agent_info.agent.handle == it.value) {
|
|
reinterpret_cast<int32_t*>(data[idx])[entry] =
|
|
static_cast<int32_t>(amd::CpuDeviceId);
|
|
}
|
|
}
|
|
++entry;
|
|
break;
|
|
default:
|
|
LogWarning("Unexpected result from HSA_AMD_SVM_ATTRIB_ACCESS_QUERY");
|
|
break;
|
|
}
|
|
}
|
|
rocr_attr += accessed_by;
|
|
for (uint32_t i = entry; i < device_count; ++i) {
|
|
reinterpret_cast<int32_t*>(data[idx])[i] = static_cast<int32_t>(amd::InvalidDeviceId);
|
|
}
|
|
break;
|
|
}
|
|
case amd::MemRangeAttribute::CoherencyMode:
|
|
if (data_sizes[idx] != sizeof(uint32_t)) {
|
|
return false;
|
|
}
|
|
// if ptr is HMM alloc then overwrite the values
|
|
if (*reinterpret_cast<uint32_t*>(data[idx]) == HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE) {
|
|
// Cast ROCr value into the hip format
|
|
*reinterpret_cast<uint32_t*>(data[idx]) = static_cast<uint32_t>(it.value);
|
|
}
|
|
++rocr_attr;
|
|
break;
|
|
default:
|
|
return false;
|
|
break;
|
|
}
|
|
// Find the next location in the query
|
|
++idx;
|
|
}
|
|
} else if (ptr_info.type == HSA_EXT_POINTER_TYPE_RESERVED_ADDR) {
|
|
LogError("GetSvmAttributes() failed, because no HMM support");
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
size_t Device::ScratchLimitCurrent() const {
|
|
uint64_t scratchLimitCurrent = 0;
|
|
hsa_status_t ret =
|
|
Hsa::agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_CURRENT,
|
|
&scratchLimitCurrent);
|
|
if (HSA_STATUS_SUCCESS != ret) {
|
|
LogPrintfError("HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_CURRENT cannot be queried! Err: 0x%xh", ret);
|
|
return 0;
|
|
}
|
|
return static_cast<size_t>(scratchLimitCurrent);
|
|
};
|
|
|
|
bool Device::UpdateScratchLimitCurrent(size_t limit) const {
|
|
hsa_status_t ret = Hsa::agent_set_async_scratch_limit(bkendDevice_, limit);
|
|
if (HSA_STATUS_SUCCESS != ret) {
|
|
LogPrintfError("hsa_amd_agent_set_async_scratch_limit(%zu) failed with err 0x%xh", limit, ret);
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
// ================================================================================================
|
|
bool Device::SvmAllocInit(void* memory, size_t size) const {
|
|
amd::MemoryAdvice advice = amd::MemoryAdvice::SetAccessedBy;
|
|
constexpr bool kFirstAlloc = true;
|
|
if (!SetSvmAttributesInt(memory, size, advice, kFirstAlloc)) {
|
|
return false;
|
|
}
|
|
|
|
if ((settings().hmmFlags_ & Settings::Hmm::EnableMallocPrefetch) == 0) {
|
|
return true;
|
|
}
|
|
|
|
if (info().hmmSupported_) {
|
|
// Initialize signal for the barrier
|
|
Hsa::signal_store_relaxed(prefetch_signal_, kInitSignalValueOne);
|
|
|
|
// Initiate a prefetch command which should force memory update in HMM
|
|
hsa_status_t status =
|
|
Hsa::svm_prefetch_async(memory, size, getBackendDevice(), 0, nullptr, prefetch_signal_);
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogError("hsa_amd_svm_prefetch_async() failed");
|
|
return false;
|
|
}
|
|
|
|
// Wait for the prefetch
|
|
if (!WaitForSignal(prefetch_signal_)) {
|
|
LogError("Barrier packet submission failed");
|
|
return false;
|
|
}
|
|
} else {
|
|
LogWarning("Early prefetch failed, because no HMM support");
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::svmFree(void* ptr) const {
|
|
amd::Memory* svmMem = amd::MemObjMap::FindMemObj(ptr);
|
|
if (nullptr != svmMem) {
|
|
amd::MemObjMap::RemoveMemObj(svmMem->getSvmPtr());
|
|
svmMem->release();
|
|
}
|
|
}
|
|
|
|
// ================================================================================================
|
|
VirtualGPU* Device::xferQueue() const {
|
|
if (!xferQueue_) {
|
|
// Create virtual device for internal memory transfer
|
|
Device* thisDevice = const_cast<Device*>(this);
|
|
thisDevice->xferQueue_ = reinterpret_cast<VirtualGPU*>(thisDevice->createVirtualDevice());
|
|
if (!xferQueue_) {
|
|
LogError("Couldn't create the device transfer manager!");
|
|
return nullptr;
|
|
}
|
|
if (xferQueue_->gpu_queue() == nullptr) {
|
|
xferQueue_->set_gpu_queue(thisDevice->AcquireActiveNormalQueue());
|
|
}
|
|
}
|
|
xferQueue_->enableSyncBlit();
|
|
return xferQueue_;
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
|
|
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
|
bool result = true;
|
|
return result;
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::IsHwEventReady(const amd::Event& event, bool wait, amd::SyncPolicy policy) const {
|
|
void* hw_event =
|
|
(event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
|
|
if (hw_event == nullptr) {
|
|
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_SIG, "No HW event");
|
|
return false;
|
|
} else if (wait) {
|
|
// hipEventBlockingSync
|
|
// when set the CPU gives up host thread for other work
|
|
// when not set the CPU enters a busy-wait on the event to occur
|
|
constexpr int kHipEventBlockingSync = 0x1;
|
|
bool active_wait =
|
|
!((policy == amd::SyncPolicy::Blocking) & kHipEventBlockingSync) && ActiveWait();
|
|
bool yield = (policy == amd::SyncPolicy::Yield);
|
|
return WaitForSignal(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, active_wait, yield);
|
|
}
|
|
|
|
auto signal = reinterpret_cast<ProfilingSignal*>(hw_event)->signal_;
|
|
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Check HW event = 0x%lx", signal.handle);
|
|
|
|
return (Hsa::signal_load_relaxed(signal) == 0);
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {
|
|
void* hw_event =
|
|
(event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
|
|
if (hw_event == nullptr) {
|
|
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event to read time");
|
|
*start = *end = 0;
|
|
} else {
|
|
fetchSignalTime(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, getBackendDevice(),
|
|
start, end);
|
|
}
|
|
}
|
|
|
|
// ================================================================================================
|
|
hsa_queue_t* Device::getQueueFromPool(const uint qIndex) {
|
|
// Check if queue with refCount 0 is available to use
|
|
if (queuePool_[qIndex].size() < GPU_MAX_HW_QUEUES) {
|
|
for (auto& it : queuePool_[qIndex]) {
|
|
if (it.second.refCount == 0) {
|
|
it.second.refCount++;
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Selected queue refCount: %p (%d)",
|
|
it.first->base_address, it.second.refCount);
|
|
return it.first;
|
|
}
|
|
}
|
|
} else {
|
|
if (qIndex < QueuePriority::Total && queuePool_[qIndex].size() > 0) {
|
|
// Search through all available queues for the lowest counter.
|
|
// Note: the map is sorted in the allocation order for possible round-robin selection
|
|
typedef decltype(queuePool_)::value_type::const_reference PoolRef;
|
|
auto lowest = std::min_element(
|
|
queuePool_[qIndex].begin(), queuePool_[qIndex].end(),
|
|
[](PoolRef A, PoolRef B) { return A.second.refCount < B.second.refCount; });
|
|
lowest->second.refCount++;
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Selected queue refCount: %p (%d)",
|
|
lowest->first->base_address, lowest->second.refCount);
|
|
return lowest->first;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
// ================================================================================================
|
|
hsa_queue_t* Device::AcquireActiveNormalQueue() {
|
|
uint32_t queue_size = ROC_AQL_QUEUE_SIZE;
|
|
auto queue = acquireQueue(queue_size, false, std::vector<uint32_t>{},
|
|
amd::CommandQueue::Priority::Normal, true);
|
|
return queue;
|
|
}
|
|
|
|
// ================================================================================================
|
|
hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
|
|
const std::vector<uint32_t>& cuMask,
|
|
amd::CommandQueue::Priority priority, bool managed) {
|
|
amd::ScopedLock l(active_queue_access_);
|
|
|
|
assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES ||
|
|
queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES ||
|
|
queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES);
|
|
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
|
|
"Number of allocated hardware queues with low priority: %d,"
|
|
" with normal priority: %d, with high priority: %d, maximum per priority is: %d",
|
|
queuePool_[QueuePriority::Low].size(), queuePool_[QueuePriority::Normal].size(),
|
|
queuePool_[QueuePriority::High].size(), GPU_MAX_HW_QUEUES);
|
|
|
|
hsa_amd_queue_priority_t queue_priority;
|
|
uint qIndex;
|
|
switch (priority) {
|
|
case amd::CommandQueue::Priority::Low:
|
|
queue_priority = HSA_AMD_QUEUE_PRIORITY_LOW;
|
|
qIndex = QueuePriority::Low;
|
|
break;
|
|
case amd::CommandQueue::Priority::High:
|
|
queue_priority = HSA_AMD_QUEUE_PRIORITY_HIGH;
|
|
qIndex = QueuePriority::High;
|
|
break;
|
|
case amd::CommandQueue::Priority::Normal:
|
|
case amd::CommandQueue::Priority::Medium:
|
|
default:
|
|
queue_priority = HSA_AMD_QUEUE_PRIORITY_NORMAL;
|
|
qIndex = QueuePriority::Normal;
|
|
break;
|
|
}
|
|
|
|
// If we have reached the max number of queues, reuse an existing queue with the matching queue
|
|
// priority, choosing the one with the least number of users. Note: Don't attempt to reuse the
|
|
// cooperative queue, since it's single per device
|
|
if (!coop_queue && (cuMask.size() == 0) &&
|
|
((queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES) || queuePool_[qIndex].size() > 0)) {
|
|
hsa_queue_t* queue = getQueueFromPool(qIndex);
|
|
if (queue != nullptr) {
|
|
if (!managed && (qIndex == QueuePriority::Normal)) {
|
|
num_normal_queues_++;
|
|
}
|
|
return queue;
|
|
}
|
|
}
|
|
|
|
// Else create a new queue. This also includes the initial state where there
|
|
// is no queue.
|
|
uint32_t queue_max_packets = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(bkendDevice_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_max_packets)) {
|
|
DevLogError("Cannot get hsa agent info \n");
|
|
return nullptr;
|
|
}
|
|
auto queue_size = (queue_max_packets < queue_size_hint) ? queue_max_packets : queue_size_hint;
|
|
|
|
hsa_queue_t* queue;
|
|
auto queue_type = HSA_QUEUE_TYPE_MULTI;
|
|
|
|
// Enable cooperative queue for the device queue
|
|
if (coop_queue) {
|
|
queue_type = HSA_QUEUE_TYPE_COOPERATIVE;
|
|
}
|
|
|
|
while (Hsa::queue_create(bkendDevice_, queue_size, queue_type, callbackQueue, this,
|
|
std::numeric_limits<uint>::max(), std::numeric_limits<uint>::max(),
|
|
&queue) != HSA_STATUS_SUCCESS) {
|
|
queue_size >>= 1;
|
|
if (queue_size < 64) {
|
|
// if a queue with the same requested priority available from the pool, returns it here
|
|
if (!coop_queue && (cuMask.size() == 0) && (queuePool_[qIndex].size() > 0)) {
|
|
return getQueueFromPool(qIndex);
|
|
}
|
|
DevLogError("Device::acquireQueue: hsa_queue_create failed!");
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
// default priority is normal so no need to set it again
|
|
if (queue_priority != HSA_AMD_QUEUE_PRIORITY_NORMAL) {
|
|
hsa_status_t st = Hsa::queue_set_priority(queue, queue_priority);
|
|
if (st != HSA_STATUS_SUCCESS) {
|
|
DevLogError("Device::acquireQueue: hsa_amd_queue_set_priority failed!");
|
|
Hsa::queue_destroy(queue);
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
|
|
"Created SWq=%p to map on HWq=%p with "
|
|
"size %d with priority %d, cooperative: %i",
|
|
queue, queue->base_address, queue_size, queue_priority, coop_queue);
|
|
|
|
Hsa::profiling_set_profiler_enabled(queue, 1);
|
|
if (cuMask.size() != 0 || info_.globalCUMask_.size() != 0) {
|
|
std::stringstream ss;
|
|
ss << std::hex;
|
|
std::vector<uint32_t> mask = {};
|
|
|
|
// handle scenarios where cuMask (custom-defined), globalCUMask_ or both are valid and
|
|
// fill the final mask which will be appiled to the current queue
|
|
if (cuMask.size() != 0 && info_.globalCUMask_.size() == 0) {
|
|
mask = cuMask;
|
|
} else if (cuMask.size() != 0 && info_.globalCUMask_.size() != 0) {
|
|
for (unsigned int i = 0; i < std::min(cuMask.size(), info_.globalCUMask_.size()); i++) {
|
|
mask.push_back(cuMask[i] & info_.globalCUMask_[i]);
|
|
}
|
|
// check to make sure after ANDing cuMask (custom-defined) with global
|
|
// CU mask, we have non-zero mask, oterwise just apply global CU mask
|
|
bool zeroCUMask = true;
|
|
for (auto m : mask) {
|
|
if (m != 0) {
|
|
zeroCUMask = false;
|
|
break;
|
|
}
|
|
}
|
|
if (zeroCUMask) {
|
|
mask = info_.globalCUMask_;
|
|
}
|
|
} else {
|
|
mask = info_.globalCUMask_;
|
|
}
|
|
|
|
|
|
for (int i = mask.size() - 1; i >= 0; i--) {
|
|
ss << std::setfill('0') << std::setw(8) << mask[i];
|
|
}
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Setting CU mask 0x%s for hardware queue %p",
|
|
ss.str().c_str(), queue->base_address);
|
|
|
|
std::vector<uint32_t> final_mask = {};
|
|
// hsa_amd_queue_cu_set_mask expects each bit in cuMask to represent each CU
|
|
// For wgp mode: Each wgp consists of 2 CUs and CUs must be adjacent pairwise enabled
|
|
// Convert each bit in the cuMask from wgp to cu by duplicating it
|
|
if (settings().enableWgpMode_) {
|
|
final_mask.resize(mask.size() * 2, 0);
|
|
|
|
for (int i = 0; i < mask.size(); i++) {
|
|
for (int j = 0; j < 16; j++) {
|
|
// Convert least significant 16 bits
|
|
if (((mask[i] >> j) & 0x1) == 0x1) {
|
|
final_mask[2 * i] |= (0x3 << (2 * j));
|
|
}
|
|
|
|
// Convert most significant 16 bits
|
|
if (((mask[i] >> (16 + j)) & 0x1) == 0x1) {
|
|
final_mask[2 * i + 1] |= (0x3 << (2 * j));
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
final_mask = mask;
|
|
}
|
|
|
|
hsa_status_t status =
|
|
Hsa::queue_cu_set_mask(queue, final_mask.size() * 32, final_mask.data());
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
DevLogError("Device::acquireQueue: hsa_amd_queue_cu_set_mask failed!");
|
|
Hsa::queue_destroy(queue);
|
|
return nullptr;
|
|
}
|
|
if (cuMask.size() != 0) {
|
|
// add queues with custom CU mask into their special pool to keep track
|
|
// of mapping of these queues to their associated queueInfo (i.e., hostcall buffers)
|
|
auto result = queueWithCUMaskPool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
|
|
assert(result.second && "QueueInfo already exists");
|
|
auto& qInfo = result.first->second;
|
|
qInfo.refCount = 1;
|
|
|
|
return queue;
|
|
}
|
|
}
|
|
|
|
if (coop_queue) {
|
|
// Skip queue recycling for cooperative queues, since it should be just one
|
|
// per device.
|
|
return queue;
|
|
}
|
|
auto result = queuePool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
|
|
assert(result.second && "QueueInfo already exists");
|
|
auto& qInfo = result.first->second;
|
|
qInfo.refCount = 1;
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "acquireQueue refCount: %p (%d)",
|
|
result.first->first->base_address, result.first->second.refCount);
|
|
if (!managed && (cuMask.size() == 0) && (qIndex = QueuePriority::Normal)) {
|
|
num_normal_queues_++;
|
|
}
|
|
return queue;
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::ReleaseActiveNormalQueue(hsa_queue_t* queue) {
|
|
// Release a queue if the total number of allocated queues exceeds the max possible
|
|
if (num_normal_queues_.load() > GPU_MAX_HW_QUEUES) {
|
|
releaseQueue(queue, std::vector<uint32_t>{}, false, true);
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask, bool coop_queue,
|
|
bool managed) {
|
|
amd::ScopedLock l(active_queue_access_);
|
|
for (auto& it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) {
|
|
auto qIter = it.find(queue);
|
|
if (qIter != it.end()) {
|
|
if (!managed && (cuMask.size() == 0) && (&it == &queuePool_[QueuePriority::Normal])) {
|
|
num_normal_queues_--;
|
|
}
|
|
auto& qInfo = qIter->second;
|
|
assert(qInfo.refCount > 0);
|
|
qInfo.refCount--;
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "releaseQueue refCount:%p (%d)",
|
|
qIter->first->base_address, qIter->second.refCount);
|
|
// hsa queues with cumask set are not being reused. Hence, if the app uses multiple
|
|
// such queues it can cause memory leak and those must be destroyed here once the
|
|
// refcount reaches 0.
|
|
if ((!cuMask.empty()) && (qInfo.refCount == 0)) {
|
|
if (qInfo.hostcallBuffer_) {
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
|
|
"Deleting hostcall buffer %p for hardware queue %p", qInfo.hostcallBuffer_,
|
|
qIter->first->base_address);
|
|
amd::disableHostcalls(qInfo.hostcallBuffer_);
|
|
context().svmFree(qInfo.hostcallBuffer_);
|
|
}
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Deleting hardware queue %p with refCount 0",
|
|
queue->base_address);
|
|
qIter = it.erase(qIter);
|
|
Hsa::queue_destroy(queue);
|
|
}
|
|
}
|
|
}
|
|
if (coop_queue) { // cooperative queue
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Deleting CG enabled hardware queue %p ",
|
|
queue->base_address);
|
|
Hsa::queue_destroy(queue);
|
|
}
|
|
}
|
|
|
|
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue,
|
|
const std::vector<uint32_t>& cuMask) {
|
|
decltype(queuePool_)::value_type::iterator qIter;
|
|
bool found = false;
|
|
if (!coop_queue) {
|
|
for (auto& it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) {
|
|
qIter = it.find(queue);
|
|
if (qIter != it.end()) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
assert(found && "Couldn't find queue");
|
|
|
|
if (qIter->second.hostcallBuffer_) {
|
|
return qIter->second.hostcallBuffer_;
|
|
}
|
|
} else {
|
|
if (coopHostcallBuffer_) {
|
|
return coopHostcallBuffer_;
|
|
}
|
|
}
|
|
|
|
// The number of packets required in each buffer is at least equal to the
|
|
// maximum number of waves supported by the device.
|
|
auto wavesPerCu = info().maxThreadsPerCU_ / info().wavefrontWidth_;
|
|
auto numPackets = info().maxComputeUnits_ * wavesPerCu;
|
|
|
|
auto size = amd::getHostcallBufferSize(numPackets);
|
|
auto align = amd::getHostcallBufferAlignment();
|
|
|
|
void* buffer = context().svmAlloc(size, align, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS);
|
|
if (!buffer) {
|
|
ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE,
|
|
"Failed to create hostcall buffer for hardware queue %p", queue->base_address);
|
|
return nullptr;
|
|
}
|
|
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Created hostcall buffer %p for hardware queue %p", buffer,
|
|
queue->base_address);
|
|
if (!coop_queue) {
|
|
qIter->second.hostcallBuffer_ = buffer;
|
|
} else {
|
|
coopHostcallBuffer_ = buffer;
|
|
}
|
|
if (!amd::enableHostcalls(*this, buffer, numPackets)) {
|
|
ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to register hostcall buffer %p with listener",
|
|
buffer);
|
|
return nullptr;
|
|
}
|
|
return buffer;
|
|
}
|
|
|
|
bool Device::findLinkInfo(const amd::Device& other_device, std::vector<LinkAttrType>* link_attrs) {
|
|
return findLinkInfo((static_cast<const roc::Device*>(&other_device))->gpuvm_segment_, link_attrs);
|
|
}
|
|
|
|
bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool,
|
|
std::vector<LinkAttrType>* link_attrs) {
|
|
if ((!pool.handle) || (link_attrs == nullptr)) {
|
|
return false;
|
|
}
|
|
|
|
// Retrieve the hops between 2 devices.
|
|
int32_t hops = 0;
|
|
hsa_status_t hsa_status = Hsa::agent_memory_pool_get_info(
|
|
bkendDevice_, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &hops);
|
|
|
|
if (hsa_status != HSA_STATUS_SUCCESS) {
|
|
DevLogPrintfError("Cannot get hops info, hsa failed with status: %d", hsa_status);
|
|
return false;
|
|
}
|
|
|
|
if (hops < 0) {
|
|
return false;
|
|
}
|
|
|
|
// The pool is on its agent
|
|
if (hops == 0) {
|
|
for (auto& link_attr : (*link_attrs)) {
|
|
switch (link_attr.first) {
|
|
case kLinkLinkType: {
|
|
// No link, so type is meaningless,
|
|
// caller should ignore it
|
|
link_attr.second = -1;
|
|
break;
|
|
}
|
|
case kLinkHopCount: {
|
|
// no hop
|
|
link_attr.second = 0;
|
|
break;
|
|
}
|
|
case kLinkDistance: {
|
|
// distance is zero, if no hops
|
|
link_attr.second = 0;
|
|
break;
|
|
}
|
|
case kLinkAtomicSupport: {
|
|
// atomic support if its on the same agent
|
|
link_attr.second = 1;
|
|
break;
|
|
}
|
|
default: {
|
|
DevLogPrintfError("Invalid LinkAttribute: %d ", link_attr.first);
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Retrieve link info on the pool.
|
|
std::vector<hsa_amd_memory_pool_link_info_t> link_info(hops);
|
|
hsa_status = Hsa::agent_memory_pool_get_info(
|
|
bkendDevice_, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info.data());
|
|
|
|
if (hsa_status != HSA_STATUS_SUCCESS) {
|
|
DevLogPrintfError("Cannot retrieve link info, hsa failed with status: %d", hsa_status);
|
|
return false;
|
|
}
|
|
|
|
for (auto& link_attr : (*link_attrs)) {
|
|
switch (link_attr.first) {
|
|
case kLinkLinkType: {
|
|
link_attr.second = static_cast<int32_t>(link_info[0].link_type);
|
|
break;
|
|
}
|
|
case kLinkHopCount: {
|
|
uint32_t distance = 0;
|
|
// Because of Rocrs limitation hops is set to 1 always between two different devices
|
|
// If Rocr Changes the behaviour revisit this logic
|
|
for (size_t hop_idx = 0; hop_idx < static_cast<size_t>(hops); ++hop_idx) {
|
|
distance += link_info[hop_idx].numa_distance;
|
|
}
|
|
uint32_t oneHopDistance = (link_info[0].link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) ? 13 : 20;
|
|
link_attr.second = static_cast<int32_t>(distance / oneHopDistance);
|
|
break;
|
|
}
|
|
case kLinkDistance: {
|
|
uint32_t distance = 0;
|
|
// Sum of distances between hops
|
|
for (size_t hop_idx = 0; hop_idx < static_cast<size_t>(hops); ++hop_idx) {
|
|
distance += link_info[hop_idx].numa_distance;
|
|
}
|
|
link_attr.second = static_cast<int32_t>(distance);
|
|
break;
|
|
}
|
|
case kLinkAtomicSupport: {
|
|
// if either of the atomic is supported
|
|
link_attr.second = static_cast<int32_t>(link_info[0].atomic_support_64bit ||
|
|
link_info[0].atomic_support_32bit);
|
|
break;
|
|
}
|
|
default: {
|
|
DevLogPrintfError("Invalid LinkAttribute: %d ", link_attr.first);
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::getGlobalCUMask(std::string cuMaskStr) {
|
|
if (cuMaskStr.length() != 0) {
|
|
std::string pre = cuMaskStr.substr(0, 2);
|
|
if (pre.compare("0x") == 0 || pre.compare("0X") == 0) {
|
|
cuMaskStr = cuMaskStr.substr(2, cuMaskStr.length());
|
|
}
|
|
|
|
int end = cuMaskStr.length();
|
|
|
|
// the number of current physical CUs compressed in 4-bits
|
|
size_t compPhysicalCUs = static_cast<size_t>(
|
|
(settings().enableWgpMode_ ? info_.maxComputeUnits_ * 2 : info_.maxComputeUnits_) / 4);
|
|
|
|
// the number of final available compute units after applying the requested CU mask
|
|
uint32_t availCUs = 0;
|
|
|
|
// read numCharToRead characters (8 or less) from the cuMask string each time, convert
|
|
// it into hex, and store it into the globalCUMask_. If the length of the cuMask string
|
|
// is more than the compressed physical available CUs, ignore the rest
|
|
for (unsigned i = 0; i < std::min(cuMaskStr.length(), compPhysicalCUs); i += 8) {
|
|
int numCharToRead = (i + 8 <= compPhysicalCUs) ? 8 : compPhysicalCUs - 8;
|
|
std::string temp =
|
|
cuMaskStr.substr(std::max(0, end - numCharToRead), std::min(numCharToRead, end));
|
|
end -= numCharToRead;
|
|
unsigned long ul = 0;
|
|
try {
|
|
ul = std::stoul(temp, 0, 16);
|
|
} catch (const std::invalid_argument&) {
|
|
info_.globalCUMask_ = {};
|
|
availCUs = 0;
|
|
break;
|
|
}
|
|
info_.globalCUMask_.push_back(static_cast<uint32_t>(ul));
|
|
// count number of set bits in ul to find the number of active CUs
|
|
// in each iteration
|
|
while (ul) {
|
|
ul &= (ul - 1);
|
|
availCUs++;
|
|
}
|
|
}
|
|
// update the maxComputeUnits_ based on the requested CU mask
|
|
if (availCUs != 0 && availCUs < compPhysicalCUs * 4) {
|
|
info_.maxComputeUnits_ = settings().enableWgpMode_ ? availCUs / 2 : availCUs;
|
|
} else {
|
|
info_.globalCUMask_ = {};
|
|
}
|
|
} else {
|
|
info_.globalCUMask_ = {};
|
|
}
|
|
}
|
|
|
|
// ================================================================================================
|
|
device::Signal* Device::createSignal() const { return new roc::Signal(); }
|
|
|
|
// ================================================================================================
|
|
hsa_status_t Device::BackendErrorCallBackHandler(const hsa_amd_event_t* event, void* data) {
|
|
cl_int gpu_error = CL_SUCCESS;
|
|
switch (event->event_type) {
|
|
case HSA_AMD_GPU_MEMORY_FAULT_EVENT:
|
|
gpu_error = CL_INVALID_MEM_OBJECT;
|
|
LogError("Memory Fault Error");
|
|
break;
|
|
case HSA_AMD_GPU_HW_EXCEPTION_EVENT:
|
|
gpu_error = CL_INVALID_OPERATION;
|
|
LogError("HW Exception Error");
|
|
break;
|
|
case HSA_AMD_GPU_MEMORY_ERROR_EVENT:
|
|
gpu_error = CL_DEVICE_NOT_AVAILABLE;
|
|
LogError("GPU Memory Error");
|
|
break;
|
|
default:
|
|
gpu_error = CL_DEVICE_NOT_AVAILABLE;
|
|
LogError("Unknown Event Type ");
|
|
break;
|
|
}
|
|
|
|
// Execute the default handler if a GPU core file should be generated ...
|
|
if (amd::Os::DumpCoreFile() || !HIP_SKIP_ABORT_ON_GPU_ERROR) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
gpu_error_ = gpu_error;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::RegisterBackendErrorCb() {
|
|
// Register ROCclr Error Callback
|
|
hsa_status_t hsa_error = HSA_STATUS_SUCCESS;
|
|
hsa_error = Hsa::register_system_event_handler(BackendErrorCallBackHandler, nullptr);
|
|
if (hsa_error != HSA_STATUS_SUCCESS) {
|
|
LogError("Cannot Register Call back event handler");
|
|
}
|
|
}
|
|
// ================================================================================================
|
|
amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) {
|
|
// Only create arena_mem_object if CPU memory is accessible from HMM
|
|
// or if runtime received an interop from another ROCr's client
|
|
// Disable arena for XNACK
|
|
hsa_amd_pointer_info_t ptr_info = {};
|
|
ptr_info.size = sizeof(hsa_amd_pointer_info_t);
|
|
if (!IsValidAllocation(ptr, size, &ptr_info)) {
|
|
return nullptr;
|
|
}
|
|
|
|
if (arena_mem_obj_ == nullptr) {
|
|
arena_mem_obj_ = new (context()) amd::ArenaMemory(context());
|
|
if ((arena_mem_obj_ != nullptr) && !arena_mem_obj_->create(nullptr)) {
|
|
LogError("Arena Memory Creation failed!");
|
|
arena_mem_obj_->release();
|
|
arena_mem_obj_ = nullptr;
|
|
}
|
|
if (arena_mem_obj_ == nullptr) {
|
|
return arena_mem_obj_;
|
|
}
|
|
}
|
|
|
|
// Calculate the offset of the pointer.
|
|
const void* dev_ptr = reinterpret_cast<void*>(
|
|
arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0])
|
|
->virtualAddress());
|
|
offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(dev_ptr);
|
|
|
|
return arena_mem_obj_;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::ReleaseGlobalSignal(void* signal) const {
|
|
if (signal != nullptr) {
|
|
reinterpret_cast<ProfilingSignal*>(signal)->release();
|
|
}
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::CreateUserEvent(amd::UserEvent* event) const {
|
|
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
|
|
if ((signal == nullptr) ||
|
|
(HSA_STATUS_SUCCESS != Hsa::signal_create(0, 0, nullptr, &signal->signal_))) {
|
|
return false;
|
|
}
|
|
Hsa::signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne);
|
|
event->SetHwEvent(signal.release());
|
|
return true;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::SetUserEvent(amd::UserEvent* event) const {
|
|
auto signal = reinterpret_cast<ProfilingSignal*>(event->HwEvent());
|
|
assert(signal != nullptr && "Can't have user event without hw event!");
|
|
Hsa::signal_silent_store_relaxed(signal->signal_, 0);
|
|
}
|
|
|
|
// ================================================================================================
|
|
bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
|
|
// Query ptr type to see if it's a HMM allocation
|
|
hsa_status_t status =
|
|
Hsa::pointer_info(const_cast<void*>(dev_ptr), ptr_info, nullptr, nullptr, nullptr);
|
|
// The call should never fail in ROCR, but just check for an error and continue
|
|
if (status != HSA_STATUS_SUCCESS) {
|
|
LogError("hsa_amd_pointer_info() failed");
|
|
}
|
|
|
|
if (ptr_info->type == HSA_EXT_POINTER_TYPE_RESERVED_ADDR) {
|
|
return false;
|
|
}
|
|
|
|
// Return false for pinned memory. A true return may result in a race because
|
|
// ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment
|
|
if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) {
|
|
return false;
|
|
}
|
|
|
|
if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
|
|
if ((size != 0) && ((reinterpret_cast<const_address>(dev_ptr) -
|
|
reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
|
|
auto HeapAllocOnly = [this, &gpu]() -> bool {
|
|
// Allocate initial heap for device memory allocator
|
|
static constexpr size_t HeapBufferSize = 128 * Ki;
|
|
heap_buffer_ = createMemory(HeapBufferSize);
|
|
if (initial_heap_size_ != 0) {
|
|
initial_heap_size_ = amd::alignUp(initial_heap_size_, 2 * Mi);
|
|
initial_heap_buffer_ = createMemory(initial_heap_size_);
|
|
}
|
|
if (heap_buffer_ == nullptr) {
|
|
LogError("Heap buffer allocation failed!");
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
std::call_once(heap_allocated_, HeapAllocOnly);
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::HiddenHeapInit(const VirtualGPU& gpu) {
|
|
auto HeapZeroOut = [this, &gpu]() -> bool {
|
|
static constexpr size_t HeapBufferSize = 128 * Ki;
|
|
bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr())
|
|
.initHeap(heap_buffer_, initial_heap_buffer_, HeapBufferSize,
|
|
initial_heap_size_ / (2 * Mi));
|
|
|
|
return result;
|
|
};
|
|
std::call_once(heap_initialized_, HeapZeroOut);
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const {
|
|
*readMask = maxSdmaReadMask_;
|
|
*writeMask = maxSdmaWriteMask_;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::AddKernel(Kernel& gpuKernel) const {
|
|
amd::ScopedLock lock(vgpusAccess());
|
|
kernel_map_.insert({gpuKernel.KernelCodeHandle(), gpuKernel});
|
|
}
|
|
|
|
// ================================================================================================
|
|
void Device::RemoveKernel(Kernel& gpuKernel) const {
|
|
if (gpuKernel.KernelCodeHandle() != 0) {
|
|
amd::ScopedLock lock(vgpusAccess());
|
|
auto it = kernel_map_.find(gpuKernel.KernelCodeHandle());
|
|
if (it != kernel_map_.end()) {
|
|
// Remove the old mapping
|
|
kernel_map_.erase(it);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ================================================================================================
|
|
ProfilingSignal::~ProfilingSignal() {
|
|
if (signal_.handle != 0) {
|
|
if (Hsa::signal_load_relaxed(signal_) > 0
|
|
&& !(HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError())) {
|
|
LogError("Runtime shouldn't destroy a signal that is still busy!");
|
|
if (Hsa::signal_wait_scacquire(signal_, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
|
|
kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) {
|
|
}
|
|
}
|
|
Hsa::signal_destroy(signal_);
|
|
}
|
|
}
|
|
|
|
// ================================================================================================
|
|
cl_int ConvertHSAErrorIntoCLError(hsa_status_t hsa_status) {
|
|
cl_int cl_error = CL_SUCCESS;
|
|
switch (hsa_status) {
|
|
case HSA_STATUS_ERROR_EXCEPTION:
|
|
cl_error = CL_INVALID_OPERATION;
|
|
break;
|
|
case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS:
|
|
cl_error = CL_INVALID_ARG_VALUE;
|
|
break;
|
|
case HSA_STATUS_ERROR_INVALID_ALLOCATION:
|
|
cl_error = CL_MEM_OBJECT_ALLOCATION_FAILURE;
|
|
break;
|
|
case HSA_STATUS_ERROR_INVALID_CODE_OBJECT:
|
|
cl_error = CL_INVALID_PROGRAM;
|
|
break;
|
|
case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT:
|
|
cl_error = CL_INVALID_OPERATION;
|
|
break;
|
|
case HSA_STATUS_ERROR_INVALID_ARGUMENT:
|
|
cl_error = CL_INVALID_ARG_VALUE;
|
|
break;
|
|
case HSA_STATUS_ERROR_INVALID_ISA:
|
|
cl_error = CL_INVALID_KERNEL;
|
|
break;
|
|
case (hsa_status_t)HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION:
|
|
cl_error = CL_BUILD_PROGRAM_FAILURE;
|
|
break;
|
|
case (hsa_status_t)HSA_STATUS_ERROR_MEMORY_FAULT:
|
|
cl_error = CL_INVALID_MEM_OBJECT;
|
|
break;
|
|
case (hsa_status_t)HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION:
|
|
cl_error = CL_INVALID_MEM_OBJECT;
|
|
break;
|
|
case HSA_STATUS_ERROR:
|
|
default:
|
|
cl_error = CL_DEVICE_NOT_AVAILABLE;
|
|
break;
|
|
}
|
|
return cl_error;
|
|
}
|
|
|
|
// ================================================================================================
|
|
void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
|
|
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
|
|
Device* dev = reinterpret_cast<Device*>(data);
|
|
for (auto it : dev->vgpus()) {
|
|
roc::VirtualGPU* vgpu = reinterpret_cast<roc::VirtualGPU*>(it);
|
|
if (vgpu->gpu_queue() == queue) {
|
|
vgpu->AnalyzeAqlQueue();
|
|
}
|
|
}
|
|
// Abort on device exceptions.
|
|
const char* errorMsg = 0;
|
|
Hsa::status_string(status, &errorMsg);
|
|
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
|
|
size_t global_available_mem = 0;
|
|
if (HSA_STATUS_SUCCESS !=
|
|
Hsa::agent_get_info(dev->getBackendDevice(),
|
|
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_AVAIL),
|
|
&global_available_mem)) {
|
|
LogError("HSA_AMD_AGENT_INFO_MEMORY_AVAIL query failed.");
|
|
}
|
|
ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS,
|
|
"Callback: Queue %p Aborting with error : %s Code: 0x%x Available Free mem : %zu MB",
|
|
queue->base_address, errorMsg, status, global_available_mem / Mi);
|
|
} else {
|
|
ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS,
|
|
"Callback: Queue %p aborting with error : %s code: 0x%x", queue->base_address,
|
|
errorMsg, status);
|
|
}
|
|
|
|
if (amd::Os::DumpCoreFile() || !HIP_SKIP_ABORT_ON_GPU_ERROR) {
|
|
abort();
|
|
}
|
|
amd::Device::gpu_error_ = ConvertHSAErrorIntoCLError(status);
|
|
}
|
|
}
|
|
|
|
// ================================================================================================
|
|
#if defined(__clang__)
|
|
#if __has_feature(address_sanitizer)
|
|
device::UriLocator* Device::createUriLocator() const { return new roc::UriLocator(); }
|
|
#endif
|
|
#endif
|
|
} // namespace amd::roc
|