Block ROCm 4.1+ running against 4.0 and prior kfd.
Sramecc is misreported in kfd 4.0 and prior. To prevent possible corruption due to d16 instructions, deny use of gfx906 with older kfds and correct misreport for gfx908. Denial of gfx906 may be overridden by setting HSA_IGNORE_SRAMECC_MISREPORT=1. Change-Id: I7d5c3a716fad01c348f8b88cd508cedbf914c989
Этот коммит содержится в:
@@ -341,6 +341,10 @@ class Runtime {
|
||||
|
||||
uint64_t sys_clock_freq() const { return sys_clock_freq_; }
|
||||
|
||||
void KfdVersion(const HsaVersionInfo& version) { kfd_version = version; }
|
||||
|
||||
HsaVersionInfo KfdVersion() const { return kfd_version; }
|
||||
|
||||
protected:
|
||||
static void AsyncEventsLoop(void*);
|
||||
|
||||
@@ -532,6 +536,9 @@ class Runtime {
|
||||
// Pools KFD Events for InterruptSignal
|
||||
InterruptSignal::EventPool EventPool;
|
||||
|
||||
// Kfd version
|
||||
HsaVersionInfo kfd_version;
|
||||
|
||||
// Frees runtime memory when the runtime library is unloaded if safe to do so.
|
||||
// Failure to release the runtime indicates an incorrect application but is
|
||||
// common (example: calls library routines at process exit).
|
||||
|
||||
@@ -87,6 +87,33 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
|
||||
}
|
||||
try {
|
||||
gpu = new GpuAgent(node_id, node_prop);
|
||||
|
||||
const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion();
|
||||
|
||||
// Check for sramecc incompatibility due to sramecc not being reported correctly in kfd before
|
||||
// 1.4.
|
||||
if (gpu->isa()->IsSrameccSupported() && (kfd_version.KernelInterfaceMajorVersion <= 1 &&
|
||||
kfd_version.KernelInterfaceMinorVersion < 4)) {
|
||||
// gfx906 has both sramecc modes in use. Suppress the device.
|
||||
if ((gpu->isa()->GetProcessorName() == "gfx906") &&
|
||||
core::Runtime::runtime_singleton_->flag().check_sramecc_validity()) {
|
||||
char name[64];
|
||||
gpu->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, name);
|
||||
name[63] = '\0';
|
||||
fprintf(stderr,
|
||||
"HSA Error: Incompatible kernel and userspace, %s disabled. Upgrade amdgpu.\n",
|
||||
name);
|
||||
delete gpu;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// gfx908 always has sramecc set to on in vbios. Set mode bit to on and recreate the device.
|
||||
if (gpu->isa()->GetProcessorName() == "gfx908") {
|
||||
node_prop.Capability.ui32.SRAM_EDCSupport = 1;
|
||||
delete gpu;
|
||||
gpu = new GpuAgent(node_id, node_prop);
|
||||
}
|
||||
}
|
||||
} catch (const hsa_exception& e) {
|
||||
if(e.error_code() == HSA_STATUS_ERROR_INVALID_ISA) {
|
||||
ifdebug {
|
||||
@@ -201,22 +228,24 @@ static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
|
||||
/// @brief Calls Kfd thunk to get the snapshot of the topology of the system,
|
||||
/// which includes associations between, node, devices, memory and caches.
|
||||
void BuildTopology() {
|
||||
HsaVersionInfo info;
|
||||
if (hsaKmtGetVersion(&info) != HSAKMT_STATUS_SUCCESS) {
|
||||
HsaVersionInfo kfd_version;
|
||||
if (hsaKmtGetVersion(&kfd_version) != HSAKMT_STATUS_SUCCESS) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (info.KernelInterfaceMajorVersion == kKfdVersionMajor &&
|
||||
info.KernelInterfaceMinorVersion < kKfdVersionMinor) {
|
||||
if (kfd_version.KernelInterfaceMajorVersion == kKfdVersionMajor &&
|
||||
kfd_version.KernelInterfaceMinorVersion < kKfdVersionMinor) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Disable KFD event support when using open source KFD
|
||||
if (info.KernelInterfaceMajorVersion == 1 &&
|
||||
info.KernelInterfaceMinorVersion == 0) {
|
||||
if (kfd_version.KernelInterfaceMajorVersion == 1 &&
|
||||
kfd_version.KernelInterfaceMinorVersion == 0) {
|
||||
core::g_use_interrupt_wait = false;
|
||||
}
|
||||
|
||||
core::Runtime::runtime_singleton_->KfdVersion(kfd_version);
|
||||
|
||||
HsaSystemProperties props;
|
||||
hsaKmtReleaseSystemProperties();
|
||||
|
||||
|
||||
@@ -1266,7 +1266,8 @@ Runtime::Runtime()
|
||||
sys_clock_freq_(0),
|
||||
vm_fault_event_(nullptr),
|
||||
vm_fault_signal_(nullptr),
|
||||
ref_count_(0) {}
|
||||
ref_count_(0),
|
||||
kfd_version{0} {}
|
||||
|
||||
hsa_status_t Runtime::Load() {
|
||||
flag_.Refresh();
|
||||
|
||||
@@ -127,6 +127,9 @@ class Flag {
|
||||
|
||||
var = os::GetEnvVar("HSA_FORCE_SDMA_SIZE");
|
||||
force_sdma_size_ = var.empty() ? 1024 * 1024 : atoi(var.c_str());
|
||||
|
||||
var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT");
|
||||
check_sramecc_validity_ = (var == "1") ? false : true;
|
||||
}
|
||||
|
||||
bool check_flat_scratch() const { return check_flat_scratch_; }
|
||||
@@ -173,6 +176,8 @@ class Flag {
|
||||
|
||||
size_t force_sdma_size() const { return force_sdma_size_; }
|
||||
|
||||
bool check_sramecc_validity() const { return check_sramecc_validity_; }
|
||||
|
||||
private:
|
||||
bool check_flat_scratch_;
|
||||
bool enable_vm_fault_message_;
|
||||
@@ -189,6 +194,7 @@ class Flag {
|
||||
bool no_scratch_thread_limit_;
|
||||
bool disable_image_;
|
||||
bool loader_enable_mmap_uri_;
|
||||
bool check_sramecc_validity_;
|
||||
|
||||
SDMA_OVERRIDE enable_sdma_;
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user