From 45fbe5b1920eee1afdd9f7f43bef470aed5763f6 Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Wed, 10 Mar 2021 13:59:31 -0600 Subject: [PATCH] Block ROCm 4.1+ running against 4.0 and prior kfd. Sramecc is misreported in kfd 4.0 and prior. To prevent possible corruption due to d16 instructions, deny use of gfx906 with older kfds and correct misreport for gfx908. Denial of gfx906 may be overridden by setting HSA_IGNORE_SRAMECC_MISREPORT=1. Change-Id: I7d5c3a716fad01c348f8b88cd508cedbf914c989 --- runtime/hsa-runtime/core/inc/runtime.h | 7 ++++ .../hsa-runtime/core/runtime/amd_topology.cpp | 41 ++++++++++++++++--- runtime/hsa-runtime/core/runtime/runtime.cpp | 3 +- runtime/hsa-runtime/core/util/flag.h | 6 +++ 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index dd615d2422..634224c5ff 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -341,6 +341,10 @@ class Runtime { uint64_t sys_clock_freq() const { return sys_clock_freq_; } + void KfdVersion(const HsaVersionInfo& version) { kfd_version = version; } + + HsaVersionInfo KfdVersion() const { return kfd_version; } + protected: static void AsyncEventsLoop(void*); @@ -532,6 +536,9 @@ class Runtime { // Pools KFD Events for InterruptSignal InterruptSignal::EventPool EventPool; + // Kfd version + HsaVersionInfo kfd_version; + // Frees runtime memory when the runtime library is unloaded if safe to do so. // Failure to release the runtime indicates an incorrect application but is // common (example: calls library routines at process exit). diff --git a/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/runtime/hsa-runtime/core/runtime/amd_topology.cpp index b1cf8e061c..1858384892 100644 --- a/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -87,6 +87,33 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { } try { gpu = new GpuAgent(node_id, node_prop); + + const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion(); + + // Check for sramecc incompatibility due to sramecc not being reported correctly in kfd before + // 1.4. + if (gpu->isa()->IsSrameccSupported() && (kfd_version.KernelInterfaceMajorVersion <= 1 && + kfd_version.KernelInterfaceMinorVersion < 4)) { + // gfx906 has both sramecc modes in use. Suppress the device. + if ((gpu->isa()->GetProcessorName() == "gfx906") && + core::Runtime::runtime_singleton_->flag().check_sramecc_validity()) { + char name[64]; + gpu->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, name); + name[63] = '\0'; + fprintf(stderr, + "HSA Error: Incompatible kernel and userspace, %s disabled. Upgrade amdgpu.\n", + name); + delete gpu; + return nullptr; + } + + // gfx908 always has sramecc set to on in vbios. Set mode bit to on and recreate the device. + if (gpu->isa()->GetProcessorName() == "gfx908") { + node_prop.Capability.ui32.SRAM_EDCSupport = 1; + delete gpu; + gpu = new GpuAgent(node_id, node_prop); + } + } } catch (const hsa_exception& e) { if(e.error_code() == HSA_STATUS_ERROR_INVALID_ISA) { ifdebug { @@ -201,22 +228,24 @@ static void SurfaceGpuList(std::vector& gpu_list) { /// @brief Calls Kfd thunk to get the snapshot of the topology of the system, /// which includes associations between, node, devices, memory and caches. void BuildTopology() { - HsaVersionInfo info; - if (hsaKmtGetVersion(&info) != HSAKMT_STATUS_SUCCESS) { + HsaVersionInfo kfd_version; + if (hsaKmtGetVersion(&kfd_version) != HSAKMT_STATUS_SUCCESS) { return; } - if (info.KernelInterfaceMajorVersion == kKfdVersionMajor && - info.KernelInterfaceMinorVersion < kKfdVersionMinor) { + if (kfd_version.KernelInterfaceMajorVersion == kKfdVersionMajor && + kfd_version.KernelInterfaceMinorVersion < kKfdVersionMinor) { return; } // Disable KFD event support when using open source KFD - if (info.KernelInterfaceMajorVersion == 1 && - info.KernelInterfaceMinorVersion == 0) { + if (kfd_version.KernelInterfaceMajorVersion == 1 && + kfd_version.KernelInterfaceMinorVersion == 0) { core::g_use_interrupt_wait = false; } + core::Runtime::runtime_singleton_->KfdVersion(kfd_version); + HsaSystemProperties props; hsaKmtReleaseSystemProperties(); diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index 2e481b7da9..c3cce57a1a 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -1266,7 +1266,8 @@ Runtime::Runtime() sys_clock_freq_(0), vm_fault_event_(nullptr), vm_fault_signal_(nullptr), - ref_count_(0) {} + ref_count_(0), + kfd_version{0} {} hsa_status_t Runtime::Load() { flag_.Refresh(); diff --git a/runtime/hsa-runtime/core/util/flag.h b/runtime/hsa-runtime/core/util/flag.h index f789527306..5013dd1a8f 100644 --- a/runtime/hsa-runtime/core/util/flag.h +++ b/runtime/hsa-runtime/core/util/flag.h @@ -127,6 +127,9 @@ class Flag { var = os::GetEnvVar("HSA_FORCE_SDMA_SIZE"); force_sdma_size_ = var.empty() ? 1024 * 1024 : atoi(var.c_str()); + + var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT"); + check_sramecc_validity_ = (var == "1") ? false : true; } bool check_flat_scratch() const { return check_flat_scratch_; } @@ -173,6 +176,8 @@ class Flag { size_t force_sdma_size() const { return force_sdma_size_; } + bool check_sramecc_validity() const { return check_sramecc_validity_; } + private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -189,6 +194,7 @@ class Flag { bool no_scratch_thread_limit_; bool disable_image_; bool loader_enable_mmap_uri_; + bool check_sramecc_validity_; SDMA_OVERRIDE enable_sdma_;