diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index dd615d2422..634224c5ff 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -341,6 +341,10 @@ class Runtime { uint64_t sys_clock_freq() const { return sys_clock_freq_; } + void KfdVersion(const HsaVersionInfo& version) { kfd_version = version; } + + HsaVersionInfo KfdVersion() const { return kfd_version; } + protected: static void AsyncEventsLoop(void*); @@ -532,6 +536,9 @@ class Runtime { // Pools KFD Events for InterruptSignal InterruptSignal::EventPool EventPool; + // Kfd version + HsaVersionInfo kfd_version; + // Frees runtime memory when the runtime library is unloaded if safe to do so. // Failure to release the runtime indicates an incorrect application but is // common (example: calls library routines at process exit). diff --git a/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/runtime/hsa-runtime/core/runtime/amd_topology.cpp index b1cf8e061c..1858384892 100644 --- a/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -87,6 +87,33 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { } try { gpu = new GpuAgent(node_id, node_prop); + + const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion(); + + // Check for sramecc incompatibility due to sramecc not being reported correctly in kfd before + // 1.4. + if (gpu->isa()->IsSrameccSupported() && (kfd_version.KernelInterfaceMajorVersion <= 1 && + kfd_version.KernelInterfaceMinorVersion < 4)) { + // gfx906 has both sramecc modes in use. Suppress the device. + if ((gpu->isa()->GetProcessorName() == "gfx906") && + core::Runtime::runtime_singleton_->flag().check_sramecc_validity()) { + char name[64]; + gpu->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, name); + name[63] = '\0'; + fprintf(stderr, + "HSA Error: Incompatible kernel and userspace, %s disabled. Upgrade amdgpu.\n", + name); + delete gpu; + return nullptr; + } + + // gfx908 always has sramecc set to on in vbios. Set mode bit to on and recreate the device. + if (gpu->isa()->GetProcessorName() == "gfx908") { + node_prop.Capability.ui32.SRAM_EDCSupport = 1; + delete gpu; + gpu = new GpuAgent(node_id, node_prop); + } + } } catch (const hsa_exception& e) { if(e.error_code() == HSA_STATUS_ERROR_INVALID_ISA) { ifdebug { @@ -201,22 +228,24 @@ static void SurfaceGpuList(std::vector& gpu_list) { /// @brief Calls Kfd thunk to get the snapshot of the topology of the system, /// which includes associations between, node, devices, memory and caches. void BuildTopology() { - HsaVersionInfo info; - if (hsaKmtGetVersion(&info) != HSAKMT_STATUS_SUCCESS) { + HsaVersionInfo kfd_version; + if (hsaKmtGetVersion(&kfd_version) != HSAKMT_STATUS_SUCCESS) { return; } - if (info.KernelInterfaceMajorVersion == kKfdVersionMajor && - info.KernelInterfaceMinorVersion < kKfdVersionMinor) { + if (kfd_version.KernelInterfaceMajorVersion == kKfdVersionMajor && + kfd_version.KernelInterfaceMinorVersion < kKfdVersionMinor) { return; } // Disable KFD event support when using open source KFD - if (info.KernelInterfaceMajorVersion == 1 && - info.KernelInterfaceMinorVersion == 0) { + if (kfd_version.KernelInterfaceMajorVersion == 1 && + kfd_version.KernelInterfaceMinorVersion == 0) { core::g_use_interrupt_wait = false; } + core::Runtime::runtime_singleton_->KfdVersion(kfd_version); + HsaSystemProperties props; hsaKmtReleaseSystemProperties(); diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index 2e481b7da9..c3cce57a1a 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -1266,7 +1266,8 @@ Runtime::Runtime() sys_clock_freq_(0), vm_fault_event_(nullptr), vm_fault_signal_(nullptr), - ref_count_(0) {} + ref_count_(0), + kfd_version{0} {} hsa_status_t Runtime::Load() { flag_.Refresh(); diff --git a/runtime/hsa-runtime/core/util/flag.h b/runtime/hsa-runtime/core/util/flag.h index f789527306..5013dd1a8f 100644 --- a/runtime/hsa-runtime/core/util/flag.h +++ b/runtime/hsa-runtime/core/util/flag.h @@ -127,6 +127,9 @@ class Flag { var = os::GetEnvVar("HSA_FORCE_SDMA_SIZE"); force_sdma_size_ = var.empty() ? 1024 * 1024 : atoi(var.c_str()); + + var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT"); + check_sramecc_validity_ = (var == "1") ? false : true; } bool check_flat_scratch() const { return check_flat_scratch_; } @@ -173,6 +176,8 @@ class Flag { size_t force_sdma_size() const { return force_sdma_size_; } + bool check_sramecc_validity() const { return check_sramecc_validity_; } + private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -189,6 +194,7 @@ class Flag { bool no_scratch_thread_limit_; bool disable_image_; bool loader_enable_mmap_uri_; + bool check_sramecc_validity_; SDMA_OVERRIDE enable_sdma_;