Block ROCm 4.1+ running against 4.0 and prior kfd.

Sramecc is misreported in kfd 4.0 and prior.  To prevent possible
corruption due to d16 instructions, deny use of gfx906 with older
kfds and correct misreport for gfx908.  Denial of gfx906 may be
overridden by setting HSA_IGNORE_SRAMECC_MISREPORT=1.

Change-Id: I7d5c3a716fad01c348f8b88cd508cedbf914c989
Этот коммит содержится в:
Sean Keely
2021-03-10 13:59:31 -06:00
родитель 72fa4a17fa
Коммит 45fbe5b192
4 изменённых файлов: 50 добавлений и 7 удалений
+7
Просмотреть файл
@@ -341,6 +341,10 @@ class Runtime {
uint64_t sys_clock_freq() const { return sys_clock_freq_; }
void KfdVersion(const HsaVersionInfo& version) { kfd_version = version; }
HsaVersionInfo KfdVersion() const { return kfd_version; }
protected:
static void AsyncEventsLoop(void*);
@@ -532,6 +536,9 @@ class Runtime {
// Pools KFD Events for InterruptSignal
InterruptSignal::EventPool EventPool;
// Kfd version
HsaVersionInfo kfd_version;
// Frees runtime memory when the runtime library is unloaded if safe to do so.
// Failure to release the runtime indicates an incorrect application but is
// common (example: calls library routines at process exit).
+35 -6
Просмотреть файл
@@ -87,6 +87,33 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
}
try {
gpu = new GpuAgent(node_id, node_prop);
const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion();
// Check for sramecc incompatibility due to sramecc not being reported correctly in kfd before
// 1.4.
if (gpu->isa()->IsSrameccSupported() && (kfd_version.KernelInterfaceMajorVersion <= 1 &&
kfd_version.KernelInterfaceMinorVersion < 4)) {
// gfx906 has both sramecc modes in use. Suppress the device.
if ((gpu->isa()->GetProcessorName() == "gfx906") &&
core::Runtime::runtime_singleton_->flag().check_sramecc_validity()) {
char name[64];
gpu->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, name);
name[63] = '\0';
fprintf(stderr,
"HSA Error: Incompatible kernel and userspace, %s disabled. Upgrade amdgpu.\n",
name);
delete gpu;
return nullptr;
}
// gfx908 always has sramecc set to on in vbios. Set mode bit to on and recreate the device.
if (gpu->isa()->GetProcessorName() == "gfx908") {
node_prop.Capability.ui32.SRAM_EDCSupport = 1;
delete gpu;
gpu = new GpuAgent(node_id, node_prop);
}
}
} catch (const hsa_exception& e) {
if(e.error_code() == HSA_STATUS_ERROR_INVALID_ISA) {
ifdebug {
@@ -201,22 +228,24 @@ static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
/// @brief Calls Kfd thunk to get the snapshot of the topology of the system,
/// which includes associations between, node, devices, memory and caches.
void BuildTopology() {
HsaVersionInfo info;
if (hsaKmtGetVersion(&info) != HSAKMT_STATUS_SUCCESS) {
HsaVersionInfo kfd_version;
if (hsaKmtGetVersion(&kfd_version) != HSAKMT_STATUS_SUCCESS) {
return;
}
if (info.KernelInterfaceMajorVersion == kKfdVersionMajor &&
info.KernelInterfaceMinorVersion < kKfdVersionMinor) {
if (kfd_version.KernelInterfaceMajorVersion == kKfdVersionMajor &&
kfd_version.KernelInterfaceMinorVersion < kKfdVersionMinor) {
return;
}
// Disable KFD event support when using open source KFD
if (info.KernelInterfaceMajorVersion == 1 &&
info.KernelInterfaceMinorVersion == 0) {
if (kfd_version.KernelInterfaceMajorVersion == 1 &&
kfd_version.KernelInterfaceMinorVersion == 0) {
core::g_use_interrupt_wait = false;
}
core::Runtime::runtime_singleton_->KfdVersion(kfd_version);
HsaSystemProperties props;
hsaKmtReleaseSystemProperties();
+2 -1
Просмотреть файл
@@ -1266,7 +1266,8 @@ Runtime::Runtime()
sys_clock_freq_(0),
vm_fault_event_(nullptr),
vm_fault_signal_(nullptr),
ref_count_(0) {}
ref_count_(0),
kfd_version{0} {}
hsa_status_t Runtime::Load() {
flag_.Refresh();
+6
Просмотреть файл
@@ -127,6 +127,9 @@ class Flag {
var = os::GetEnvVar("HSA_FORCE_SDMA_SIZE");
force_sdma_size_ = var.empty() ? 1024 * 1024 : atoi(var.c_str());
var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT");
check_sramecc_validity_ = (var == "1") ? false : true;
}
bool check_flat_scratch() const { return check_flat_scratch_; }
@@ -173,6 +176,8 @@ class Flag {
size_t force_sdma_size() const { return force_sdma_size_; }
bool check_sramecc_validity() const { return check_sramecc_validity_; }
private:
bool check_flat_scratch_;
bool enable_vm_fault_message_;
@@ -189,6 +194,7 @@ class Flag {
bool no_scratch_thread_limit_;
bool disable_image_;
bool loader_enable_mmap_uri_;
bool check_sramecc_validity_;
SDMA_OVERRIDE enable_sdma_;