From dd42ca6dbe3f504faa5447a66e8f4196814faf28 Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Thu, 1 Apr 2021 20:51:51 -0500 Subject: [PATCH] Squash merge of cfreehil/amd-temp-gfx90a onto amd-staging. Includes some workarounds and HMM. Conflicts: opensrc/hsa-runtime/core/runtime/amd_topology.cpp opensrc/hsa-runtime/core/util/flag.h Change-Id: I22976f07964a43dbb228a6231777dbd599112b8d [ROCm/ROCR-Runtime commit: 7333c77e2287367a7cd979b13636222a39030fbf] --- .../suites/negative/queue_validation.cc | 93 +-- .../rocrtst/suites/test_common/CMakeLists.txt | 2 +- .../rocrtst/suites/test_common/main.cc | 11 +- .../core/common/hsa_table_interface.cpp | 21 + .../hsa-runtime/core/inc/amd_gpu_agent.h | 3 +- .../hsa-runtime/core/inc/amd_gpu_shaders.h | 26 + .../hsa-runtime/core/inc/amd_memory_region.h | 2 +- .../hsa-runtime/core/inc/hsa_ext_amd_impl.h | 15 + .../hsa-runtime/core/inc/memory_region.h | 7 +- .../runtime/hsa-runtime/core/inc/runtime.h | 38 ++ .../core/runtime/amd_cpu_agent.cpp | 43 +- .../core/runtime/amd_gpu_agent.cpp | 38 +- .../core/runtime/amd_memory_region.cpp | 20 +- .../hsa-runtime/core/runtime/amd_topology.cpp | 65 ++- .../core/runtime/hsa_api_trace.cpp | 3 + .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 32 ++ .../runtime/hsa-runtime/core/runtime/isa.cpp | 9 + .../hsa-runtime/core/runtime/runtime.cpp | 536 +++++++++++++++++- .../runtime/hsa-runtime/core/util/flag.h | 40 ++ .../runtime/hsa-runtime/hsacore.so.def | 3 + .../runtime/hsa-runtime/image/blit_kernel.cpp | 3 + .../hsa-runtime/image/blit_src/CMakeLists.txt | 2 +- .../runtime/hsa-runtime/image/device_info.cpp | 18 +- .../runtime/hsa-runtime/inc/amd_hsa_elf.h | 5 +- .../runtime/hsa-runtime/inc/hsa.h | 16 +- .../runtime/hsa-runtime/inc/hsa_api_trace.h | 3 + .../runtime/hsa-runtime/inc/hsa_ext_amd.h | 156 +++++ .../libamdhsacode/amd_hsa_code.cpp | 1 + .../runtime/hsa-runtime/loader/loaders.cpp | 43 +- .../runtime/hsa-runtime/loader/loaders.hpp | 2 +- 30 files changed, 1102 insertions(+), 154 deletions(-) diff --git a/projects/rocr-runtime/rocrtst/suites/negative/queue_validation.cc b/projects/rocr-runtime/rocrtst/suites/negative/queue_validation.cc index 7c5a93e053..19fb5914a4 100755 --- a/projects/rocr-runtime/rocrtst/suites/negative/queue_validation.cc +++ b/projects/rocr-runtime/rocrtst/suites/negative/queue_validation.cc @@ -186,19 +186,19 @@ void QueueValidation::QueueValidationForInvalidDimension(hsa_agent_t cpuAgent, hsa_queue_t *queue[kMaxQueue]; // command queue uint32_t ii; + test_validation_data user_data[kMaxQueue]; for (ii = 0; ii < kMaxQueue; ++ii) { - test_validation_data user_data; // set callback flag to false if callback called then it will change to true - user_data.cb_triggered = false; + user_data[ii].cb_triggered = false; // set the queue pointer - user_data.queue_pointer = &queue[ii]; + user_data[ii].queue_pointer = &queue[ii]; // set the expected status in queue error calback handling - user_data.expected_status = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + user_data[ii].expected_status = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; // create queue err = hsa_queue_create(gpuAgent, - queue_max, HSA_QUEUE_TYPE_SINGLE, - CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]); + queue_max, HSA_QUEUE_TYPE_SINGLE, + CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]); ASSERT_EQ(err, HSA_STATUS_SUCCESS); @@ -240,12 +240,12 @@ void QueueValidation::QueueValidationForInvalidDimension(hsa_agent_t cpuAgent, // completion signal should not be changed. ASSERT_EQ(completion, 1); - // queue error handling callback should be triggered - ASSERT_EQ(user_data.cb_triggered, true); - hsa_signal_store_relaxed(aql().completion_signal, 1); } + sleep(1); for (ii = 0; ii < kMaxQueue; ++ii) { + // queue error handling callback should be triggered + ASSERT_EQ(user_data[ii].cb_triggered, true); if (queue[ii]) { hsa_queue_destroy(queue[ii]); } } } @@ -269,20 +269,21 @@ void QueueValidation::QueueValidationInvalidGroupMemory(hsa_agent_t cpuAgent, queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent; hsa_queue_t *queue[kMaxQueue]; // command queue + test_validation_data user_data[kMaxQueue]; + uint32_t ii; for (ii = 0; ii < kMaxQueue; ++ii) { - test_validation_data user_data; // set callback flag to false if callback called then it will change to true - user_data.cb_triggered = false; + user_data[ii].cb_triggered = false; // set the queue pointer - user_data.queue_pointer = &queue[ii]; + user_data[ii].queue_pointer = &queue[ii]; // set the expected status in queue error calback handling - user_data.expected_status = HSA_STATUS_ERROR_INVALID_ALLOCATION; + user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_ALLOCATION; // create queue err = hsa_queue_create(gpuAgent, - queue_max, HSA_QUEUE_TYPE_SINGLE, - CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]); + queue_max, HSA_QUEUE_TYPE_SINGLE, + CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]); ASSERT_EQ(err, HSA_STATUS_SUCCESS); @@ -325,12 +326,12 @@ void QueueValidation::QueueValidationInvalidGroupMemory(hsa_agent_t cpuAgent, // completion signal should not be changed. ASSERT_EQ(completion, 1); - // queue error handling callback should be triggered - ASSERT_EQ(user_data.cb_triggered, true); - hsa_signal_store_relaxed(aql().completion_signal, 1); } + sleep(1); for (ii = 0; ii < kMaxQueue; ++ii) { + // queue error handling callback should be triggered + ASSERT_EQ(user_data[ii].cb_triggered, true); if (queue[ii]) { hsa_queue_destroy(queue[ii]); } } } @@ -353,20 +354,20 @@ void QueueValidation::QueueValidationForInvalidKernelObject(hsa_agent_t cpuAgent queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent; hsa_queue_t *queue[kMaxQueue]; // command queue + test_validation_data user_data[kMaxQueue]; uint32_t ii; for (ii = 0; ii < kMaxQueue; ++ii) { - test_validation_data user_data; // set callback flag to false if callback called then it will change to true - user_data.cb_triggered = false; + user_data[ii].cb_triggered = false; // set the queue pointer - user_data.queue_pointer = &queue[ii]; + user_data[ii].queue_pointer = &queue[ii]; // set the expected status in queue error calback handling - user_data.expected_status = HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_CODE_OBJECT; // create queue err = hsa_queue_create(gpuAgent, kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE, - CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]); + CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]); ASSERT_EQ(err, HSA_STATUS_SUCCESS); @@ -408,12 +409,12 @@ void QueueValidation::QueueValidationForInvalidKernelObject(hsa_agent_t cpuAgent // completion signal should not be changed. ASSERT_EQ(completion, 1); - // queue error handling callback should be triggered - ASSERT_EQ(user_data.cb_triggered, true); - hsa_signal_store_relaxed(aql().completion_signal, 1); } + sleep(1); for (ii = 0; ii < kMaxQueue; ++ii) { + // queue error handling callback should be triggered + ASSERT_EQ(user_data[ii].cb_triggered, true); if (queue[ii]) { hsa_queue_destroy(queue[ii]); } } } @@ -437,19 +438,19 @@ void QueueValidation::QueueValidationForInvalidPacket(hsa_agent_t cpuAgent, hsa_queue_t *queue[kMaxQueue]; // command queue uint32_t ii; + test_validation_data user_data[kMaxQueue]; for (ii = 0; ii < kMaxQueue; ++ii) { - test_validation_data user_data; // set callback flag to false if callback called then it will change to true - user_data.cb_triggered = false; + user_data[ii].cb_triggered = false; // set the queue pointer - user_data.queue_pointer = &queue[ii]; + user_data[ii].queue_pointer = &queue[ii]; // set the expected status in queue error calback handling - user_data.expected_status = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT; + user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT; // create queue err = hsa_queue_create(gpuAgent, - queue_max, HSA_QUEUE_TYPE_SINGLE, - CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]); + queue_max, HSA_QUEUE_TYPE_SINGLE, + CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]); ASSERT_EQ(err, HSA_STATUS_SUCCESS); @@ -486,12 +487,12 @@ void QueueValidation::QueueValidationForInvalidPacket(hsa_agent_t cpuAgent, // completion signal should not be changed. ASSERT_EQ(completion, 1); - // queue error handling callback should be triggered - ASSERT_EQ(user_data.cb_triggered, true); - hsa_signal_store_relaxed(aql().completion_signal, 1); } + sleep(1); for (ii = 0; ii < kMaxQueue; ++ii) { + // queue error handling callback should be triggered + ASSERT_EQ(user_data[ii].cb_triggered, true); if (queue[ii]) { hsa_queue_destroy(queue[ii]); } } } @@ -514,22 +515,22 @@ void QueueValidation::QueueValidationForInvalidWorkGroupSize(hsa_agent_t cpuAgen queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent; hsa_queue_t *queue[kMaxQueue]; // command queue + test_validation_data user_data[kMaxQueue][3]; uint32_t ii; for (ii = 0; ii < kMaxQueue; ++ii) { uint32_t jj; for (jj = 1; jj <= 3; ++jj) { - test_validation_data user_data; // set callback flag to false if callback called then it will change to true - user_data.cb_triggered = false; + user_data[ii][jj - 1].cb_triggered = false; // set the queue pointer - user_data.queue_pointer = &queue[ii]; + user_data[ii][jj - 1].queue_pointer = &queue[ii]; // set the expected status in queue error calback handling - user_data.expected_status = HSA_STATUS_ERROR_INVALID_ARGUMENT; + user_data[ii][jj - 1].expected_status = HSA_STATUS_ERROR_INVALID_ARGUMENT; // create queue err = hsa_queue_create(gpuAgent, - kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE, - CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]); + kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE, + CallbackQueueErrorHandling, &user_data[ii][jj - 1], 0, 0, &queue[ii]); ASSERT_EQ(err, HSA_STATUS_SUCCESS); @@ -574,13 +575,17 @@ void QueueValidation::QueueValidationForInvalidWorkGroupSize(hsa_agent_t cpuAgen // completion signal should not be changed. ASSERT_EQ(completion, 1); - // queue error handling callback should be triggered - ASSERT_EQ(user_data.cb_triggered, true); - hsa_signal_store_relaxed(aql().completion_signal, 1); if (queue[ii]) { hsa_queue_destroy(queue[ii]); } } } + sleep(1); + for (uint32_t ii = 0; ii < kMaxQueue; ++ii) { + for (uint32_t jj = 0; jj < 3; ++jj) { + // queue error handling callback should be triggered + ASSERT_EQ(user_data[ii][jj].cb_triggered, true); + } + } } diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/CMakeLists.txt b/projects/rocr-runtime/rocrtst/suites/test_common/CMakeLists.txt index 2f70ae5b69..e468bebba5 100755 --- a/projects/rocr-runtime/rocrtst/suites/test_common/CMakeLists.txt +++ b/projects/rocr-runtime/rocrtst/suites/test_common/CMakeLists.txt @@ -54,7 +54,7 @@ set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCR-R set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCR-Runtime" ) -set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033") +set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033") # # Currently support for Windows platform is not present diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc index 6a813bd95e..9ba677a5d7 100755 --- a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc +++ b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc @@ -128,11 +128,6 @@ TEST(rocrtst, Test_Example) { RunGenericTest(&tst); } -TEST(rocrtstFunc, IPC) { - IPCTest ipc; - RunGenericTest(&ipc); -} - TEST(rocrtstFunc, MemoryAccessTests) { MemoryAccessTest mt; RunCustomTestProlog(&mt); @@ -175,7 +170,6 @@ TEST(rocrtstFunc, Concurrent_Shutdown) { RunCustomTestEpilog(&cs); } - TEST(rocrtstFunc, Reference_Count) { ReferenceCountTest rc(true, false); RunCustomTestProlog(&rc); @@ -212,6 +206,11 @@ TEST(rocrtstFunc, Signal_Create_Concurrently) { } #ifndef ROCRTST_EMULATOR_BUILD +TEST(rocrtstFunc, IPC) { + IPCTest ipc; + RunGenericTest(&ipc); +} + TEST(rocrtstFunc, DISABLED_Signal_Kernel_Set) { SignalKernelTest sk(SET); RunCustomTestProlog(&sk); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp index e48f24c54f..0ff7b5f8b9 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -1151,6 +1151,27 @@ hsa_status_t HSA_API hsa_amd_signal_value_pointer(hsa_signal_t signal, return amdExtTable->hsa_amd_signal_value_pointer_fn(signal, value_ptr); } +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + return amdExtTable->hsa_amd_svm_attributes_set_fn(ptr, size, attribute_list, attribute_count); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + return amdExtTable->hsa_amd_svm_attributes_get_fn(ptr, size, attribute_list, attribute_count); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + return amdExtTable->hsa_amd_svm_prefetch_async_fn(ptr, size, agent, num_dep_signals, dep_signals, completion_signal); +} + // Tools only table interfaces. namespace rocr { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index c00005ff0f..df1f4f2b95 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -173,7 +173,8 @@ class GpuAgent : public GpuAgentInt { // @param [in] node Node id. Each CPU in different socket will get distinct // id. // @param [in] node_props Node property. - GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props); + // @param [in] xnack_mode XNACK mode of device. + GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode); // @brief GPU agent destructor. ~GpuAgent(); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_shaders.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_shaders.h index 68cf52d74e..0cf527baf0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_shaders.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_shaders.h @@ -439,6 +439,32 @@ static const unsigned int kCodeTrapHandler9[] = { 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, }; +static const unsigned int kCodeTrapHandler90a[] = { + 0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73, + 0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900, + 0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a, + 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff, + 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001, + 0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f, + 0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f, + 0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f, + 0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80, + 0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044, + 0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, + 0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4, + 0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e, + 0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c, + 0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff, + 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, + 0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f, + 0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, + 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, + 0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, + 0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, + 0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, +}; + static const unsigned int kCodeCopyAligned8[] = { 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020, 0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050, diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h index e119a939eb..cce01e6587 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -95,7 +95,7 @@ class MemoryRegion : public core::MemoryRegion { /// @brief Unpin memory. static void MakeKfdMemoryUnresident(const void* ptr); - MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, + MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props); ~MemoryRegion(); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h index 510e369601..9954b8fc7a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h @@ -246,6 +246,21 @@ hsa_status_t hsa_amd_deregister_deallocation_callback( hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, volatile hsa_signal_value_t** value_ptr); +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + } // namespace amd } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/memory_region.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/memory_region.h index 583b13a124..4f362fad2d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/memory_region.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/memory_region.h @@ -57,8 +57,8 @@ class Agent; class MemoryRegion : public Checked<0x9C961F19EE175BB3> { public: - MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner) - : fine_grain_(fine_grain), full_profile_(full_profile), owner_(owner) { + MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner) + : fine_grain_(fine_grain), kernarg_(kernarg), full_profile_(full_profile), owner_(owner) { assert(owner_ != NULL); } @@ -112,12 +112,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { __forceinline bool fine_grain() const { return fine_grain_; } + __forceinline bool kernarg() const { return kernarg_; } + __forceinline bool full_profile() const { return full_profile_; } __forceinline core::Agent* owner() const { return owner_; } private: const bool fine_grain_; + const bool kernarg_; const bool full_profile_; core::Agent* owner_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h index 634224c5ff..d232e6f955 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h @@ -292,6 +292,15 @@ class Runtime { hsa_status_t IPCDetach(void* ptr); + hsa_status_t SetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + + hsa_status_t GetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + + hsa_status_t SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, hsa_signal_t completion_signal); + const std::vector& cpu_agents() { return cpu_agents_; } const std::vector& gpu_agents() { return gpu_agents_; } @@ -395,6 +404,28 @@ class Runtime { std::vector arg_; }; + struct PrefetchRange; + typedef std::map prefetch_map_t; + + struct PrefetchOp { + void* base; + size_t size; + uint32_t node_id; + int remaining_deps; + hsa_signal_t completion; + std::vector dep_signals; + prefetch_map_t::iterator prefetch_map_entry; + }; + + struct PrefetchRange { + PrefetchRange() {} + PrefetchRange(size_t Bytes, PrefetchOp* Op) : bytes(Bytes), op(Op) {} + size_t bytes; + PrefetchOp* op; + prefetch_map_t::iterator prev; + prefetch_map_t::iterator next; + }; + // Will be created before any user could call hsa_init but also could be // destroyed before incorrectly written programs call hsa_shutdown. static KernelMutex bootstrap_lock_; @@ -444,6 +475,9 @@ class Runtime { /// @retval Index in ::link_matrix_. uint32_t GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to); + /// @brief Get most recently issued SVM prefetch agent for the range in question. + Agent* GetSVMPrefetchAgent(void* ptr, size_t size); + // Mutex object to protect multithreaded access to ::allocation_map_, // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo // registered & mapped arrays. @@ -485,6 +519,10 @@ class Runtime { // Contains the region, address, and size of previously allocated memory. std::map allocation_map_; + // Pending prefetch containers. + KernelMutex prefetch_lock_; + prefetch_map_t prefetch_map_; + // Allocator using ::system_region_ std::function system_allocator_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp index 576f66369c..1ecf9c3f9b 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp @@ -69,40 +69,31 @@ void CpuAgent::InitRegionList() { std::vector mem_props(properties_.NumMemoryBanks); if (HSAKMT_STATUS_SUCCESS == - hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, - &mem_props[0])) { + hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, &mem_props[0])) { std::vector::iterator system_prop = std::find_if(mem_props.begin(), mem_props.end(), [](HsaMemoryProperties prop) -> bool { return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM); }); - if (system_prop != mem_props.end()) { - MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, *system_prop); + HsaMemoryProperties system_props; + std::memset(&system_props, 0, sizeof(HsaMemoryProperties)); + system_props.HeapType = HSA_HEAPTYPE_SYSTEM; + system_props.SizeInBytes = 0; + system_props.VirtualBaseAddress = 0; - regions_.push_back(system_region_fine); + if (system_prop != mem_props.end()) system_props = *system_prop; - if (!is_apu_node) { - MemoryRegion* system_region_coarse = - new MemoryRegion(false, is_apu_node, this, *system_prop); + MemoryRegion* system_region_fine = + new MemoryRegion(true, false, is_apu_node, this, system_props); + regions_.push_back(system_region_fine); + MemoryRegion* system_region_kernarg = + new MemoryRegion(true, true, is_apu_node, this, system_props); + regions_.push_back(system_region_kernarg); - regions_.push_back(system_region_coarse); - } - } else { - HsaMemoryProperties system_props; - std::memset(&system_props, 0, sizeof(HsaMemoryProperties)); - - system_props.HeapType = HSA_HEAPTYPE_SYSTEM; - system_props.SizeInBytes = 0; - system_props.VirtualBaseAddress = 0; - - MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, system_props); - regions_.push_back(system_region_fine); - - if (!is_apu_node) { - MemoryRegion* system_region_coarse = - new MemoryRegion(false, is_apu_node, this, system_props); - regions_.push_back(system_region_coarse); - } + if (!is_apu_node) { + MemoryRegion* system_region_coarse = + new MemoryRegion(false, false, is_apu_node, this, *system_prop); + regions_.push_back(system_region_coarse); } } } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 06265bda24..5a83467cc3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -77,7 +77,7 @@ extern HsaApiTable hsa_internal_api_table_; } // namespace core namespace AMD { -GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) +GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode) : GpuAgentInt(node), properties_(node_props), current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), @@ -112,16 +112,21 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) rocr::core::IsaFeature sramecc = rocr::core::IsaFeature::Unsupported; if (isa_base->IsSrameccSupported()) { - sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 - ? core::IsaFeature::Enabled - : core::IsaFeature::Disabled; + sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled + : core::IsaFeature::Disabled; + // sramecc control for emulator. + if (core::Runtime::runtime_singleton_->flag().sramecc() != Flag::FLAG_DEFAULT) { + sramecc = core::Runtime::runtime_singleton_->flag().sramecc() == Flag::FLAG_ENABLE + ? core::IsaFeature::Enabled + : core::IsaFeature::Disabled; + } } rocr::core::IsaFeature xnack = rocr::core::IsaFeature::Unsupported; if (isa_base->IsXnackSupported()) { // TODO: This needs to be obtained form KFD once HMM implemented. - xnack = profile_ == HSA_PROFILE_FULL ? core::IsaFeature::Enabled - : core::IsaFeature::Disabled; + xnack = xnack_mode ? core::IsaFeature::Enabled + : core::IsaFeature::Disabled; } // Set instruction set architecture via node property, only on GPU device. @@ -202,6 +207,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar ASICShader compute_7; ASICShader compute_8; ASICShader compute_9; + ASICShader compute_90a; ASICShader compute_1010; ASICShader compute_10; }; @@ -212,6 +218,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {NULL, 0, 0, 0}, {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, + {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, }}, @@ -220,6 +227,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, }}, @@ -228,6 +236,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, }}, @@ -236,6 +245,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeFill7, sizeof(kCodeFill7), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill10, sizeof(kCodeFill10), 19, 8}, {kCodeFill10, sizeof(kCodeFill10), 19, 8}, }}}; @@ -254,6 +264,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar asic_shader = &compiled_shader_it->second.compute_8; break; case 9: + if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) + asic_shader = &compiled_shader_it->second.compute_90a; + else asic_shader = &compiled_shader_it->second.compute_9; break; case 10: @@ -302,6 +315,14 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2); AMD_HSA_BITS_SET(header->compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1); + + if ((isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) && + (isa_->GetStepping() == 10)) { + // Program COMPUTE_PGM_RSRC3.ACCUM_OFFSET for 0 ACC VGPRs on gfx90a. + // FIXME: Assemble code objects from source at build time + int gran_accvgprs = ((gran_vgprs + 1) * 8) / 4 - 1; + header->max_scratch_backing_memory_byte_size = uint64_t(gran_accvgprs) << 32; + } } // Copy shader code into the GPU-visible buffer. @@ -338,8 +359,7 @@ void GpuAgent::InitRegionList() { memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax; case HSA_HEAPTYPE_GPU_LDS: case HSA_HEAPTYPE_GPU_SCRATCH: { - MemoryRegion* region = - new MemoryRegion(false, false, this, mem_props[mem_idx]); + MemoryRegion* region = new MemoryRegion(false, false, false, this, mem_props[mem_idx]); regions_.push_back(region); @@ -348,7 +368,7 @@ void GpuAgent::InitRegionList() { // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI. if ((properties_.HiveID != 0) || (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) { - regions_.push_back(new MemoryRegion(true, false, this, mem_props[mem_idx])); + regions_.push_back(new MemoryRegion(true, false, false, this, mem_props[mem_idx])); } } break; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index fca4398664..4419286fd9 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -100,9 +100,9 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { hsaKmtUnmapMemoryToGPU(const_cast(ptr)); } -MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, +MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props) - : core::MemoryRegion(fine_grain, full_profile, owner), + : core::MemoryRegion(fine_grain, kernarg, full_profile, owner), mem_props_(mem_props), max_single_alloc_size_(0), virtual_size_(0), @@ -128,6 +128,8 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne mem_flag_.ui32.HostAccess = 1; mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; + if (kernarg) mem_flag_.ui32.Uncached = 1; + virtual_size_ = (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize; } @@ -290,16 +292,14 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_GLOBAL_FLAGS: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: - *((uint32_t*)value) = fine_grain() - ? (HSA_REGION_GLOBAL_FLAG_KERNARG | - HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) - : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; - break; - case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: - *((uint32_t*)value) = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED - : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: { + uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED + : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG; + *((uint32_t*)value) = ret; break; + } default: *((uint32_t*)value) = 0; break; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp index 1858384892..2855e33e4e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -68,6 +68,38 @@ namespace AMD { static const uint kKfdVersionMajor = 0; static const uint kKfdVersionMinor = 99; +// Query for user preference and use that to determine Xnack mode of ROCm system. +// Return true if Xnack mode is ON or false if OFF. Xnack mode of a system is +// orthogonal to devices that do not support Xnack mode. It is legal for a +// system with Xnack ON to have devices that do not support Xnack functionality. +bool BindXnackMode() { + // Get users' preference for Xnack mode of ROCm platform + HSAint32 mode; + mode = core::Runtime::runtime_singleton_->flag().xnack(); + bool config_xnack = + (core::Runtime::runtime_singleton_->flag().xnack() != Flag::XNACK_REQUEST::XNACK_UNCHANGED); + + // Indicate to driver users' preference for Xnack mode + // Call to driver can fail and is a supported feature + HSAKMT_STATUS status = HSAKMT_STATUS_ERROR; + if (config_xnack) { + status = hsaKmtSetXNACKMode(mode); + if (status == HSAKMT_STATUS_SUCCESS) { + return mode; + } + } + + // Get Xnack mode of devices bound by driver. This could happen + // when a call to SET Xnack mode fails or user has no particular + // preference + status = hsaKmtGetXNACKMode((HSAint32*)&mode); + if(status != HSAKMT_STATUS_SUCCESS) { + debug_print("KFD does not support xnack mode query.\nROCr must assume xnack is disabled.\n"); + return false; + } + return mode; +} + CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { if (node_prop.NumCPUCores == 0) { return nullptr; @@ -79,14 +111,14 @@ CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { return cpu; } -GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { +GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnack_mode) { GpuAgent* gpu = nullptr; if (node_prop.NumFComputeCores == 0) { // Ignore non GPUs. return nullptr; } try { - gpu = new GpuAgent(node_id, node_prop); + gpu = new GpuAgent(node_id, node_prop, xnack_mode); const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion(); @@ -174,20 +206,26 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) { link_info.atomic_support_32bit = true; link_info.atomic_support_64bit = true; link_info.coherent_support = true; + if (core::Runtime::runtime_singleton_->flag().patch_xgmi_link_weight()) { + io_link.Weight = 15; + } break; default: debug_print("Unrecognized IOLINK type.\n"); break; } - if (io_link.Flags.ui32.Override == 1) { - if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) { - // Ignore this link since peer to peer is not allowed. - continue; + // KFD is reporting wrong override status for XGMI. Disallow override for bringup. + if (!core::Runtime::runtime_singleton_->flag().patch_link_override()) { + if (io_link.Flags.ui32.Override == 1) { + if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) { + // Ignore this link since peer to peer is not allowed. + continue; + } + link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0); + link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0); + link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0); } - link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0); - link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0); - link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0); } link_info.max_bandwidth = io_link.MaximumBandwidth; @@ -204,7 +242,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) { /** * Process the list of Gpus that are surfaced to user */ -static void SurfaceGpuList(std::vector& gpu_list) { +static void SurfaceGpuList(std::vector& gpu_list, bool xnack_mode) { // Process user visible Gpu devices int32_t invalidIdx = -1; int32_t list_sz = gpu_list.size(); @@ -221,7 +259,7 @@ static void SurfaceGpuList(std::vector& gpu_list) { // Instantiate a Gpu device. The IO links // of this node have already been registered assert((node_prop.NumFComputeCores != 0) && "Improper node used for GPU device discovery."); - DiscoverGpu(gpu_list[idx], node_prop); + DiscoverGpu(gpu_list[idx], node_prop, xnack_mode); } } @@ -305,8 +343,11 @@ void BuildTopology() { RegisterLinkInfo(node_id, node_prop.NumIOLinks); } + // Determine the Xnack mode to be bound for system + bool xnack_mode = BindXnackMode(); + // Instantiate ROCr objects to encapsulate Gpu devices - SurfaceGpuList(gpu_usr_list); + SurfaceGpuList(gpu_usr_list, xnack_mode); } bool Load() { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index 9e50971b3d..605ec15aec 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -391,6 +391,9 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_register_deallocation_callback_fn = AMD::hsa_amd_register_deallocation_callback; amd_ext_api.hsa_amd_deregister_deallocation_callback_fn = AMD::hsa_amd_deregister_deallocation_callback; amd_ext_api.hsa_amd_signal_value_pointer_fn = AMD::hsa_amd_signal_value_pointer; + amd_ext_api.hsa_amd_svm_attributes_set_fn = AMD::hsa_amd_svm_attributes_set; + amd_ext_api.hsa_amd_svm_attributes_get_fn = AMD::hsa_amd_svm_attributes_get; + amd_ext_api.hsa_amd_svm_prefetch_async_fn = AMD::hsa_amd_svm_prefetch_async; } void LoadInitialHsaApiTable() { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index f4776eb990..66d7ff7278 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -1009,5 +1009,37 @@ hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifie CATCH; } +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + TRY; + IS_OPEN(); + return core::Runtime::runtime_singleton_->SetSvmAttrib(ptr, size, attribute_list, + attribute_count); + CATCH; +} + +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + TRY; + IS_OPEN(); + return core::Runtime::runtime_singleton_->GetSvmAttrib(ptr, size, attribute_list, + attribute_count); + CATCH; +} + +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + TRY; + IS_OPEN(); + // Validate inputs. + // if (core::g_use_interrupt_wait && (!core::InterruptSignal::IsType(signal))) + return core::Runtime::runtime_singleton_->SvmPrefetch(ptr, size, agent, num_dep_signals, + dep_signals, completion_signal); + CATCH; +} + } // namespace amd } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/isa.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/isa.cpp index 8b889bb5d9..8116e66b77 100755 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/isa.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/isa.cpp @@ -281,6 +281,15 @@ constexpr size_t hsa_name_size = 63; ISAREG_ENTRY_GEN("gfx908:sramecc-:xnack+", 9, 0, 8, disabled, enabled) ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack-", 9, 0, 8, enabled, disabled) ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack+", 9, 0, 8, enabled, enabled) + ISAREG_ENTRY_GEN("gfx90a", 9, 0, 10, any, any) + ISAREG_ENTRY_GEN("gfx90a:xnack-", 9, 0, 10, any, disabled) + ISAREG_ENTRY_GEN("gfx90a:xnack+", 9, 0, 10, any, enabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc-", 9, 0, 10, disabled, any) + ISAREG_ENTRY_GEN("gfx90a:sramecc+", 9, 0, 10, enabled, any) + ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack-", 9, 0, 10, disabled, disabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack+", 9, 0, 10, disabled, enabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack-", 9, 0, 10, enabled, disabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack+", 9, 0, 10, enabled, enabled) ISAREG_ENTRY_GEN("gfx1010", 10, 1, 0, unsupported, any) ISAREG_ENTRY_GEN("gfx1010:xnack-", 10, 1, 0, unsupported, disabled) ISAREG_ENTRY_GEN("gfx1010:xnack+", 10, 1, 0, unsupported, enabled) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index c3cce57a1a..aba70cb5e3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -60,6 +60,7 @@ #include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/hsa_api_trace_int.h" #include "core/util/os.h" +#include "core/inc/exceptions.h" #include "inc/hsa_ven_amd_aqlprofile.h" #define HSA_VERSION_MAJOR 1 @@ -173,16 +174,29 @@ void Runtime::RegisterAgent(Agent* agent) { if (cpu_agents_.size() == 1) { // Might need memory pooling to cover allocation that // requires less than 4096 bytes. - system_allocator_ = [this](size_t size, size_t align, MemoryRegion::AllocateFlags alloc_flags) -> void* { - assert(align <= 4096); - void* ptr = nullptr; - core::Runtime::runtime_singleton_->AllocateMemory(system_regions_fine_[0], size, alloc_flags, &ptr); - return ptr; - }; - system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; + // Default system pool must support kernarg + for (auto pool : system_regions_fine_) { + if (pool->kernarg()) { + system_allocator_ = [pool](size_t size, size_t alignment, + MemoryRegion::AllocateFlags alloc_flags) -> void* { + assert(alignment <= 4096); + void* ptr = NULL; + return (HSA_STATUS_SUCCESS == + core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, + &ptr)) + ? ptr + : NULL; + }; - BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_); + system_deallocator_ = [](void* ptr) { + core::Runtime::runtime_singleton_->FreeMemory(ptr); + }; + + BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_); + break; + } + } } } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) { gpu_agents_.push_back(agent); @@ -630,6 +644,18 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) { *(const char**)value = STRING(ROCR_BUILD_ID); break; } + case HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: { + // todo: Get HMM kernel support info. + *(bool*)value = true; + break; + } + case HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: { + bool ret = true; + for(auto agent : gpu_agents_) + ret &= (agent->isa()->GetXnack() == IsaFeature::Enabled); + *(bool*)value = ret; + break; + } default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -1572,5 +1598,499 @@ void Runtime::InternalQueueCreateNotify(const hsa_queue_t* queue, hsa_agent_t ag internal_queue_create_notifier_(queue, agent, internal_queue_create_notifier_user_data_); } +hsa_status_t Runtime::SetSvmAttrib(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + uint32_t set_attribs = 0; + std::vector agent_seen(agents_by_node_.size(), false); + + std::vector attribs; + attribs.reserve(attribute_count); + uint32_t set_flags = 0; + uint32_t clear_flags = 0; + + auto Convert = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent == nullptr) || !agent->IsValid()) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::SetSvmAttrib."); + return agent; + }; + + auto ConvertAllowNull = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent != nullptr) && (!agent->IsValid())) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::SetSvmAttrib."); + return agent; + }; + + auto ConfirmNew = [&](Agent* agent) { + if (agent_seen[agent->node_id()]) + throw AMD::hsa_exception( + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, + "Multiple attributes given for the same agent in Runtime::SetSvmAttrib."); + agent_seen[agent->node_id()] = true; + }; + + auto Check = [&](uint64_t attrib) { + if (set_attribs & (1 << attrib)) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, + "Attribute given multiple times in Runtime::SetSvmAttrib."); + set_attribs |= (1 << attrib); + }; + + auto kmtPair = [](uint32_t attrib, uint32_t value) { + HSA_SVM_ATTRIBUTE pair = {attrib, value}; + return pair; + }; + + for (uint32_t i = 0; i < attribute_count; i++) { + auto attrib = attribute_list[i].attribute; + auto value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: { + Check(attrib); + switch (value) { + case HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED: + set_flags |= HSA_SVM_FLAG_COHERENT; + break; + case HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED: + clear_flags |= HSA_SVM_FLAG_COHERENT; + break; + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Invalid HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG value."); + } + break; + } + case HSA_AMD_SVM_ATTRIB_READ_ONLY: { + Check(attrib); + if (value) + set_flags |= HSA_SVM_FLAG_GPU_RO; + else + clear_flags |= HSA_SVM_FLAG_GPU_RO; + break; + } + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + Check(attrib); + if (value) + set_flags |= HSA_SVM_FLAG_HIVE_LOCAL; + else + clear_flags |= HSA_SVM_FLAG_HIVE_LOCAL; + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + Check(attrib); + // Max migration size is 1GB. + if (value > 18) value = 18; + attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, value)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + Check(attrib); + Agent* agent = ConvertAllowNull(value); + if (agent == nullptr) + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, INVALID_NODEID)); + else + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, agent->node_id())); + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + set_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id())); + } + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + set_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS_IN_PLACE, agent->node_id())); + } + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + clear_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_NO_ACCESS, agent->node_id())); + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::SetSvmAttrib"); + } + } + + // Merge CPU access properties - grant access if any CPU needs access. + // Probably wrong. + if (set_flags & HSA_SVM_FLAG_HOST_ACCESS) clear_flags &= ~HSA_SVM_FLAG_HOST_ACCESS; + + // Add flag updates + if (clear_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_CLR_FLAGS, clear_flags)); + if (set_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, set_flags)); + + uint8_t* base = AlignDown((uint8_t*)ptr, 4096); + uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096); + size_t len = end - base; + HSAKMT_STATUS error = hsaKmtSVMSetAttr(base, len, attribs.size(), &attribs[0]); + if (error != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMSetAttr failed."); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::GetSvmAttrib(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + std::vector attribs; + attribs.reserve(attribute_count); + + std::vector kmtIndices(attribute_count); + + bool getFlags = false; + + auto Convert = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent == nullptr) || !agent->IsValid()) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::GetSvmAttrib."); + return agent; + }; + + auto kmtPair = [](uint32_t attrib, uint32_t value) { + HSA_SVM_ATTRIBUTE pair = {attrib, value}; + return pair; + }; + + for (uint32_t i = 0; i < attribute_count; i++) { + auto& attrib = attribute_list[i].attribute; + auto& value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: + case HSA_AMD_SVM_ATTRIB_READ_ONLY: + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + getFlags = true; + kmtIndices[i] = -1; + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, 0)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, 0)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: { + value = Agent::Convert(GetSVMPrefetchAgent(ptr, size)).handle; + kmtIndices[i] = -1; + break; + } + case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: { + Agent* agent = Convert(value); + if (agent->device_type() == Agent::kAmdCpuDevice) { + getFlags = true; + kmtIndices[i] = -1; + } else { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id())); + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::SetSvmAttrib"); + } + } + + if (getFlags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, 0)); + + uint8_t* base = AlignDown((uint8_t*)ptr, 4096); + uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096); + size_t len = end - base; + if (attribs.size() != 0) { + HSAKMT_STATUS error = hsaKmtSVMGetAttr(base, len, attribs.size(), &attribs[0]); + if (error != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMGetAttr failed."); + } + + for (uint32_t i = 0; i < attribute_count; i++) { + auto& attrib = attribute_list[i].attribute; + auto& value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: { + if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_COHERENT) + value = HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED; + else + value = HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED; + break; + } + case HSA_AMD_SVM_ATTRIB_READ_ONLY: { + value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_GPU_RO); + break; + } + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HIVE_LOCAL); + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + value = attribs[kmtIndices[i]].value; + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + uint64_t node = attribs[kmtIndices[i]].value; + Agent* agent = nullptr; + if (node != INVALID_NODEID) agent = agents_by_node_[node][0]; + value = Agent::Convert(agent).handle; + break; + } + case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: { + break; + } + case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: { + if (kmtIndices[i] == -1) { + if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HOST_ACCESS) + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE; + } else { + switch (attribs[kmtIndices[i]].type) { + case HSA_SVM_ATTR_ACCESS: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE; + break; + case HSA_SVM_ATTR_ACCESS_IN_PLACE: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE; + break; + case HSA_SVM_ATTR_NO_ACCESS: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS; + break; + default: + assert(false && "Bad agent accessibility from KFD."); + } + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::GetSvmAttrib"); + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + uintptr_t base = reinterpret_cast(AlignDown(ptr, 4096)); + uintptr_t end = AlignUp(reinterpret_cast(ptr) + size, 4096); + size_t len = end - base; + + PrefetchOp* op = new PrefetchOp(); + MAKE_NAMED_SCOPE_GUARD(OpGuard, [&]() { delete op; }); + + Agent* dest = Agent::Convert(agent); + if (dest->device_type() == Agent::kAmdCpuDevice) + op->node_id = 0; + else + op->node_id = dest->node_id(); + + op->base = reinterpret_cast(base); + op->size = len; + op->completion = completion_signal; + if (num_dep_signals > 1) { + op->remaining_deps = num_dep_signals - 1; + for (int i = 0; i < num_dep_signals - 1; i++) op->dep_signals.push_back(dep_signals[i]); + } else { + op->remaining_deps = 0; + } + + { + ScopedAcquire lock(&prefetch_lock_); + // Remove all fully overlapped and trim partially overlapped ranges. + // Get iteration bounds + auto start = prefetch_map_.upper_bound(base); + if (start != prefetch_map_.begin()) start--; + auto stop = prefetch_map_.lower_bound(end); + + auto isEndNode = [&](decltype(start) node) { return node->second.next == prefetch_map_.end(); }; + auto isFirstNode = [&](decltype(start) node) { + return node->second.prev == prefetch_map_.end(); + }; + + // Trim and remove old ranges. + while (start != stop) { + uintptr_t startBase = start->first; + uintptr_t startEnd = startBase + start->second.bytes; + + auto ibase = Max(startBase, base); + auto iend = Min(startEnd, end); + // Check for overlap + if (ibase < iend) { + // Second range check + if (iend < startEnd) { + auto ret = prefetch_map_.insert( + std::make_pair(iend, PrefetchRange(startEnd - iend, start->second.op))); + assert(ret.second && "Prefetch map insert failed during range split."); + + auto it = ret.first; + it->second.prev = start; + it->second.next = start->second.next; + start->second.next = it; + if (!isEndNode(it)) it->second.next->second.prev = it; + } + + // Is the first interval of the old range valid + if (startBase < ibase) { + start->second.bytes = ibase - startBase; + } else { + if (isFirstNode(start)) { + start->second.op->prefetch_map_entry = start->second.next; + if (!isEndNode(start)) start->second.next->second.prev = prefetch_map_.end(); + } else { + start->second.prev->second.next = start->second.next; + if (!isEndNode(start)) start->second.next->second.prev = start->second.prev; + } + prefetch_map_.erase(start); + } + } + start++; + } + + // Insert new range. + auto ret = prefetch_map_.insert(std::make_pair(base, PrefetchRange(len, op))); + assert(ret.second && "Prefetch map insert failed."); + + auto it = ret.first; + op->prefetch_map_entry = it; + it->second.next = it->second.prev = prefetch_map_.end(); + } + + // Remove the prefetch's ranges from the map. + static auto removePrefetchRanges = [](PrefetchOp* op) { + ScopedAcquire lock(&Runtime::runtime_singleton_->prefetch_lock_); + auto it = op->prefetch_map_entry; + while (it != Runtime::runtime_singleton_->prefetch_map_.end()) { + auto next = it->second.next; + Runtime::runtime_singleton_->prefetch_map_.erase(it); + it = next; + } + }; + + // Prefetch Signal handler for synchronization. + static hsa_amd_signal_handler signal_handler = [](hsa_signal_value_t value, void* arg) { + PrefetchOp* op = reinterpret_cast(arg); + + if (op->remaining_deps > 0) { + op->remaining_deps--; + Runtime::runtime_singleton_->SetAsyncSignalHandler( + op->dep_signals[op->remaining_deps], HSA_SIGNAL_CONDITION_EQ, 0, signal_handler, arg); + return false; + } + + HSA_SVM_ATTRIBUTE attrib; + attrib.type = HSA_SVM_ATTR_PREFETCH_LOC; + attrib.value = op->node_id; + HSAKMT_STATUS error = hsaKmtSVMSetAttr(op->base, op->size, 1, &attrib); + assert(error == HSAKMT_STATUS_SUCCESS && "KFD Prefetch failed."); + + removePrefetchRanges(op); + + if (op->completion.handle != 0) Signal::Convert(op->completion)->SubRelaxed(1); + delete op; + + return false; + }; + + auto no_dependencies = [](void* arg) { signal_handler(0, arg); }; + + MAKE_NAMED_SCOPE_GUARD(RangeGuard, [&]() { removePrefetchRanges(op); }); + + hsa_status_t err; + if (num_dep_signals == 0) + err = AMD::hsa_amd_async_function(no_dependencies, op); + else + err = SetAsyncSignalHandler(dep_signals[num_dep_signals - 1], HSA_SIGNAL_CONDITION_EQ, 0, + signal_handler, op); + if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Signal handler unable to be set."); + + RangeGuard.Dismiss(); + OpGuard.Dismiss(); + return HSA_STATUS_SUCCESS; +} + +Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) { + uintptr_t base = reinterpret_cast(AlignDown(ptr, 4096)); + uintptr_t end = AlignUp(reinterpret_cast(ptr) + size, 4096); + size_t len = end - base; + + std::vector> holes; + + ScopedAcquire lock(&Runtime::runtime_singleton_->prefetch_lock_); + auto start = prefetch_map_.upper_bound(base); + if (start != prefetch_map_.begin()) start--; + auto stop = prefetch_map_.lower_bound(end); + + // KFD returns -1 for no or mixed destinations. + uint32_t prefetch_node = -2; + if (start != stop) { + prefetch_node = start->second.op->node_id; + } + + while (start != stop) { + uintptr_t startBase = start->first; + uintptr_t startEnd = startBase + start->second.bytes; + + auto ibase = Max(base, startBase); + auto iend = Min(end, startEnd); + // Check for intersection with the query + if (ibase < iend) { + // If prefetch locations are different then we report null agent. + if (prefetch_node != start->second.op->node_id) return nullptr; + + // Push leading gap to an array for checking KFD. + if (base < ibase) holes.push_back(std::make_pair(base, ibase - base)); + + // Trim query range. + base = iend; + } + start++; + } + if (base < end) holes.push_back(std::make_pair(base, end - base)); + + HSA_SVM_ATTRIBUTE attrib; + attrib.type = HSA_SVM_ATTR_PREFETCH_LOC; + for (auto& range : holes) { + HSAKMT_STATUS error = + hsaKmtSVMGetAttr(reinterpret_cast(range.first), range.second, 1, &attrib); + assert(error == HSAKMT_STATUS_SUCCESS && "KFD prefetch query failed."); + + if (attrib.value == -1) return nullptr; + if (prefetch_node == -2) prefetch_node = attrib.value; + if (prefetch_node != attrib.value) return nullptr; + } + + assert(prefetch_node != -2 && "prefetch_node was not updated."); + assert(prefetch_node != -1 && "Should have already returned."); + return agents_by_node_[prefetch_node][0]; +} + } // namespace core } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h index 5013dd1a8f..34c9d21e11 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h @@ -56,6 +56,13 @@ class Flag { public: enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT }; + // The values are meaningful and chosen to satisfy the thunk API. + enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 }; + static_assert(XNACK_DISABLE == 0, "XNACK_REQUEST enum values improperly changed."); + static_assert(XNACK_ENABLE == 1, "XNACK_REQUEST enum values improperly changed."); + + enum FLAG_TRI_STATE { FLAG_DISABLE = 0, FLAG_ENABLE = 1, FLAG_DEFAULT = 2 }; + explicit Flag() { Refresh(); } virtual ~Flag() {} @@ -104,6 +111,12 @@ class Flag { var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR"); disable_fragment_alloc_ = (var == "1") ? true : false; + var = os::GetEnvVar("HSA_UNPATCH_XGMI_LINK_WEIGHT"); + patch_xgmi_link_weight_ = (var == "1") ? false : true; + + var = os::GetEnvVar("HSA_UNPATCH_LINK_OVERRIDE"); + patch_link_override_ = (var == "1") ? false : true; + var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH"); enable_sdma_hdp_flush_ = (var == "0") ? false : true; @@ -130,6 +143,16 @@ class Flag { var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT"); check_sramecc_validity_ = (var == "1") ? false : true; + + // Legal values are zero "0" or one "1". Any other value will + // be interpreted as not defining the env variable + var = os::GetEnvVar("HSA_XNACK"); + xnack_ = (var == "0") ? XNACK_DISABLE : ((var == "1") ? XNACK_ENABLE : XNACK_UNCHANGED); + + // Legal values are zero "0" or one "1". Any other value will + // be interpreted as not defining the env variable. + var = os::GetEnvVar("HSA_FORCE_SRAMECC"); + sramecc_ = (var == "0") ? FLAG_DISABLE : ((var == "1") ? FLAG_ENABLE : FLAG_DEFAULT); } bool check_flat_scratch() const { return check_flat_scratch_; } @@ -150,6 +173,11 @@ class Flag { bool disable_fragment_alloc() const { return disable_fragment_alloc_; } + // Temporary way to control ROCr interpretation of inter-device link weight + bool patch_xgmi_link_weight() const { return patch_xgmi_link_weight_; } + + bool patch_link_override() const { return patch_link_override_; } + bool rev_copy_dir() const { return rev_copy_dir_; } bool fine_grain_pcie() const { return fine_grain_pcie_; } @@ -178,6 +206,10 @@ class Flag { bool check_sramecc_validity() const { return check_sramecc_validity_; } + XNACK_REQUEST xnack() const { return xnack_; } + + FLAG_TRI_STATE sramecc() const { return sramecc_; } + private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -195,6 +227,8 @@ class Flag { bool disable_image_; bool loader_enable_mmap_uri_; bool check_sramecc_validity_; + bool patch_xgmi_link_weight_; + bool patch_link_override_; SDMA_OVERRIDE enable_sdma_; @@ -209,6 +243,12 @@ class Flag { size_t force_sdma_size_; + // Indicates user preference for Xnack state. + XNACK_REQUEST xnack_; + + // Indicates user preference for SramECC state. + FLAG_TRI_STATE sramecc_; + DISALLOW_COPY_AND_ASSIGN(Flag); }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def index 4666121cfb..eb853410e5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def +++ b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def @@ -222,6 +222,9 @@ global: hsa_amd_deregister_deallocation_callback; hsa_amd_signal_value_pointer; _amdgpu_r_debug; + hsa_amd_svm_attributes_set; + hsa_amd_svm_attributes_get; + hsa_amd_svm_prefetch_async; local: *; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/image/blit_kernel.cpp b/projects/rocr-runtime/runtime/hsa-runtime/image/blit_kernel.cpp index 33a3e1c91e..14d385fe72 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/image/blit_kernel.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/image/blit_kernel.cpp @@ -82,6 +82,7 @@ extern uint8_t ocl_blit_object_gfx902[]; extern uint8_t ocl_blit_object_gfx904[]; extern uint8_t ocl_blit_object_gfx906[]; extern uint8_t ocl_blit_object_gfx908[]; +extern uint8_t ocl_blit_object_gfx90a[]; extern uint8_t ocl_blit_object_gfx1010[]; extern uint8_t ocl_blit_object_gfx1011[]; extern uint8_t ocl_blit_object_gfx1012[]; @@ -990,6 +991,8 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name, *blit_code_object = ocl_blit_object_gfx906; } else if (sname == "gfx908") { *blit_code_object = ocl_blit_object_gfx908; + } else if (sname == "gfx90a") { + *blit_code_object = ocl_blit_object_gfx90a; } else if (sname == "gfx1010") { *blit_code_object = ocl_blit_object_gfx1010; } else if (sname == "gfx1011") { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/image/blit_src/CMakeLists.txt b/projects/rocr-runtime/runtime/hsa-runtime/image/blit_src/CMakeLists.txt index 5e4fd7f0d4..87be604126 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/image/blit_src/CMakeLists.txt +++ b/projects/rocr-runtime/runtime/hsa-runtime/image/blit_src/CMakeLists.txt @@ -69,7 +69,7 @@ endif() # Determine the target devices if not specified if (NOT DEFINED TARGET_DEVICES) - set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033") + set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033") endif() set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE ) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/image/device_info.cpp b/projects/rocr-runtime/runtime/hsa-runtime/image/device_info.cpp index fec0a5849a..ce214aa2bd 100755 --- a/projects/rocr-runtime/runtime/hsa-runtime/image/device_info.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/image/device_info.cpp @@ -50,17 +50,11 @@ namespace rocr { namespace image { -uint32_t MajorVerFromDevID(uint32_t dev_id) { - return dev_id/100; -} +uint32_t MajorVerFromDevID(uint32_t dev_id) { return dev_id >> 8; } -uint32_t MinorVerFromDevID(uint32_t dev_id) { - return (dev_id % 100)/10; -} +uint32_t MinorVerFromDevID(uint32_t dev_id) { return (dev_id >> 4) & 0xF; } -uint32_t StepFromDevID(uint32_t dev_id) { - return (dev_id%100)%10; -} +uint32_t StepFromDevID(uint32_t dev_id) { return dev_id & 0xF; } hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) { char asic_name[64]; @@ -78,7 +72,10 @@ hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) { assert(a_str.compare(0, 3, "gfx", 3) == 0); a_str.erase(0,3); - *chip_id = std::stoi(a_str); + + // Load chip_id accounting for stepping and minor in hex and major in dec. + *chip_id = std::stoi(a_str.substr(a_str.length() - 2), nullptr, 16); + *chip_id += (std::stoi(a_str.substr(0, a_str.length() - 2)) << 8); return HSA_STATUS_SUCCESS; } @@ -163,6 +160,7 @@ uint32_t DevIDToAddrLibFamily(uint32_t dev_id) { case 4: // Vega12 case 6: // Vega20 case 8: // Arcturus + case 10: // Aldebaran return FAMILY_AI; case 2: diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/amd_hsa_elf.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/amd_hsa_elf.h index adcdec490c..cc4ba97f02 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/amd_hsa_elf.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/amd_hsa_elf.h @@ -115,10 +115,13 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3D = 0x03d, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3E = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa.h index d8fdd472b9..fe1facca44 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa.h @@ -480,7 +480,21 @@ typedef enum { /** * String containing the ROCr build identifier. */ - HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200 + HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202 } hsa_system_info_t; /** diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h index bf3e9197ed..35dd21bfa5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -183,6 +183,9 @@ struct AmdExtTable { decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn; decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn; decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn; + decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn; + decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn; + decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn; }; // Table to export HSA Core Runtime Apis diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h index 9df7c49aee..a0bc5d1d99 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -2116,6 +2116,162 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr, hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, hsa_amd_deallocation_callback_t callback); +typedef enum hsa_amd_svm_model_s { + /** + * Updates to memory with this attribute conform to HSA memory consistency model. + */ + HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0, + /** + * Writes to memory with this attribute can be performed by a single agent at a time. + */ + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1 +} hsa_amd_svm_model_t; + +typedef enum hsa_amd_svm_attribute_s { + // Memory model attribute. + // Type of this attribute is hsa_amd_svm_model_t. + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0, + // Marks the range read only. This allows multiple physical copies to be + // placed local to each accessing device. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_ONLY = 1, + // Automatic migrations should attempt to keep the memory within the xgmi hive + // containing accessible agents. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2, + // Page granularity to migrate at once. Page granularity is specified as + // log2(page_count). + // Type of this attribute is uint64_t. + HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3, + // Physical location to prefer when automatic migration occurs. + // Set to the null agent handle (handle == 0) to indicate there + // is no preferred location. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4, + // This attribute can not be used in ::hsa_amd_svm_attributes_set (see + // ::hsa_amd_svm_prefetch_async). + // Physical location of most recent prefetch command. + // If the prefetch location has not been set or is not uniform across the + // address range then returned hsa_agent_t::handle will be 0. + // Querying this attribute will return the destination agent of the most + // recent ::hsa_amd_svm_prefetch_async targeting the address range. If + // multiple async prefetches have been issued targeting the region and the + // most recently issued prefetch has completed then the query will return + // the location of the most recently completed prefetch. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range. Access may incur a page fault + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM + // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range without page faults. Access + // will not incur a page fault and will not cause access based migration. + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if + // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Denies an agent access to the memory range. Access will cause a terminal + // segfault. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202, + // This attribute can not be used in ::hsa_amd_svm_attributes_set. + // Returns the access attribute associated with the agent. + // The agent to query must be set in the attribute value field. + // The attribute enum will be replaced with the agent's current access + // attribute for the address range. + // TODO: Clarify KFD return value for non-uniform access attribute. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203, +} hsa_amd_svm_attribute_t; + +// List type for hsa_amd_svm_attributes_set/get. +typedef struct hsa_amd_svm_attribute_pair_s { + // hsa_amd_svm_attribute_t value. + uint64_t attribute; + // Attribute value. Bit values should be interpreted according to the type + // given in the associated attribute description. + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + +/** + * @brief Sets SVM memory attributes. + * + * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling + * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE + * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM + * memory access by that Agent. + * + * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION + * may not be used with this API. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Gets SVM memory attributes. + * + * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, + * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and + * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API. + * + * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an + * hsa_agent_t and returns the current access type through its attribute field. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Asynchronously migrates memory to an agent. + * + * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero. + * @p completion_signal will decrement when the migration is complete. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] agent Agent to migrate to. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the migration + * operation starts. The migration will start after every signal has been observed with + * the value 0. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the migration + * operation. When the migration operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. If no completion signal is required this handle may be null. + */ +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + #ifdef __cplusplus } // end extern "C" block #endif diff --git a/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp b/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp index cb7964422e..6bd595da05 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp @@ -568,6 +568,7 @@ namespace code { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: name = "gfx906"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: name = "gfx908"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: name = "gfx909"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: name = "gfx90a"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: name = "gfx90c"; xnack_supported = true; sramecc_supported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: name = "gfx1010"; xnack_supported = true; sramecc_supported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: name = "gfx1011"; xnack_supported = true; sramecc_supported = false; break; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/loader/loaders.cpp b/projects/rocr-runtime/runtime/hsa-runtime/loader/loaders.cpp index a36ce2c950..e9d402a857 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/loader/loaders.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/loader/loaders.cpp @@ -80,26 +80,27 @@ namespace loader { : out(std::cout) { invalid.handle = 0; - gfx700.handle = 700; - gfx701.handle = 701; - gfx702.handle = 702; - gfx801.handle = 801; - gfx802.handle = 802; - gfx803.handle = 803; - gfx805.handle = 805; - gfx810.handle = 810; - gfx900.handle = 900; - gfx902.handle = 902; - gfx904.handle = 904; - gfx906.handle = 906; - gfx908.handle = 908; - gfx1010.handle = 1010; - gfx1011.handle = 1011; - gfx1012.handle = 1012; - gfx1030.handle = 1030; - gfx1031.handle = 1031; - gfx1032.handle = 1032; - gfx1033.handle = 1033; + gfx700.handle = 0x700; + gfx701.handle = 0x701; + gfx702.handle = 0x702; + gfx801.handle = 0x801; + gfx802.handle = 0x802; + gfx803.handle = 0x803; + gfx805.handle = 0x805; + gfx810.handle = 0x810; + gfx900.handle = 0x900; + gfx902.handle = 0x902; + gfx904.handle = 0x904; + gfx906.handle = 0x906; + gfx908.handle = 0x908; + gfx90a.handle = 0x90a; + gfx1010.handle = 0x1010; + gfx1011.handle = 0x1011; + gfx1012.handle = 0x1012; + gfx1030.handle = 0x1030; + gfx1031.handle = 0x1031; + gfx1032.handle = 0x1032; + gfx1033.handle = 0x1033; } hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) @@ -137,6 +138,8 @@ namespace loader { return gfx906; } else if (sname == "AMD:AMDGPU:9:0:8") { return gfx908; + } else if (sname == "AMD:AMDGPU:9:0:A") { + return gfx90a; } else if (sname == "AMD:AMDGPU:10:1:0") { return gfx1010; } else if (sname == "AMD:AMDGPU:10:1:1") { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/loader/loaders.hpp b/projects/rocr-runtime/runtime/hsa-runtime/loader/loaders.hpp index ef6ef2a59e..4466d5e221 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/loader/loaders.hpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/loader/loaders.hpp @@ -57,7 +57,7 @@ namespace loader { hsa_isa_t invalid; hsa_isa_t gfx700, gfx701, gfx702, gfx703, gfx704, gfx705; hsa_isa_t gfx801, gfx802, gfx803, gfx805, gfx810; - hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908; + hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908, gfx90a; hsa_isa_t gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, gfx1032, gfx1033; std::ostream& out; typedef std::set PointerSet;