Squash merge of cfreehil/amd-temp-gfx90a onto amd-staging.
Includes some workarounds and HMM.
Conflicts:
opensrc/hsa-runtime/core/runtime/amd_topology.cpp
opensrc/hsa-runtime/core/util/flag.h
Change-Id: I22976f07964a43dbb228a6231777dbd599112b8d
[ROCm/ROCR-Runtime commit: 7333c77e22]
Этот коммит содержится в:
@@ -186,19 +186,19 @@ void QueueValidation::QueueValidationForInvalidDimension(hsa_agent_t cpuAgent,
|
||||
|
||||
hsa_queue_t *queue[kMaxQueue]; // command queue
|
||||
uint32_t ii;
|
||||
test_validation_data user_data[kMaxQueue];
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
test_validation_data user_data;
|
||||
// set callback flag to false if callback called then it will change to true
|
||||
user_data.cb_triggered = false;
|
||||
user_data[ii].cb_triggered = false;
|
||||
// set the queue pointer
|
||||
user_data.queue_pointer = &queue[ii];
|
||||
user_data[ii].queue_pointer = &queue[ii];
|
||||
// set the expected status in queue error calback handling
|
||||
user_data.expected_status = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
|
||||
user_data[ii].expected_status = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
|
||||
|
||||
// create queue
|
||||
err = hsa_queue_create(gpuAgent,
|
||||
queue_max, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
|
||||
queue_max, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
|
||||
@@ -240,12 +240,12 @@ void QueueValidation::QueueValidationForInvalidDimension(hsa_agent_t cpuAgent,
|
||||
// completion signal should not be changed.
|
||||
ASSERT_EQ(completion, 1);
|
||||
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data.cb_triggered, true);
|
||||
|
||||
hsa_signal_store_relaxed(aql().completion_signal, 1);
|
||||
}
|
||||
sleep(1);
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data[ii].cb_triggered, true);
|
||||
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
|
||||
}
|
||||
}
|
||||
@@ -269,20 +269,21 @@ void QueueValidation::QueueValidationInvalidGroupMemory(hsa_agent_t cpuAgent,
|
||||
queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;
|
||||
|
||||
hsa_queue_t *queue[kMaxQueue]; // command queue
|
||||
test_validation_data user_data[kMaxQueue];
|
||||
|
||||
uint32_t ii;
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
test_validation_data user_data;
|
||||
// set callback flag to false if callback called then it will change to true
|
||||
user_data.cb_triggered = false;
|
||||
user_data[ii].cb_triggered = false;
|
||||
// set the queue pointer
|
||||
user_data.queue_pointer = &queue[ii];
|
||||
user_data[ii].queue_pointer = &queue[ii];
|
||||
// set the expected status in queue error calback handling
|
||||
user_data.expected_status = HSA_STATUS_ERROR_INVALID_ALLOCATION;
|
||||
user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_ALLOCATION;
|
||||
|
||||
// create queue
|
||||
err = hsa_queue_create(gpuAgent,
|
||||
queue_max, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
|
||||
queue_max, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
|
||||
@@ -325,12 +326,12 @@ void QueueValidation::QueueValidationInvalidGroupMemory(hsa_agent_t cpuAgent,
|
||||
// completion signal should not be changed.
|
||||
ASSERT_EQ(completion, 1);
|
||||
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data.cb_triggered, true);
|
||||
|
||||
hsa_signal_store_relaxed(aql().completion_signal, 1);
|
||||
}
|
||||
sleep(1);
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data[ii].cb_triggered, true);
|
||||
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
|
||||
}
|
||||
}
|
||||
@@ -353,20 +354,20 @@ void QueueValidation::QueueValidationForInvalidKernelObject(hsa_agent_t cpuAgent
|
||||
queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;
|
||||
|
||||
hsa_queue_t *queue[kMaxQueue]; // command queue
|
||||
test_validation_data user_data[kMaxQueue];
|
||||
uint32_t ii;
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
test_validation_data user_data;
|
||||
// set callback flag to false if callback called then it will change to true
|
||||
user_data.cb_triggered = false;
|
||||
user_data[ii].cb_triggered = false;
|
||||
// set the queue pointer
|
||||
user_data.queue_pointer = &queue[ii];
|
||||
user_data[ii].queue_pointer = &queue[ii];
|
||||
// set the expected status in queue error calback handling
|
||||
user_data.expected_status = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
|
||||
user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
|
||||
|
||||
// create queue
|
||||
err = hsa_queue_create(gpuAgent,
|
||||
kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
|
||||
CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
|
||||
@@ -408,12 +409,12 @@ void QueueValidation::QueueValidationForInvalidKernelObject(hsa_agent_t cpuAgent
|
||||
// completion signal should not be changed.
|
||||
ASSERT_EQ(completion, 1);
|
||||
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data.cb_triggered, true);
|
||||
|
||||
hsa_signal_store_relaxed(aql().completion_signal, 1);
|
||||
}
|
||||
sleep(1);
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data[ii].cb_triggered, true);
|
||||
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
|
||||
}
|
||||
}
|
||||
@@ -437,19 +438,19 @@ void QueueValidation::QueueValidationForInvalidPacket(hsa_agent_t cpuAgent,
|
||||
|
||||
hsa_queue_t *queue[kMaxQueue]; // command queue
|
||||
uint32_t ii;
|
||||
test_validation_data user_data[kMaxQueue];
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
test_validation_data user_data;
|
||||
// set callback flag to false if callback called then it will change to true
|
||||
user_data.cb_triggered = false;
|
||||
user_data[ii].cb_triggered = false;
|
||||
// set the queue pointer
|
||||
user_data.queue_pointer = &queue[ii];
|
||||
user_data[ii].queue_pointer = &queue[ii];
|
||||
// set the expected status in queue error calback handling
|
||||
user_data.expected_status = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
|
||||
user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
|
||||
|
||||
// create queue
|
||||
err = hsa_queue_create(gpuAgent,
|
||||
queue_max, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
|
||||
queue_max, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
|
||||
@@ -486,12 +487,12 @@ void QueueValidation::QueueValidationForInvalidPacket(hsa_agent_t cpuAgent,
|
||||
// completion signal should not be changed.
|
||||
ASSERT_EQ(completion, 1);
|
||||
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data.cb_triggered, true);
|
||||
|
||||
hsa_signal_store_relaxed(aql().completion_signal, 1);
|
||||
}
|
||||
sleep(1);
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data[ii].cb_triggered, true);
|
||||
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
|
||||
}
|
||||
}
|
||||
@@ -514,22 +515,22 @@ void QueueValidation::QueueValidationForInvalidWorkGroupSize(hsa_agent_t cpuAgen
|
||||
queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;
|
||||
|
||||
hsa_queue_t *queue[kMaxQueue]; // command queue
|
||||
test_validation_data user_data[kMaxQueue][3];
|
||||
uint32_t ii;
|
||||
for (ii = 0; ii < kMaxQueue; ++ii) {
|
||||
uint32_t jj;
|
||||
for (jj = 1; jj <= 3; ++jj) {
|
||||
test_validation_data user_data;
|
||||
// set callback flag to false if callback called then it will change to true
|
||||
user_data.cb_triggered = false;
|
||||
user_data[ii][jj - 1].cb_triggered = false;
|
||||
// set the queue pointer
|
||||
user_data.queue_pointer = &queue[ii];
|
||||
user_data[ii][jj - 1].queue_pointer = &queue[ii];
|
||||
// set the expected status in queue error calback handling
|
||||
user_data.expected_status = HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
user_data[ii][jj - 1].expected_status = HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
// create queue
|
||||
err = hsa_queue_create(gpuAgent,
|
||||
kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
|
||||
kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
|
||||
CallbackQueueErrorHandling, &user_data[ii][jj - 1], 0, 0, &queue[ii]);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
|
||||
@@ -574,13 +575,17 @@ void QueueValidation::QueueValidationForInvalidWorkGroupSize(hsa_agent_t cpuAgen
|
||||
// completion signal should not be changed.
|
||||
ASSERT_EQ(completion, 1);
|
||||
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data.cb_triggered, true);
|
||||
|
||||
hsa_signal_store_relaxed(aql().completion_signal, 1);
|
||||
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
|
||||
}
|
||||
}
|
||||
sleep(1);
|
||||
for (uint32_t ii = 0; ii < kMaxQueue; ++ii) {
|
||||
for (uint32_t jj = 0; jj < 3; ++jj) {
|
||||
// queue error handling callback should be triggered
|
||||
ASSERT_EQ(user_data[ii][jj].cb_triggered, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCR-R
|
||||
set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCR-Runtime" )
|
||||
|
||||
|
||||
set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
|
||||
set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
|
||||
|
||||
#
|
||||
# Currently support for Windows platform is not present
|
||||
|
||||
@@ -128,11 +128,6 @@ TEST(rocrtst, Test_Example) {
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, IPC) {
|
||||
IPCTest ipc;
|
||||
RunGenericTest(&ipc);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, MemoryAccessTests) {
|
||||
MemoryAccessTest mt;
|
||||
RunCustomTestProlog(&mt);
|
||||
@@ -175,7 +170,6 @@ TEST(rocrtstFunc, Concurrent_Shutdown) {
|
||||
RunCustomTestEpilog(&cs);
|
||||
}
|
||||
|
||||
|
||||
TEST(rocrtstFunc, Reference_Count) {
|
||||
ReferenceCountTest rc(true, false);
|
||||
RunCustomTestProlog(&rc);
|
||||
@@ -212,6 +206,11 @@ TEST(rocrtstFunc, Signal_Create_Concurrently) {
|
||||
}
|
||||
|
||||
#ifndef ROCRTST_EMULATOR_BUILD
|
||||
TEST(rocrtstFunc, IPC) {
|
||||
IPCTest ipc;
|
||||
RunGenericTest(&ipc);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, DISABLED_Signal_Kernel_Set) {
|
||||
SignalKernelTest sk(SET);
|
||||
RunCustomTestProlog(&sk);
|
||||
|
||||
@@ -1151,6 +1151,27 @@ hsa_status_t HSA_API hsa_amd_signal_value_pointer(hsa_signal_t signal,
|
||||
return amdExtTable->hsa_amd_signal_value_pointer_fn(signal, value_ptr);
|
||||
}
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count) {
|
||||
return amdExtTable->hsa_amd_svm_attributes_set_fn(ptr, size, attribute_list, attribute_count);
|
||||
}
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count) {
|
||||
return amdExtTable->hsa_amd_svm_attributes_get_fn(ptr, size, attribute_list, attribute_count);
|
||||
}
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
|
||||
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal) {
|
||||
return amdExtTable->hsa_amd_svm_prefetch_async_fn(ptr, size, agent, num_dep_signals, dep_signals, completion_signal);
|
||||
}
|
||||
|
||||
// Tools only table interfaces.
|
||||
namespace rocr {
|
||||
|
||||
|
||||
@@ -173,7 +173,8 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @param [in] node Node id. Each CPU in different socket will get distinct
|
||||
// id.
|
||||
// @param [in] node_props Node property.
|
||||
GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props);
|
||||
// @param [in] xnack_mode XNACK mode of device.
|
||||
GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode);
|
||||
|
||||
// @brief GPU agent destructor.
|
||||
~GpuAgent();
|
||||
|
||||
@@ -439,6 +439,32 @@ static const unsigned int kCodeTrapHandler9[] = {
|
||||
0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeTrapHandler90a[] = {
|
||||
0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73,
|
||||
0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900,
|
||||
0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a,
|
||||
0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff,
|
||||
0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001,
|
||||
0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f,
|
||||
0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f,
|
||||
0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f,
|
||||
0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80,
|
||||
0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044,
|
||||
0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100,
|
||||
0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4,
|
||||
0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e,
|
||||
0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c,
|
||||
0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff,
|
||||
0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f,
|
||||
0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f,
|
||||
0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37,
|
||||
0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010,
|
||||
0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f,
|
||||
0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001,
|
||||
0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e,
|
||||
0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeCopyAligned8[] = {
|
||||
0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020,
|
||||
0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050,
|
||||
|
||||
@@ -95,7 +95,7 @@ class MemoryRegion : public core::MemoryRegion {
|
||||
/// @brief Unpin memory.
|
||||
static void MakeKfdMemoryUnresident(const void* ptr);
|
||||
|
||||
MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
|
||||
MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
|
||||
const HsaMemoryProperties& mem_props);
|
||||
|
||||
~MemoryRegion();
|
||||
|
||||
@@ -246,6 +246,21 @@ hsa_status_t hsa_amd_deregister_deallocation_callback(
|
||||
hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
|
||||
volatile hsa_signal_value_t** value_ptr);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
|
||||
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal);
|
||||
|
||||
} // namespace amd
|
||||
} // namespace rocr
|
||||
|
||||
|
||||
@@ -57,8 +57,8 @@ class Agent;
|
||||
|
||||
class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
|
||||
public:
|
||||
MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner)
|
||||
: fine_grain_(fine_grain), full_profile_(full_profile), owner_(owner) {
|
||||
MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner)
|
||||
: fine_grain_(fine_grain), kernarg_(kernarg), full_profile_(full_profile), owner_(owner) {
|
||||
assert(owner_ != NULL);
|
||||
}
|
||||
|
||||
@@ -112,12 +112,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
|
||||
|
||||
__forceinline bool fine_grain() const { return fine_grain_; }
|
||||
|
||||
__forceinline bool kernarg() const { return kernarg_; }
|
||||
|
||||
__forceinline bool full_profile() const { return full_profile_; }
|
||||
|
||||
__forceinline core::Agent* owner() const { return owner_; }
|
||||
|
||||
private:
|
||||
const bool fine_grain_;
|
||||
const bool kernarg_;
|
||||
const bool full_profile_;
|
||||
|
||||
core::Agent* owner_;
|
||||
|
||||
@@ -292,6 +292,15 @@ class Runtime {
|
||||
|
||||
hsa_status_t IPCDetach(void* ptr);
|
||||
|
||||
hsa_status_t SetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count);
|
||||
|
||||
hsa_status_t GetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count);
|
||||
|
||||
hsa_status_t SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, uint32_t num_dep_signals,
|
||||
const hsa_signal_t* dep_signals, hsa_signal_t completion_signal);
|
||||
|
||||
const std::vector<Agent*>& cpu_agents() { return cpu_agents_; }
|
||||
|
||||
const std::vector<Agent*>& gpu_agents() { return gpu_agents_; }
|
||||
@@ -395,6 +404,28 @@ class Runtime {
|
||||
std::vector<void*> arg_;
|
||||
};
|
||||
|
||||
struct PrefetchRange;
|
||||
typedef std::map<uintptr_t, PrefetchRange> prefetch_map_t;
|
||||
|
||||
struct PrefetchOp {
|
||||
void* base;
|
||||
size_t size;
|
||||
uint32_t node_id;
|
||||
int remaining_deps;
|
||||
hsa_signal_t completion;
|
||||
std::vector<hsa_signal_t> dep_signals;
|
||||
prefetch_map_t::iterator prefetch_map_entry;
|
||||
};
|
||||
|
||||
struct PrefetchRange {
|
||||
PrefetchRange() {}
|
||||
PrefetchRange(size_t Bytes, PrefetchOp* Op) : bytes(Bytes), op(Op) {}
|
||||
size_t bytes;
|
||||
PrefetchOp* op;
|
||||
prefetch_map_t::iterator prev;
|
||||
prefetch_map_t::iterator next;
|
||||
};
|
||||
|
||||
// Will be created before any user could call hsa_init but also could be
|
||||
// destroyed before incorrectly written programs call hsa_shutdown.
|
||||
static KernelMutex bootstrap_lock_;
|
||||
@@ -444,6 +475,9 @@ class Runtime {
|
||||
/// @retval Index in ::link_matrix_.
|
||||
uint32_t GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to);
|
||||
|
||||
/// @brief Get most recently issued SVM prefetch agent for the range in question.
|
||||
Agent* GetSVMPrefetchAgent(void* ptr, size_t size);
|
||||
|
||||
// Mutex object to protect multithreaded access to ::allocation_map_,
|
||||
// KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
|
||||
// registered & mapped arrays.
|
||||
@@ -485,6 +519,10 @@ class Runtime {
|
||||
// Contains the region, address, and size of previously allocated memory.
|
||||
std::map<const void*, AllocationRegion> allocation_map_;
|
||||
|
||||
// Pending prefetch containers.
|
||||
KernelMutex prefetch_lock_;
|
||||
prefetch_map_t prefetch_map_;
|
||||
|
||||
// Allocator using ::system_region_
|
||||
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;
|
||||
|
||||
|
||||
@@ -69,40 +69,31 @@ void CpuAgent::InitRegionList() {
|
||||
|
||||
std::vector<HsaMemoryProperties> mem_props(properties_.NumMemoryBanks);
|
||||
if (HSAKMT_STATUS_SUCCESS ==
|
||||
hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks,
|
||||
&mem_props[0])) {
|
||||
hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, &mem_props[0])) {
|
||||
std::vector<HsaMemoryProperties>::iterator system_prop =
|
||||
std::find_if(mem_props.begin(), mem_props.end(), [](HsaMemoryProperties prop) -> bool {
|
||||
return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM);
|
||||
});
|
||||
|
||||
if (system_prop != mem_props.end()) {
|
||||
MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, *system_prop);
|
||||
HsaMemoryProperties system_props;
|
||||
std::memset(&system_props, 0, sizeof(HsaMemoryProperties));
|
||||
system_props.HeapType = HSA_HEAPTYPE_SYSTEM;
|
||||
system_props.SizeInBytes = 0;
|
||||
system_props.VirtualBaseAddress = 0;
|
||||
|
||||
regions_.push_back(system_region_fine);
|
||||
if (system_prop != mem_props.end()) system_props = *system_prop;
|
||||
|
||||
if (!is_apu_node) {
|
||||
MemoryRegion* system_region_coarse =
|
||||
new MemoryRegion(false, is_apu_node, this, *system_prop);
|
||||
MemoryRegion* system_region_fine =
|
||||
new MemoryRegion(true, false, is_apu_node, this, system_props);
|
||||
regions_.push_back(system_region_fine);
|
||||
MemoryRegion* system_region_kernarg =
|
||||
new MemoryRegion(true, true, is_apu_node, this, system_props);
|
||||
regions_.push_back(system_region_kernarg);
|
||||
|
||||
regions_.push_back(system_region_coarse);
|
||||
}
|
||||
} else {
|
||||
HsaMemoryProperties system_props;
|
||||
std::memset(&system_props, 0, sizeof(HsaMemoryProperties));
|
||||
|
||||
system_props.HeapType = HSA_HEAPTYPE_SYSTEM;
|
||||
system_props.SizeInBytes = 0;
|
||||
system_props.VirtualBaseAddress = 0;
|
||||
|
||||
MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, system_props);
|
||||
regions_.push_back(system_region_fine);
|
||||
|
||||
if (!is_apu_node) {
|
||||
MemoryRegion* system_region_coarse =
|
||||
new MemoryRegion(false, is_apu_node, this, system_props);
|
||||
regions_.push_back(system_region_coarse);
|
||||
}
|
||||
if (!is_apu_node) {
|
||||
MemoryRegion* system_region_coarse =
|
||||
new MemoryRegion(false, false, is_apu_node, this, *system_prop);
|
||||
regions_.push_back(system_region_coarse);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ extern HsaApiTable hsa_internal_api_table_;
|
||||
} // namespace core
|
||||
|
||||
namespace AMD {
|
||||
GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
|
||||
GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode)
|
||||
: GpuAgentInt(node),
|
||||
properties_(node_props),
|
||||
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
|
||||
@@ -112,16 +112,21 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
|
||||
|
||||
rocr::core::IsaFeature sramecc = rocr::core::IsaFeature::Unsupported;
|
||||
if (isa_base->IsSrameccSupported()) {
|
||||
sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1
|
||||
? core::IsaFeature::Enabled
|
||||
: core::IsaFeature::Disabled;
|
||||
sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled
|
||||
: core::IsaFeature::Disabled;
|
||||
// sramecc control for emulator.
|
||||
if (core::Runtime::runtime_singleton_->flag().sramecc() != Flag::FLAG_DEFAULT) {
|
||||
sramecc = core::Runtime::runtime_singleton_->flag().sramecc() == Flag::FLAG_ENABLE
|
||||
? core::IsaFeature::Enabled
|
||||
: core::IsaFeature::Disabled;
|
||||
}
|
||||
}
|
||||
|
||||
rocr::core::IsaFeature xnack = rocr::core::IsaFeature::Unsupported;
|
||||
if (isa_base->IsXnackSupported()) {
|
||||
// TODO: This needs to be obtained form KFD once HMM implemented.
|
||||
xnack = profile_ == HSA_PROFILE_FULL ? core::IsaFeature::Enabled
|
||||
: core::IsaFeature::Disabled;
|
||||
xnack = xnack_mode ? core::IsaFeature::Enabled
|
||||
: core::IsaFeature::Disabled;
|
||||
}
|
||||
|
||||
// Set instruction set architecture via node property, only on GPU device.
|
||||
@@ -202,6 +207,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
|
||||
ASICShader compute_7;
|
||||
ASICShader compute_8;
|
||||
ASICShader compute_9;
|
||||
ASICShader compute_90a;
|
||||
ASICShader compute_1010;
|
||||
ASICShader compute_10;
|
||||
};
|
||||
@@ -212,6 +218,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
|
||||
{NULL, 0, 0, 0},
|
||||
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
|
||||
{kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},
|
||||
{kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},
|
||||
{kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},
|
||||
{kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},
|
||||
}},
|
||||
@@ -220,6 +227,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
|
||||
{kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},
|
||||
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
|
||||
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
|
||||
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
|
||||
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
|
||||
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
|
||||
}},
|
||||
@@ -228,6 +236,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
|
||||
{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
|
||||
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
|
||||
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
|
||||
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
|
||||
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
|
||||
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
|
||||
}},
|
||||
@@ -236,6 +245,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
|
||||
{kCodeFill7, sizeof(kCodeFill7), 19, 8},
|
||||
{kCodeFill8, sizeof(kCodeFill8), 19, 8},
|
||||
{kCodeFill8, sizeof(kCodeFill8), 19, 8},
|
||||
{kCodeFill8, sizeof(kCodeFill8), 19, 8},
|
||||
{kCodeFill10, sizeof(kCodeFill10), 19, 8},
|
||||
{kCodeFill10, sizeof(kCodeFill10), 19, 8},
|
||||
}}};
|
||||
@@ -254,6 +264,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
|
||||
asic_shader = &compiled_shader_it->second.compute_8;
|
||||
break;
|
||||
case 9:
|
||||
if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10))
|
||||
asic_shader = &compiled_shader_it->second.compute_90a;
|
||||
else
|
||||
asic_shader = &compiled_shader_it->second.compute_9;
|
||||
break;
|
||||
case 10:
|
||||
@@ -302,6 +315,14 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2);
|
||||
AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1);
|
||||
|
||||
if ((isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) &&
|
||||
(isa_->GetStepping() == 10)) {
|
||||
// Program COMPUTE_PGM_RSRC3.ACCUM_OFFSET for 0 ACC VGPRs on gfx90a.
|
||||
// FIXME: Assemble code objects from source at build time
|
||||
int gran_accvgprs = ((gran_vgprs + 1) * 8) / 4 - 1;
|
||||
header->max_scratch_backing_memory_byte_size = uint64_t(gran_accvgprs) << 32;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy shader code into the GPU-visible buffer.
|
||||
@@ -338,8 +359,7 @@ void GpuAgent::InitRegionList() {
|
||||
memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
|
||||
case HSA_HEAPTYPE_GPU_LDS:
|
||||
case HSA_HEAPTYPE_GPU_SCRATCH: {
|
||||
MemoryRegion* region =
|
||||
new MemoryRegion(false, false, this, mem_props[mem_idx]);
|
||||
MemoryRegion* region = new MemoryRegion(false, false, false, this, mem_props[mem_idx]);
|
||||
|
||||
regions_.push_back(region);
|
||||
|
||||
@@ -348,7 +368,7 @@ void GpuAgent::InitRegionList() {
|
||||
// Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
|
||||
if ((properties_.HiveID != 0) ||
|
||||
(core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) {
|
||||
regions_.push_back(new MemoryRegion(true, false, this, mem_props[mem_idx]));
|
||||
regions_.push_back(new MemoryRegion(true, false, false, this, mem_props[mem_idx]));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
+10
-10
@@ -100,9 +100,9 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
|
||||
hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
|
||||
}
|
||||
|
||||
MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
|
||||
MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
|
||||
const HsaMemoryProperties& mem_props)
|
||||
: core::MemoryRegion(fine_grain, full_profile, owner),
|
||||
: core::MemoryRegion(fine_grain, kernarg, full_profile, owner),
|
||||
mem_props_(mem_props),
|
||||
max_single_alloc_size_(0),
|
||||
virtual_size_(0),
|
||||
@@ -128,6 +128,8 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne
|
||||
mem_flag_.ui32.HostAccess = 1;
|
||||
mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED;
|
||||
|
||||
if (kernarg) mem_flag_.ui32.Uncached = 1;
|
||||
|
||||
virtual_size_ =
|
||||
(full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize;
|
||||
}
|
||||
@@ -290,16 +292,14 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
|
||||
case HSA_REGION_INFO_GLOBAL_FLAGS:
|
||||
switch (mem_props_.HeapType) {
|
||||
case HSA_HEAPTYPE_SYSTEM:
|
||||
*((uint32_t*)value) = fine_grain()
|
||||
? (HSA_REGION_GLOBAL_FLAG_KERNARG |
|
||||
HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
|
||||
: HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
|
||||
break;
|
||||
case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
|
||||
case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
|
||||
*((uint32_t*)value) = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
|
||||
: HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
|
||||
case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: {
|
||||
uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
|
||||
: HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
|
||||
if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG;
|
||||
*((uint32_t*)value) = ret;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
*((uint32_t*)value) = 0;
|
||||
break;
|
||||
|
||||
@@ -68,6 +68,38 @@ namespace AMD {
|
||||
static const uint kKfdVersionMajor = 0;
|
||||
static const uint kKfdVersionMinor = 99;
|
||||
|
||||
// Query for user preference and use that to determine Xnack mode of ROCm system.
|
||||
// Return true if Xnack mode is ON or false if OFF. Xnack mode of a system is
|
||||
// orthogonal to devices that do not support Xnack mode. It is legal for a
|
||||
// system with Xnack ON to have devices that do not support Xnack functionality.
|
||||
bool BindXnackMode() {
|
||||
// Get users' preference for Xnack mode of ROCm platform
|
||||
HSAint32 mode;
|
||||
mode = core::Runtime::runtime_singleton_->flag().xnack();
|
||||
bool config_xnack =
|
||||
(core::Runtime::runtime_singleton_->flag().xnack() != Flag::XNACK_REQUEST::XNACK_UNCHANGED);
|
||||
|
||||
// Indicate to driver users' preference for Xnack mode
|
||||
// Call to driver can fail and is a supported feature
|
||||
HSAKMT_STATUS status = HSAKMT_STATUS_ERROR;
|
||||
if (config_xnack) {
|
||||
status = hsaKmtSetXNACKMode(mode);
|
||||
if (status == HSAKMT_STATUS_SUCCESS) {
|
||||
return mode;
|
||||
}
|
||||
}
|
||||
|
||||
// Get Xnack mode of devices bound by driver. This could happen
|
||||
// when a call to SET Xnack mode fails or user has no particular
|
||||
// preference
|
||||
status = hsaKmtGetXNACKMode((HSAint32*)&mode);
|
||||
if(status != HSAKMT_STATUS_SUCCESS) {
|
||||
debug_print("KFD does not support xnack mode query.\nROCr must assume xnack is disabled.\n");
|
||||
return false;
|
||||
}
|
||||
return mode;
|
||||
}
|
||||
|
||||
CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
|
||||
if (node_prop.NumCPUCores == 0) {
|
||||
return nullptr;
|
||||
@@ -79,14 +111,14 @@ CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
|
||||
return cpu;
|
||||
}
|
||||
|
||||
GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
|
||||
GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnack_mode) {
|
||||
GpuAgent* gpu = nullptr;
|
||||
if (node_prop.NumFComputeCores == 0) {
|
||||
// Ignore non GPUs.
|
||||
return nullptr;
|
||||
}
|
||||
try {
|
||||
gpu = new GpuAgent(node_id, node_prop);
|
||||
gpu = new GpuAgent(node_id, node_prop, xnack_mode);
|
||||
|
||||
const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion();
|
||||
|
||||
@@ -174,20 +206,26 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
|
||||
link_info.atomic_support_32bit = true;
|
||||
link_info.atomic_support_64bit = true;
|
||||
link_info.coherent_support = true;
|
||||
if (core::Runtime::runtime_singleton_->flag().patch_xgmi_link_weight()) {
|
||||
io_link.Weight = 15;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
debug_print("Unrecognized IOLINK type.\n");
|
||||
break;
|
||||
}
|
||||
|
||||
if (io_link.Flags.ui32.Override == 1) {
|
||||
if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) {
|
||||
// Ignore this link since peer to peer is not allowed.
|
||||
continue;
|
||||
// KFD is reporting wrong override status for XGMI. Disallow override for bringup.
|
||||
if (!core::Runtime::runtime_singleton_->flag().patch_link_override()) {
|
||||
if (io_link.Flags.ui32.Override == 1) {
|
||||
if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) {
|
||||
// Ignore this link since peer to peer is not allowed.
|
||||
continue;
|
||||
}
|
||||
link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0);
|
||||
link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0);
|
||||
link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0);
|
||||
}
|
||||
link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0);
|
||||
link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0);
|
||||
link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0);
|
||||
}
|
||||
|
||||
link_info.max_bandwidth = io_link.MaximumBandwidth;
|
||||
@@ -204,7 +242,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
|
||||
/**
|
||||
* Process the list of Gpus that are surfaced to user
|
||||
*/
|
||||
static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
|
||||
static void SurfaceGpuList(std::vector<int32_t>& gpu_list, bool xnack_mode) {
|
||||
// Process user visible Gpu devices
|
||||
int32_t invalidIdx = -1;
|
||||
int32_t list_sz = gpu_list.size();
|
||||
@@ -221,7 +259,7 @@ static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
|
||||
// Instantiate a Gpu device. The IO links
|
||||
// of this node have already been registered
|
||||
assert((node_prop.NumFComputeCores != 0) && "Improper node used for GPU device discovery.");
|
||||
DiscoverGpu(gpu_list[idx], node_prop);
|
||||
DiscoverGpu(gpu_list[idx], node_prop, xnack_mode);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -305,8 +343,11 @@ void BuildTopology() {
|
||||
RegisterLinkInfo(node_id, node_prop.NumIOLinks);
|
||||
}
|
||||
|
||||
// Determine the Xnack mode to be bound for system
|
||||
bool xnack_mode = BindXnackMode();
|
||||
|
||||
// Instantiate ROCr objects to encapsulate Gpu devices
|
||||
SurfaceGpuList(gpu_usr_list);
|
||||
SurfaceGpuList(gpu_usr_list, xnack_mode);
|
||||
}
|
||||
|
||||
bool Load() {
|
||||
|
||||
@@ -391,6 +391,9 @@ void HsaApiTable::UpdateAmdExts() {
|
||||
amd_ext_api.hsa_amd_register_deallocation_callback_fn = AMD::hsa_amd_register_deallocation_callback;
|
||||
amd_ext_api.hsa_amd_deregister_deallocation_callback_fn = AMD::hsa_amd_deregister_deallocation_callback;
|
||||
amd_ext_api.hsa_amd_signal_value_pointer_fn = AMD::hsa_amd_signal_value_pointer;
|
||||
amd_ext_api.hsa_amd_svm_attributes_set_fn = AMD::hsa_amd_svm_attributes_set;
|
||||
amd_ext_api.hsa_amd_svm_attributes_get_fn = AMD::hsa_amd_svm_attributes_get;
|
||||
amd_ext_api.hsa_amd_svm_prefetch_async_fn = AMD::hsa_amd_svm_prefetch_async;
|
||||
}
|
||||
|
||||
void LoadInitialHsaApiTable() {
|
||||
|
||||
@@ -1009,5 +1009,37 @@ hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifie
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count) {
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
return core::Runtime::runtime_singleton_->SetSvmAttrib(ptr, size, attribute_list,
|
||||
attribute_count);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count) {
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
return core::Runtime::runtime_singleton_->GetSvmAttrib(ptr, size, attribute_list,
|
||||
attribute_count);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
|
||||
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal) {
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
// Validate inputs.
|
||||
// if (core::g_use_interrupt_wait && (!core::InterruptSignal::IsType(signal)))
|
||||
return core::Runtime::runtime_singleton_->SvmPrefetch(ptr, size, agent, num_dep_signals,
|
||||
dep_signals, completion_signal);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
} // namespace amd
|
||||
} // namespace rocr
|
||||
|
||||
@@ -281,6 +281,15 @@ constexpr size_t hsa_name_size = 63;
|
||||
ISAREG_ENTRY_GEN("gfx908:sramecc-:xnack+", 9, 0, 8, disabled, enabled)
|
||||
ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack-", 9, 0, 8, enabled, disabled)
|
||||
ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack+", 9, 0, 8, enabled, enabled)
|
||||
ISAREG_ENTRY_GEN("gfx90a", 9, 0, 10, any, any)
|
||||
ISAREG_ENTRY_GEN("gfx90a:xnack-", 9, 0, 10, any, disabled)
|
||||
ISAREG_ENTRY_GEN("gfx90a:xnack+", 9, 0, 10, any, enabled)
|
||||
ISAREG_ENTRY_GEN("gfx90a:sramecc-", 9, 0, 10, disabled, any)
|
||||
ISAREG_ENTRY_GEN("gfx90a:sramecc+", 9, 0, 10, enabled, any)
|
||||
ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack-", 9, 0, 10, disabled, disabled)
|
||||
ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack+", 9, 0, 10, disabled, enabled)
|
||||
ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack-", 9, 0, 10, enabled, disabled)
|
||||
ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack+", 9, 0, 10, enabled, enabled)
|
||||
ISAREG_ENTRY_GEN("gfx1010", 10, 1, 0, unsupported, any)
|
||||
ISAREG_ENTRY_GEN("gfx1010:xnack-", 10, 1, 0, unsupported, disabled)
|
||||
ISAREG_ENTRY_GEN("gfx1010:xnack+", 10, 1, 0, unsupported, enabled)
|
||||
|
||||
@@ -60,6 +60,7 @@
|
||||
#include "core/inc/hsa_ext_amd_impl.h"
|
||||
#include "core/inc/hsa_api_trace_int.h"
|
||||
#include "core/util/os.h"
|
||||
#include "core/inc/exceptions.h"
|
||||
#include "inc/hsa_ven_amd_aqlprofile.h"
|
||||
|
||||
#define HSA_VERSION_MAJOR 1
|
||||
@@ -173,16 +174,29 @@ void Runtime::RegisterAgent(Agent* agent) {
|
||||
if (cpu_agents_.size() == 1) {
|
||||
// Might need memory pooling to cover allocation that
|
||||
// requires less than 4096 bytes.
|
||||
system_allocator_ = [this](size_t size, size_t align, MemoryRegion::AllocateFlags alloc_flags) -> void* {
|
||||
assert(align <= 4096);
|
||||
void* ptr = nullptr;
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(system_regions_fine_[0], size, alloc_flags, &ptr);
|
||||
return ptr;
|
||||
};
|
||||
|
||||
system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
|
||||
// Default system pool must support kernarg
|
||||
for (auto pool : system_regions_fine_) {
|
||||
if (pool->kernarg()) {
|
||||
system_allocator_ = [pool](size_t size, size_t alignment,
|
||||
MemoryRegion::AllocateFlags alloc_flags) -> void* {
|
||||
assert(alignment <= 4096);
|
||||
void* ptr = NULL;
|
||||
return (HSA_STATUS_SUCCESS ==
|
||||
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
|
||||
&ptr))
|
||||
? ptr
|
||||
: NULL;
|
||||
};
|
||||
|
||||
BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
|
||||
system_deallocator_ = [](void* ptr) {
|
||||
core::Runtime::runtime_singleton_->FreeMemory(ptr);
|
||||
};
|
||||
|
||||
BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) {
|
||||
gpu_agents_.push_back(agent);
|
||||
@@ -630,6 +644,18 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) {
|
||||
*(const char**)value = STRING(ROCR_BUILD_ID);
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: {
|
||||
// todo: Get HMM kernel support info.
|
||||
*(bool*)value = true;
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: {
|
||||
bool ret = true;
|
||||
for(auto agent : gpu_agents_)
|
||||
ret &= (agent->isa()->GetXnack() == IsaFeature::Enabled);
|
||||
*(bool*)value = ret;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
@@ -1572,5 +1598,499 @@ void Runtime::InternalQueueCreateNotify(const hsa_queue_t* queue, hsa_agent_t ag
|
||||
internal_queue_create_notifier_(queue, agent, internal_queue_create_notifier_user_data_);
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::SetSvmAttrib(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count) {
|
||||
uint32_t set_attribs = 0;
|
||||
std::vector<bool> agent_seen(agents_by_node_.size(), false);
|
||||
|
||||
std::vector<HSA_SVM_ATTRIBUTE> attribs;
|
||||
attribs.reserve(attribute_count);
|
||||
uint32_t set_flags = 0;
|
||||
uint32_t clear_flags = 0;
|
||||
|
||||
auto Convert = [&](uint64_t value) -> Agent* {
|
||||
hsa_agent_t handle = {value};
|
||||
Agent* agent = Agent::Convert(handle);
|
||||
if ((agent == nullptr) || !agent->IsValid())
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
|
||||
"Invalid agent handle in Runtime::SetSvmAttrib.");
|
||||
return agent;
|
||||
};
|
||||
|
||||
auto ConvertAllowNull = [&](uint64_t value) -> Agent* {
|
||||
hsa_agent_t handle = {value};
|
||||
Agent* agent = Agent::Convert(handle);
|
||||
if ((agent != nullptr) && (!agent->IsValid()))
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
|
||||
"Invalid agent handle in Runtime::SetSvmAttrib.");
|
||||
return agent;
|
||||
};
|
||||
|
||||
auto ConfirmNew = [&](Agent* agent) {
|
||||
if (agent_seen[agent->node_id()])
|
||||
throw AMD::hsa_exception(
|
||||
HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS,
|
||||
"Multiple attributes given for the same agent in Runtime::SetSvmAttrib.");
|
||||
agent_seen[agent->node_id()] = true;
|
||||
};
|
||||
|
||||
auto Check = [&](uint64_t attrib) {
|
||||
if (set_attribs & (1 << attrib))
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS,
|
||||
"Attribute given multiple times in Runtime::SetSvmAttrib.");
|
||||
set_attribs |= (1 << attrib);
|
||||
};
|
||||
|
||||
auto kmtPair = [](uint32_t attrib, uint32_t value) {
|
||||
HSA_SVM_ATTRIBUTE pair = {attrib, value};
|
||||
return pair;
|
||||
};
|
||||
|
||||
for (uint32_t i = 0; i < attribute_count; i++) {
|
||||
auto attrib = attribute_list[i].attribute;
|
||||
auto value = attribute_list[i].value;
|
||||
|
||||
switch (attrib) {
|
||||
case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: {
|
||||
Check(attrib);
|
||||
switch (value) {
|
||||
case HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED:
|
||||
set_flags |= HSA_SVM_FLAG_COHERENT;
|
||||
break;
|
||||
case HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED:
|
||||
clear_flags |= HSA_SVM_FLAG_COHERENT;
|
||||
break;
|
||||
default:
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
|
||||
"Invalid HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG value.");
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_READ_ONLY: {
|
||||
Check(attrib);
|
||||
if (value)
|
||||
set_flags |= HSA_SVM_FLAG_GPU_RO;
|
||||
else
|
||||
clear_flags |= HSA_SVM_FLAG_GPU_RO;
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
|
||||
Check(attrib);
|
||||
if (value)
|
||||
set_flags |= HSA_SVM_FLAG_HIVE_LOCAL;
|
||||
else
|
||||
clear_flags |= HSA_SVM_FLAG_HIVE_LOCAL;
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
|
||||
Check(attrib);
|
||||
// Max migration size is 1GB.
|
||||
if (value > 18) value = 18;
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, value));
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
|
||||
Check(attrib);
|
||||
Agent* agent = ConvertAllowNull(value);
|
||||
if (agent == nullptr)
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, INVALID_NODEID));
|
||||
else
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, agent->node_id()));
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE: {
|
||||
Agent* agent = Convert(value);
|
||||
ConfirmNew(agent);
|
||||
if (agent->device_type() == Agent::kAmdCpuDevice) {
|
||||
set_flags |= HSA_SVM_FLAG_HOST_ACCESS;
|
||||
} else {
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id()));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE: {
|
||||
Agent* agent = Convert(value);
|
||||
ConfirmNew(agent);
|
||||
if (agent->device_type() == Agent::kAmdCpuDevice) {
|
||||
set_flags |= HSA_SVM_FLAG_HOST_ACCESS;
|
||||
} else {
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS_IN_PLACE, agent->node_id()));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS: {
|
||||
Agent* agent = Convert(value);
|
||||
ConfirmNew(agent);
|
||||
if (agent->device_type() == Agent::kAmdCpuDevice) {
|
||||
clear_flags |= HSA_SVM_FLAG_HOST_ACCESS;
|
||||
} else {
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_NO_ACCESS, agent->node_id()));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
|
||||
"Illegal or invalid attribute in Runtime::SetSvmAttrib");
|
||||
}
|
||||
}
|
||||
|
||||
// Merge CPU access properties - grant access if any CPU needs access.
|
||||
// Probably wrong.
|
||||
if (set_flags & HSA_SVM_FLAG_HOST_ACCESS) clear_flags &= ~HSA_SVM_FLAG_HOST_ACCESS;
|
||||
|
||||
// Add flag updates
|
||||
if (clear_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_CLR_FLAGS, clear_flags));
|
||||
if (set_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, set_flags));
|
||||
|
||||
uint8_t* base = AlignDown((uint8_t*)ptr, 4096);
|
||||
uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096);
|
||||
size_t len = end - base;
|
||||
HSAKMT_STATUS error = hsaKmtSVMSetAttr(base, len, attribs.size(), &attribs[0]);
|
||||
if (error != HSAKMT_STATUS_SUCCESS)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMSetAttr failed.");
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::GetSvmAttrib(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count) {
|
||||
std::vector<HSA_SVM_ATTRIBUTE> attribs;
|
||||
attribs.reserve(attribute_count);
|
||||
|
||||
std::vector<int> kmtIndices(attribute_count);
|
||||
|
||||
bool getFlags = false;
|
||||
|
||||
auto Convert = [&](uint64_t value) -> Agent* {
|
||||
hsa_agent_t handle = {value};
|
||||
Agent* agent = Agent::Convert(handle);
|
||||
if ((agent == nullptr) || !agent->IsValid())
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
|
||||
"Invalid agent handle in Runtime::GetSvmAttrib.");
|
||||
return agent;
|
||||
};
|
||||
|
||||
auto kmtPair = [](uint32_t attrib, uint32_t value) {
|
||||
HSA_SVM_ATTRIBUTE pair = {attrib, value};
|
||||
return pair;
|
||||
};
|
||||
|
||||
for (uint32_t i = 0; i < attribute_count; i++) {
|
||||
auto& attrib = attribute_list[i].attribute;
|
||||
auto& value = attribute_list[i].value;
|
||||
|
||||
switch (attrib) {
|
||||
case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG:
|
||||
case HSA_AMD_SVM_ATTRIB_READ_ONLY:
|
||||
case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
|
||||
getFlags = true;
|
||||
kmtIndices[i] = -1;
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
|
||||
kmtIndices[i] = attribs.size();
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, 0));
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
|
||||
kmtIndices[i] = attribs.size();
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, 0));
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: {
|
||||
value = Agent::Convert(GetSVMPrefetchAgent(ptr, size)).handle;
|
||||
kmtIndices[i] = -1;
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: {
|
||||
Agent* agent = Convert(value);
|
||||
if (agent->device_type() == Agent::kAmdCpuDevice) {
|
||||
getFlags = true;
|
||||
kmtIndices[i] = -1;
|
||||
} else {
|
||||
kmtIndices[i] = attribs.size();
|
||||
attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id()));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
|
||||
"Illegal or invalid attribute in Runtime::SetSvmAttrib");
|
||||
}
|
||||
}
|
||||
|
||||
if (getFlags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, 0));
|
||||
|
||||
uint8_t* base = AlignDown((uint8_t*)ptr, 4096);
|
||||
uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096);
|
||||
size_t len = end - base;
|
||||
if (attribs.size() != 0) {
|
||||
HSAKMT_STATUS error = hsaKmtSVMGetAttr(base, len, attribs.size(), &attribs[0]);
|
||||
if (error != HSAKMT_STATUS_SUCCESS)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMGetAttr failed.");
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < attribute_count; i++) {
|
||||
auto& attrib = attribute_list[i].attribute;
|
||||
auto& value = attribute_list[i].value;
|
||||
|
||||
switch (attrib) {
|
||||
case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: {
|
||||
if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_COHERENT)
|
||||
value = HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED;
|
||||
else
|
||||
value = HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED;
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_READ_ONLY: {
|
||||
value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_GPU_RO);
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
|
||||
value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HIVE_LOCAL);
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
|
||||
value = attribs[kmtIndices[i]].value;
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
|
||||
uint64_t node = attribs[kmtIndices[i]].value;
|
||||
Agent* agent = nullptr;
|
||||
if (node != INVALID_NODEID) agent = agents_by_node_[node][0];
|
||||
value = Agent::Convert(agent).handle;
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: {
|
||||
break;
|
||||
}
|
||||
case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: {
|
||||
if (kmtIndices[i] == -1) {
|
||||
if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HOST_ACCESS)
|
||||
attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE;
|
||||
} else {
|
||||
switch (attribs[kmtIndices[i]].type) {
|
||||
case HSA_SVM_ATTR_ACCESS:
|
||||
attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE;
|
||||
break;
|
||||
case HSA_SVM_ATTR_ACCESS_IN_PLACE:
|
||||
attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE;
|
||||
break;
|
||||
case HSA_SVM_ATTR_NO_ACCESS:
|
||||
attrib = HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS;
|
||||
break;
|
||||
default:
|
||||
assert(false && "Bad agent accessibility from KFD.");
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
|
||||
"Illegal or invalid attribute in Runtime::GetSvmAttrib");
|
||||
}
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
|
||||
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal) {
|
||||
uintptr_t base = reinterpret_cast<uintptr_t>(AlignDown(ptr, 4096));
|
||||
uintptr_t end = AlignUp(reinterpret_cast<uintptr_t>(ptr) + size, 4096);
|
||||
size_t len = end - base;
|
||||
|
||||
PrefetchOp* op = new PrefetchOp();
|
||||
MAKE_NAMED_SCOPE_GUARD(OpGuard, [&]() { delete op; });
|
||||
|
||||
Agent* dest = Agent::Convert(agent);
|
||||
if (dest->device_type() == Agent::kAmdCpuDevice)
|
||||
op->node_id = 0;
|
||||
else
|
||||
op->node_id = dest->node_id();
|
||||
|
||||
op->base = reinterpret_cast<void*>(base);
|
||||
op->size = len;
|
||||
op->completion = completion_signal;
|
||||
if (num_dep_signals > 1) {
|
||||
op->remaining_deps = num_dep_signals - 1;
|
||||
for (int i = 0; i < num_dep_signals - 1; i++) op->dep_signals.push_back(dep_signals[i]);
|
||||
} else {
|
||||
op->remaining_deps = 0;
|
||||
}
|
||||
|
||||
{
|
||||
ScopedAcquire<KernelMutex> lock(&prefetch_lock_);
|
||||
// Remove all fully overlapped and trim partially overlapped ranges.
|
||||
// Get iteration bounds
|
||||
auto start = prefetch_map_.upper_bound(base);
|
||||
if (start != prefetch_map_.begin()) start--;
|
||||
auto stop = prefetch_map_.lower_bound(end);
|
||||
|
||||
auto isEndNode = [&](decltype(start) node) { return node->second.next == prefetch_map_.end(); };
|
||||
auto isFirstNode = [&](decltype(start) node) {
|
||||
return node->second.prev == prefetch_map_.end();
|
||||
};
|
||||
|
||||
// Trim and remove old ranges.
|
||||
while (start != stop) {
|
||||
uintptr_t startBase = start->first;
|
||||
uintptr_t startEnd = startBase + start->second.bytes;
|
||||
|
||||
auto ibase = Max(startBase, base);
|
||||
auto iend = Min(startEnd, end);
|
||||
// Check for overlap
|
||||
if (ibase < iend) {
|
||||
// Second range check
|
||||
if (iend < startEnd) {
|
||||
auto ret = prefetch_map_.insert(
|
||||
std::make_pair(iend, PrefetchRange(startEnd - iend, start->second.op)));
|
||||
assert(ret.second && "Prefetch map insert failed during range split.");
|
||||
|
||||
auto it = ret.first;
|
||||
it->second.prev = start;
|
||||
it->second.next = start->second.next;
|
||||
start->second.next = it;
|
||||
if (!isEndNode(it)) it->second.next->second.prev = it;
|
||||
}
|
||||
|
||||
// Is the first interval of the old range valid
|
||||
if (startBase < ibase) {
|
||||
start->second.bytes = ibase - startBase;
|
||||
} else {
|
||||
if (isFirstNode(start)) {
|
||||
start->second.op->prefetch_map_entry = start->second.next;
|
||||
if (!isEndNode(start)) start->second.next->second.prev = prefetch_map_.end();
|
||||
} else {
|
||||
start->second.prev->second.next = start->second.next;
|
||||
if (!isEndNode(start)) start->second.next->second.prev = start->second.prev;
|
||||
}
|
||||
prefetch_map_.erase(start);
|
||||
}
|
||||
}
|
||||
start++;
|
||||
}
|
||||
|
||||
// Insert new range.
|
||||
auto ret = prefetch_map_.insert(std::make_pair(base, PrefetchRange(len, op)));
|
||||
assert(ret.second && "Prefetch map insert failed.");
|
||||
|
||||
auto it = ret.first;
|
||||
op->prefetch_map_entry = it;
|
||||
it->second.next = it->second.prev = prefetch_map_.end();
|
||||
}
|
||||
|
||||
// Remove the prefetch's ranges from the map.
|
||||
static auto removePrefetchRanges = [](PrefetchOp* op) {
|
||||
ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
|
||||
auto it = op->prefetch_map_entry;
|
||||
while (it != Runtime::runtime_singleton_->prefetch_map_.end()) {
|
||||
auto next = it->second.next;
|
||||
Runtime::runtime_singleton_->prefetch_map_.erase(it);
|
||||
it = next;
|
||||
}
|
||||
};
|
||||
|
||||
// Prefetch Signal handler for synchronization.
|
||||
static hsa_amd_signal_handler signal_handler = [](hsa_signal_value_t value, void* arg) {
|
||||
PrefetchOp* op = reinterpret_cast<PrefetchOp*>(arg);
|
||||
|
||||
if (op->remaining_deps > 0) {
|
||||
op->remaining_deps--;
|
||||
Runtime::runtime_singleton_->SetAsyncSignalHandler(
|
||||
op->dep_signals[op->remaining_deps], HSA_SIGNAL_CONDITION_EQ, 0, signal_handler, arg);
|
||||
return false;
|
||||
}
|
||||
|
||||
HSA_SVM_ATTRIBUTE attrib;
|
||||
attrib.type = HSA_SVM_ATTR_PREFETCH_LOC;
|
||||
attrib.value = op->node_id;
|
||||
HSAKMT_STATUS error = hsaKmtSVMSetAttr(op->base, op->size, 1, &attrib);
|
||||
assert(error == HSAKMT_STATUS_SUCCESS && "KFD Prefetch failed.");
|
||||
|
||||
removePrefetchRanges(op);
|
||||
|
||||
if (op->completion.handle != 0) Signal::Convert(op->completion)->SubRelaxed(1);
|
||||
delete op;
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
auto no_dependencies = [](void* arg) { signal_handler(0, arg); };
|
||||
|
||||
MAKE_NAMED_SCOPE_GUARD(RangeGuard, [&]() { removePrefetchRanges(op); });
|
||||
|
||||
hsa_status_t err;
|
||||
if (num_dep_signals == 0)
|
||||
err = AMD::hsa_amd_async_function(no_dependencies, op);
|
||||
else
|
||||
err = SetAsyncSignalHandler(dep_signals[num_dep_signals - 1], HSA_SIGNAL_CONDITION_EQ, 0,
|
||||
signal_handler, op);
|
||||
if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Signal handler unable to be set.");
|
||||
|
||||
RangeGuard.Dismiss();
|
||||
OpGuard.Dismiss();
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
|
||||
uintptr_t base = reinterpret_cast<uintptr_t>(AlignDown(ptr, 4096));
|
||||
uintptr_t end = AlignUp(reinterpret_cast<uintptr_t>(ptr) + size, 4096);
|
||||
size_t len = end - base;
|
||||
|
||||
std::vector<std::pair<uintptr_t, uintptr_t>> holes;
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
|
||||
auto start = prefetch_map_.upper_bound(base);
|
||||
if (start != prefetch_map_.begin()) start--;
|
||||
auto stop = prefetch_map_.lower_bound(end);
|
||||
|
||||
// KFD returns -1 for no or mixed destinations.
|
||||
uint32_t prefetch_node = -2;
|
||||
if (start != stop) {
|
||||
prefetch_node = start->second.op->node_id;
|
||||
}
|
||||
|
||||
while (start != stop) {
|
||||
uintptr_t startBase = start->first;
|
||||
uintptr_t startEnd = startBase + start->second.bytes;
|
||||
|
||||
auto ibase = Max(base, startBase);
|
||||
auto iend = Min(end, startEnd);
|
||||
// Check for intersection with the query
|
||||
if (ibase < iend) {
|
||||
// If prefetch locations are different then we report null agent.
|
||||
if (prefetch_node != start->second.op->node_id) return nullptr;
|
||||
|
||||
// Push leading gap to an array for checking KFD.
|
||||
if (base < ibase) holes.push_back(std::make_pair(base, ibase - base));
|
||||
|
||||
// Trim query range.
|
||||
base = iend;
|
||||
}
|
||||
start++;
|
||||
}
|
||||
if (base < end) holes.push_back(std::make_pair(base, end - base));
|
||||
|
||||
HSA_SVM_ATTRIBUTE attrib;
|
||||
attrib.type = HSA_SVM_ATTR_PREFETCH_LOC;
|
||||
for (auto& range : holes) {
|
||||
HSAKMT_STATUS error =
|
||||
hsaKmtSVMGetAttr(reinterpret_cast<void*>(range.first), range.second, 1, &attrib);
|
||||
assert(error == HSAKMT_STATUS_SUCCESS && "KFD prefetch query failed.");
|
||||
|
||||
if (attrib.value == -1) return nullptr;
|
||||
if (prefetch_node == -2) prefetch_node = attrib.value;
|
||||
if (prefetch_node != attrib.value) return nullptr;
|
||||
}
|
||||
|
||||
assert(prefetch_node != -2 && "prefetch_node was not updated.");
|
||||
assert(prefetch_node != -1 && "Should have already returned.");
|
||||
return agents_by_node_[prefetch_node][0];
|
||||
}
|
||||
|
||||
} // namespace core
|
||||
} // namespace rocr
|
||||
|
||||
@@ -56,6 +56,13 @@ class Flag {
|
||||
public:
|
||||
enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT };
|
||||
|
||||
// The values are meaningful and chosen to satisfy the thunk API.
|
||||
enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 };
|
||||
static_assert(XNACK_DISABLE == 0, "XNACK_REQUEST enum values improperly changed.");
|
||||
static_assert(XNACK_ENABLE == 1, "XNACK_REQUEST enum values improperly changed.");
|
||||
|
||||
enum FLAG_TRI_STATE { FLAG_DISABLE = 0, FLAG_ENABLE = 1, FLAG_DEFAULT = 2 };
|
||||
|
||||
explicit Flag() { Refresh(); }
|
||||
|
||||
virtual ~Flag() {}
|
||||
@@ -104,6 +111,12 @@ class Flag {
|
||||
var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR");
|
||||
disable_fragment_alloc_ = (var == "1") ? true : false;
|
||||
|
||||
var = os::GetEnvVar("HSA_UNPATCH_XGMI_LINK_WEIGHT");
|
||||
patch_xgmi_link_weight_ = (var == "1") ? false : true;
|
||||
|
||||
var = os::GetEnvVar("HSA_UNPATCH_LINK_OVERRIDE");
|
||||
patch_link_override_ = (var == "1") ? false : true;
|
||||
|
||||
var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH");
|
||||
enable_sdma_hdp_flush_ = (var == "0") ? false : true;
|
||||
|
||||
@@ -130,6 +143,16 @@ class Flag {
|
||||
|
||||
var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT");
|
||||
check_sramecc_validity_ = (var == "1") ? false : true;
|
||||
|
||||
// Legal values are zero "0" or one "1". Any other value will
|
||||
// be interpreted as not defining the env variable
|
||||
var = os::GetEnvVar("HSA_XNACK");
|
||||
xnack_ = (var == "0") ? XNACK_DISABLE : ((var == "1") ? XNACK_ENABLE : XNACK_UNCHANGED);
|
||||
|
||||
// Legal values are zero "0" or one "1". Any other value will
|
||||
// be interpreted as not defining the env variable.
|
||||
var = os::GetEnvVar("HSA_FORCE_SRAMECC");
|
||||
sramecc_ = (var == "0") ? FLAG_DISABLE : ((var == "1") ? FLAG_ENABLE : FLAG_DEFAULT);
|
||||
}
|
||||
|
||||
bool check_flat_scratch() const { return check_flat_scratch_; }
|
||||
@@ -150,6 +173,11 @@ class Flag {
|
||||
|
||||
bool disable_fragment_alloc() const { return disable_fragment_alloc_; }
|
||||
|
||||
// Temporary way to control ROCr interpretation of inter-device link weight
|
||||
bool patch_xgmi_link_weight() const { return patch_xgmi_link_weight_; }
|
||||
|
||||
bool patch_link_override() const { return patch_link_override_; }
|
||||
|
||||
bool rev_copy_dir() const { return rev_copy_dir_; }
|
||||
|
||||
bool fine_grain_pcie() const { return fine_grain_pcie_; }
|
||||
@@ -178,6 +206,10 @@ class Flag {
|
||||
|
||||
bool check_sramecc_validity() const { return check_sramecc_validity_; }
|
||||
|
||||
XNACK_REQUEST xnack() const { return xnack_; }
|
||||
|
||||
FLAG_TRI_STATE sramecc() const { return sramecc_; }
|
||||
|
||||
private:
|
||||
bool check_flat_scratch_;
|
||||
bool enable_vm_fault_message_;
|
||||
@@ -195,6 +227,8 @@ class Flag {
|
||||
bool disable_image_;
|
||||
bool loader_enable_mmap_uri_;
|
||||
bool check_sramecc_validity_;
|
||||
bool patch_xgmi_link_weight_;
|
||||
bool patch_link_override_;
|
||||
|
||||
SDMA_OVERRIDE enable_sdma_;
|
||||
|
||||
@@ -209,6 +243,12 @@ class Flag {
|
||||
|
||||
size_t force_sdma_size_;
|
||||
|
||||
// Indicates user preference for Xnack state.
|
||||
XNACK_REQUEST xnack_;
|
||||
|
||||
// Indicates user preference for SramECC state.
|
||||
FLAG_TRI_STATE sramecc_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(Flag);
|
||||
};
|
||||
|
||||
|
||||
@@ -222,6 +222,9 @@ global:
|
||||
hsa_amd_deregister_deallocation_callback;
|
||||
hsa_amd_signal_value_pointer;
|
||||
_amdgpu_r_debug;
|
||||
hsa_amd_svm_attributes_set;
|
||||
hsa_amd_svm_attributes_get;
|
||||
hsa_amd_svm_prefetch_async;
|
||||
|
||||
local:
|
||||
*;
|
||||
|
||||
@@ -82,6 +82,7 @@ extern uint8_t ocl_blit_object_gfx902[];
|
||||
extern uint8_t ocl_blit_object_gfx904[];
|
||||
extern uint8_t ocl_blit_object_gfx906[];
|
||||
extern uint8_t ocl_blit_object_gfx908[];
|
||||
extern uint8_t ocl_blit_object_gfx90a[];
|
||||
extern uint8_t ocl_blit_object_gfx1010[];
|
||||
extern uint8_t ocl_blit_object_gfx1011[];
|
||||
extern uint8_t ocl_blit_object_gfx1012[];
|
||||
@@ -990,6 +991,8 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name,
|
||||
*blit_code_object = ocl_blit_object_gfx906;
|
||||
} else if (sname == "gfx908") {
|
||||
*blit_code_object = ocl_blit_object_gfx908;
|
||||
} else if (sname == "gfx90a") {
|
||||
*blit_code_object = ocl_blit_object_gfx90a;
|
||||
} else if (sname == "gfx1010") {
|
||||
*blit_code_object = ocl_blit_object_gfx1010;
|
||||
} else if (sname == "gfx1011") {
|
||||
|
||||
@@ -69,7 +69,7 @@ endif()
|
||||
|
||||
# Determine the target devices if not specified
|
||||
if (NOT DEFINED TARGET_DEVICES)
|
||||
set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
|
||||
set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
|
||||
endif()
|
||||
set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE )
|
||||
|
||||
|
||||
@@ -50,17 +50,11 @@
|
||||
namespace rocr {
|
||||
namespace image {
|
||||
|
||||
uint32_t MajorVerFromDevID(uint32_t dev_id) {
|
||||
return dev_id/100;
|
||||
}
|
||||
uint32_t MajorVerFromDevID(uint32_t dev_id) { return dev_id >> 8; }
|
||||
|
||||
uint32_t MinorVerFromDevID(uint32_t dev_id) {
|
||||
return (dev_id % 100)/10;
|
||||
}
|
||||
uint32_t MinorVerFromDevID(uint32_t dev_id) { return (dev_id >> 4) & 0xF; }
|
||||
|
||||
uint32_t StepFromDevID(uint32_t dev_id) {
|
||||
return (dev_id%100)%10;
|
||||
}
|
||||
uint32_t StepFromDevID(uint32_t dev_id) { return dev_id & 0xF; }
|
||||
|
||||
hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) {
|
||||
char asic_name[64];
|
||||
@@ -78,7 +72,10 @@ hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) {
|
||||
assert(a_str.compare(0, 3, "gfx", 3) == 0);
|
||||
|
||||
a_str.erase(0,3);
|
||||
*chip_id = std::stoi(a_str);
|
||||
|
||||
// Load chip_id accounting for stepping and minor in hex and major in dec.
|
||||
*chip_id = std::stoi(a_str.substr(a_str.length() - 2), nullptr, 16);
|
||||
*chip_id += (std::stoi(a_str.substr(0, a_str.length() - 2)) << 8);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -163,6 +160,7 @@ uint32_t DevIDToAddrLibFamily(uint32_t dev_id) {
|
||||
case 4: // Vega12
|
||||
case 6: // Vega20
|
||||
case 8: // Arcturus
|
||||
case 10: // Aldebaran
|
||||
return FAMILY_AI;
|
||||
|
||||
case 2:
|
||||
|
||||
@@ -115,10 +115,13 @@ enum : unsigned {
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3D = 0x03d,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3E = 0x03e,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f,
|
||||
|
||||
// First/last AMDGCN-based processors.
|
||||
EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
|
||||
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805,
|
||||
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A,
|
||||
|
||||
// Indicates if the "xnack" target feature is enabled for all code contained
|
||||
// in the object.
|
||||
|
||||
@@ -480,7 +480,21 @@ typedef enum {
|
||||
/**
|
||||
* String containing the ROCr build identifier.
|
||||
*/
|
||||
HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200
|
||||
HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200,
|
||||
/**
|
||||
* Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of
|
||||
* this attribute is bool.
|
||||
*/
|
||||
HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201,
|
||||
// TODO: Should this be per Agent?
|
||||
/**
|
||||
* Returns true if all Agents have access to system allocated memory (such as
|
||||
* that allocated by mmap, malloc, or new) by default.
|
||||
* If false then system allocated memory may only be made SVM accessible to
|
||||
* an Agent by declaration of accessibility with hsa_amd_svm_set_attributes.
|
||||
* The type of this attribute is bool.
|
||||
*/
|
||||
HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202
|
||||
} hsa_system_info_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -183,6 +183,9 @@ struct AmdExtTable {
|
||||
decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn;
|
||||
decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn;
|
||||
decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn;
|
||||
decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn;
|
||||
decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn;
|
||||
decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn;
|
||||
};
|
||||
|
||||
// Table to export HSA Core Runtime Apis
|
||||
|
||||
@@ -2116,6 +2116,162 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr,
|
||||
hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
|
||||
hsa_amd_deallocation_callback_t callback);
|
||||
|
||||
typedef enum hsa_amd_svm_model_s {
|
||||
/**
|
||||
* Updates to memory with this attribute conform to HSA memory consistency model.
|
||||
*/
|
||||
HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0,
|
||||
/**
|
||||
* Writes to memory with this attribute can be performed by a single agent at a time.
|
||||
*/
|
||||
HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1
|
||||
} hsa_amd_svm_model_t;
|
||||
|
||||
typedef enum hsa_amd_svm_attribute_s {
|
||||
// Memory model attribute.
|
||||
// Type of this attribute is hsa_amd_svm_model_t.
|
||||
HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0,
|
||||
// Marks the range read only. This allows multiple physical copies to be
|
||||
// placed local to each accessing device.
|
||||
// Type of this attribute is bool.
|
||||
HSA_AMD_SVM_ATTRIB_READ_ONLY = 1,
|
||||
// Automatic migrations should attempt to keep the memory within the xgmi hive
|
||||
// containing accessible agents.
|
||||
// Type of this attribute is bool.
|
||||
HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2,
|
||||
// Page granularity to migrate at once. Page granularity is specified as
|
||||
// log2(page_count).
|
||||
// Type of this attribute is uint64_t.
|
||||
HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3,
|
||||
// Physical location to prefer when automatic migration occurs.
|
||||
// Set to the null agent handle (handle == 0) to indicate there
|
||||
// is no preferred location.
|
||||
// Type of this attribute is hsa_agent_t.
|
||||
HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4,
|
||||
// This attribute can not be used in ::hsa_amd_svm_attributes_set (see
|
||||
// ::hsa_amd_svm_prefetch_async).
|
||||
// Physical location of most recent prefetch command.
|
||||
// If the prefetch location has not been set or is not uniform across the
|
||||
// address range then returned hsa_agent_t::handle will be 0.
|
||||
// Querying this attribute will return the destination agent of the most
|
||||
// recent ::hsa_amd_svm_prefetch_async targeting the address range. If
|
||||
// multiple async prefetches have been issued targeting the region and the
|
||||
// most recently issued prefetch has completed then the query will return
|
||||
// the location of the most recently completed prefetch.
|
||||
// Type of this attribute is hsa_agent_t.
|
||||
HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5,
|
||||
// This attribute can not be used in ::hsa_amd_svm_attributes_get.
|
||||
// Enables an agent for access to the range. Access may incur a page fault
|
||||
// and associated memory migration. Either this or
|
||||
// HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM
|
||||
// access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
|
||||
// Type of this attribute is hsa_agent_t.
|
||||
HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200,
|
||||
// This attribute can not be used in ::hsa_amd_svm_attributes_get.
|
||||
// Enables an agent for access to the range without page faults. Access
|
||||
// will not incur a page fault and will not cause access based migration.
|
||||
// and associated memory migration. Either this or
|
||||
// HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if
|
||||
// HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
|
||||
// Type of this attribute is hsa_agent_t.
|
||||
HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201,
|
||||
// This attribute can not be used in ::hsa_amd_svm_attributes_get.
|
||||
// Denies an agent access to the memory range. Access will cause a terminal
|
||||
// segfault.
|
||||
// Type of this attribute is hsa_agent_t.
|
||||
HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202,
|
||||
// This attribute can not be used in ::hsa_amd_svm_attributes_set.
|
||||
// Returns the access attribute associated with the agent.
|
||||
// The agent to query must be set in the attribute value field.
|
||||
// The attribute enum will be replaced with the agent's current access
|
||||
// attribute for the address range.
|
||||
// TODO: Clarify KFD return value for non-uniform access attribute.
|
||||
// Type of this attribute is hsa_agent_t.
|
||||
HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203,
|
||||
} hsa_amd_svm_attribute_t;
|
||||
|
||||
// List type for hsa_amd_svm_attributes_set/get.
|
||||
typedef struct hsa_amd_svm_attribute_pair_s {
|
||||
// hsa_amd_svm_attribute_t value.
|
||||
uint64_t attribute;
|
||||
// Attribute value. Bit values should be interpreted according to the type
|
||||
// given in the associated attribute description.
|
||||
uint64_t value;
|
||||
} hsa_amd_svm_attribute_pair_t;
|
||||
|
||||
/**
|
||||
* @brief Sets SVM memory attributes.
|
||||
*
|
||||
* If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling
|
||||
* access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
|
||||
* or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM
|
||||
* memory access by that Agent.
|
||||
*
|
||||
* Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION
|
||||
* may not be used with this API.
|
||||
*
|
||||
* @param[in] ptr Will be aligned down to nearest page boundary.
|
||||
*
|
||||
* @param[in] size Will be aligned up to nearest page boundary.
|
||||
*
|
||||
* @param[in] attribute_list List of attributes to set for the address range.
|
||||
*
|
||||
* @param[in] attribute_count Length of @p attribute_list.
|
||||
*/
|
||||
hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count);
|
||||
|
||||
/**
|
||||
* @brief Gets SVM memory attributes.
|
||||
*
|
||||
* Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE,
|
||||
* HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and
|
||||
* HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API.
|
||||
*
|
||||
* Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an
|
||||
* hsa_agent_t and returns the current access type through its attribute field.
|
||||
*
|
||||
* @param[in] ptr Will be aligned down to nearest page boundary.
|
||||
*
|
||||
* @param[in] size Will be aligned up to nearest page boundary.
|
||||
*
|
||||
* @param[in] attribute_list List of attributes to set for the address range.
|
||||
*
|
||||
* @param[in] attribute_count Length of @p attribute_list.
|
||||
*/
|
||||
hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
|
||||
hsa_amd_svm_attribute_pair_t* attribute_list,
|
||||
size_t attribute_count);
|
||||
|
||||
/**
|
||||
* @brief Asynchronously migrates memory to an agent.
|
||||
*
|
||||
* Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero.
|
||||
* @p completion_signal will decrement when the migration is complete.
|
||||
*
|
||||
* @param[in] ptr Will be aligned down to nearest page boundary.
|
||||
*
|
||||
* @param[in] size Will be aligned up to nearest page boundary.
|
||||
*
|
||||
* @param[in] agent Agent to migrate to.
|
||||
*
|
||||
* @param[in] num_dep_signals Number of dependent signals. Can be 0.
|
||||
*
|
||||
* @param[in] dep_signals List of signals that must be waited on before the migration
|
||||
* operation starts. The migration will start after every signal has been observed with
|
||||
* the value 0. If @p num_dep_signals is 0, this argument is ignored.
|
||||
*
|
||||
* @param[in] completion_signal Signal used to indicate completion of the migration
|
||||
* operation. When the migration operation is finished, the value of the signal is
|
||||
* decremented. The runtime indicates that an error has occurred during the copy
|
||||
* operation by setting the value of the completion signal to a negative
|
||||
* number. If no completion signal is required this handle may be null.
|
||||
*/
|
||||
hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
|
||||
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
|
||||
hsa_signal_t completion_signal);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C" block
|
||||
#endif
|
||||
|
||||
@@ -568,6 +568,7 @@ namespace code {
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: name = "gfx906"; xnack_supported = true; sramecc_supported = true; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: name = "gfx908"; xnack_supported = true; sramecc_supported = true; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: name = "gfx909"; xnack_supported = true; sramecc_supported = false; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: name = "gfx90a"; xnack_supported = true; sramecc_supported = true; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: name = "gfx90c"; xnack_supported = true; sramecc_supported = false; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: name = "gfx1010"; xnack_supported = true; sramecc_supported = false; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: name = "gfx1011"; xnack_supported = true; sramecc_supported = false; break;
|
||||
|
||||
@@ -80,26 +80,27 @@ namespace loader {
|
||||
: out(std::cout)
|
||||
{
|
||||
invalid.handle = 0;
|
||||
gfx700.handle = 700;
|
||||
gfx701.handle = 701;
|
||||
gfx702.handle = 702;
|
||||
gfx801.handle = 801;
|
||||
gfx802.handle = 802;
|
||||
gfx803.handle = 803;
|
||||
gfx805.handle = 805;
|
||||
gfx810.handle = 810;
|
||||
gfx900.handle = 900;
|
||||
gfx902.handle = 902;
|
||||
gfx904.handle = 904;
|
||||
gfx906.handle = 906;
|
||||
gfx908.handle = 908;
|
||||
gfx1010.handle = 1010;
|
||||
gfx1011.handle = 1011;
|
||||
gfx1012.handle = 1012;
|
||||
gfx1030.handle = 1030;
|
||||
gfx1031.handle = 1031;
|
||||
gfx1032.handle = 1032;
|
||||
gfx1033.handle = 1033;
|
||||
gfx700.handle = 0x700;
|
||||
gfx701.handle = 0x701;
|
||||
gfx702.handle = 0x702;
|
||||
gfx801.handle = 0x801;
|
||||
gfx802.handle = 0x802;
|
||||
gfx803.handle = 0x803;
|
||||
gfx805.handle = 0x805;
|
||||
gfx810.handle = 0x810;
|
||||
gfx900.handle = 0x900;
|
||||
gfx902.handle = 0x902;
|
||||
gfx904.handle = 0x904;
|
||||
gfx906.handle = 0x906;
|
||||
gfx908.handle = 0x908;
|
||||
gfx90a.handle = 0x90a;
|
||||
gfx1010.handle = 0x1010;
|
||||
gfx1011.handle = 0x1011;
|
||||
gfx1012.handle = 0x1012;
|
||||
gfx1030.handle = 0x1030;
|
||||
gfx1031.handle = 0x1031;
|
||||
gfx1032.handle = 0x1032;
|
||||
gfx1033.handle = 0x1033;
|
||||
}
|
||||
|
||||
hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name)
|
||||
@@ -137,6 +138,8 @@ namespace loader {
|
||||
return gfx906;
|
||||
} else if (sname == "AMD:AMDGPU:9:0:8") {
|
||||
return gfx908;
|
||||
} else if (sname == "AMD:AMDGPU:9:0:A") {
|
||||
return gfx90a;
|
||||
} else if (sname == "AMD:AMDGPU:10:1:0") {
|
||||
return gfx1010;
|
||||
} else if (sname == "AMD:AMDGPU:10:1:1") {
|
||||
|
||||
@@ -57,7 +57,7 @@ namespace loader {
|
||||
hsa_isa_t invalid;
|
||||
hsa_isa_t gfx700, gfx701, gfx702, gfx703, gfx704, gfx705;
|
||||
hsa_isa_t gfx801, gfx802, gfx803, gfx805, gfx810;
|
||||
hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908;
|
||||
hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908, gfx90a;
|
||||
hsa_isa_t gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, gfx1032, gfx1033;
|
||||
std::ostream& out;
|
||||
typedef std::set<void*> PointerSet;
|
||||
|
||||
Ссылка в новой задаче
Block a user