Squash merge of cfreehil/amd-temp-gfx90a onto amd-staging.

Includes some workarounds and HMM.
Conflicts:
	opensrc/hsa-runtime/core/runtime/amd_topology.cpp
	opensrc/hsa-runtime/core/util/flag.h

Change-Id: I22976f07964a43dbb228a6231777dbd599112b8d


[ROCm/ROCR-Runtime commit: 7333c77e22]
Этот коммит содержится в:
Sean Keely
2021-04-01 20:51:51 -05:00
родитель ea1f545fcc
Коммит dd42ca6dbe
30 изменённых файлов: 1102 добавлений и 154 удалений
+49 -44
Просмотреть файл
@@ -186,19 +186,19 @@ void QueueValidation::QueueValidationForInvalidDimension(hsa_agent_t cpuAgent,
hsa_queue_t *queue[kMaxQueue]; // command queue
uint32_t ii;
test_validation_data user_data[kMaxQueue];
for (ii = 0; ii < kMaxQueue; ++ii) {
test_validation_data user_data;
// set callback flag to false if callback called then it will change to true
user_data.cb_triggered = false;
user_data[ii].cb_triggered = false;
// set the queue pointer
user_data.queue_pointer = &queue[ii];
user_data[ii].queue_pointer = &queue[ii];
// set the expected status in queue error calback handling
user_data.expected_status = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
user_data[ii].expected_status = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
// create queue
err = hsa_queue_create(gpuAgent,
queue_max, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
queue_max, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
@@ -240,12 +240,12 @@ void QueueValidation::QueueValidationForInvalidDimension(hsa_agent_t cpuAgent,
// completion signal should not be changed.
ASSERT_EQ(completion, 1);
// queue error handling callback should be triggered
ASSERT_EQ(user_data.cb_triggered, true);
hsa_signal_store_relaxed(aql().completion_signal, 1);
}
sleep(1);
for (ii = 0; ii < kMaxQueue; ++ii) {
// queue error handling callback should be triggered
ASSERT_EQ(user_data[ii].cb_triggered, true);
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
}
}
@@ -269,20 +269,21 @@ void QueueValidation::QueueValidationInvalidGroupMemory(hsa_agent_t cpuAgent,
queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;
hsa_queue_t *queue[kMaxQueue]; // command queue
test_validation_data user_data[kMaxQueue];
uint32_t ii;
for (ii = 0; ii < kMaxQueue; ++ii) {
test_validation_data user_data;
// set callback flag to false if callback called then it will change to true
user_data.cb_triggered = false;
user_data[ii].cb_triggered = false;
// set the queue pointer
user_data.queue_pointer = &queue[ii];
user_data[ii].queue_pointer = &queue[ii];
// set the expected status in queue error calback handling
user_data.expected_status = HSA_STATUS_ERROR_INVALID_ALLOCATION;
user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_ALLOCATION;
// create queue
err = hsa_queue_create(gpuAgent,
queue_max, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
queue_max, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
@@ -325,12 +326,12 @@ void QueueValidation::QueueValidationInvalidGroupMemory(hsa_agent_t cpuAgent,
// completion signal should not be changed.
ASSERT_EQ(completion, 1);
// queue error handling callback should be triggered
ASSERT_EQ(user_data.cb_triggered, true);
hsa_signal_store_relaxed(aql().completion_signal, 1);
}
sleep(1);
for (ii = 0; ii < kMaxQueue; ++ii) {
// queue error handling callback should be triggered
ASSERT_EQ(user_data[ii].cb_triggered, true);
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
}
}
@@ -353,20 +354,20 @@ void QueueValidation::QueueValidationForInvalidKernelObject(hsa_agent_t cpuAgent
queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;
hsa_queue_t *queue[kMaxQueue]; // command queue
test_validation_data user_data[kMaxQueue];
uint32_t ii;
for (ii = 0; ii < kMaxQueue; ++ii) {
test_validation_data user_data;
// set callback flag to false if callback called then it will change to true
user_data.cb_triggered = false;
user_data[ii].cb_triggered = false;
// set the queue pointer
user_data.queue_pointer = &queue[ii];
user_data[ii].queue_pointer = &queue[ii];
// set the expected status in queue error calback handling
user_data.expected_status = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
// create queue
err = hsa_queue_create(gpuAgent,
kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
@@ -408,12 +409,12 @@ void QueueValidation::QueueValidationForInvalidKernelObject(hsa_agent_t cpuAgent
// completion signal should not be changed.
ASSERT_EQ(completion, 1);
// queue error handling callback should be triggered
ASSERT_EQ(user_data.cb_triggered, true);
hsa_signal_store_relaxed(aql().completion_signal, 1);
}
sleep(1);
for (ii = 0; ii < kMaxQueue; ++ii) {
// queue error handling callback should be triggered
ASSERT_EQ(user_data[ii].cb_triggered, true);
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
}
}
@@ -437,19 +438,19 @@ void QueueValidation::QueueValidationForInvalidPacket(hsa_agent_t cpuAgent,
hsa_queue_t *queue[kMaxQueue]; // command queue
uint32_t ii;
test_validation_data user_data[kMaxQueue];
for (ii = 0; ii < kMaxQueue; ++ii) {
test_validation_data user_data;
// set callback flag to false if callback called then it will change to true
user_data.cb_triggered = false;
user_data[ii].cb_triggered = false;
// set the queue pointer
user_data.queue_pointer = &queue[ii];
user_data[ii].queue_pointer = &queue[ii];
// set the expected status in queue error calback handling
user_data.expected_status = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
// create queue
err = hsa_queue_create(gpuAgent,
queue_max, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
queue_max, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
@@ -486,12 +487,12 @@ void QueueValidation::QueueValidationForInvalidPacket(hsa_agent_t cpuAgent,
// completion signal should not be changed.
ASSERT_EQ(completion, 1);
// queue error handling callback should be triggered
ASSERT_EQ(user_data.cb_triggered, true);
hsa_signal_store_relaxed(aql().completion_signal, 1);
}
sleep(1);
for (ii = 0; ii < kMaxQueue; ++ii) {
// queue error handling callback should be triggered
ASSERT_EQ(user_data[ii].cb_triggered, true);
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
}
}
@@ -514,22 +515,22 @@ void QueueValidation::QueueValidationForInvalidWorkGroupSize(hsa_agent_t cpuAgen
queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;
hsa_queue_t *queue[kMaxQueue]; // command queue
test_validation_data user_data[kMaxQueue][3];
uint32_t ii;
for (ii = 0; ii < kMaxQueue; ++ii) {
uint32_t jj;
for (jj = 1; jj <= 3; ++jj) {
test_validation_data user_data;
// set callback flag to false if callback called then it will change to true
user_data.cb_triggered = false;
user_data[ii][jj - 1].cb_triggered = false;
// set the queue pointer
user_data.queue_pointer = &queue[ii];
user_data[ii][jj - 1].queue_pointer = &queue[ii];
// set the expected status in queue error calback handling
user_data.expected_status = HSA_STATUS_ERROR_INVALID_ARGUMENT;
user_data[ii][jj - 1].expected_status = HSA_STATUS_ERROR_INVALID_ARGUMENT;
// create queue
err = hsa_queue_create(gpuAgent,
kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
CallbackQueueErrorHandling, &user_data[ii][jj - 1], 0, 0, &queue[ii]);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
@@ -574,13 +575,17 @@ void QueueValidation::QueueValidationForInvalidWorkGroupSize(hsa_agent_t cpuAgen
// completion signal should not be changed.
ASSERT_EQ(completion, 1);
// queue error handling callback should be triggered
ASSERT_EQ(user_data.cb_triggered, true);
hsa_signal_store_relaxed(aql().completion_signal, 1);
if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
}
}
sleep(1);
for (uint32_t ii = 0; ii < kMaxQueue; ++ii) {
for (uint32_t jj = 0; jj < 3; ++jj) {
// queue error handling callback should be triggered
ASSERT_EQ(user_data[ii][jj].cb_triggered, true);
}
}
}
+1 -1
Просмотреть файл
@@ -54,7 +54,7 @@ set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCR-R
set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCR-Runtime" )
set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
#
# Currently support for Windows platform is not present
+5 -6
Просмотреть файл
@@ -128,11 +128,6 @@ TEST(rocrtst, Test_Example) {
RunGenericTest(&tst);
}
TEST(rocrtstFunc, IPC) {
IPCTest ipc;
RunGenericTest(&ipc);
}
TEST(rocrtstFunc, MemoryAccessTests) {
MemoryAccessTest mt;
RunCustomTestProlog(&mt);
@@ -175,7 +170,6 @@ TEST(rocrtstFunc, Concurrent_Shutdown) {
RunCustomTestEpilog(&cs);
}
TEST(rocrtstFunc, Reference_Count) {
ReferenceCountTest rc(true, false);
RunCustomTestProlog(&rc);
@@ -212,6 +206,11 @@ TEST(rocrtstFunc, Signal_Create_Concurrently) {
}
#ifndef ROCRTST_EMULATOR_BUILD
TEST(rocrtstFunc, IPC) {
IPCTest ipc;
RunGenericTest(&ipc);
}
TEST(rocrtstFunc, DISABLED_Signal_Kernel_Set) {
SignalKernelTest sk(SET);
RunCustomTestProlog(&sk);
+21
Просмотреть файл
@@ -1151,6 +1151,27 @@ hsa_status_t HSA_API hsa_amd_signal_value_pointer(hsa_signal_t signal,
return amdExtTable->hsa_amd_signal_value_pointer_fn(signal, value_ptr);
}
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count) {
return amdExtTable->hsa_amd_svm_attributes_set_fn(ptr, size, attribute_list, attribute_count);
}
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count) {
return amdExtTable->hsa_amd_svm_attributes_get_fn(ptr, size, attribute_list, attribute_count);
}
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal) {
return amdExtTable->hsa_amd_svm_prefetch_async_fn(ptr, size, agent, num_dep_signals, dep_signals, completion_signal);
}
// Tools only table interfaces.
namespace rocr {
+2 -1
Просмотреть файл
@@ -173,7 +173,8 @@ class GpuAgent : public GpuAgentInt {
// @param [in] node Node id. Each CPU in different socket will get distinct
// id.
// @param [in] node_props Node property.
GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props);
// @param [in] xnack_mode XNACK mode of device.
GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode);
// @brief GPU agent destructor.
~GpuAgent();
+26
Просмотреть файл
@@ -439,6 +439,32 @@ static const unsigned int kCodeTrapHandler9[] = {
0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
};
static const unsigned int kCodeTrapHandler90a[] = {
0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73,
0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900,
0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a,
0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff,
0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001,
0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f,
0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f,
0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f,
0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80,
0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044,
0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100,
0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4,
0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e,
0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c,
0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff,
0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f,
0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f,
0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37,
0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010,
0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f,
0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001,
0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e,
0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
};
static const unsigned int kCodeCopyAligned8[] = {
0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020,
0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050,
+1 -1
Просмотреть файл
@@ -95,7 +95,7 @@ class MemoryRegion : public core::MemoryRegion {
/// @brief Unpin memory.
static void MakeKfdMemoryUnresident(const void* ptr);
MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
const HsaMemoryProperties& mem_props);
~MemoryRegion();
+15
Просмотреть файл
@@ -246,6 +246,21 @@ hsa_status_t hsa_amd_deregister_deallocation_callback(
hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
volatile hsa_signal_value_t** value_ptr);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal);
} // namespace amd
} // namespace rocr
+5 -2
Просмотреть файл
@@ -57,8 +57,8 @@ class Agent;
class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
public:
MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner)
: fine_grain_(fine_grain), full_profile_(full_profile), owner_(owner) {
MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner)
: fine_grain_(fine_grain), kernarg_(kernarg), full_profile_(full_profile), owner_(owner) {
assert(owner_ != NULL);
}
@@ -112,12 +112,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
__forceinline bool fine_grain() const { return fine_grain_; }
__forceinline bool kernarg() const { return kernarg_; }
__forceinline bool full_profile() const { return full_profile_; }
__forceinline core::Agent* owner() const { return owner_; }
private:
const bool fine_grain_;
const bool kernarg_;
const bool full_profile_;
core::Agent* owner_;
+38
Просмотреть файл
@@ -292,6 +292,15 @@ class Runtime {
hsa_status_t IPCDetach(void* ptr);
hsa_status_t SetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count);
hsa_status_t GetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count);
hsa_status_t SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, uint32_t num_dep_signals,
const hsa_signal_t* dep_signals, hsa_signal_t completion_signal);
const std::vector<Agent*>& cpu_agents() { return cpu_agents_; }
const std::vector<Agent*>& gpu_agents() { return gpu_agents_; }
@@ -395,6 +404,28 @@ class Runtime {
std::vector<void*> arg_;
};
struct PrefetchRange;
typedef std::map<uintptr_t, PrefetchRange> prefetch_map_t;
struct PrefetchOp {
void* base;
size_t size;
uint32_t node_id;
int remaining_deps;
hsa_signal_t completion;
std::vector<hsa_signal_t> dep_signals;
prefetch_map_t::iterator prefetch_map_entry;
};
struct PrefetchRange {
PrefetchRange() {}
PrefetchRange(size_t Bytes, PrefetchOp* Op) : bytes(Bytes), op(Op) {}
size_t bytes;
PrefetchOp* op;
prefetch_map_t::iterator prev;
prefetch_map_t::iterator next;
};
// Will be created before any user could call hsa_init but also could be
// destroyed before incorrectly written programs call hsa_shutdown.
static KernelMutex bootstrap_lock_;
@@ -444,6 +475,9 @@ class Runtime {
/// @retval Index in ::link_matrix_.
uint32_t GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to);
/// @brief Get most recently issued SVM prefetch agent for the range in question.
Agent* GetSVMPrefetchAgent(void* ptr, size_t size);
// Mutex object to protect multithreaded access to ::allocation_map_,
// KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
// registered & mapped arrays.
@@ -485,6 +519,10 @@ class Runtime {
// Contains the region, address, and size of previously allocated memory.
std::map<const void*, AllocationRegion> allocation_map_;
// Pending prefetch containers.
KernelMutex prefetch_lock_;
prefetch_map_t prefetch_map_;
// Allocator using ::system_region_
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;
+17 -26
Просмотреть файл
@@ -69,40 +69,31 @@ void CpuAgent::InitRegionList() {
std::vector<HsaMemoryProperties> mem_props(properties_.NumMemoryBanks);
if (HSAKMT_STATUS_SUCCESS ==
hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks,
&mem_props[0])) {
hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, &mem_props[0])) {
std::vector<HsaMemoryProperties>::iterator system_prop =
std::find_if(mem_props.begin(), mem_props.end(), [](HsaMemoryProperties prop) -> bool {
return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM);
});
if (system_prop != mem_props.end()) {
MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, *system_prop);
HsaMemoryProperties system_props;
std::memset(&system_props, 0, sizeof(HsaMemoryProperties));
system_props.HeapType = HSA_HEAPTYPE_SYSTEM;
system_props.SizeInBytes = 0;
system_props.VirtualBaseAddress = 0;
regions_.push_back(system_region_fine);
if (system_prop != mem_props.end()) system_props = *system_prop;
if (!is_apu_node) {
MemoryRegion* system_region_coarse =
new MemoryRegion(false, is_apu_node, this, *system_prop);
MemoryRegion* system_region_fine =
new MemoryRegion(true, false, is_apu_node, this, system_props);
regions_.push_back(system_region_fine);
MemoryRegion* system_region_kernarg =
new MemoryRegion(true, true, is_apu_node, this, system_props);
regions_.push_back(system_region_kernarg);
regions_.push_back(system_region_coarse);
}
} else {
HsaMemoryProperties system_props;
std::memset(&system_props, 0, sizeof(HsaMemoryProperties));
system_props.HeapType = HSA_HEAPTYPE_SYSTEM;
system_props.SizeInBytes = 0;
system_props.VirtualBaseAddress = 0;
MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, system_props);
regions_.push_back(system_region_fine);
if (!is_apu_node) {
MemoryRegion* system_region_coarse =
new MemoryRegion(false, is_apu_node, this, system_props);
regions_.push_back(system_region_coarse);
}
if (!is_apu_node) {
MemoryRegion* system_region_coarse =
new MemoryRegion(false, false, is_apu_node, this, *system_prop);
regions_.push_back(system_region_coarse);
}
}
}
+29 -9
Просмотреть файл
@@ -77,7 +77,7 @@ extern HsaApiTable hsa_internal_api_table_;
} // namespace core
namespace AMD {
GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode)
: GpuAgentInt(node),
properties_(node_props),
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
@@ -112,16 +112,21 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
rocr::core::IsaFeature sramecc = rocr::core::IsaFeature::Unsupported;
if (isa_base->IsSrameccSupported()) {
sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1
? core::IsaFeature::Enabled
: core::IsaFeature::Disabled;
sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled
: core::IsaFeature::Disabled;
// sramecc control for emulator.
if (core::Runtime::runtime_singleton_->flag().sramecc() != Flag::FLAG_DEFAULT) {
sramecc = core::Runtime::runtime_singleton_->flag().sramecc() == Flag::FLAG_ENABLE
? core::IsaFeature::Enabled
: core::IsaFeature::Disabled;
}
}
rocr::core::IsaFeature xnack = rocr::core::IsaFeature::Unsupported;
if (isa_base->IsXnackSupported()) {
// TODO: This needs to be obtained form KFD once HMM implemented.
xnack = profile_ == HSA_PROFILE_FULL ? core::IsaFeature::Enabled
: core::IsaFeature::Disabled;
xnack = xnack_mode ? core::IsaFeature::Enabled
: core::IsaFeature::Disabled;
}
// Set instruction set architecture via node property, only on GPU device.
@@ -202,6 +207,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
ASICShader compute_7;
ASICShader compute_8;
ASICShader compute_9;
ASICShader compute_90a;
ASICShader compute_1010;
ASICShader compute_10;
};
@@ -212,6 +218,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
{NULL, 0, 0, 0},
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
{kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},
{kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},
{kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},
{kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},
}},
@@ -220,6 +227,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
{kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
}},
@@ -228,6 +236,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
}},
@@ -236,6 +245,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
{kCodeFill7, sizeof(kCodeFill7), 19, 8},
{kCodeFill8, sizeof(kCodeFill8), 19, 8},
{kCodeFill8, sizeof(kCodeFill8), 19, 8},
{kCodeFill8, sizeof(kCodeFill8), 19, 8},
{kCodeFill10, sizeof(kCodeFill10), 19, 8},
{kCodeFill10, sizeof(kCodeFill10), 19, 8},
}}};
@@ -254,6 +264,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
asic_shader = &compiled_shader_it->second.compute_8;
break;
case 9:
if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10))
asic_shader = &compiled_shader_it->second.compute_90a;
else
asic_shader = &compiled_shader_it->second.compute_9;
break;
case 10:
@@ -302,6 +315,14 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2);
AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1);
if ((isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) &&
(isa_->GetStepping() == 10)) {
// Program COMPUTE_PGM_RSRC3.ACCUM_OFFSET for 0 ACC VGPRs on gfx90a.
// FIXME: Assemble code objects from source at build time
int gran_accvgprs = ((gran_vgprs + 1) * 8) / 4 - 1;
header->max_scratch_backing_memory_byte_size = uint64_t(gran_accvgprs) << 32;
}
}
// Copy shader code into the GPU-visible buffer.
@@ -338,8 +359,7 @@ void GpuAgent::InitRegionList() {
memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
case HSA_HEAPTYPE_GPU_LDS:
case HSA_HEAPTYPE_GPU_SCRATCH: {
MemoryRegion* region =
new MemoryRegion(false, false, this, mem_props[mem_idx]);
MemoryRegion* region = new MemoryRegion(false, false, false, this, mem_props[mem_idx]);
regions_.push_back(region);
@@ -348,7 +368,7 @@ void GpuAgent::InitRegionList() {
// Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
if ((properties_.HiveID != 0) ||
(core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) {
regions_.push_back(new MemoryRegion(true, false, this, mem_props[mem_idx]));
regions_.push_back(new MemoryRegion(true, false, false, this, mem_props[mem_idx]));
}
}
break;
+10 -10
Просмотреть файл
@@ -100,9 +100,9 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
}
MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
const HsaMemoryProperties& mem_props)
: core::MemoryRegion(fine_grain, full_profile, owner),
: core::MemoryRegion(fine_grain, kernarg, full_profile, owner),
mem_props_(mem_props),
max_single_alloc_size_(0),
virtual_size_(0),
@@ -128,6 +128,8 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne
mem_flag_.ui32.HostAccess = 1;
mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED;
if (kernarg) mem_flag_.ui32.Uncached = 1;
virtual_size_ =
(full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize;
}
@@ -290,16 +292,14 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
case HSA_REGION_INFO_GLOBAL_FLAGS:
switch (mem_props_.HeapType) {
case HSA_HEAPTYPE_SYSTEM:
*((uint32_t*)value) = fine_grain()
? (HSA_REGION_GLOBAL_FLAG_KERNARG |
HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
: HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
break;
case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
*((uint32_t*)value) = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
: HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: {
uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
: HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG;
*((uint32_t*)value) = ret;
break;
}
default:
*((uint32_t*)value) = 0;
break;
+53 -12
Просмотреть файл
@@ -68,6 +68,38 @@ namespace AMD {
static const uint kKfdVersionMajor = 0;
static const uint kKfdVersionMinor = 99;
// Query for user preference and use that to determine Xnack mode of ROCm system.
// Return true if Xnack mode is ON or false if OFF. Xnack mode of a system is
// orthogonal to devices that do not support Xnack mode. It is legal for a
// system with Xnack ON to have devices that do not support Xnack functionality.
bool BindXnackMode() {
// Get users' preference for Xnack mode of ROCm platform
HSAint32 mode;
mode = core::Runtime::runtime_singleton_->flag().xnack();
bool config_xnack =
(core::Runtime::runtime_singleton_->flag().xnack() != Flag::XNACK_REQUEST::XNACK_UNCHANGED);
// Indicate to driver users' preference for Xnack mode
// Call to driver can fail and is a supported feature
HSAKMT_STATUS status = HSAKMT_STATUS_ERROR;
if (config_xnack) {
status = hsaKmtSetXNACKMode(mode);
if (status == HSAKMT_STATUS_SUCCESS) {
return mode;
}
}
// Get Xnack mode of devices bound by driver. This could happen
// when a call to SET Xnack mode fails or user has no particular
// preference
status = hsaKmtGetXNACKMode((HSAint32*)&mode);
if(status != HSAKMT_STATUS_SUCCESS) {
debug_print("KFD does not support xnack mode query.\nROCr must assume xnack is disabled.\n");
return false;
}
return mode;
}
CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
if (node_prop.NumCPUCores == 0) {
return nullptr;
@@ -79,14 +111,14 @@ CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
return cpu;
}
GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnack_mode) {
GpuAgent* gpu = nullptr;
if (node_prop.NumFComputeCores == 0) {
// Ignore non GPUs.
return nullptr;
}
try {
gpu = new GpuAgent(node_id, node_prop);
gpu = new GpuAgent(node_id, node_prop, xnack_mode);
const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion();
@@ -174,20 +206,26 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
link_info.atomic_support_32bit = true;
link_info.atomic_support_64bit = true;
link_info.coherent_support = true;
if (core::Runtime::runtime_singleton_->flag().patch_xgmi_link_weight()) {
io_link.Weight = 15;
}
break;
default:
debug_print("Unrecognized IOLINK type.\n");
break;
}
if (io_link.Flags.ui32.Override == 1) {
if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) {
// Ignore this link since peer to peer is not allowed.
continue;
// KFD is reporting wrong override status for XGMI. Disallow override for bringup.
if (!core::Runtime::runtime_singleton_->flag().patch_link_override()) {
if (io_link.Flags.ui32.Override == 1) {
if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) {
// Ignore this link since peer to peer is not allowed.
continue;
}
link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0);
link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0);
link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0);
}
link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0);
link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0);
link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0);
}
link_info.max_bandwidth = io_link.MaximumBandwidth;
@@ -204,7 +242,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
/**
* Process the list of Gpus that are surfaced to user
*/
static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
static void SurfaceGpuList(std::vector<int32_t>& gpu_list, bool xnack_mode) {
// Process user visible Gpu devices
int32_t invalidIdx = -1;
int32_t list_sz = gpu_list.size();
@@ -221,7 +259,7 @@ static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
// Instantiate a Gpu device. The IO links
// of this node have already been registered
assert((node_prop.NumFComputeCores != 0) && "Improper node used for GPU device discovery.");
DiscoverGpu(gpu_list[idx], node_prop);
DiscoverGpu(gpu_list[idx], node_prop, xnack_mode);
}
}
@@ -305,8 +343,11 @@ void BuildTopology() {
RegisterLinkInfo(node_id, node_prop.NumIOLinks);
}
// Determine the Xnack mode to be bound for system
bool xnack_mode = BindXnackMode();
// Instantiate ROCr objects to encapsulate Gpu devices
SurfaceGpuList(gpu_usr_list);
SurfaceGpuList(gpu_usr_list, xnack_mode);
}
bool Load() {
+3
Просмотреть файл
@@ -391,6 +391,9 @@ void HsaApiTable::UpdateAmdExts() {
amd_ext_api.hsa_amd_register_deallocation_callback_fn = AMD::hsa_amd_register_deallocation_callback;
amd_ext_api.hsa_amd_deregister_deallocation_callback_fn = AMD::hsa_amd_deregister_deallocation_callback;
amd_ext_api.hsa_amd_signal_value_pointer_fn = AMD::hsa_amd_signal_value_pointer;
amd_ext_api.hsa_amd_svm_attributes_set_fn = AMD::hsa_amd_svm_attributes_set;
amd_ext_api.hsa_amd_svm_attributes_get_fn = AMD::hsa_amd_svm_attributes_get;
amd_ext_api.hsa_amd_svm_prefetch_async_fn = AMD::hsa_amd_svm_prefetch_async;
}
void LoadInitialHsaApiTable() {
+32
Просмотреть файл
@@ -1009,5 +1009,37 @@ hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifie
CATCH;
}
hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count) {
TRY;
IS_OPEN();
return core::Runtime::runtime_singleton_->SetSvmAttrib(ptr, size, attribute_list,
attribute_count);
CATCH;
}
hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count) {
TRY;
IS_OPEN();
return core::Runtime::runtime_singleton_->GetSvmAttrib(ptr, size, attribute_list,
attribute_count);
CATCH;
}
hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal) {
TRY;
IS_OPEN();
// Validate inputs.
// if (core::g_use_interrupt_wait && (!core::InterruptSignal::IsType(signal)))
return core::Runtime::runtime_singleton_->SvmPrefetch(ptr, size, agent, num_dep_signals,
dep_signals, completion_signal);
CATCH;
}
} // namespace amd
} // namespace rocr
+9
Просмотреть файл
@@ -281,6 +281,15 @@ constexpr size_t hsa_name_size = 63;
ISAREG_ENTRY_GEN("gfx908:sramecc-:xnack+", 9, 0, 8, disabled, enabled)
ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack-", 9, 0, 8, enabled, disabled)
ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack+", 9, 0, 8, enabled, enabled)
ISAREG_ENTRY_GEN("gfx90a", 9, 0, 10, any, any)
ISAREG_ENTRY_GEN("gfx90a:xnack-", 9, 0, 10, any, disabled)
ISAREG_ENTRY_GEN("gfx90a:xnack+", 9, 0, 10, any, enabled)
ISAREG_ENTRY_GEN("gfx90a:sramecc-", 9, 0, 10, disabled, any)
ISAREG_ENTRY_GEN("gfx90a:sramecc+", 9, 0, 10, enabled, any)
ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack-", 9, 0, 10, disabled, disabled)
ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack+", 9, 0, 10, disabled, enabled)
ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack-", 9, 0, 10, enabled, disabled)
ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack+", 9, 0, 10, enabled, enabled)
ISAREG_ENTRY_GEN("gfx1010", 10, 1, 0, unsupported, any)
ISAREG_ENTRY_GEN("gfx1010:xnack-", 10, 1, 0, unsupported, disabled)
ISAREG_ENTRY_GEN("gfx1010:xnack+", 10, 1, 0, unsupported, enabled)
+528 -8
Просмотреть файл
@@ -60,6 +60,7 @@
#include "core/inc/hsa_ext_amd_impl.h"
#include "core/inc/hsa_api_trace_int.h"
#include "core/util/os.h"
#include "core/inc/exceptions.h"
#include "inc/hsa_ven_amd_aqlprofile.h"
#define HSA_VERSION_MAJOR 1
@@ -173,16 +174,29 @@ void Runtime::RegisterAgent(Agent* agent) {
if (cpu_agents_.size() == 1) {
// Might need memory pooling to cover allocation that
// requires less than 4096 bytes.
system_allocator_ = [this](size_t size, size_t align, MemoryRegion::AllocateFlags alloc_flags) -> void* {
assert(align <= 4096);
void* ptr = nullptr;
core::Runtime::runtime_singleton_->AllocateMemory(system_regions_fine_[0], size, alloc_flags, &ptr);
return ptr;
};
system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
// Default system pool must support kernarg
for (auto pool : system_regions_fine_) {
if (pool->kernarg()) {
system_allocator_ = [pool](size_t size, size_t alignment,
MemoryRegion::AllocateFlags alloc_flags) -> void* {
assert(alignment <= 4096);
void* ptr = NULL;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
&ptr))
? ptr
: NULL;
};
BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
system_deallocator_ = [](void* ptr) {
core::Runtime::runtime_singleton_->FreeMemory(ptr);
};
BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
break;
}
}
}
} else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) {
gpu_agents_.push_back(agent);
@@ -630,6 +644,18 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) {
*(const char**)value = STRING(ROCR_BUILD_ID);
break;
}
case HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: {
// todo: Get HMM kernel support info.
*(bool*)value = true;
break;
}
case HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: {
bool ret = true;
for(auto agent : gpu_agents_)
ret &= (agent->isa()->GetXnack() == IsaFeature::Enabled);
*(bool*)value = ret;
break;
}
default:
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
@@ -1572,5 +1598,499 @@ void Runtime::InternalQueueCreateNotify(const hsa_queue_t* queue, hsa_agent_t ag
internal_queue_create_notifier_(queue, agent, internal_queue_create_notifier_user_data_);
}
hsa_status_t Runtime::SetSvmAttrib(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count) {
uint32_t set_attribs = 0;
std::vector<bool> agent_seen(agents_by_node_.size(), false);
std::vector<HSA_SVM_ATTRIBUTE> attribs;
attribs.reserve(attribute_count);
uint32_t set_flags = 0;
uint32_t clear_flags = 0;
auto Convert = [&](uint64_t value) -> Agent* {
hsa_agent_t handle = {value};
Agent* agent = Agent::Convert(handle);
if ((agent == nullptr) || !agent->IsValid())
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
"Invalid agent handle in Runtime::SetSvmAttrib.");
return agent;
};
auto ConvertAllowNull = [&](uint64_t value) -> Agent* {
hsa_agent_t handle = {value};
Agent* agent = Agent::Convert(handle);
if ((agent != nullptr) && (!agent->IsValid()))
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
"Invalid agent handle in Runtime::SetSvmAttrib.");
return agent;
};
auto ConfirmNew = [&](Agent* agent) {
if (agent_seen[agent->node_id()])
throw AMD::hsa_exception(
HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS,
"Multiple attributes given for the same agent in Runtime::SetSvmAttrib.");
agent_seen[agent->node_id()] = true;
};
auto Check = [&](uint64_t attrib) {
if (set_attribs & (1 << attrib))
throw AMD::hsa_exception(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS,
"Attribute given multiple times in Runtime::SetSvmAttrib.");
set_attribs |= (1 << attrib);
};
auto kmtPair = [](uint32_t attrib, uint32_t value) {
HSA_SVM_ATTRIBUTE pair = {attrib, value};
return pair;
};
for (uint32_t i = 0; i < attribute_count; i++) {
auto attrib = attribute_list[i].attribute;
auto value = attribute_list[i].value;
switch (attrib) {
case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: {
Check(attrib);
switch (value) {
case HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED:
set_flags |= HSA_SVM_FLAG_COHERENT;
break;
case HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED:
clear_flags |= HSA_SVM_FLAG_COHERENT;
break;
default:
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
"Invalid HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG value.");
}
break;
}
case HSA_AMD_SVM_ATTRIB_READ_ONLY: {
Check(attrib);
if (value)
set_flags |= HSA_SVM_FLAG_GPU_RO;
else
clear_flags |= HSA_SVM_FLAG_GPU_RO;
break;
}
case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
Check(attrib);
if (value)
set_flags |= HSA_SVM_FLAG_HIVE_LOCAL;
else
clear_flags |= HSA_SVM_FLAG_HIVE_LOCAL;
break;
}
case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
Check(attrib);
// Max migration size is 1GB.
if (value > 18) value = 18;
attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, value));
break;
}
case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
Check(attrib);
Agent* agent = ConvertAllowNull(value);
if (agent == nullptr)
attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, INVALID_NODEID));
else
attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, agent->node_id()));
break;
}
case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE: {
Agent* agent = Convert(value);
ConfirmNew(agent);
if (agent->device_type() == Agent::kAmdCpuDevice) {
set_flags |= HSA_SVM_FLAG_HOST_ACCESS;
} else {
attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id()));
}
break;
}
case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE: {
Agent* agent = Convert(value);
ConfirmNew(agent);
if (agent->device_type() == Agent::kAmdCpuDevice) {
set_flags |= HSA_SVM_FLAG_HOST_ACCESS;
} else {
attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS_IN_PLACE, agent->node_id()));
}
break;
}
case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS: {
Agent* agent = Convert(value);
ConfirmNew(agent);
if (agent->device_type() == Agent::kAmdCpuDevice) {
clear_flags |= HSA_SVM_FLAG_HOST_ACCESS;
} else {
attribs.push_back(kmtPair(HSA_SVM_ATTR_NO_ACCESS, agent->node_id()));
}
break;
}
default:
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
"Illegal or invalid attribute in Runtime::SetSvmAttrib");
}
}
// Merge CPU access properties - grant access if any CPU needs access.
// Probably wrong.
if (set_flags & HSA_SVM_FLAG_HOST_ACCESS) clear_flags &= ~HSA_SVM_FLAG_HOST_ACCESS;
// Add flag updates
if (clear_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_CLR_FLAGS, clear_flags));
if (set_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, set_flags));
uint8_t* base = AlignDown((uint8_t*)ptr, 4096);
uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096);
size_t len = end - base;
HSAKMT_STATUS error = hsaKmtSVMSetAttr(base, len, attribs.size(), &attribs[0]);
if (error != HSAKMT_STATUS_SUCCESS)
throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMSetAttr failed.");
return HSA_STATUS_SUCCESS;
}
hsa_status_t Runtime::GetSvmAttrib(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count) {
std::vector<HSA_SVM_ATTRIBUTE> attribs;
attribs.reserve(attribute_count);
std::vector<int> kmtIndices(attribute_count);
bool getFlags = false;
auto Convert = [&](uint64_t value) -> Agent* {
hsa_agent_t handle = {value};
Agent* agent = Agent::Convert(handle);
if ((agent == nullptr) || !agent->IsValid())
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
"Invalid agent handle in Runtime::GetSvmAttrib.");
return agent;
};
auto kmtPair = [](uint32_t attrib, uint32_t value) {
HSA_SVM_ATTRIBUTE pair = {attrib, value};
return pair;
};
for (uint32_t i = 0; i < attribute_count; i++) {
auto& attrib = attribute_list[i].attribute;
auto& value = attribute_list[i].value;
switch (attrib) {
case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG:
case HSA_AMD_SVM_ATTRIB_READ_ONLY:
case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
getFlags = true;
kmtIndices[i] = -1;
break;
}
case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
kmtIndices[i] = attribs.size();
attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, 0));
break;
}
case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
kmtIndices[i] = attribs.size();
attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, 0));
break;
}
case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: {
value = Agent::Convert(GetSVMPrefetchAgent(ptr, size)).handle;
kmtIndices[i] = -1;
break;
}
case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: {
Agent* agent = Convert(value);
if (agent->device_type() == Agent::kAmdCpuDevice) {
getFlags = true;
kmtIndices[i] = -1;
} else {
kmtIndices[i] = attribs.size();
attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id()));
}
break;
}
default:
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
"Illegal or invalid attribute in Runtime::SetSvmAttrib");
}
}
if (getFlags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, 0));
uint8_t* base = AlignDown((uint8_t*)ptr, 4096);
uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096);
size_t len = end - base;
if (attribs.size() != 0) {
HSAKMT_STATUS error = hsaKmtSVMGetAttr(base, len, attribs.size(), &attribs[0]);
if (error != HSAKMT_STATUS_SUCCESS)
throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMGetAttr failed.");
}
for (uint32_t i = 0; i < attribute_count; i++) {
auto& attrib = attribute_list[i].attribute;
auto& value = attribute_list[i].value;
switch (attrib) {
case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: {
if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_COHERENT)
value = HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED;
else
value = HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED;
break;
}
case HSA_AMD_SVM_ATTRIB_READ_ONLY: {
value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_GPU_RO);
break;
}
case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HIVE_LOCAL);
break;
}
case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
value = attribs[kmtIndices[i]].value;
break;
}
case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
uint64_t node = attribs[kmtIndices[i]].value;
Agent* agent = nullptr;
if (node != INVALID_NODEID) agent = agents_by_node_[node][0];
value = Agent::Convert(agent).handle;
break;
}
case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: {
break;
}
case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: {
if (kmtIndices[i] == -1) {
if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HOST_ACCESS)
attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE;
} else {
switch (attribs[kmtIndices[i]].type) {
case HSA_SVM_ATTR_ACCESS:
attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE;
break;
case HSA_SVM_ATTR_ACCESS_IN_PLACE:
attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE;
break;
case HSA_SVM_ATTR_NO_ACCESS:
attrib = HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS;
break;
default:
assert(false && "Bad agent accessibility from KFD.");
}
}
break;
}
default:
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
"Illegal or invalid attribute in Runtime::GetSvmAttrib");
}
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal) {
uintptr_t base = reinterpret_cast<uintptr_t>(AlignDown(ptr, 4096));
uintptr_t end = AlignUp(reinterpret_cast<uintptr_t>(ptr) + size, 4096);
size_t len = end - base;
PrefetchOp* op = new PrefetchOp();
MAKE_NAMED_SCOPE_GUARD(OpGuard, [&]() { delete op; });
Agent* dest = Agent::Convert(agent);
if (dest->device_type() == Agent::kAmdCpuDevice)
op->node_id = 0;
else
op->node_id = dest->node_id();
op->base = reinterpret_cast<void*>(base);
op->size = len;
op->completion = completion_signal;
if (num_dep_signals > 1) {
op->remaining_deps = num_dep_signals - 1;
for (int i = 0; i < num_dep_signals - 1; i++) op->dep_signals.push_back(dep_signals[i]);
} else {
op->remaining_deps = 0;
}
{
ScopedAcquire<KernelMutex> lock(&prefetch_lock_);
// Remove all fully overlapped and trim partially overlapped ranges.
// Get iteration bounds
auto start = prefetch_map_.upper_bound(base);
if (start != prefetch_map_.begin()) start--;
auto stop = prefetch_map_.lower_bound(end);
auto isEndNode = [&](decltype(start) node) { return node->second.next == prefetch_map_.end(); };
auto isFirstNode = [&](decltype(start) node) {
return node->second.prev == prefetch_map_.end();
};
// Trim and remove old ranges.
while (start != stop) {
uintptr_t startBase = start->first;
uintptr_t startEnd = startBase + start->second.bytes;
auto ibase = Max(startBase, base);
auto iend = Min(startEnd, end);
// Check for overlap
if (ibase < iend) {
// Second range check
if (iend < startEnd) {
auto ret = prefetch_map_.insert(
std::make_pair(iend, PrefetchRange(startEnd - iend, start->second.op)));
assert(ret.second && "Prefetch map insert failed during range split.");
auto it = ret.first;
it->second.prev = start;
it->second.next = start->second.next;
start->second.next = it;
if (!isEndNode(it)) it->second.next->second.prev = it;
}
// Is the first interval of the old range valid
if (startBase < ibase) {
start->second.bytes = ibase - startBase;
} else {
if (isFirstNode(start)) {
start->second.op->prefetch_map_entry = start->second.next;
if (!isEndNode(start)) start->second.next->second.prev = prefetch_map_.end();
} else {
start->second.prev->second.next = start->second.next;
if (!isEndNode(start)) start->second.next->second.prev = start->second.prev;
}
prefetch_map_.erase(start);
}
}
start++;
}
// Insert new range.
auto ret = prefetch_map_.insert(std::make_pair(base, PrefetchRange(len, op)));
assert(ret.second && "Prefetch map insert failed.");
auto it = ret.first;
op->prefetch_map_entry = it;
it->second.next = it->second.prev = prefetch_map_.end();
}
// Remove the prefetch's ranges from the map.
static auto removePrefetchRanges = [](PrefetchOp* op) {
ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
auto it = op->prefetch_map_entry;
while (it != Runtime::runtime_singleton_->prefetch_map_.end()) {
auto next = it->second.next;
Runtime::runtime_singleton_->prefetch_map_.erase(it);
it = next;
}
};
// Prefetch Signal handler for synchronization.
static hsa_amd_signal_handler signal_handler = [](hsa_signal_value_t value, void* arg) {
PrefetchOp* op = reinterpret_cast<PrefetchOp*>(arg);
if (op->remaining_deps > 0) {
op->remaining_deps--;
Runtime::runtime_singleton_->SetAsyncSignalHandler(
op->dep_signals[op->remaining_deps], HSA_SIGNAL_CONDITION_EQ, 0, signal_handler, arg);
return false;
}
HSA_SVM_ATTRIBUTE attrib;
attrib.type = HSA_SVM_ATTR_PREFETCH_LOC;
attrib.value = op->node_id;
HSAKMT_STATUS error = hsaKmtSVMSetAttr(op->base, op->size, 1, &attrib);
assert(error == HSAKMT_STATUS_SUCCESS && "KFD Prefetch failed.");
removePrefetchRanges(op);
if (op->completion.handle != 0) Signal::Convert(op->completion)->SubRelaxed(1);
delete op;
return false;
};
auto no_dependencies = [](void* arg) { signal_handler(0, arg); };
MAKE_NAMED_SCOPE_GUARD(RangeGuard, [&]() { removePrefetchRanges(op); });
hsa_status_t err;
if (num_dep_signals == 0)
err = AMD::hsa_amd_async_function(no_dependencies, op);
else
err = SetAsyncSignalHandler(dep_signals[num_dep_signals - 1], HSA_SIGNAL_CONDITION_EQ, 0,
signal_handler, op);
if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Signal handler unable to be set.");
RangeGuard.Dismiss();
OpGuard.Dismiss();
return HSA_STATUS_SUCCESS;
}
Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
uintptr_t base = reinterpret_cast<uintptr_t>(AlignDown(ptr, 4096));
uintptr_t end = AlignUp(reinterpret_cast<uintptr_t>(ptr) + size, 4096);
size_t len = end - base;
std::vector<std::pair<uintptr_t, uintptr_t>> holes;
ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
auto start = prefetch_map_.upper_bound(base);
if (start != prefetch_map_.begin()) start--;
auto stop = prefetch_map_.lower_bound(end);
// KFD returns -1 for no or mixed destinations.
uint32_t prefetch_node = -2;
if (start != stop) {
prefetch_node = start->second.op->node_id;
}
while (start != stop) {
uintptr_t startBase = start->first;
uintptr_t startEnd = startBase + start->second.bytes;
auto ibase = Max(base, startBase);
auto iend = Min(end, startEnd);
// Check for intersection with the query
if (ibase < iend) {
// If prefetch locations are different then we report null agent.
if (prefetch_node != start->second.op->node_id) return nullptr;
// Push leading gap to an array for checking KFD.
if (base < ibase) holes.push_back(std::make_pair(base, ibase - base));
// Trim query range.
base = iend;
}
start++;
}
if (base < end) holes.push_back(std::make_pair(base, end - base));
HSA_SVM_ATTRIBUTE attrib;
attrib.type = HSA_SVM_ATTR_PREFETCH_LOC;
for (auto& range : holes) {
HSAKMT_STATUS error =
hsaKmtSVMGetAttr(reinterpret_cast<void*>(range.first), range.second, 1, &attrib);
assert(error == HSAKMT_STATUS_SUCCESS && "KFD prefetch query failed.");
if (attrib.value == -1) return nullptr;
if (prefetch_node == -2) prefetch_node = attrib.value;
if (prefetch_node != attrib.value) return nullptr;
}
assert(prefetch_node != -2 && "prefetch_node was not updated.");
assert(prefetch_node != -1 && "Should have already returned.");
return agents_by_node_[prefetch_node][0];
}
} // namespace core
} // namespace rocr
+40
Просмотреть файл
@@ -56,6 +56,13 @@ class Flag {
public:
enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT };
// The values are meaningful and chosen to satisfy the thunk API.
enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 };
static_assert(XNACK_DISABLE == 0, "XNACK_REQUEST enum values improperly changed.");
static_assert(XNACK_ENABLE == 1, "XNACK_REQUEST enum values improperly changed.");
enum FLAG_TRI_STATE { FLAG_DISABLE = 0, FLAG_ENABLE = 1, FLAG_DEFAULT = 2 };
explicit Flag() { Refresh(); }
virtual ~Flag() {}
@@ -104,6 +111,12 @@ class Flag {
var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR");
disable_fragment_alloc_ = (var == "1") ? true : false;
var = os::GetEnvVar("HSA_UNPATCH_XGMI_LINK_WEIGHT");
patch_xgmi_link_weight_ = (var == "1") ? false : true;
var = os::GetEnvVar("HSA_UNPATCH_LINK_OVERRIDE");
patch_link_override_ = (var == "1") ? false : true;
var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH");
enable_sdma_hdp_flush_ = (var == "0") ? false : true;
@@ -130,6 +143,16 @@ class Flag {
var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT");
check_sramecc_validity_ = (var == "1") ? false : true;
// Legal values are zero "0" or one "1". Any other value will
// be interpreted as not defining the env variable
var = os::GetEnvVar("HSA_XNACK");
xnack_ = (var == "0") ? XNACK_DISABLE : ((var == "1") ? XNACK_ENABLE : XNACK_UNCHANGED);
// Legal values are zero "0" or one "1". Any other value will
// be interpreted as not defining the env variable.
var = os::GetEnvVar("HSA_FORCE_SRAMECC");
sramecc_ = (var == "0") ? FLAG_DISABLE : ((var == "1") ? FLAG_ENABLE : FLAG_DEFAULT);
}
bool check_flat_scratch() const { return check_flat_scratch_; }
@@ -150,6 +173,11 @@ class Flag {
bool disable_fragment_alloc() const { return disable_fragment_alloc_; }
// Temporary way to control ROCr interpretation of inter-device link weight
bool patch_xgmi_link_weight() const { return patch_xgmi_link_weight_; }
bool patch_link_override() const { return patch_link_override_; }
bool rev_copy_dir() const { return rev_copy_dir_; }
bool fine_grain_pcie() const { return fine_grain_pcie_; }
@@ -178,6 +206,10 @@ class Flag {
bool check_sramecc_validity() const { return check_sramecc_validity_; }
XNACK_REQUEST xnack() const { return xnack_; }
FLAG_TRI_STATE sramecc() const { return sramecc_; }
private:
bool check_flat_scratch_;
bool enable_vm_fault_message_;
@@ -195,6 +227,8 @@ class Flag {
bool disable_image_;
bool loader_enable_mmap_uri_;
bool check_sramecc_validity_;
bool patch_xgmi_link_weight_;
bool patch_link_override_;
SDMA_OVERRIDE enable_sdma_;
@@ -209,6 +243,12 @@ class Flag {
size_t force_sdma_size_;
// Indicates user preference for Xnack state.
XNACK_REQUEST xnack_;
// Indicates user preference for SramECC state.
FLAG_TRI_STATE sramecc_;
DISALLOW_COPY_AND_ASSIGN(Flag);
};
+3
Просмотреть файл
@@ -222,6 +222,9 @@ global:
hsa_amd_deregister_deallocation_callback;
hsa_amd_signal_value_pointer;
_amdgpu_r_debug;
hsa_amd_svm_attributes_set;
hsa_amd_svm_attributes_get;
hsa_amd_svm_prefetch_async;
local:
*;
+3
Просмотреть файл
@@ -82,6 +82,7 @@ extern uint8_t ocl_blit_object_gfx902[];
extern uint8_t ocl_blit_object_gfx904[];
extern uint8_t ocl_blit_object_gfx906[];
extern uint8_t ocl_blit_object_gfx908[];
extern uint8_t ocl_blit_object_gfx90a[];
extern uint8_t ocl_blit_object_gfx1010[];
extern uint8_t ocl_blit_object_gfx1011[];
extern uint8_t ocl_blit_object_gfx1012[];
@@ -990,6 +991,8 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name,
*blit_code_object = ocl_blit_object_gfx906;
} else if (sname == "gfx908") {
*blit_code_object = ocl_blit_object_gfx908;
} else if (sname == "gfx90a") {
*blit_code_object = ocl_blit_object_gfx90a;
} else if (sname == "gfx1010") {
*blit_code_object = ocl_blit_object_gfx1010;
} else if (sname == "gfx1011") {
+1 -1
Просмотреть файл
@@ -69,7 +69,7 @@ endif()
# Determine the target devices if not specified
if (NOT DEFINED TARGET_DEVICES)
set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
endif()
set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE )
+8 -10
Просмотреть файл
@@ -50,17 +50,11 @@
namespace rocr {
namespace image {
uint32_t MajorVerFromDevID(uint32_t dev_id) {
return dev_id/100;
}
uint32_t MajorVerFromDevID(uint32_t dev_id) { return dev_id >> 8; }
uint32_t MinorVerFromDevID(uint32_t dev_id) {
return (dev_id % 100)/10;
}
uint32_t MinorVerFromDevID(uint32_t dev_id) { return (dev_id >> 4) & 0xF; }
uint32_t StepFromDevID(uint32_t dev_id) {
return (dev_id%100)%10;
}
uint32_t StepFromDevID(uint32_t dev_id) { return dev_id & 0xF; }
hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) {
char asic_name[64];
@@ -78,7 +72,10 @@ hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) {
assert(a_str.compare(0, 3, "gfx", 3) == 0);
a_str.erase(0,3);
*chip_id = std::stoi(a_str);
// Load chip_id accounting for stepping and minor in hex and major in dec.
*chip_id = std::stoi(a_str.substr(a_str.length() - 2), nullptr, 16);
*chip_id += (std::stoi(a_str.substr(0, a_str.length() - 2)) << 8);
return HSA_STATUS_SUCCESS;
}
@@ -163,6 +160,7 @@ uint32_t DevIDToAddrLibFamily(uint32_t dev_id) {
case 4: // Vega12
case 6: // Vega20
case 8: // Arcturus
case 10: // Aldebaran
return FAMILY_AI;
case 2:
+4 -1
Просмотреть файл
@@ -115,10 +115,13 @@ enum : unsigned {
EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a,
EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b,
EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3D = 0x03d,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3E = 0x03e,
EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f,
// First/last AMDGCN-based processors.
EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805,
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A,
// Indicates if the "xnack" target feature is enabled for all code contained
// in the object.
+15 -1
Просмотреть файл
@@ -480,7 +480,21 @@ typedef enum {
/**
* String containing the ROCr build identifier.
*/
HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200
HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200,
/**
* Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of
* this attribute is bool.
*/
HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201,
// TODO: Should this be per Agent?
/**
* Returns true if all Agents have access to system allocated memory (such as
* that allocated by mmap, malloc, or new) by default.
* If false then system allocated memory may only be made SVM accessible to
* an Agent by declaration of accessibility with hsa_amd_svm_set_attributes.
* The type of this attribute is bool.
*/
HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202
} hsa_system_info_t;
/**
+3
Просмотреть файл
@@ -183,6 +183,9 @@ struct AmdExtTable {
decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn;
decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn;
decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn;
decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn;
decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn;
decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn;
};
// Table to export HSA Core Runtime Apis
+156
Просмотреть файл
@@ -2116,6 +2116,162 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr,
hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
hsa_amd_deallocation_callback_t callback);
typedef enum hsa_amd_svm_model_s {
/**
* Updates to memory with this attribute conform to HSA memory consistency model.
*/
HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0,
/**
* Writes to memory with this attribute can be performed by a single agent at a time.
*/
HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1
} hsa_amd_svm_model_t;
typedef enum hsa_amd_svm_attribute_s {
// Memory model attribute.
// Type of this attribute is hsa_amd_svm_model_t.
HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0,
// Marks the range read only. This allows multiple physical copies to be
// placed local to each accessing device.
// Type of this attribute is bool.
HSA_AMD_SVM_ATTRIB_READ_ONLY = 1,
// Automatic migrations should attempt to keep the memory within the xgmi hive
// containing accessible agents.
// Type of this attribute is bool.
HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2,
// Page granularity to migrate at once. Page granularity is specified as
// log2(page_count).
// Type of this attribute is uint64_t.
HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3,
// Physical location to prefer when automatic migration occurs.
// Set to the null agent handle (handle == 0) to indicate there
// is no preferred location.
// Type of this attribute is hsa_agent_t.
HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4,
// This attribute can not be used in ::hsa_amd_svm_attributes_set (see
// ::hsa_amd_svm_prefetch_async).
// Physical location of most recent prefetch command.
// If the prefetch location has not been set or is not uniform across the
// address range then returned hsa_agent_t::handle will be 0.
// Querying this attribute will return the destination agent of the most
// recent ::hsa_amd_svm_prefetch_async targeting the address range. If
// multiple async prefetches have been issued targeting the region and the
// most recently issued prefetch has completed then the query will return
// the location of the most recently completed prefetch.
// Type of this attribute is hsa_agent_t.
HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5,
// This attribute can not be used in ::hsa_amd_svm_attributes_get.
// Enables an agent for access to the range. Access may incur a page fault
// and associated memory migration. Either this or
// HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM
// access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
// Type of this attribute is hsa_agent_t.
HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200,
// This attribute can not be used in ::hsa_amd_svm_attributes_get.
// Enables an agent for access to the range without page faults. Access
// will not incur a page fault and will not cause access based migration.
// and associated memory migration. Either this or
// HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if
// HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
// Type of this attribute is hsa_agent_t.
HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201,
// This attribute can not be used in ::hsa_amd_svm_attributes_get.
// Denies an agent access to the memory range. Access will cause a terminal
// segfault.
// Type of this attribute is hsa_agent_t.
HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202,
// This attribute can not be used in ::hsa_amd_svm_attributes_set.
// Returns the access attribute associated with the agent.
// The agent to query must be set in the attribute value field.
// The attribute enum will be replaced with the agent's current access
// attribute for the address range.
// TODO: Clarify KFD return value for non-uniform access attribute.
// Type of this attribute is hsa_agent_t.
HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203,
} hsa_amd_svm_attribute_t;
// List type for hsa_amd_svm_attributes_set/get.
typedef struct hsa_amd_svm_attribute_pair_s {
// hsa_amd_svm_attribute_t value.
uint64_t attribute;
// Attribute value. Bit values should be interpreted according to the type
// given in the associated attribute description.
uint64_t value;
} hsa_amd_svm_attribute_pair_t;
/**
* @brief Sets SVM memory attributes.
*
* If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling
* access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
* or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM
* memory access by that Agent.
*
* Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION
* may not be used with this API.
*
* @param[in] ptr Will be aligned down to nearest page boundary.
*
* @param[in] size Will be aligned up to nearest page boundary.
*
* @param[in] attribute_list List of attributes to set for the address range.
*
* @param[in] attribute_count Length of @p attribute_list.
*/
hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count);
/**
* @brief Gets SVM memory attributes.
*
* Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE,
* HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and
* HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API.
*
* Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an
* hsa_agent_t and returns the current access type through its attribute field.
*
* @param[in] ptr Will be aligned down to nearest page boundary.
*
* @param[in] size Will be aligned up to nearest page boundary.
*
* @param[in] attribute_list List of attributes to set for the address range.
*
* @param[in] attribute_count Length of @p attribute_list.
*/
hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count);
/**
* @brief Asynchronously migrates memory to an agent.
*
* Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero.
* @p completion_signal will decrement when the migration is complete.
*
* @param[in] ptr Will be aligned down to nearest page boundary.
*
* @param[in] size Will be aligned up to nearest page boundary.
*
* @param[in] agent Agent to migrate to.
*
* @param[in] num_dep_signals Number of dependent signals. Can be 0.
*
* @param[in] dep_signals List of signals that must be waited on before the migration
* operation starts. The migration will start after every signal has been observed with
* the value 0. If @p num_dep_signals is 0, this argument is ignored.
*
* @param[in] completion_signal Signal used to indicate completion of the migration
* operation. When the migration operation is finished, the value of the signal is
* decremented. The runtime indicates that an error has occurred during the copy
* operation by setting the value of the completion signal to a negative
* number. If no completion signal is required this handle may be null.
*/
hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal);
#ifdef __cplusplus
} // end extern "C" block
#endif
+1
Просмотреть файл
@@ -568,6 +568,7 @@ namespace code {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: name = "gfx906"; xnack_supported = true; sramecc_supported = true; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: name = "gfx908"; xnack_supported = true; sramecc_supported = true; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: name = "gfx909"; xnack_supported = true; sramecc_supported = false; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: name = "gfx90a"; xnack_supported = true; sramecc_supported = true; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: name = "gfx90c"; xnack_supported = true; sramecc_supported = false; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: name = "gfx1010"; xnack_supported = true; sramecc_supported = false; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: name = "gfx1011"; xnack_supported = true; sramecc_supported = false; break;
+23 -20
Просмотреть файл
@@ -80,26 +80,27 @@ namespace loader {
: out(std::cout)
{
invalid.handle = 0;
gfx700.handle = 700;
gfx701.handle = 701;
gfx702.handle = 702;
gfx801.handle = 801;
gfx802.handle = 802;
gfx803.handle = 803;
gfx805.handle = 805;
gfx810.handle = 810;
gfx900.handle = 900;
gfx902.handle = 902;
gfx904.handle = 904;
gfx906.handle = 906;
gfx908.handle = 908;
gfx1010.handle = 1010;
gfx1011.handle = 1011;
gfx1012.handle = 1012;
gfx1030.handle = 1030;
gfx1031.handle = 1031;
gfx1032.handle = 1032;
gfx1033.handle = 1033;
gfx700.handle = 0x700;
gfx701.handle = 0x701;
gfx702.handle = 0x702;
gfx801.handle = 0x801;
gfx802.handle = 0x802;
gfx803.handle = 0x803;
gfx805.handle = 0x805;
gfx810.handle = 0x810;
gfx900.handle = 0x900;
gfx902.handle = 0x902;
gfx904.handle = 0x904;
gfx906.handle = 0x906;
gfx908.handle = 0x908;
gfx90a.handle = 0x90a;
gfx1010.handle = 0x1010;
gfx1011.handle = 0x1011;
gfx1012.handle = 0x1012;
gfx1030.handle = 0x1030;
gfx1031.handle = 0x1031;
gfx1032.handle = 0x1032;
gfx1033.handle = 0x1033;
}
hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name)
@@ -137,6 +138,8 @@ namespace loader {
return gfx906;
} else if (sname == "AMD:AMDGPU:9:0:8") {
return gfx908;
} else if (sname == "AMD:AMDGPU:9:0:A") {
return gfx90a;
} else if (sname == "AMD:AMDGPU:10:1:0") {
return gfx1010;
} else if (sname == "AMD:AMDGPU:10:1:1") {
+1 -1
Просмотреть файл
@@ -57,7 +57,7 @@ namespace loader {
hsa_isa_t invalid;
hsa_isa_t gfx700, gfx701, gfx702, gfx703, gfx704, gfx705;
hsa_isa_t gfx801, gfx802, gfx803, gfx805, gfx810;
hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908;
hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908, gfx90a;
hsa_isa_t gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, gfx1032, gfx1033;
std::ostream& out;
typedef std::set<void*> PointerSet;