rocr: Memory copy based on recommended SDMA engines

Recommended SDMA engines for DMA copies are now exposed for better
GPU-GPU performance. ROCr can now select those DMA engines.

Also lock-in host-device copies to SDMA0 and device-host copies to
SDMA1 for better stability and performance.

Change-Id: Ideff2e13daf537104efecb8b837bd49ee5096cb5
This commit is contained in:
Jonathan Kim
2024-08-13 14:54:13 -04:00
parent 2f588a2406
commit eb30a5bbc7
9 ha cambiato i file con 73 aggiunte e 22 eliminazioni
@@ -66,7 +66,7 @@ class BlitSdmaBase : public core::Blit {
static const size_t kMaxSingleFillSize;
virtual bool isSDMA() const override { return true; }
virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi,
size_t linear_copy_size_override) = 0;
size_t linear_copy_size_override, int rec_engine) = 0;
virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src,
@@ -92,7 +92,7 @@ class BlitSdma : public BlitSdmaBase {
///
/// @return hsa_status_t
virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi,
size_t linear_copy_size_override) override;
size_t linear_copy_size_override, int rec_eng) override;
/// @brief Marks the queue object as invalid and uncouples its link with
/// the underlying compute device's control block. Use of queue object
@@ -161,6 +161,8 @@ class GpuAgentInt : public core::Agent {
virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0;
virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) = 0;
// @brief Query if agent represent Kaveri GPU.
//
// @retval true if agent is Kaveri GPU.
@@ -336,6 +338,8 @@ class GpuAgent : public GpuAgentInt {
void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) override;
void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) override;
// Getter & setters.
// @brief Returns Hive ID
@@ -457,7 +461,7 @@ class GpuAgent : public GpuAgentInt {
// @brief Create SDMA blit object.
//
// @retval NULL if SDMA blit creation and initialization failed.
core::Blit* CreateBlitSdma(bool use_xgmi);
core::Blit* CreateBlitSdma(bool use_xgmi, int rec_eng);
// @brief Create Kernel blit object using provided compute queue.
//
@@ -761,6 +765,10 @@ class GpuAgent : public GpuAgentInt {
bool DmaEngineIsFree(uint32_t engine_id);
std::map<uint64_t,unsigned int> gang_peers_info_;
std::map<uint64_t, uint32_t> rec_sdma_eng_id_peers_info_;
bool uses_rec_sdma_eng_id_mask_;
};
} // namespace amd
+3 -2
Vedi File
@@ -115,9 +115,10 @@ class Runtime {
public:
/// @brief Structure to describe connectivity between agents.
struct LinkInfo {
LinkInfo() : num_hop(0), info{0} {}
LinkInfo() : num_hop(0), rec_sdma_eng_id_mask(0), info{0} {}
uint32_t num_hop;
uint32_t rec_sdma_eng_id_mask;
hsa_amd_memory_pool_link_info_t info;
};
@@ -167,7 +168,7 @@ class Runtime {
/// @param [in] link_info The link information between source and destination
/// nodes.
void RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
uint32_t num_hop,
uint32_t num_hop, uint32_t rec_sdma_eng_id_mask,
hsa_amd_memory_pool_link_info_t& link_info);
/// @brief Query link information between two nodes.
@@ -283,10 +283,10 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
HSAKMT_STATUS kmt_status;
if (core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging) {
queue_rsrc.ErrorReason = &exception_signal_->signal_.value;
kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_,
kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_,
ring_buf_alloc_bytes_, queue_event_, &queue_rsrc);
} else {
kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_,
kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_,
ring_buf_alloc_bytes_, NULL, &queue_rsrc);
}
if (kmt_status != HSAKMT_STATUS_SUCCESS)
@@ -133,7 +133,7 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::~BlitSdma()
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Initialize(
const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override) {
const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override, int rec_eng) {
if (queue_start_addr_ != NULL) {
// Already initialized.
return HSA_STATUS_SUCCESS;
@@ -191,10 +191,12 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
// device. ROCr creates queues that are of two kinds: PCIe optimized
// and xGMI optimized. Which queue to create is indicated via input
// boolean flag
const HSA_QUEUE_TYPE kQueueType_ = use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA;
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
kQueueSize, NULL, &queue_resource_)) {
const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
(use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueueExt(agent_->node_id(), kQueueType_, 100,
HSA_QUEUE_PRIORITY_MAXIMUM, rec_eng,
queue_start_addr_, kQueueSize, NULL,
&queue_resource_)) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
@@ -720,7 +720,7 @@ core::Queue* GpuAgent::CreateInterceptibleQueue(void (*callback)(hsa_status_t st
return queue;
}
core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) {
AMD::BlitSdmaBase* sdma;
size_t copy_size_override = 0;
const size_t copy_size_overrides[2] = {0x3fffff, 0x3fffffff};
@@ -754,7 +754,9 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
core::Runtime::runtime_singleton_->flag().enable_sdma_copy_size_override();
if (copy_size_override_setting == Flag::SDMA_DISABLE) copy_size_override = 0;
if (sdma->Initialize(*this, use_xgmi, copy_size_override) != HSA_STATUS_SUCCESS) {
rec_eng = uses_rec_sdma_eng_id_mask_ || !use_xgmi ? rec_eng : -1;
if (sdma->Initialize(*this, use_xgmi, copy_size_override, rec_eng) != HSA_STATUS_SUCCESS) {
sdma->Destroy(*this);
delete sdma;
sdma = nullptr;
@@ -801,7 +803,7 @@ void GpuAgent::InitDma() {
queues_[QueuePCSampling].reset([queue_lambda, this]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); });
// Decide which engine to use for blits.
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev) {
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev, uint32_t rec_eng) {
Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
// User SDMA queues are unstable on gfx8 and unsupported on gfx1013.
@@ -817,7 +819,7 @@ void GpuAgent::InitDma() {
*blits_[BlitHostToDev];
}
auto ret = CreateBlitSdma(use_xgmi);
auto ret = CreateBlitSdma(use_xgmi, rec_eng);
if (ret != nullptr) return ret;
}
@@ -857,14 +859,15 @@ void GpuAgent::InitDma() {
return ret;
});
blits_[BlitHostToDev].reset(
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true); });
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true, 0); });
blits_[BlitDevToHost].reset(
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false); });
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false, 1); });
// XGMI engines.
for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) {
const int eng = idx - 1;
blits_[idx].reset(
[blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility], false); });
[blit_lambda, this, eng]() { return blit_lambda(true, queues_[QueueUtility], false, eng); });
}
// GWS queues.
@@ -941,6 +944,25 @@ void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_fa
gang_peers_info_[peer.public_handle().handle] = max_bandwidth_factor;
}
// Assign direct peer recommended SDMA engine IDs to GPU
void GpuAgent::RegisterRecSdmaEngIdMaskPeer(core::Agent& peer, uint32_t rec_sdma_eng_id_mask) {
auto kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version;
bool rec_eng_enabled = core::Runtime::runtime_singleton_->flag().enable_sdma_recommended_eng() !=
Flag::SDMA_DISABLE;
// Assume all recommended masks with single recommended engine (IsPowerOfTwo)
// will only support targeting that engine and will not gang.
// Also assume support is uniform for every device in the system.
uses_rec_sdma_eng_id_mask_ = (kfd_version.KernelInterfaceMajorVersion > 1 ||
(kfd_version.KernelInterfaceMajorVersion == 1 &&
kfd_version.KernelInterfaceMinorVersion >= 17)) &&
isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() >= 4 &&
IsPowerOfTwo(rec_sdma_eng_id_mask) && rec_eng_enabled;
rec_sdma_eng_id_peers_info_[peer.public_handle().handle] = uses_rec_sdma_eng_id_mask_ ?
rec_sdma_eng_id_mask : 0;
}
// Destroy gang signal
static bool GangCopyCompleteHandler(hsa_signal_value_t, void *arg ) {
core::Signal *gang_signal = reinterpret_cast<core::Signal*>(arg);
@@ -955,6 +977,13 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
size_t size,
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
// Recommended SDMA engine copies only have gang factor 1
uint32_t rec_sdma_eng = ffs(rec_sdma_eng_id_peers_info_[dst_agent.public_handle().handle]);
if (rec_sdma_eng)
return DmaCopyOnEngine(dst, dst_agent, src, src_agent, size,
dep_signals, out_signal, rec_sdma_eng, false);
if (profiling_enabled()) {
// Track the agent so we could translate the resulting timestamp to system
// domain correctly.
@@ -238,7 +238,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
link_info.numa_distance = io_link.Weight;
core::Runtime::runtime_singleton_->RegisterLinkInfo(
io_link.NodeFrom, io_link.NodeTo, io_link.Weight, link_info);
io_link.NodeFrom, io_link.NodeTo, io_link.Weight, io_link.RecSdmaEngIdMask, link_info);
}
}
@@ -383,7 +383,7 @@ void BuildTopology() {
uint32_t src_id = src_gpu->node_id();
for (auto& dst_gpu : core::Runtime::runtime_singleton_->gpu_agents()) {
uint32_t dst_id = dst_gpu->node_id();
uint32_t gang_factor = 1;
uint32_t gang_factor = 1, rec_sdma_eng_id_mask = 0;
if (src_id != dst_id) {
auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id);
@@ -398,12 +398,15 @@ void BuildTopology() {
else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth)
gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth;
else gang_factor = 1;
rec_sdma_eng_id_mask = linfo.rec_sdma_eng_id_mask;
}
}
// Register all GPUs regardless of connection type to take advantage of easy
// key-value lookup later on.
((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, gang_factor);
((AMD::GpuAgent*)src_gpu)->RegisterRecSdmaEngIdMaskPeer(*dst_gpu, rec_sdma_eng_id_mask);
}
}
}
@@ -268,10 +268,11 @@ void Runtime::SetLinkCount(size_t num_nodes) {
}
void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
uint32_t num_hop,
uint32_t num_hop, uint32_t rec_sdma_eng_id_mask,
hsa_amd_memory_pool_link_info_t& link_info) {
const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to);
link_matrix_[idx].num_hop = num_hop;
link_matrix_[idx].rec_sdma_eng_id_mask = rec_sdma_eng_id_mask;
link_matrix_[idx].info = link_info;
// Limit the number of hop to 1 since the runtime does not have enough
+7
Vedi File
@@ -101,6 +101,10 @@ class Flag {
enable_sdma_copy_size_override_ = (var == "0") ? SDMA_DISABLE :
((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
var = os::GetEnvVar("HSA_ENABLE_SDMA_RECOMMENDED_ENG");
enable_sdma_recommended_eng_ = (var == "0") ? SDMA_DISABLE :
((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES");
filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES");
@@ -288,6 +292,8 @@ class Flag {
SDMA_OVERRIDE enable_sdma_copy_size_override() const { return enable_sdma_copy_size_override_; }
SDMA_OVERRIDE enable_sdma_recommended_eng() const { return enable_sdma_recommended_eng_; }
std::string visible_gpus() const { return visible_gpus_; }
bool filter_visible_gpus() const { return filter_visible_gpus_; }
@@ -384,6 +390,7 @@ class Flag {
SDMA_OVERRIDE enable_peer_sdma_;
SDMA_OVERRIDE enable_sdma_gang_;
SDMA_OVERRIDE enable_sdma_copy_size_override_;
SDMA_OVERRIDE enable_sdma_recommended_eng_;
bool filter_visible_gpus_;
std::string visible_gpus_;