rocr: Memory copy based on recommended SDMA engines
Recommended SDMA engines for DMA copies are now exposed for better GPU-GPU performance. ROCr can now select those DMA engines. Also lock-in host-device copies to SDMA0 and device-host copies to SDMA1 for better stability and performance. Change-Id: Ideff2e13daf537104efecb8b837bd49ee5096cb5
This commit is contained in:
@@ -66,7 +66,7 @@ class BlitSdmaBase : public core::Blit {
|
||||
static const size_t kMaxSingleFillSize;
|
||||
virtual bool isSDMA() const override { return true; }
|
||||
virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi,
|
||||
size_t linear_copy_size_override) = 0;
|
||||
size_t linear_copy_size_override, int rec_engine) = 0;
|
||||
virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
|
||||
const hsa_dim3_t* dst_offset,
|
||||
const hsa_pitched_ptr_t* src,
|
||||
@@ -92,7 +92,7 @@ class BlitSdma : public BlitSdmaBase {
|
||||
///
|
||||
/// @return hsa_status_t
|
||||
virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi,
|
||||
size_t linear_copy_size_override) override;
|
||||
size_t linear_copy_size_override, int rec_eng) override;
|
||||
|
||||
/// @brief Marks the queue object as invalid and uncouples its link with
|
||||
/// the underlying compute device's control block. Use of queue object
|
||||
|
||||
@@ -161,6 +161,8 @@ class GpuAgentInt : public core::Agent {
|
||||
|
||||
virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0;
|
||||
|
||||
virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) = 0;
|
||||
|
||||
// @brief Query if agent represent Kaveri GPU.
|
||||
//
|
||||
// @retval true if agent is Kaveri GPU.
|
||||
@@ -336,6 +338,8 @@ class GpuAgent : public GpuAgentInt {
|
||||
|
||||
void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) override;
|
||||
|
||||
void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) override;
|
||||
|
||||
// Getter & setters.
|
||||
|
||||
// @brief Returns Hive ID
|
||||
@@ -457,7 +461,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief Create SDMA blit object.
|
||||
//
|
||||
// @retval NULL if SDMA blit creation and initialization failed.
|
||||
core::Blit* CreateBlitSdma(bool use_xgmi);
|
||||
core::Blit* CreateBlitSdma(bool use_xgmi, int rec_eng);
|
||||
|
||||
// @brief Create Kernel blit object using provided compute queue.
|
||||
//
|
||||
@@ -761,6 +765,10 @@ class GpuAgent : public GpuAgentInt {
|
||||
bool DmaEngineIsFree(uint32_t engine_id);
|
||||
|
||||
std::map<uint64_t,unsigned int> gang_peers_info_;
|
||||
|
||||
std::map<uint64_t, uint32_t> rec_sdma_eng_id_peers_info_;
|
||||
|
||||
bool uses_rec_sdma_eng_id_mask_;
|
||||
};
|
||||
|
||||
} // namespace amd
|
||||
|
||||
@@ -115,9 +115,10 @@ class Runtime {
|
||||
public:
|
||||
/// @brief Structure to describe connectivity between agents.
|
||||
struct LinkInfo {
|
||||
LinkInfo() : num_hop(0), info{0} {}
|
||||
LinkInfo() : num_hop(0), rec_sdma_eng_id_mask(0), info{0} {}
|
||||
|
||||
uint32_t num_hop;
|
||||
uint32_t rec_sdma_eng_id_mask;
|
||||
hsa_amd_memory_pool_link_info_t info;
|
||||
};
|
||||
|
||||
@@ -167,7 +168,7 @@ class Runtime {
|
||||
/// @param [in] link_info The link information between source and destination
|
||||
/// nodes.
|
||||
void RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
|
||||
uint32_t num_hop,
|
||||
uint32_t num_hop, uint32_t rec_sdma_eng_id_mask,
|
||||
hsa_amd_memory_pool_link_info_t& link_info);
|
||||
|
||||
/// @brief Query link information between two nodes.
|
||||
|
||||
@@ -283,10 +283,10 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
|
||||
HSAKMT_STATUS kmt_status;
|
||||
if (core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging) {
|
||||
queue_rsrc.ErrorReason = &exception_signal_->signal_.value;
|
||||
kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_,
|
||||
kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_,
|
||||
ring_buf_alloc_bytes_, queue_event_, &queue_rsrc);
|
||||
} else {
|
||||
kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_,
|
||||
kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_,
|
||||
ring_buf_alloc_bytes_, NULL, &queue_rsrc);
|
||||
}
|
||||
if (kmt_status != HSAKMT_STATUS_SUCCESS)
|
||||
|
||||
@@ -133,7 +133,7 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::~BlitSdma()
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Initialize(
|
||||
const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override) {
|
||||
const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override, int rec_eng) {
|
||||
if (queue_start_addr_ != NULL) {
|
||||
// Already initialized.
|
||||
return HSA_STATUS_SUCCESS;
|
||||
@@ -191,10 +191,12 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
|
||||
// device. ROCr creates queues that are of two kinds: PCIe optimized
|
||||
// and xGMI optimized. Which queue to create is indicated via input
|
||||
// boolean flag
|
||||
const HSA_QUEUE_TYPE kQueueType_ = use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA;
|
||||
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
|
||||
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
|
||||
kQueueSize, NULL, &queue_resource_)) {
|
||||
const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
|
||||
(use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
|
||||
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueueExt(agent_->node_id(), kQueueType_, 100,
|
||||
HSA_QUEUE_PRIORITY_MAXIMUM, rec_eng,
|
||||
queue_start_addr_, kQueueSize, NULL,
|
||||
&queue_resource_)) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
|
||||
@@ -720,7 +720,7 @@ core::Queue* GpuAgent::CreateInterceptibleQueue(void (*callback)(hsa_status_t st
|
||||
return queue;
|
||||
}
|
||||
|
||||
core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
|
||||
core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) {
|
||||
AMD::BlitSdmaBase* sdma;
|
||||
size_t copy_size_override = 0;
|
||||
const size_t copy_size_overrides[2] = {0x3fffff, 0x3fffffff};
|
||||
@@ -754,7 +754,9 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
|
||||
core::Runtime::runtime_singleton_->flag().enable_sdma_copy_size_override();
|
||||
if (copy_size_override_setting == Flag::SDMA_DISABLE) copy_size_override = 0;
|
||||
|
||||
if (sdma->Initialize(*this, use_xgmi, copy_size_override) != HSA_STATUS_SUCCESS) {
|
||||
rec_eng = uses_rec_sdma_eng_id_mask_ || !use_xgmi ? rec_eng : -1;
|
||||
|
||||
if (sdma->Initialize(*this, use_xgmi, copy_size_override, rec_eng) != HSA_STATUS_SUCCESS) {
|
||||
sdma->Destroy(*this);
|
||||
delete sdma;
|
||||
sdma = nullptr;
|
||||
@@ -801,7 +803,7 @@ void GpuAgent::InitDma() {
|
||||
queues_[QueuePCSampling].reset([queue_lambda, this]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); });
|
||||
|
||||
// Decide which engine to use for blits.
|
||||
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev) {
|
||||
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev, uint32_t rec_eng) {
|
||||
Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
|
||||
|
||||
// User SDMA queues are unstable on gfx8 and unsupported on gfx1013.
|
||||
@@ -817,7 +819,7 @@ void GpuAgent::InitDma() {
|
||||
*blits_[BlitHostToDev];
|
||||
}
|
||||
|
||||
auto ret = CreateBlitSdma(use_xgmi);
|
||||
auto ret = CreateBlitSdma(use_xgmi, rec_eng);
|
||||
if (ret != nullptr) return ret;
|
||||
}
|
||||
|
||||
@@ -857,14 +859,15 @@ void GpuAgent::InitDma() {
|
||||
return ret;
|
||||
});
|
||||
blits_[BlitHostToDev].reset(
|
||||
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true); });
|
||||
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true, 0); });
|
||||
blits_[BlitDevToHost].reset(
|
||||
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false); });
|
||||
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false, 1); });
|
||||
|
||||
// XGMI engines.
|
||||
for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) {
|
||||
const int eng = idx - 1;
|
||||
blits_[idx].reset(
|
||||
[blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility], false); });
|
||||
[blit_lambda, this, eng]() { return blit_lambda(true, queues_[QueueUtility], false, eng); });
|
||||
}
|
||||
|
||||
// GWS queues.
|
||||
@@ -941,6 +944,25 @@ void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_fa
|
||||
gang_peers_info_[peer.public_handle().handle] = max_bandwidth_factor;
|
||||
}
|
||||
|
||||
// Assign direct peer recommended SDMA engine IDs to GPU
|
||||
void GpuAgent::RegisterRecSdmaEngIdMaskPeer(core::Agent& peer, uint32_t rec_sdma_eng_id_mask) {
|
||||
auto kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version;
|
||||
bool rec_eng_enabled = core::Runtime::runtime_singleton_->flag().enable_sdma_recommended_eng() !=
|
||||
Flag::SDMA_DISABLE;
|
||||
|
||||
// Assume all recommended masks with single recommended engine (IsPowerOfTwo)
|
||||
// will only support targeting that engine and will not gang.
|
||||
// Also assume support is uniform for every device in the system.
|
||||
uses_rec_sdma_eng_id_mask_ = (kfd_version.KernelInterfaceMajorVersion > 1 ||
|
||||
(kfd_version.KernelInterfaceMajorVersion == 1 &&
|
||||
kfd_version.KernelInterfaceMinorVersion >= 17)) &&
|
||||
isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() >= 4 &&
|
||||
IsPowerOfTwo(rec_sdma_eng_id_mask) && rec_eng_enabled;
|
||||
|
||||
rec_sdma_eng_id_peers_info_[peer.public_handle().handle] = uses_rec_sdma_eng_id_mask_ ?
|
||||
rec_sdma_eng_id_mask : 0;
|
||||
}
|
||||
|
||||
// Destroy gang signal
|
||||
static bool GangCopyCompleteHandler(hsa_signal_value_t, void *arg ) {
|
||||
core::Signal *gang_signal = reinterpret_cast<core::Signal*>(arg);
|
||||
@@ -955,6 +977,13 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
|
||||
size_t size,
|
||||
std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) {
|
||||
// Recommended SDMA engine copies only have gang factor 1
|
||||
uint32_t rec_sdma_eng = ffs(rec_sdma_eng_id_peers_info_[dst_agent.public_handle().handle]);
|
||||
|
||||
if (rec_sdma_eng)
|
||||
return DmaCopyOnEngine(dst, dst_agent, src, src_agent, size,
|
||||
dep_signals, out_signal, rec_sdma_eng, false);
|
||||
|
||||
if (profiling_enabled()) {
|
||||
// Track the agent so we could translate the resulting timestamp to system
|
||||
// domain correctly.
|
||||
|
||||
@@ -238,7 +238,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
|
||||
link_info.numa_distance = io_link.Weight;
|
||||
|
||||
core::Runtime::runtime_singleton_->RegisterLinkInfo(
|
||||
io_link.NodeFrom, io_link.NodeTo, io_link.Weight, link_info);
|
||||
io_link.NodeFrom, io_link.NodeTo, io_link.Weight, io_link.RecSdmaEngIdMask, link_info);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -383,7 +383,7 @@ void BuildTopology() {
|
||||
uint32_t src_id = src_gpu->node_id();
|
||||
for (auto& dst_gpu : core::Runtime::runtime_singleton_->gpu_agents()) {
|
||||
uint32_t dst_id = dst_gpu->node_id();
|
||||
uint32_t gang_factor = 1;
|
||||
uint32_t gang_factor = 1, rec_sdma_eng_id_mask = 0;
|
||||
|
||||
if (src_id != dst_id) {
|
||||
auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id);
|
||||
@@ -398,12 +398,15 @@ void BuildTopology() {
|
||||
else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth)
|
||||
gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth;
|
||||
else gang_factor = 1;
|
||||
|
||||
rec_sdma_eng_id_mask = linfo.rec_sdma_eng_id_mask;
|
||||
}
|
||||
}
|
||||
|
||||
// Register all GPUs regardless of connection type to take advantage of easy
|
||||
// key-value lookup later on.
|
||||
((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, gang_factor);
|
||||
((AMD::GpuAgent*)src_gpu)->RegisterRecSdmaEngIdMaskPeer(*dst_gpu, rec_sdma_eng_id_mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,10 +268,11 @@ void Runtime::SetLinkCount(size_t num_nodes) {
|
||||
}
|
||||
|
||||
void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
|
||||
uint32_t num_hop,
|
||||
uint32_t num_hop, uint32_t rec_sdma_eng_id_mask,
|
||||
hsa_amd_memory_pool_link_info_t& link_info) {
|
||||
const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to);
|
||||
link_matrix_[idx].num_hop = num_hop;
|
||||
link_matrix_[idx].rec_sdma_eng_id_mask = rec_sdma_eng_id_mask;
|
||||
link_matrix_[idx].info = link_info;
|
||||
|
||||
// Limit the number of hop to 1 since the runtime does not have enough
|
||||
|
||||
@@ -101,6 +101,10 @@ class Flag {
|
||||
enable_sdma_copy_size_override_ = (var == "0") ? SDMA_DISABLE :
|
||||
((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
|
||||
|
||||
var = os::GetEnvVar("HSA_ENABLE_SDMA_RECOMMENDED_ENG");
|
||||
enable_sdma_recommended_eng_ = (var == "0") ? SDMA_DISABLE :
|
||||
((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
|
||||
|
||||
visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES");
|
||||
filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES");
|
||||
|
||||
@@ -288,6 +292,8 @@ class Flag {
|
||||
|
||||
SDMA_OVERRIDE enable_sdma_copy_size_override() const { return enable_sdma_copy_size_override_; }
|
||||
|
||||
SDMA_OVERRIDE enable_sdma_recommended_eng() const { return enable_sdma_recommended_eng_; }
|
||||
|
||||
std::string visible_gpus() const { return visible_gpus_; }
|
||||
|
||||
bool filter_visible_gpus() const { return filter_visible_gpus_; }
|
||||
@@ -384,6 +390,7 @@ class Flag {
|
||||
SDMA_OVERRIDE enable_peer_sdma_;
|
||||
SDMA_OVERRIDE enable_sdma_gang_;
|
||||
SDMA_OVERRIDE enable_sdma_copy_size_override_;
|
||||
SDMA_OVERRIDE enable_sdma_recommended_eng_;
|
||||
|
||||
bool filter_visible_gpus_;
|
||||
std::string visible_gpus_;
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user