diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h index ceecfcd15c..185a279ab0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h @@ -95,10 +95,11 @@ class BlitKernel : public core::Blit { /// @param size Size of the data to be copied. /// @param dep_signals Arrays of dependent signal. /// @param out_signal Output signal. + /// @param gang_signals Array of gang signals. virtual hsa_status_t SubmitLinearCopyCommand( void* dst, const void* src, size_t size, std::vector& dep_signals, - core::Signal& out_signal) override; + core::Signal& out_signal, std::vector& gang_signals) override; /// @brief Submit an AQL packet to perform memory fill. The call is blocking /// until the command execution is finished. @@ -113,6 +114,11 @@ class BlitKernel : public core::Blit { virtual uint64_t PendingBytes() override; + void GangLeader(bool gang_leader) {} + bool GangLeader() const { return false; } + void GangStatus(bool is_ganged) {} + bool GangStatus() const { return false; } + private: union KernelArgs { struct __ALIGNED__(16) { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 7e9c5e9ae3..e1e1cfbe8e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -121,10 +121,11 @@ class BlitSdma : public BlitSdmaBase { /// @param size Size of the data to be copied. /// @param dep_signals Arrays of dependent signal. /// @param out_signal Output signal. + /// @param gang_signals Array of gang signals. virtual hsa_status_t SubmitLinearCopyCommand( void* dst, const void* src, size_t size, std::vector& dep_signals, - core::Signal& out_signal) override; + core::Signal& out_signal, std::vector& gang_signals) override; virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, @@ -144,6 +145,10 @@ class BlitSdma : public BlitSdmaBase { virtual hsa_status_t EnableProfiling(bool enable) override; virtual uint64_t PendingBytes() override; + void GangLeader(bool gang_leader) { gang_leader_ = gang_leader; } + bool GangLeader() const { return gang_leader_; } + void GangStatus(bool is_ganged) { is_ganged_ = is_ganged; } + bool GangStatus() const { return is_ganged_; } private: /// @brief Acquires the address into queue buffer where a new command @@ -216,7 +221,7 @@ class BlitSdma : public BlitSdmaBase { hsa_status_t SubmitCommand(const void* cmds, size_t cmd_size, uint64_t size, const std::vector& dep_signals, - core::Signal& out_signal); + core::Signal& out_signal, std::vector& gang_signals); hsa_status_t SubmitBlockingCommand(const void* cmds, size_t cmd_size, uint64_t size); @@ -300,6 +305,12 @@ class BlitSdma : public BlitSdmaBase { /// True if sDMA supports HDP flush bool hdp_flush_support_; + + /// True if SDMA blit is gang leader + bool gang_leader_; + + /// True if SDMA blit is ganged + bool is_ganged_; }; // Ring indices are 32-bit. diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index d2a502749a..c8b8017387 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -145,6 +145,8 @@ class GpuAgentInt : public core::Agent { // @retval Coherency type. virtual hsa_amd_coherency_type_t current_coherency_type() const = 0; + virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0; + // @brief Query if agent represent Kaveri GPU. // // @retval true if agent is Kaveri GPU. @@ -290,6 +292,8 @@ class GpuAgent : public GpuAgentInt { } core::Agent* GetNearestCpuAgent(void) const; + + void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) override; // Getter & setters. @@ -509,7 +513,6 @@ class GpuAgent : public GpuAgentInt { // @brief HDP flush registers hsa_amd_hdp_flush_t HDP_flush_ = {nullptr, nullptr}; - private: // @brief Query the driver to get the region list owned by this agent. void InitRegionList(); @@ -546,7 +549,7 @@ class GpuAgent : public GpuAgentInt { void ReleaseScratch(void* base, size_t size, bool large); // Bind index of peer device that is connected via xGMI links - lazy_ptr& GetXgmiBlit(const core::Agent& peer_agent); + lazy_ptr& GetXgmiBlit(const core::Agent& peer_agent, int gang_id); // Bind the Blit object that will drive the copy operation // across PCIe links (H2D or D2H) or is within same device D2D @@ -554,7 +557,7 @@ class GpuAgent : public GpuAgentInt { // Bind the Blit object that will drive the copy operation lazy_ptr& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent, - const size_t size); + const size_t size, int gang_id); // Bind the Blit object that will drive the copy operation by engine ID lazy_ptr& GetBlitObject(uint32_t engine_id); @@ -599,6 +602,8 @@ class GpuAgent : public GpuAgentInt { // Check if SDMA engine by ID is free bool DmaEngineIsFree(uint32_t engine_id); + + std::vector> gang_peers_info_; }; } // namespace amd diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h index 9e927755bf..94e9fe892d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h @@ -85,9 +85,11 @@ class Blit { /// @param size Size of the data to be copied. /// @param dep_signals Arrays of dependent signal. /// @param out_signal Output signal. + /// @param gang_signals Array of gang signals. virtual hsa_status_t SubmitLinearCopyCommand( void* dst, const void* src, size_t size, - std::vector& dep_signals, core::Signal& out_signal) = 0; + std::vector& dep_signals, core::Signal& out_signal, + std::vector& gang_signals) = 0; /// @brief Submit a linear fill command to the the underlying compute device's /// control block. The call is blocking until the command execution is @@ -114,6 +116,11 @@ class Blit { /// @Brief Reports the approximate number of remaining bytes to copy or fill. Any return of zero /// must be exact. virtual uint64_t PendingBytes() = 0; + + virtual void GangLeader(bool gang_leader) = 0; + virtual bool GangLeader() const { return false; }; + virtual void GangStatus(bool is_ganged) = 0; + virtual bool GangStatus() const { return false; }; }; } // namespace core } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp index 661e3be4db..9ede1ea301 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp @@ -982,9 +982,10 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, HSA::hsa_signal_store_relaxed(completion_signal_, 1); std::vector dep_signals(0); + std::vector gang_signals(0); hsa_status_t stat = SubmitLinearCopyCommand( - dst, src, size, dep_signals, *core::Signal::Convert(completion_signal_)); + dst, src, size, dep_signals, *core::Signal::Convert(completion_signal_), gang_signals); if (stat != HSA_STATUS_SUCCESS) { return stat; @@ -1002,7 +1003,8 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, hsa_status_t BlitKernel::SubmitLinearCopyCommand( void* dst, const void* src, size_t size, - std::vector& dep_signals, core::Signal& out_signal) { + std::vector& dep_signals, core::Signal& out_signal, + std::vector& gang_signals) { // Reserve write index for barrier(s) + dispatch packet. const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5); const uint32_t total_num_packet = num_barrier_packet + 1; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index 2a4688df67..23ae0b3069 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -247,9 +247,12 @@ hsa_status_t BlitSdmaStoreRelaxed(0); }); lock.Release(); + std::vector gang_signals(0); + // Submit command and wait for completion hsa_status_t ret = - SubmitCommand(cmd, cmd_size, size, std::vector(), *completionSignal); + SubmitCommand(cmd, cmd_size, size, std::vector(), *completionSignal, + gang_signals); completionSignal->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 1, -1, HSA_WAIT_STATE_BLOCKED); return ret; } @@ -257,7 +260,8 @@ hsa_status_t BlitSdma hsa_status_t BlitSdma::SubmitCommand( const void* cmd, size_t cmd_size, uint64_t size, const std::vector& dep_signals, - core::Signal& out_signal) { + core::Signal& out_signal, std::vector& gang_signals) { + // The signal is 64 bit value, and poll checks for 32 bit value. So we // need to use two poll operations per dependent signal. const uint32_t num_poll_command = @@ -273,7 +277,15 @@ hsa_status_t BlitSdma: uint64_t* end_ts_addr = nullptr; uint32_t total_timestamp_command_size = 0; - if (profiling_enabled) { + // Gang leader polls gang item completions and does final decrement or + // completion of gang signal to prevent race between poll and signal + // destruction. + uint32_t total_gang_complete_command_size = poll_command_size_ + + (platform_atomic_support_ ? atomic_command_size_ : fence_command_size_); + uint32_t total_gang_command_size = gang_leader_ ? + static_cast(gang_signals.size()) * total_gang_complete_command_size : 0; + + if (profiling_enabled && (gang_leader_ || gang_signals.empty())) { out_signal.GetSdmaTsAddresses(start_ts_addr, end_ts_addr); total_timestamp_command_size = 2 * timestamp_command_size_; } @@ -309,7 +321,7 @@ hsa_status_t BlitSdma: if (useGCR) flush_cmd_size += gcr_command_size_ * 2; const uint32_t total_command_size = total_poll_command_size + cmd_size + sync_command_size + - total_timestamp_command_size + interrupt_command_size + flush_cmd_size; + total_timestamp_command_size + interrupt_command_size + flush_cmd_size + total_gang_command_size; RingIndexTy curr_index; char* command_addr; @@ -341,7 +353,7 @@ hsa_status_t BlitSdma: wrapped_index += poll_command_size_; } - if (profiling_enabled) { + if (profiling_enabled && (gang_leader_ || gang_signals.empty())) { BuildGetGlobalTimestampCommand(command_addr, reinterpret_cast(start_ts_addr)); command_addr += timestamp_command_size_; bytes_written_[wrapped_index] = prior_bytes; @@ -380,7 +392,7 @@ hsa_status_t BlitSdma: wrapped_index += gcr_command_size_; } - if (profiling_enabled) { + if (profiling_enabled && (gang_leader_ || gang_signals.empty())) { assert(IsMultipleOf(end_ts_addr, 32)); BuildGetGlobalTimestampCommand(command_addr, reinterpret_cast(end_ts_addr)); @@ -389,6 +401,31 @@ hsa_status_t BlitSdma: wrapped_index += timestamp_command_size_; } + // Wait for non-leaders gang items to complete + if (gang_leader_) { + for (int i = 0; i < gang_signals.size(); i++) { + uint32_t* gang_signal_addr = + reinterpret_cast(gang_signals[i]->ValueLocation()); + BuildPollCommand(command_addr, gang_signal_addr, 1); + command_addr += poll_command_size_; + bytes_written_[wrapped_index] = prior_bytes; + wrapped_index += poll_command_size_; + + // After non-leader gang-items have completed, decrement the gang signal value. + if (platform_atomic_support_) { + BuildAtomicDecrementCommand(command_addr, gang_signal_addr); + command_addr += atomic_command_size_; + bytes_written_[wrapped_index] = post_bytes; + wrapped_index += atomic_command_size_; + } else { + BuildFenceCommand(command_addr, gang_signal_addr, 0); + command_addr += fence_command_size_; + bytes_written_[wrapped_index] = post_bytes; + wrapped_index += fence_command_size_; + } + } + } + // After transfer is completed, decrement the signal value. if (platform_atomic_support_) { BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation()); @@ -448,7 +485,8 @@ template ::SubmitLinearCopyCommand(void* dst, const void* src, size_t size, std::vector& dep_signals, - core::Signal& out_signal) { + core::Signal& out_signal, + std::vector& gang_signals) { // Break the copy into multiple copy operations when the copy size exceeds // the SDMA linear copy limit. const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize; @@ -458,7 +496,7 @@ hsa_status_t BlitSdma(&buff[0]), num_copy_command, dst, src, size); return SubmitCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR), size, dep_signals, - out_signal); + out_signal, gang_signals); } template @@ -524,8 +562,10 @@ BlitSdma::SubmitCopyRe uint64_t size = range->x * range->y * range->z; + std::vector gang_signals(0); + return SubmitCommand(&pkts[0], pkts.size() * sizeof(SDMA_PKT_COPY_LINEAR_RECT), size, dep_signals, - out_signal); + out_signal, gang_signals); } template diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 5fc8f07856..1a86c6085b 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -58,6 +58,7 @@ #include "core/inc/amd_blit_sdma.h" #include "core/inc/amd_gpu_pm4.h" #include "core/inc/amd_memory_region.h" +#include "core/inc/default_signal.h" #include "core/inc/interrupt_signal.h" #include "core/inc/isa.h" #include "core/inc/runtime.h" @@ -850,25 +851,137 @@ void GpuAgent::SetCopyStatusCheckRefCount(bool set) { else pending_copy_stat_check_ref_++; } +// Assign direct peer gang factor to GPU +void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_factor) { + unsigned int max_gang_factor = std::min(max_bandwidth_factor, properties_.NumSdmaXgmiEngines); + gang_peers_info_.push_back(std::pair(peer, max_gang_factor)); +} + +// Destroy gang signal +static bool GangCopyCompleteHandler(hsa_signal_value_t, void *arg ) { + core::Signal *gang_signal = reinterpret_cast(arg); + if (gang_signal->IsValid()) { + gang_signal->DestroySignal(); + } + return true; +} + hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, const void* src, core::Agent& src_agent, size_t size, std::vector& dep_signals, core::Signal& out_signal) { - SetCopyRequestRefCount(true); - // Bind the Blit object that will drive this copy operation - lazy_ptr& blit = GetBlitObject(dst_agent, src_agent, size); - if (profiling_enabled()) { // Track the agent so we could translate the resulting timestamp to system // domain correctly. out_signal.async_copy_agent(core::Agent::Convert(this->public_handle())); } - hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal); - SetCopyRequestRefCount(false); + // Calculate the number of gang items + int tmp_gang_factor = 1; + for (auto peer_info : gang_peers_info_) { + Flag::SDMA_OVERRIDE sdma_gang_override = + core::Runtime::runtime_singleton_->flag().enable_sdma_gang(); + Flag::SDMA_OVERRIDE sdma_override = + core::Runtime::runtime_singleton_->flag().enable_sdma(); + // Blit copies already saturate xGMI + if (sdma_override == Flag::SDMA_DISABLE || sdma_gang_override == Flag::SDMA_DISABLE) { + break; + } - return stat; + // Avoid the latency boundary on small copies + if (size < HSA_PAGE_SIZE_4KB) { + break; + } + + if (dst_agent.public_handle().handle == peer_info.first.public_handle().handle) { + tmp_gang_factor = peer_info.second; + } + } + + int gang_factor = 0; + uint32_t gang_mask = 0; + for (int i = 0; i < tmp_gang_factor; i++) { + uint32_t engine_offset = 0; + for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) { + if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) { + engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount; + break; + } + } + + // Avoid oversubscribing unavailable blit engines that are not already ganged + if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset) && + !blits_[engine_offset]->GangStatus()) { + continue; + } + + gang_mask |= 1 << i; + gang_factor++; + } + + if (!gang_factor) gang_factor = 1; + + // Manage internal gang signals + std::vector gang_signals; + if (gang_factor > 1) { + for (int i = 0; i < gang_factor - 1; i++) { + core::Signal *gang_signal; + + // Initial value is 2 where 1 is for gang-leader to ack and + // 1 for non-leader gang item to decrement + gang_signal = new core::DefaultSignal(2); + + // Fall back to non-gang copy + if (!gang_signal->IsValid()) { + for (int j = 0; j < gang_signals.size(); j++) gang_signals[j]->DestroySignal(); + gang_factor = 1; + break; + } + + core::Runtime::runtime_singleton_->SetAsyncSignalHandler( + core::Signal::Convert(gang_signal), + HSA_SIGNAL_CONDITION_EQ, 0, GangCopyCompleteHandler, + reinterpret_cast(gang_signal)); + gang_signals.push_back(gang_signal); + } + } + + // Bind the Blit object that will drive this copy operation + size_t offset = 0, remainder_size = size; + bool gang_leader_set = false; + int gang_sig_count = 0; + for (int i = 0; i < gang_factor; i++) { + if (gang_factor > 1 && !!!(gang_mask & (1 << i))) + continue; + + // Set leader and gang status to blit + SetCopyRequestRefCount(true); + lazy_ptr& blit = GetBlitObject(dst_agent, src_agent, size, i); + blit->GangLeader(gang_factor > 1 && !gang_leader_set); + blit->GangStatus(gang_factor > 1); + + hsa_status_t stat; + size_t chunk = std::min(remainder_size, (size + gang_factor - 1)/gang_factor); + if (!blit->GangLeader() && !gang_signals.empty()) { + stat = blit->SubmitLinearCopyCommand(dst + offset, src + offset, chunk, + dep_signals, *gang_signals[gang_sig_count], gang_signals); + gang_sig_count++; + } else { + stat = blit->SubmitLinearCopyCommand(dst + offset, src + offset, chunk, + dep_signals, out_signal, gang_signals); + } + SetCopyRequestRefCount(false); + + if (stat) + return stat; + + offset += chunk; + remainder_size -= chunk; + gang_leader_set = true; + } + + return HSA_STATUS_SUCCESS; } hsa_status_t GpuAgent::DmaCopyOnEngine(void* dst, core::Agent& dst_agent, @@ -934,7 +1047,10 @@ hsa_status_t GpuAgent::DmaCopyOnEngine(void* dst, core::Agent& dst_agent, out_signal.async_copy_agent(core::Agent::Convert(this->public_handle())); } - hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal); + std::vector gang_signals(0); + + hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal, + gang_signals); SetCopyRequestRefCount(false); return stat; @@ -1888,7 +2004,7 @@ lazy_ptr& GpuAgent::GetBlitObject(uint32_t engine_offset) { return blits_[engine_offset]; } -lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) { +lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent, int gang_id) { // Determine if destination is a member xgmi peers list uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines; assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen")); @@ -1899,7 +2015,7 @@ lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) { uint64_t dst_handle = dst_agent.public_handle().handle; uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle; if (peer_handle == dst_handle) { - return GetBlitObject((idx % xgmi_engine_cnt) + DefaultBlitCount); + return blits_[((idx + gang_id) % xgmi_engine_cnt) + DefaultBlitCount]; } } @@ -1918,7 +2034,9 @@ lazy_ptr& GpuAgent::GetPcieBlit(const core::Agent& dst_agent, } lazy_ptr& GpuAgent::GetBlitObject(const core::Agent& dst_agent, - const core::Agent& src_agent, const size_t size) { + const core::Agent& src_agent, + const size_t size, + int gang_id) { // At this point it is guaranteed that one of // the two devices is a GPU, potentially both assert(((src_agent.device_type() == core::Agent::kAmdGpuDevice) || @@ -1978,7 +2096,7 @@ lazy_ptr& GpuAgent::GetBlitObject(const core::Agent& dst_agent, return GetPcieBlit(dst_agent, src_agent); } - return GetXgmiBlit(dst_agent); + return GetXgmiBlit(dst_agent, gang_id); } void GpuAgent::Trim() { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp index 264a5f30e0..77601b24e5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -365,6 +365,28 @@ void BuildTopology() { maxCu = Max(maxCu, cus); } const_cast(core::Runtime::runtime_singleton_->flag()).parse_masks(maxGpu, maxCu); + + // Register destination agents that can SDMA gang copy for source agents + for (auto& src_gpu : core::Runtime::runtime_singleton_->gpu_agents()) { + uint32_t src_id = src_gpu->node_id(); + for (auto& dst_gpu : core::Runtime::runtime_singleton_->gpu_agents()) { + uint32_t dst_id = dst_gpu->node_id(); + + if (src_id == dst_gpu->node_id()) + continue; + + auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id); + + // Min Bandwidth < Max Bandwidth if source and destination GPUs are a + // single hop way and there exists more than a single xGMI link between + // them. Otherwise, destination GPU is not a gang candidate. + if (linfo.info.link_type != HSA_AMD_LINK_INFO_TYPE_XGMI || + linfo.info.min_bandwidth == linfo.info.max_bandwidth) + continue; + + ((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, linfo.info.max_bandwidth/linfo.info.min_bandwidth); + } + } } bool Load() { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h index 507b299014..a75e1ccb85 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h @@ -90,6 +90,10 @@ class Flag { var = os::GetEnvVar("HSA_ENABLE_PEER_SDMA"); enable_peer_sdma_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + var = os::GetEnvVar("HSA_ENABLE_SDMA_GANG"); + enable_sdma_gang_ = (var == "0") ? SDMA_DISABLE : + ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES"); filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES"); @@ -230,6 +234,8 @@ class Flag { SDMA_OVERRIDE enable_peer_sdma() const { return enable_peer_sdma_; } + SDMA_OVERRIDE enable_sdma_gang() const { return enable_sdma_gang_; } + std::string visible_gpus() const { return visible_gpus_; } bool filter_visible_gpus() const { return filter_visible_gpus_; } @@ -308,6 +314,7 @@ class Flag { SDMA_OVERRIDE enable_sdma_; SDMA_OVERRIDE enable_peer_sdma_; + SDMA_OVERRIDE enable_sdma_gang_; bool filter_visible_gpus_; std::string visible_gpus_;