Optimize and fix SDMA gang copies

Optimizations include:
- Greedy gang by placing gang leaders on first D2D sdma blit context
to avoid dead locking with other gang leaders and items.  Note that
this is fine since we can't avoid an oversubscription problem when
there is only 1 xGMI link anyways, so treat all xGMI links as a single
pipe for ganging.
- Non-leader gang items don't have to poll on dependency signals so this
opens up more non-blocking SDMA channels.
- unlock gang lock when gangs are not needed.
- Change gang factor lookup from vector pair to map and register all
gpus in gang factor lookup regardless of link type so that we can take
advantage of the O(logN) direct key/value lookup time.

Fixes include:
- HSA_PAGE_SIZE_4KB was an incorrect macro to use for gang size limit.
As a result, small copies ended up ganging and hitting latency limit.
Use hardcoded 4096 bytes instead.
- Cap auxillary gang factor to the number of non-XGMI SDMA engines.

Change-Id: Ic23fde131502906a807134a04599aa6d012e8cbb
This commit is contained in:
Jonathan Kim
2024-01-24 11:35:00 -05:00
förälder caedadcc6f
incheckning 62f3f250ce
4 ändrade filer med 43 tillägg och 80 borttagningar
+3 -3
Visa fil
@@ -600,7 +600,7 @@ class GpuAgent : public GpuAgentInt {
void ReleaseScratch(void* base, size_t size, bool large);
// Bind index of peer device that is connected via xGMI links
lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent, int gang_id);
lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
// Bind the Blit object that will drive the copy operation
// across PCIe links (H2D or D2H) or is within same device D2D
@@ -608,7 +608,7 @@ class GpuAgent : public GpuAgentInt {
// Bind the Blit object that will drive the copy operation
lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent,
const size_t size, int gang_id);
const size_t size);
// Bind the Blit object that will drive the copy operation by engine ID
lazy_ptr<core::Blit>& GetBlitObject(uint32_t engine_id);
@@ -662,7 +662,7 @@ class GpuAgent : public GpuAgentInt {
// Check if SDMA engine by ID is free
bool DmaEngineIsFree(uint32_t engine_id);
std::vector<std::pair<core::Agent&,unsigned int>> gang_peers_info_;
std::map<uint64_t,unsigned int> gang_peers_info_;
};
} // namespace amd
@@ -882,7 +882,7 @@ void GpuAgent::SetCopyStatusCheckRefCount(bool set) {
// Assign direct peer gang factor to GPU
void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_factor) {
gang_peers_info_.push_back(std::pair<core::Agent&,unsigned int>(peer, max_bandwidth_factor));
gang_peers_info_[peer.public_handle().handle] = max_bandwidth_factor;
}
// Destroy gang signal
@@ -905,59 +905,22 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
out_signal.async_copy_agent(core::Agent::Convert(this->public_handle()));
}
ScopedAcquire<KernelMutex> lock(&sdma_gang_lock_);
// Calculate the number of gang items
unsigned int tmp_gang_factor = 1;
for (auto peer_info : gang_peers_info_) {
Flag::SDMA_OVERRIDE sdma_gang_override =
core::Runtime::runtime_singleton_->flag().enable_sdma_gang();
Flag::SDMA_OVERRIDE sdma_override =
core::Runtime::runtime_singleton_->flag().enable_sdma();
// Blit copies already saturate xGMI
if (sdma_override == Flag::SDMA_DISABLE || sdma_gang_override == Flag::SDMA_DISABLE) {
break;
}
unsigned int gang_factor = 1;
if (core::Runtime::runtime_singleton_->flag().enable_sdma_gang() != Flag::SDMA_DISABLE &&
size >= 4096 && dst_agent.device_type() == core::Agent::kAmdGpuDevice)
gang_factor = gang_peers_info_[dst_agent.public_handle().handle];
// Avoid the latency boundary on small copies
if (size < HSA_PAGE_SIZE_4KB) {
break;
}
if (dst_agent.public_handle().handle == peer_info.first.public_handle().handle) {
tmp_gang_factor = peer_info.second;
break;
}
}
int gang_factor = 0;
// Use non-D2D (auxillary) SDMA engines in the event of xGMI D2D support
// when xGMI SDMA context is not available.
bool has_aux_gang = tmp_gang_factor >= properties_.NumSdmaEngines && !!!properties_.NumSdmaXgmiEngines;
tmp_gang_factor = has_aux_gang ? tmp_gang_factor : std::min(tmp_gang_factor, properties_.NumSdmaXgmiEngines);
for (int i = 0; i < tmp_gang_factor; i++) {
if (has_aux_gang && !DmaEngineIsFree(i + 1)) {
break;
} else {
uint32_t engine_offset = 0;
for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) {
engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount;
break;
}
}
// Avoid oversubscribing unavailable blit engines that are not already ganged
if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset)) {
break;
}
}
gang_factor++;
}
if (!gang_factor) gang_factor = 1;
bool has_aux_gang = gang_factor >= properties_.NumSdmaEngines &&
!!!properties_.NumSdmaXgmiEngines;
gang_factor = has_aux_gang ?
std::min(gang_factor, properties_.NumSdmaEngines) :
std::min(gang_factor, properties_.NumSdmaXgmiEngines);
ScopedAcquire<KernelMutex> lock(&sdma_gang_lock_);
if (gang_factor == 1) sdma_gang_lock_.Release();
// Manage internal gang signals
std::vector<core::Signal*> gang_signals;
if (gang_factor > 1) {
@@ -985,22 +948,23 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
// Bind the Blit object that will drive this copy operation
size_t offset = 0, remainder_size = size;
bool gang_leader_set = false;
int gang_sig_count = 0;
for (int i = 0; i < gang_factor; i++) {
// Set leader and gang status to blit
SetCopyRequestRefCount(true);
MAKE_SCOPE_GUARD([&]() { SetCopyRequestRefCount(false); });
lazy_ptr<core::Blit>& blit = has_aux_gang ? blits_[i + 1] :
GetBlitObject(dst_agent, src_agent, size, i);
blit->GangLeader(gang_factor > 1 && !gang_leader_set);
lazy_ptr<core::Blit>& blit = gang_factor > 1 ?
(has_aux_gang ? blits_[i + 1] : blits_[i + DefaultBlitCount]) :
GetBlitObject(dst_agent, src_agent, size);
blit->GangLeader(gang_factor > 1 && !i);
hsa_status_t stat;
size_t chunk = std::min(remainder_size, (size + gang_factor - 1)/gang_factor);
if (!blit->GangLeader() && !gang_signals.empty()) {
std::vector<core::Signal*> dep_signals_null(0); // only leader has to wait on dependencies
stat = blit->SubmitLinearCopyCommand(reinterpret_cast<uint8_t*>(dst) + offset,
reinterpret_cast<const uint8_t*>(src) + offset,
chunk, dep_signals,
chunk, dep_signals_null,
*gang_signals[gang_sig_count], gang_signals);
gang_sig_count++;
} else {
@@ -1015,7 +979,6 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
offset += chunk;
remainder_size -= chunk;
gang_leader_set = true;
}
return HSA_STATUS_SUCCESS;
@@ -2173,7 +2136,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(uint32_t engine_offset) {
return blits_[engine_offset];
}
lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent, int gang_id) {
lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
// Determine if destination is a member xgmi peers list
uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen"));
@@ -2184,7 +2147,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent, int ga
uint64_t dst_handle = dst_agent.public_handle().handle;
uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle;
if (peer_handle == dst_handle) {
return blits_[((idx + gang_id) % xgmi_engine_cnt) + DefaultBlitCount];
return blits_[(idx % xgmi_engine_cnt) + DefaultBlitCount];
}
}
@@ -2204,8 +2167,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetPcieBlit(const core::Agent& dst_agent,
lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
const core::Agent& src_agent,
const size_t size,
int gang_id) {
const size_t size) {
// At this point it is guaranteed that one of
// the two devices is a GPU, potentially both
assert(((src_agent.device_type() == core::Agent::kAmdGpuDevice) ||
@@ -2265,7 +2227,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
return GetPcieBlit(dst_agent, src_agent);
}
return GetXgmiBlit(dst_agent, gang_id);
return GetXgmiBlit(dst_agent);
}
void GpuAgent::Trim() {
@@ -372,26 +372,26 @@ void BuildTopology() {
uint32_t src_id = src_gpu->node_id();
for (auto& dst_gpu : core::Runtime::runtime_singleton_->gpu_agents()) {
uint32_t dst_id = dst_gpu->node_id();
uint32_t gang_factor = 1;
if (src_id == dst_gpu->node_id())
continue;
auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id);
// xGMI link type cannot determine bandwidth keep it fixed for ganging
bool has_fixed_gang = linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI &&
linfo.info.numa_distance != 15;
// Min Bandwidth < Max Bandwidth if source and destination GPUs are a
// single hop way and there exists more than a single xGMI link between
// them. Otherwise, destination GPU is not a gang candidate.
if (linfo.info.link_type != HSA_AMD_LINK_INFO_TYPE_XGMI ||
(linfo.info.min_bandwidth == linfo.info.max_bandwidth && !has_fixed_gang)) {
continue;
if (src_id != dst_id) {
auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id);
// Ganging can only be done over xGMI and is either fixed or variable
// based on topology information:
// Weight of 13 - Intra-socket GPU link in multi-partition mode
// Weigth of 15 - Direct GPU link in single partition mode
// Weight of 41 - Inter-socket GPU link in multi-partition mode
if (linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
if (linfo.info.numa_distance == 13 || linfo.info.numa_distance == 41)
gang_factor = 2;
else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth)
gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth;
else gang_factor = 1;
}
}
uint32_t gang_factor = has_fixed_gang ? 2 : (linfo.info.min_bandwidth ?
linfo.info.max_bandwidth/linfo.info.min_bandwidth : 0);
// Register all GPUs regardless of connection type to take advantage of easy
// key-value lookup later on.
((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, gang_factor);
}
}
+1
Visa fil
@@ -94,6 +94,7 @@ class Flag {
var = os::GetEnvVar("HSA_ENABLE_SDMA_GANG");
enable_sdma_gang_ = (var == "0") ? SDMA_DISABLE :
((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
if (enable_sdma_ == SDMA_DISABLE) enable_sdma_gang_ = SDMA_DISABLE;
var = os::GetEnvVar("HSA_ENABLE_SDMA_COPY_SIZE_OVERRIDE");
enable_sdma_copy_size_override_ = (var == "0") ? SDMA_DISABLE :