From 1815fc808d35cfde8e58f37d738f41c7c35ad00a Mon Sep 17 00:00:00 2001 From: Satyanvesh Dittakavi Date: Thu, 6 Jun 2024 08:34:09 +0000 Subject: [PATCH] SWDEV-464927 - Update the Get by PCI BusId logic and Hop count - Update the intra socket weight for partitions within single socket as it is changed to 13 by the driver. - Use the PCIe function to distinguish the partitions of the same device such as TPX mode in gfx942. Change-Id: I8e64023d44e37c2dbb105cbb343441a48021ba7b --- hipamd/src/hip_device_runtime.cpp | 10 +++++++--- rocclr/device/rocm/rocdevice.cpp | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/hipamd/src/hip_device_runtime.cpp b/hipamd/src/hip_device_runtime.cpp index a735d2039a..c453820f66 100644 --- a/hipamd/src/hip_device_runtime.cpp +++ b/hipamd/src/hip_device_runtime.cpp @@ -462,10 +462,12 @@ hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusIdstr) { int pciBusID = -1; int pciDeviceID = -1; int pciDomainID = -1; + int pciFunction = -1; bool found = false; - if (sscanf(pciBusIdstr, "%04x:%02x:%02x", reinterpret_cast(&pciDomainID), + if (sscanf(pciBusIdstr, "%04x:%02x:%02x.%01x", reinterpret_cast(&pciDomainID), reinterpret_cast(&pciBusID), - reinterpret_cast(&pciDeviceID)) == 0x3) { + reinterpret_cast(&pciDeviceID), + reinterpret_cast(&pciFunction)) == 0x4) { int count = 0; HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count)); for (cl_int i = 0; i < count; i++) { @@ -473,9 +475,11 @@ hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusIdstr) { hipDeviceProp_tR0600 prop; HIP_RETURN_ONFAIL(ihipDeviceGet(&dev, i)); HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, dev)); + auto* deviceHandle = g_devices[dev]->devices()[0]; if ((pciBusID == prop.pciBusID) && (pciDomainID == prop.pciDomainID) && - (pciDeviceID == prop.pciDeviceID)) { + (pciDeviceID == prop.pciDeviceID) && + (pciFunction == deviceHandle->info().deviceTopology_.pcie.function)) { *device = i; found = true; break; diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 05e7ff169d..57438e6f1f 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -3356,7 +3356,7 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool, distance += link_info[hop_idx].numa_distance; } uint32_t oneHopDistance - = (link_info[0].link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) ? 15 : 20; + = (link_info[0].link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) ? 13 : 20; link_attr.second = static_cast(distance/oneHopDistance); break; }