From 13cde8429d761c0042dba29f68f809b59bbbdebc Mon Sep 17 00:00:00 2001 From: Elena Sakhnovitch Date: Tue, 26 Oct 2021 18:39:23 -0400 Subject: [PATCH] [ROCm-SMI] add --showNodesBw Display min and max bandwidth between gpu nodes Signed-off-by: Elena Sakhnovitch Change-Id: I7289fb83f80e2f899996b7d7560ece670cc5f31f --- python_smi_tools/rocm_smi.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 8ecdef5af4..c56338ec24 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -2485,6 +2485,43 @@ def showHwTopology(deviceList): showNumaTopology(deviceList) +def showNodesBw(deviceList): + """ Display max and min bandwidth between nodes. + Currently supports XGMI only. + This reads the HW Topology file and displays the matrix for the nodes + @param deviceList: List of DRM devices (can be a single-item list) + """ + devices_ind = range(len(deviceList)) + minBW = c_uint32() + maxBW = c_uint32() + gpu_links_type = [[0 for x in devices_ind] for y in devices_ind] + printLogSpacer(' Bandwidth ') + for srcdevice in deviceList: + for destdevice in deviceList: + if srcdevice != destdevice: + ret = rocmsmi.rsmi_minmax_bandwidth_get(srcdevice, destdevice, byref(minBW), byref(maxBW)) + if rsmi_ret_ok(ret, " {} to {}".format(srcdevice, destdevice),None ): + gpu_links_type[srcdevice][destdevice] = "{}-{}".format(minBW.value, maxBW.value) + else: + gpu_links_type[srcdevice][destdevice] = "N/A" + if PRINT_JSON: + formatMatrixToJSON(deviceList, "{}-{}".format(minBW.value, maxBW.value), " min-max bandwidth between DRM devices {} and {}".format(srcdevice, destdevice)) + return + printTableRow(None, ' ') + for row in deviceList: + tmp = 'GPU%d' % row + printTableRow('%-12s', tmp) + printEmptyLine() + for gpu1 in deviceList: + tmp = 'GPU%d' % gpu1 + printTableRow('%-6s', tmp) + for gpu2 in deviceList: + printTableRow('%-12s', gpu_links_type[gpu1][gpu2]) + printEmptyLine() + printLog(None,"Format: min-max; Units: mps", None) + printLog(None,'"0-0" min-max bandwidth indicates devices are not connected dirrectly', None) + + def checkAmdGpus(deviceList): """ Check if there are any AMD GPUs being queried, return False if there are none @@ -2828,6 +2865,7 @@ if __name__ == '__main__': groupDisplay.add_argument('--showtoponuma', help='Shows the numa nodes ', action='store_true') groupDisplay.add_argument('--showenergycounter', help='Energy accumulator that stores amount of energy consumed', action='store_true') + groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true') groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default', action='store_true') @@ -3065,6 +3103,8 @@ if __name__ == '__main__': showProductName(deviceList) if args.showxgmierr: showXgmiErr(deviceList) + if args.shownodesbw: + showNodesBw(deviceList) if args.showtopo: showHwTopology(deviceList) if args.showtopoaccess: