From 488bbb668ac424683c60af586adfda30b7a5a4b6 Mon Sep 17 00:00:00 2001 From: Mike Li Date: Thu, 18 Jun 2020 08:43:29 -0700 Subject: [PATCH] Add support to retrieve XGMI hive id Change-Id: I1eee05dd85ecb856889d1cfe0565454d2f538856 Signed-off-by: Mike Li --- include/rocm_smi/rocm_smi.h | 21 ++++++++++++++++++ include/rocm_smi/rocm_smi_kfd.h | 2 ++ src/rocm_smi.cc | 16 ++++++++++++++ src/rocm_smi_kfd.cc | 8 ++++++- .../functional/xgmi_read_write.cc | 22 +++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 96fcc1c383..5373e8f830 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -2785,6 +2785,27 @@ rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status); rsmi_status_t rsmi_dev_xgmi_error_reset(uint32_t dv_ind); +/** + * @brief Retrieve the XGMI hive id for a device + * + * @details Given a device index @p dv_ind, and a pointer to an + * uint64_t @p hive_id, this function will write the current XGMI + * hive id for the device @p dv_ind to the memory pointed to by @p hive_id. + * + * @param[in] dv_ind a device index + * + * @param[inout] hive_id A pointer to an uint64_t to which the XGMI hive id + * should be written + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t +rsmi_dev_xgmi_hive_id_get(uint32_t dv_ind, uint64_t *hive_id); + /** @} */ // end of SysInfo /*****************************************************************************/ diff --git a/include/rocm_smi/rocm_smi_kfd.h b/include/rocm_smi/rocm_smi_kfd.h index d355938ec8..4ad44a001f 100755 --- a/include/rocm_smi/rocm_smi_kfd.h +++ b/include/rocm_smi/rocm_smi_kfd.h @@ -69,6 +69,7 @@ class KFDNode { uint32_t node_index(void) const {return node_indx_;} uint32_t numa_node_number(void) const {return numa_node_number_;} uint64_t numa_node_weight(void) const {return numa_node_weight_;} + uint64_t xgmi_hive_id(void) const {return xgmi_hive_id_;} IO_LINK_TYPE numa_node_type(void) const {return numa_node_type_;} int get_io_link_type(uint32_t node_to, IO_LINK_TYPE *type); int get_io_link_weight(uint32_t node_to, uint64_t *weight); @@ -84,6 +85,7 @@ class KFDNode { uint32_t numa_node_number_; uint64_t numa_node_weight_; IO_LINK_TYPE numa_node_type_; + uint64_t xgmi_hive_id_; std::map io_link_type_; std::map io_link_weight_; std::map> io_link_map_; diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index c87cc772a5..f632ddacd0 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -3064,6 +3064,22 @@ rsmi_dev_xgmi_error_reset(uint32_t dv_ind) { CATCH } +rsmi_status_t +rsmi_dev_xgmi_hive_id_get(uint32_t dv_ind, uint64_t *hive_id) { + TRY + + if (hive_id == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + GET_DEV_AND_KFDNODE_FROM_INDX + + *hive_id = kfd_node->xgmi_hive_id(); + + return RSMI_STATUS_SUCCESS; + CATCH +} + rsmi_status_t rsmi_topo_get_numa_node_number(uint32_t dv_ind, uint32_t *numa_node) { TRY diff --git a/src/rocm_smi_kfd.cc b/src/rocm_smi_kfd.cc index b3c563a83b..cf77bfcedb 100755 --- a/src/rocm_smi_kfd.cc +++ b/src/rocm_smi_kfd.cc @@ -95,7 +95,7 @@ static const char *kKFDPasidFName = "pasid"; static const char *kKFDNodePropLOCATION_IDStr = "location_id"; static const char *kKFDNodePropDOMAINStr = "domain"; // static const char *kKFDNodePropDRM_RENDER_MINORStr = "drm_render_minor"; -// static const char *kKFDNodePropHIVE_IDStr = "hive_id"; +static const char *kKFDNodePropHIVE_IDStr = "hive_id"; // static const char *kKFDNodePropNUM_SDMA_ENGINESStr = "num_sdma_engines"; // static const char *kKFDNodePropNUM_SDMA_XGMI_ENGINESStr = // "num_sdma_xgmi_engines"; @@ -560,6 +560,12 @@ KFDNode::Initialize(void) { ret = ReadKFDGpuName(node_indx_, &name_); + ret = get_property_value(kKFDNodePropHIVE_IDStr, &xgmi_hive_id_); + if (ret != 0) { + throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, + "Failed to initialize rocm_smi library (get xgmi hive id)."); + } + std::map> io_link_map_tmp; ret = DiscoverIOLinksPerNode(node_indx_, &io_link_map_tmp); if (ret != 0) { diff --git a/tests/rocm_smi_test/functional/xgmi_read_write.cc b/tests/rocm_smi_test/functional/xgmi_read_write.cc index 5d53a1b891..c85de42cf4 100755 --- a/tests/rocm_smi_test/functional/xgmi_read_write.cc +++ b/tests/rocm_smi_test/functional/xgmi_read_write.cc @@ -87,6 +87,7 @@ void TestXGMIReadWrite::Close() { void TestXGMIReadWrite::Run(void) { rsmi_status_t err; rsmi_xgmi_status_t err_stat; + uint64_t hive_id; TestBase::Run(); if (setup_failed_) { @@ -99,6 +100,27 @@ void TestXGMIReadWrite::Run(void) { for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { PrintDeviceHeader(dv_ind); + err = rsmi_dev_xgmi_hive_id_get(dv_ind, &hive_id); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**rsmi_dev_xgmi_hive_id_get() is not supported" + " on this machine" << std::endl; + // Verify api support checking functionality is working + err = rsmi_dev_xgmi_hive_id_get(dv_ind, nullptr); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + + continue; + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**XGMI Hive ID : " << std::hex << hive_id << + std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_xgmi_hive_id_get(dv_ind, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_xgmi_error_status(dv_ind, &err_stat); if (err == RSMI_STATUS_NOT_SUPPORTED) {