diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 87df0a929b..a85f0dc950 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -3361,6 +3361,32 @@ rsmi_status_t rsmi_topo_get_link_weight(uint32_t dv_ind_src, uint32_t dv_ind_dst, uint64_t *weight); +/** + * @brief Retreive minimal and maximal io link bandwidth between 2 GPUs + * + * @details Given a source device index @p dv_ind_src and + * a destination device index @p dv_ind_dst, pointer to an + * uint64_t @p min_bandwidth, and a pointer to uint64_t @p max_bandiwidth, + * this function will write theoretical minimal and maximal bandwidth limits. + * API works if src and dst are connected via xgmi and have 1 hop distance. + * + * @param[in] dv_ind_src the source device index + * + * @param[in] dv_ind_dst the destination device index + * + * @param[inout] min_bandwidth A pointer to an uint64_t to which the + * minimal bandwidth for the connection should be written. + * + * @param[inout] max_bandwidth A pointer to an uint64_t to which the + * maximal bandwidth for the connection should be written. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + */ +rsmi_status_t +rsmi_minmax_bandwidth_get(uint32_t dv_ind_src, uint32_t dv_ind_dst, + uint64_t *min_bandwidth, uint64_t *max_bandwidth); + /** * @brief Retrieve the hops and the connection type between 2 GPUs * diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_io_link.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_io_link.h index 6ba58d09bd..5903ab99df 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_io_link.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_io_link.h @@ -98,6 +98,9 @@ class IOLink { uint32_t node_to(void) const {return node_to_;} uint64_t weight(void) const {return weight_;} LINK_DIRECTORY_TYPE get_directory_type(void) const {return link_dir_type_;} + uint64_t min_bandwidth(void) const {return min_bandwidth_;} + uint64_t max_bandwidth(void) const {return max_bandwidth_;} + private: uint32_t node_indx_; @@ -106,6 +109,8 @@ class IOLink { uint32_t node_from_; uint32_t node_to_; uint64_t weight_; + uint64_t min_bandwidth_; + uint64_t max_bandwidth_; std::map properties_; LINK_DIRECTORY_TYPE link_dir_type_; }; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h index 2142e4bd16..a0c8f5fe2d 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h @@ -74,6 +74,8 @@ class KFDNode { IO_LINK_TYPE numa_node_type(void) const {return numa_node_type_;} int get_io_link_type(uint32_t node_to, IO_LINK_TYPE *type); int get_io_link_weight(uint32_t node_to, uint64_t *weight); + int get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth, + uint64_t *min_bandwidth); std::shared_ptr amdgpu_device(void) const {return amdgpu_device_;} uint32_t amdgpu_dev_index(void) const {return amdgpu_dev_index_;} void set_amdgpu_dev_index(uint32_t val) {amdgpu_dev_index_ = val;} @@ -90,6 +92,8 @@ class KFDNode { uint32_t cu_count_; std::map io_link_type_; std::map io_link_weight_; + std::map io_link_max_bandwidth_; + std::map io_link_min_bandwidth_; std::map> io_link_map_; std::map properties_; std::shared_ptr amdgpu_device_; diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index e806992028..0cd1526944 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -3453,6 +3453,49 @@ rsmi_topo_get_link_weight(uint32_t dv_ind_src, uint32_t dv_ind_dst, CATCH } + rsmi_status_t + rsmi_minmax_bandwidth_get(uint32_t dv_ind_src, uint32_t dv_ind_dst, + uint64_t *min_bandwidth, uint64_t *max_bandwidth){ + TRY + + uint32_t dv_ind = dv_ind_src; + GET_DEV_AND_KFDNODE_FROM_INDX + DEVICE_MUTEX + + if (min_bandwidth == nullptr || max_bandwidth == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + if (dv_ind_src == dv_ind_dst) { + return RSMI_STATUS_INVALID_ARGS; + } + + rsmi_status_t status; + uint32_t node_ind_dst; + int ret = smi.get_node_index(dv_ind_dst, &node_ind_dst); + + if (ret != 0) { + return RSMI_STATUS_INVALID_ARGS; + } + + + amd::smi::IO_LINK_TYPE type; + ret = kfd_node->get_io_link_type(node_ind_dst, &type); + if ( ret == 0 && type == amd::smi::IOLINK_TYPE_XGMI) { + ret = kfd_node->get_io_link_bandwidth(node_ind_dst,max_bandwidth, + min_bandwidth); + if (ret == 0) + status = RSMI_STATUS_SUCCESS; + else + status = RSMI_STATUS_INIT_ERROR; + } else { // from src GPU to it's CPU node, or type not XGMI + status = RSMI_STATUS_NOT_SUPPORTED; + } + + return status; + CATCH +} + rsmi_status_t rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst, uint64_t *hops, RSMI_IO_LINK_TYPE *type) { diff --git a/projects/rocm-smi-lib/src/rocm_smi_io_link.cc b/projects/rocm-smi-lib/src/rocm_smi_io_link.cc index fc28e380be..888f13fffa 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_io_link.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_io_link.cc @@ -73,8 +73,8 @@ static const char *kIOLinkPropNODE_TOStr = "node_to"; static const char *kIOLinkPropWEIGHTStr = "weight"; // static const char *kIOLinkPropMIN_LATENCYStr = "min_latency"; // static const char *kIOLinkPropMAX_LATENCYStr = "max_latency"; -// static const char *kIOLinkPropMIN_BANDWIDTHStr = "min_bandwidth"; -// static const char *kIOLinkPropMAX_BANDWIDTHStr = "max_bandwidth"; +static const char *kIOLinkPropMIN_BANDWIDTHStr = "min_bandwidth"; +static const char *kIOLinkPropMAX_BANDWIDTHStr = "max_bandwidth"; // static const char *kIOLinkPropRECOMMENDED_TRANSFER_SIZEStr = // "recommended_transfer_size"; // static const char *kIOLinkPropFLAGSStr = "flags"; @@ -380,6 +380,12 @@ IOLink::Initialize(void) { if (ret) {return ret;} ret = get_property_value(kIOLinkPropWEIGHTStr, &weight_); + if (ret) {return ret;} + + ret = get_property_value(kIOLinkPropMIN_BANDWIDTHStr, &min_bandwidth_); + if (ret) {return ret;} + + ret = get_property_value(kIOLinkPropMAX_BANDWIDTHStr, &max_bandwidth_); return ret; } diff --git a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc index 72367088fe..bf3227f81b 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc @@ -679,6 +679,9 @@ KFDNode::Initialize(void) { } else { io_link_type_[node_to] = link->type(); io_link_weight_[node_to] = link->weight(); + io_link_max_bandwidth_[node_to] = link->max_bandwidth(); + io_link_min_bandwidth_[node_to] = link->min_bandwidth(); + } } @@ -746,5 +749,24 @@ KFDNode::get_io_link_weight(uint32_t node_to, uint64_t *weight) { return 0; } +int +KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth, + uint64_t *min_bandwidth){ + assert (max_bandwidth != nullptr && min_bandwidth != nullptr); + if (max_bandwidth == nullptr || min_bandwidth == nullptr ){ + return EINVAL; + } + + if (io_link_max_bandwidth_.find(node_to) == io_link_max_bandwidth_.end() || + io_link_min_bandwidth_.find(node_to) == io_link_min_bandwidth_.end()){ + return EINVAL; + } + + *max_bandwidth = io_link_max_bandwidth_[node_to]; + *min_bandwidth = io_link_min_bandwidth_[node_to]; + + return 0; +} + } // namespace smi } // namespace amd