diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index f6b20536ae..7319d13df7 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -744,6 +744,24 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid); rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, uint64_t *received, uint64_t *max_pkt_sz); +/** + * @brief Get PCIe replay counter + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t @p + * counter, this function will write the sum of the number of NAK's received + * by the GPU and the NAK's generated by the GPU to memory pointed to by @p + * counter. + * + * @param[in] dv_ind a device index + * + * @param[inout] counter a pointer to uint64_t to which the sum of the NAK's + * received and generated by the GPU is written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + */ +rsmi_status_t rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, + uint64_t *counter); + /** @} */ // end of PCIeQuer /*****************************************************************************/ /** @defgroup PCIeCont PCIe Control diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index 7197814209..c7fcad7dc1 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -56,7 +56,7 @@ #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" extern "C" { -#include "shared_mutex.h" +#include "shared_mutex.h" // NOLINT }; namespace amd { @@ -90,6 +90,7 @@ enum DevInfoTypes { kDevMemUsedGTT, kDevMemUsedVisVRAM, kDevMemUsedVRAM, + kDevPCIEReplayCount, }; class Device { @@ -116,6 +117,7 @@ class Device { void set_bdfid(uint64_t val) {bdfid_ = val;} uint64_t get_bdfid(void) const {return bdfid_;} pthread_mutex_t *mutex(void) {return mutex_.ptr;} + private: std::shared_ptr monitor_; std::shared_ptr power_monitor_; diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h index 8949eaac7a..4612fadce4 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h @@ -66,23 +66,23 @@ int ReadSysfsStr(std::string path, std::string *retStr); int WriteSysfsStr(std::string path, std::string val); struct pthread_wrap { - public: - pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {} + public: + explicit pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {} void Acquire() { pthread_mutex_lock(&mutex_); } void Release() { pthread_mutex_unlock(&mutex_); } - private: + private: pthread_mutex_t& mutex_; }; struct ScopedPthread { - ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) { + explicit ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) { pthrd_ref_.Acquire(); - }; + } ~ScopedPthread() { pthrd_ref_.Release(); } - private: + private: ScopedPthread(const ScopedPthread&); pthread_wrap& pthrd_ref_; diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 3a8cec8e31..3200197cfa 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -120,8 +120,8 @@ static rsmi_status_t errno_to_rsmi_status(uint32_t err) { switch (err) { case 0: return RSMI_STATUS_SUCCESS; case EACCES: return RSMI_STATUS_PERMISSION; - case EPERM: return RSMI_STATUS_NOT_SUPPORTED; - case ENOENT: + case EPERM: + case ENOENT: return RSMI_STATUS_NOT_SUPPORTED; case EISDIR: return RSMI_STATUS_FILE_ERROR; default: return RSMI_STATUS_UNKNOWN_ERROR; } @@ -2044,3 +2044,16 @@ rsmi_version_str_get(rsmi_sw_component_t component, char *ver_str, CATCH } + +rsmi_status_t +rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) { + TRY + + DEVICE_MUTEX + rsmi_status_t ret; + + ret = get_dev_value_int(amd::smi::kDevPCIEReplayCount, dv_ind, counter); + return ret; + + CATCH +} diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index f50fc319ee..fbbe6fdc97 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -96,6 +96,7 @@ static const char *kDevMemTotVRAMFName = "mem_info_vram_total"; static const char *kDevMemUsedGTTFName = "mem_info_gtt_used"; static const char *kDevMemUsedVisVRAMFName = "mem_info_vis_vram_used"; static const char *kDevMemUsedVRAMFName = "mem_info_vram_used"; +static const char *kDevPCIEReplayCountFName = "pcie_replay_count"; // Strings that are found within sysfs files static const char *kDevPerfLevelAutoStr = "auto"; @@ -136,6 +137,7 @@ static const std::map kDevAttribNameMap = { {kDevMemUsedGTT, kDevMemUsedGTTFName}, {kDevMemUsedVisVRAM, kDevMemUsedVisVRAMFName}, {kDevMemUsedVRAM, kDevMemUsedVRAMFName}, + {kDevPCIEReplayCount, kDevPCIEReplayCountFName}, }; static const std::map kDevPerfLvlMap = { @@ -202,7 +204,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { DBG_FILE_ERROR(sysfs_path, str); if (!isRegularFile(sysfs_path)) { - return EISDIR; + return ENOENT; } fs->open(sysfs_path); @@ -367,6 +369,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevMemUsedGTT: case kDevMemUsedVisVRAM: case kDevMemUsedVRAM: + case kDevPCIEReplayCount: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); *val = std::stoul(tempStr, 0); diff --git a/projects/amdsmi/src/shared_mutex/shared_mutex.c b/projects/amdsmi/src/shared_mutex/shared_mutex.c index 33c4d38729..af25e1ba90 100755 --- a/projects/amdsmi/src/shared_mutex/shared_mutex.c +++ b/projects/amdsmi/src/shared_mutex/shared_mutex.c @@ -8,6 +8,7 @@ #include // perror #include // malloc, free #include // strcpy +#include // clock_gettime shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { shared_mutex_t mutex = {NULL, 0, NULL, 0}; @@ -51,7 +52,17 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { return mutex; } - if (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL) { + pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr; + + // Make sure the mutex wasn't left in a locked state. If we can't + // acquire it in 3 sec., re-do everything. + struct timespec expireTime; + clock_gettime(CLOCK_REALTIME, &expireTime); + expireTime.tv_sec += 3; + + int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime); + + if (ret || (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL)) { // Something is out of sync. Unlink shm and start over. if (shm_unlink(name)) { mutex.shm_fd = 0; @@ -60,9 +71,12 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { free(mutex.name); return shared_mutex_init(name, mode); + } else { + if (pthread_mutex_unlock(mutex_ptr)) { + perror("pthread_mutex_unlock"); + } } - pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr; if (mutex.created) { pthread_mutexattr_t attr; diff --git a/projects/amdsmi/src/shared_mutex/shared_mutex.h b/projects/amdsmi/src/shared_mutex/shared_mutex.h index 18e70bd6de..77aff5509d 100755 --- a/projects/amdsmi/src/shared_mutex/shared_mutex.h +++ b/projects/amdsmi/src/shared_mutex/shared_mutex.h @@ -1,3 +1,6 @@ +// NOLINT(legal/copyright) +// See LICENSE file + #ifndef SRC_SHARED_MUTEX_SHARED_MUTEX_H_ #define SRC_SHARED_MUTEX_SHARED_MUTEX_H_ diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc index 8f6a9ac26a..03c5b70df5 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc @@ -96,7 +96,7 @@ void TestErrCntRead::Run(void) { err = rsmi_dev_ecc_enabled_get(i, &enabled_mask); if (err == RSMI_STATUS_NOT_SUPPORTED) { std::cout << - "\t**Error Count Enabled Mask for is not supported on this machine" + "\t**Error Count Enabled Mask get is not supported on this machine" << std::endl; } else { CHK_ERR_ASRT(err) diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/pci_read_write.cc b/projects/amdsmi/tests/rocm_smi_test/functional/pci_read_write.cc index 6e63e495a4..07fb96890a 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/pci_read_write.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/pci_read_write.cc @@ -89,13 +89,26 @@ void TestPciReadWrite::Run(void) { rsmi_status_t ret; rsmi_pcie_bandwidth_t bw; uint32_t freq_bitmask; - uint64_t sent, received, max_pkt_sz; + uint64_t sent, received, max_pkt_sz, u64int; TestBase::Run(); for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { PrintDeviceHeader(dv_ind); + ret = rsmi_dev_pci_replay_counter_get(dv_ind, &u64int); + + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**rsmi_dev_pci_replay_counter_get() is not supported" + " on this machine" << std::endl; + } else { + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\tPCIe Replay Counter: " << u64int << std::endl; + } + } + ret = rsmi_dev_pci_throughput_get(dv_ind, &sent, &received, &max_pkt_sz); if (ret == RSMI_STATUS_NOT_SUPPORTED) { std::cout << "TEST FAILURE: Current PCIe throughput is not detected. " @@ -106,7 +119,7 @@ void TestPciReadWrite::Run(void) { CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "PCIe Throughput (1 sec.): " << std::endl; + std::cout << "\tPCIe Throughput (1 sec.): " << std::endl; std::cout << "\t\tSent: " << sent << " bytes" << std::endl; std::cout << "\t\tReceived: " << received << " bytes" << std::endl; std::cout << "\t\tMax Packet Size: " << max_pkt_sz << " bytes" << @@ -125,7 +138,8 @@ void TestPciReadWrite::Run(void) { CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "Initial PCIe is " << bw.transfer_rate.current << std::endl; + std::cout << "\tInitial PCIe is " << bw.transfer_rate.current << + std::endl; } // First set the bitmask to all supported bandwidths @@ -141,7 +155,7 @@ void TestPciReadWrite::Run(void) { freq_bm_str.size()-1)); IF_VERB(STANDARD) { - std::cout << "Setting bandwidth mask to " << "0b" << freq_bm_str << + std::cout << "\tSetting bandwidth mask to " << "0b" << freq_bm_str << " ..." << std::endl; } ret = rsmi_dev_pci_bandwidth_set(dv_ind, freq_bitmask); @@ -151,9 +165,9 @@ void TestPciReadWrite::Run(void) { CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "Bandwidth is now index " << bw.transfer_rate.current << + std::cout << "\tBandwidth is now index " << bw.transfer_rate.current << std::endl; - std::cout << "Resetting mask to all bandwidths." << std::endl; + std::cout << "\tResetting mask to all bandwidths." << std::endl; } ret = rsmi_dev_pci_bandwidth_set(dv_ind, 0xFFFFFFFF); CHK_ERR_ASRT(ret) diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/temp_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/temp_read.cc index 9f04a0deeb..e68185e689 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/temp_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/temp_read.cc @@ -99,7 +99,7 @@ void TestTempRead::Run(void) { err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64); if (err != RSMI_STATUS_SUCCESS) { - if (err == RSMI_STATUS_FILE_ERROR) { + if (err == RSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { std::cout << "\t**" << label << ": " << "Not supported on this machine" << std::endl; diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/volt_freq_curv_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/volt_freq_curv_read.cc index 4f99880c2d..60b8080a1a 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/volt_freq_curv_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/volt_freq_curv_read.cc @@ -155,7 +155,7 @@ void TestVoltCurvRead::Run(void) { err = rsmi_dev_od_volt_info_get(i, &odv); if (err == RSMI_STATUS_FILE_ERROR || - err == RSMI_STATUS_NOT_YET_IMPLEMENTED) { + err == RSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { std::cout << "\t**rsmi_dev_od_volt_info_get: Not supported on this machine"