Added rsmi_dev_pci_replay_counter_get()
Also, added code to destroy/recreate mutex if we can't get a lock
within 3 seconds, when shared memory mutex is initialized.
[ROCm/amdsmi commit: 34c977bd06]
This commit is contained in:
@@ -744,6 +744,24 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid);
|
||||
rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
|
||||
uint64_t *received, uint64_t *max_pkt_sz);
|
||||
|
||||
/**
|
||||
* @brief Get PCIe replay counter
|
||||
*
|
||||
* @details Given a device index @p dv_ind and a pointer to a uint64_t @p
|
||||
* counter, this function will write the sum of the number of NAK's received
|
||||
* by the GPU and the NAK's generated by the GPU to memory pointed to by @p
|
||||
* counter.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] counter a pointer to uint64_t to which the sum of the NAK's
|
||||
* received and generated by the GPU is written
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_pci_replay_counter_get(uint32_t dv_ind,
|
||||
uint64_t *counter);
|
||||
|
||||
/** @} */ // end of PCIeQuer
|
||||
/*****************************************************************************/
|
||||
/** @defgroup PCIeCont PCIe Control
|
||||
|
||||
@@ -56,7 +56,7 @@
|
||||
#include "rocm_smi/rocm_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
extern "C" {
|
||||
#include "shared_mutex.h"
|
||||
#include "shared_mutex.h" // NOLINT
|
||||
};
|
||||
|
||||
namespace amd {
|
||||
@@ -90,6 +90,7 @@ enum DevInfoTypes {
|
||||
kDevMemUsedGTT,
|
||||
kDevMemUsedVisVRAM,
|
||||
kDevMemUsedVRAM,
|
||||
kDevPCIEReplayCount,
|
||||
};
|
||||
|
||||
class Device {
|
||||
@@ -116,6 +117,7 @@ class Device {
|
||||
void set_bdfid(uint64_t val) {bdfid_ = val;}
|
||||
uint64_t get_bdfid(void) const {return bdfid_;}
|
||||
pthread_mutex_t *mutex(void) {return mutex_.ptr;}
|
||||
|
||||
private:
|
||||
std::shared_ptr<Monitor> monitor_;
|
||||
std::shared_ptr<PowerMon> power_monitor_;
|
||||
|
||||
@@ -66,23 +66,23 @@ int ReadSysfsStr(std::string path, std::string *retStr);
|
||||
int WriteSysfsStr(std::string path, std::string val);
|
||||
|
||||
struct pthread_wrap {
|
||||
public:
|
||||
pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {}
|
||||
public:
|
||||
explicit pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {}
|
||||
|
||||
void Acquire() { pthread_mutex_lock(&mutex_); }
|
||||
void Release() { pthread_mutex_unlock(&mutex_); }
|
||||
private:
|
||||
private:
|
||||
pthread_mutex_t& mutex_;
|
||||
};
|
||||
struct ScopedPthread {
|
||||
ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) {
|
||||
explicit ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) {
|
||||
pthrd_ref_.Acquire();
|
||||
};
|
||||
}
|
||||
|
||||
~ScopedPthread() {
|
||||
pthrd_ref_.Release();
|
||||
}
|
||||
private:
|
||||
private:
|
||||
ScopedPthread(const ScopedPthread&);
|
||||
|
||||
pthread_wrap& pthrd_ref_;
|
||||
|
||||
@@ -120,8 +120,8 @@ static rsmi_status_t errno_to_rsmi_status(uint32_t err) {
|
||||
switch (err) {
|
||||
case 0: return RSMI_STATUS_SUCCESS;
|
||||
case EACCES: return RSMI_STATUS_PERMISSION;
|
||||
case EPERM: return RSMI_STATUS_NOT_SUPPORTED;
|
||||
case ENOENT:
|
||||
case EPERM:
|
||||
case ENOENT: return RSMI_STATUS_NOT_SUPPORTED;
|
||||
case EISDIR: return RSMI_STATUS_FILE_ERROR;
|
||||
default: return RSMI_STATUS_UNKNOWN_ERROR;
|
||||
}
|
||||
@@ -2044,3 +2044,16 @@ rsmi_version_str_get(rsmi_sw_component_t component, char *ver_str,
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) {
|
||||
TRY
|
||||
|
||||
DEVICE_MUTEX
|
||||
rsmi_status_t ret;
|
||||
|
||||
ret = get_dev_value_int(amd::smi::kDevPCIEReplayCount, dv_ind, counter);
|
||||
return ret;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
@@ -96,6 +96,7 @@ static const char *kDevMemTotVRAMFName = "mem_info_vram_total";
|
||||
static const char *kDevMemUsedGTTFName = "mem_info_gtt_used";
|
||||
static const char *kDevMemUsedVisVRAMFName = "mem_info_vis_vram_used";
|
||||
static const char *kDevMemUsedVRAMFName = "mem_info_vram_used";
|
||||
static const char *kDevPCIEReplayCountFName = "pcie_replay_count";
|
||||
|
||||
// Strings that are found within sysfs files
|
||||
static const char *kDevPerfLevelAutoStr = "auto";
|
||||
@@ -136,6 +137,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
|
||||
{kDevMemUsedGTT, kDevMemUsedGTTFName},
|
||||
{kDevMemUsedVisVRAM, kDevMemUsedVisVRAMFName},
|
||||
{kDevMemUsedVRAM, kDevMemUsedVRAMFName},
|
||||
{kDevPCIEReplayCount, kDevPCIEReplayCountFName},
|
||||
};
|
||||
|
||||
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
|
||||
@@ -202,7 +204,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
|
||||
DBG_FILE_ERROR(sysfs_path, str);
|
||||
if (!isRegularFile(sysfs_path)) {
|
||||
return EISDIR;
|
||||
return ENOENT;
|
||||
}
|
||||
|
||||
fs->open(sysfs_path);
|
||||
@@ -367,6 +369,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
case kDevMemUsedGTT:
|
||||
case kDevMemUsedVisVRAM:
|
||||
case kDevMemUsedVRAM:
|
||||
case kDevPCIEReplayCount:
|
||||
ret = readDevInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
*val = std::stoul(tempStr, 0);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <stdio.h> // perror
|
||||
#include <stdlib.h> // malloc, free
|
||||
#include <string.h> // strcpy
|
||||
#include <time.h> // clock_gettime
|
||||
|
||||
shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
shared_mutex_t mutex = {NULL, 0, NULL, 0};
|
||||
@@ -51,7 +52,17 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
return mutex;
|
||||
}
|
||||
|
||||
if (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL) {
|
||||
pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr;
|
||||
|
||||
// Make sure the mutex wasn't left in a locked state. If we can't
|
||||
// acquire it in 3 sec., re-do everything.
|
||||
struct timespec expireTime;
|
||||
clock_gettime(CLOCK_REALTIME, &expireTime);
|
||||
expireTime.tv_sec += 3;
|
||||
|
||||
int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
|
||||
|
||||
if (ret || (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL)) {
|
||||
// Something is out of sync. Unlink shm and start over.
|
||||
if (shm_unlink(name)) {
|
||||
mutex.shm_fd = 0;
|
||||
@@ -60,9 +71,12 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
free(mutex.name);
|
||||
|
||||
return shared_mutex_init(name, mode);
|
||||
} else {
|
||||
if (pthread_mutex_unlock(mutex_ptr)) {
|
||||
perror("pthread_mutex_unlock");
|
||||
}
|
||||
}
|
||||
|
||||
pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr;
|
||||
|
||||
if (mutex.created) {
|
||||
pthread_mutexattr_t attr;
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
// NOLINT(legal/copyright)
|
||||
// See LICENSE file
|
||||
|
||||
#ifndef SRC_SHARED_MUTEX_SHARED_MUTEX_H_
|
||||
#define SRC_SHARED_MUTEX_SHARED_MUTEX_H_
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ void TestErrCntRead::Run(void) {
|
||||
err = rsmi_dev_ecc_enabled_get(i, &enabled_mask);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout <<
|
||||
"\t**Error Count Enabled Mask for is not supported on this machine"
|
||||
"\t**Error Count Enabled Mask get is not supported on this machine"
|
||||
<< std::endl;
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
|
||||
@@ -89,13 +89,26 @@ void TestPciReadWrite::Run(void) {
|
||||
rsmi_status_t ret;
|
||||
rsmi_pcie_bandwidth_t bw;
|
||||
uint32_t freq_bitmask;
|
||||
uint64_t sent, received, max_pkt_sz;
|
||||
uint64_t sent, received, max_pkt_sz, u64int;
|
||||
|
||||
TestBase::Run();
|
||||
|
||||
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
|
||||
PrintDeviceHeader(dv_ind);
|
||||
|
||||
ret = rsmi_dev_pci_replay_counter_get(dv_ind, &u64int);
|
||||
|
||||
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout <<
|
||||
"\t**rsmi_dev_pci_replay_counter_get() is not supported"
|
||||
" on this machine" << std::endl;
|
||||
} else {
|
||||
CHK_ERR_ASRT(ret)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\tPCIe Replay Counter: " << u64int << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
ret = rsmi_dev_pci_throughput_get(dv_ind, &sent, &received, &max_pkt_sz);
|
||||
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "TEST FAILURE: Current PCIe throughput is not detected. "
|
||||
@@ -106,7 +119,7 @@ void TestPciReadWrite::Run(void) {
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "PCIe Throughput (1 sec.): " << std::endl;
|
||||
std::cout << "\tPCIe Throughput (1 sec.): " << std::endl;
|
||||
std::cout << "\t\tSent: " << sent << " bytes" << std::endl;
|
||||
std::cout << "\t\tReceived: " << received << " bytes" << std::endl;
|
||||
std::cout << "\t\tMax Packet Size: " << max_pkt_sz << " bytes" <<
|
||||
@@ -125,7 +138,8 @@ void TestPciReadWrite::Run(void) {
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "Initial PCIe is " << bw.transfer_rate.current << std::endl;
|
||||
std::cout << "\tInitial PCIe is " << bw.transfer_rate.current <<
|
||||
std::endl;
|
||||
}
|
||||
|
||||
// First set the bitmask to all supported bandwidths
|
||||
@@ -141,7 +155,7 @@ void TestPciReadWrite::Run(void) {
|
||||
freq_bm_str.size()-1));
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "Setting bandwidth mask to " << "0b" << freq_bm_str <<
|
||||
std::cout << "\tSetting bandwidth mask to " << "0b" << freq_bm_str <<
|
||||
" ..." << std::endl;
|
||||
}
|
||||
ret = rsmi_dev_pci_bandwidth_set(dv_ind, freq_bitmask);
|
||||
@@ -151,9 +165,9 @@ void TestPciReadWrite::Run(void) {
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "Bandwidth is now index " << bw.transfer_rate.current <<
|
||||
std::cout << "\tBandwidth is now index " << bw.transfer_rate.current <<
|
||||
std::endl;
|
||||
std::cout << "Resetting mask to all bandwidths." << std::endl;
|
||||
std::cout << "\tResetting mask to all bandwidths." << std::endl;
|
||||
}
|
||||
ret = rsmi_dev_pci_bandwidth_set(dv_ind, 0xFFFFFFFF);
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
@@ -99,7 +99,7 @@ void TestTempRead::Run(void) {
|
||||
err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64);
|
||||
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
if (err == RSMI_STATUS_FILE_ERROR) {
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**" << label << ": " <<
|
||||
"Not supported on this machine" << std::endl;
|
||||
|
||||
@@ -155,7 +155,7 @@ void TestVoltCurvRead::Run(void) {
|
||||
|
||||
err = rsmi_dev_od_volt_info_get(i, &odv);
|
||||
if (err == RSMI_STATUS_FILE_ERROR ||
|
||||
err == RSMI_STATUS_NOT_YET_IMPLEMENTED) {
|
||||
err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout <<
|
||||
"\t**rsmi_dev_od_volt_info_get: Not supported on this machine"
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user