Implementation for adding pcie_total (#40)
* Implementation for adding pcie_total
Signed-off-by: adapryor <Adam.pryor@amd.com>
Change-Id: I4b0cfd7095e9d984e939283ee7169d01f55a1847
Signed-off-by: adapryor <Adam.pryor@amd.com>
* Updates
Signed-off-by: adapryor <Adam.pryor@amd.com>
Change-Id: I021f29083de651cab9fbe7db98acbe20f65948d4
* Updates
Signed-off-by: adapryor <Adam.pryor@amd.com>
Change-Id: I42f3207b745fa787dabe30a85c8e063159d1337d
---------
Signed-off-by: adapryor <Adam.pryor@amd.com>
[ROCm/rdc commit: 60b7359161]
This commit is contained in:
@@ -422,6 +422,9 @@ typedef struct {
|
||||
uint64_t ecc_uncorrect; //!< Uncorrectable errors
|
||||
rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats
|
||||
rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats
|
||||
rdc_stats_summary_t pcie_total; //!< Total PCIe bandwidth stats
|
||||
//!< pcie_tx/pcie_rx are not available on mi300, max integer
|
||||
//!< returned, so use pcie_total
|
||||
rdc_stats_summary_t power_usage; //!< GPU Power usage stats
|
||||
rdc_stats_summary_t gpu_clock; //!< GPU Clock speed stats
|
||||
rdc_stats_summary_t memory_clock; //!< Mem. Clock speed stats
|
||||
|
||||
@@ -447,6 +447,7 @@ message GpuUsageInfo {
|
||||
JobStatsSummary pcie_rx = 13;
|
||||
JobStatsSummary memory_clock = 14;
|
||||
JobStatsSummary gpu_temperature = 15;
|
||||
JobStatsSummary pcie_total = 16;
|
||||
}
|
||||
message GetJobStatsResponse {
|
||||
uint32 status = 1;
|
||||
|
||||
@@ -210,6 +210,7 @@ class rdc_gpu_usage_info_t(Structure):
|
||||
,("ecc_uncorrect", c_uint64)
|
||||
,("pcie_tx", rdc_stats_summary_t)
|
||||
,("pcie_rx", rdc_stats_summary_t)
|
||||
,("pcie_total", rdc_stats_summary_t)
|
||||
,("power_usage", rdc_stats_summary_t)
|
||||
,("gpu_clock", rdc_stats_summary_t)
|
||||
,("memory_clock", rdc_stats_summary_t)
|
||||
|
||||
@@ -294,6 +294,7 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
|
||||
summary_info.power_usage = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.pcie_tx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.pcie_rx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.pcie_total = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.gpu_temperature = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.memory_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.gpu_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
@@ -363,6 +364,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
|
||||
set_summary(ite->second, gpu_info.pcie_tx, summary_info.pcie_tx, 1024 * 1024);
|
||||
} else if (ite->first == RDC_FI_PCIE_RX) {
|
||||
set_summary(ite->second, gpu_info.pcie_rx, summary_info.pcie_rx, 1024 * 1024);
|
||||
} else if (ite->first == RDC_FI_PCIE_BANDWIDTH) {
|
||||
set_summary(ite->second, gpu_info.pcie_total, summary_info.pcie_total, 1024 * 1024);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
|
||||
// Add the default job stats fields
|
||||
rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK,
|
||||
RDC_FI_GPU_UTIL, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
|
||||
RDC_FI_MEM_CLOCK, RDC_FI_GPU_TEMP};
|
||||
RDC_FI_PCIE_BANDWIDTH, RDC_FI_MEM_CLOCK, RDC_FI_GPU_TEMP};
|
||||
char job_field_group[] = "JobStatsFields";
|
||||
rdc_field_grp_t fgid = JOB_FIELD_ID;
|
||||
|
||||
|
||||
@@ -129,6 +129,12 @@ bool RdcStandaloneHandler::copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src,
|
||||
target->pcie_rx.average = rxstats.average();
|
||||
target->pcie_rx.standard_deviation = rxstats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& pcietotalstats = src.pcie_total();
|
||||
target->pcie_total.max_value = pcietotalstats.max_value();
|
||||
target->pcie_total.min_value = pcietotalstats.min_value();
|
||||
target->pcie_total.average = pcietotalstats.average();
|
||||
target->pcie_total.standard_deviation = pcietotalstats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& mcstats = src.memory_clock();
|
||||
target->memory_clock.max_value = mcstats.max_value();
|
||||
target->memory_clock.min_value = mcstats.min_value();
|
||||
|
||||
@@ -203,6 +203,12 @@ void RdciStatsSubSystem::show_job_stats_json(const rdc_gpu_usage_info_t& gpu_inf
|
||||
std::cout << "\"pcie_tx_avg\": " << gpu_info.pcie_tx.average << ",";
|
||||
std::cout << "\"pcie_tx_stanard_deviation\": " << gpu_info.pcie_tx.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"pcie_total_max\": " << gpu_info.pcie_total.max_value << ",";
|
||||
std::cout << "\"pcie_total_min\": " << gpu_info.pcie_total.min_value << ",";
|
||||
std::cout << "\"pcie_total_avg\": " << gpu_info.pcie_total.average << ",";
|
||||
std::cout << "\"pcie_total_stanard_deviation\": " << gpu_info.pcie_total.standard_deviation
|
||||
<< ",";
|
||||
|
||||
std::cout << "\"ecc_correct\": " << gpu_info.ecc_correct << ",";
|
||||
std::cout << "\"ecc_uncorrect\": " << gpu_info.ecc_uncorrect;
|
||||
}
|
||||
@@ -254,13 +260,45 @@ void RdciStatsSubSystem::show_job_stats(const rdc_gpu_usage_info_t& gpu_info) co
|
||||
<< " Avg: " << gpu_info.gpu_temperature.average << " SD: " << std::fixed
|
||||
<< std::setprecision(2) << gpu_info.gpu_temperature.standard_deviation << "\n";
|
||||
std::cout << "| PCIe Rx Bandwidth (megabytes) | "
|
||||
<< "Max: " << gpu_info.pcie_rx.max_value << " Min: " << gpu_info.pcie_rx.min_value
|
||||
<< " Avg: " << gpu_info.pcie_rx.average << " SD: " << std::fixed << std::setprecision(2)
|
||||
<< gpu_info.pcie_rx.standard_deviation << "\n";
|
||||
<< "Max: "
|
||||
<< (gpu_info.pcie_rx.max_value == std::numeric_limits<int>::max()
|
||||
? "N/A"
|
||||
: std::to_string(gpu_info.pcie_rx.max_value))
|
||||
<< " Min: "
|
||||
<< (gpu_info.pcie_rx.min_value == std::numeric_limits<int>::max()
|
||||
? "N/A"
|
||||
: std::to_string(gpu_info.pcie_rx.min_value))
|
||||
<< " Avg: "
|
||||
<< (gpu_info.pcie_rx.average == std::numeric_limits<int>::max()
|
||||
? "N/A"
|
||||
: std::to_string(gpu_info.pcie_rx.average))
|
||||
<< " SD: " << std::fixed << std::setprecision(2)
|
||||
<< (gpu_info.pcie_rx.standard_deviation == std::numeric_limits<float>::max()
|
||||
? "N/A"
|
||||
: std::to_string(gpu_info.pcie_rx.standard_deviation))
|
||||
<< "\n";
|
||||
std::cout << "| PCIe Tx Bandwidth (megabytes) | "
|
||||
<< "Max: " << gpu_info.pcie_tx.max_value << " Min: " << gpu_info.pcie_tx.min_value
|
||||
<< " Avg: " << gpu_info.pcie_tx.average << " SD: " << std::fixed << std::setprecision(2)
|
||||
<< gpu_info.pcie_tx.standard_deviation << "\n";
|
||||
<< "Max: "
|
||||
<< (gpu_info.pcie_tx.max_value == std::numeric_limits<int>::max()
|
||||
? "N/A"
|
||||
: std::to_string(gpu_info.pcie_tx.max_value))
|
||||
<< " Min: "
|
||||
<< (gpu_info.pcie_tx.min_value == std::numeric_limits<int>::max()
|
||||
? "N/A"
|
||||
: std::to_string(gpu_info.pcie_tx.min_value))
|
||||
<< " Avg: "
|
||||
<< (gpu_info.pcie_tx.average == std::numeric_limits<int>::max()
|
||||
? "N/A"
|
||||
: std::to_string(gpu_info.pcie_tx.average))
|
||||
<< " SD: " << std::fixed << std::setprecision(2)
|
||||
<< (gpu_info.pcie_tx.standard_deviation == std::numeric_limits<float>::max()
|
||||
? "N/A"
|
||||
: std::to_string(gpu_info.pcie_tx.standard_deviation))
|
||||
<< "\n";
|
||||
std::cout << "| PCIe Total Bandwidth (megabytes) | "
|
||||
<< "Max: " << gpu_info.pcie_total.max_value << " Min: " << gpu_info.pcie_total.min_value
|
||||
<< " Avg: " << gpu_info.pcie_total.average << " SD: " << std::fixed
|
||||
<< std::setprecision(2) << gpu_info.pcie_total.standard_deviation << "\n";
|
||||
std::cout << "| Correctable ECC Errors | " << gpu_info.ecc_correct << "\n";
|
||||
std::cout << "| Uncorrectable ECC Errors | " << gpu_info.ecc_uncorrect << "\n";
|
||||
std::cout << "+----------------------------------"
|
||||
|
||||
@@ -523,6 +523,12 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
|
||||
stats->set_average(src.pcie_rx.average);
|
||||
stats->set_standard_deviation(src.pcie_rx.standard_deviation);
|
||||
|
||||
stats = target->mutable_pcie_total();
|
||||
stats->set_max_value(src.pcie_total.max_value);
|
||||
stats->set_min_value(src.pcie_total.min_value);
|
||||
stats->set_average(src.pcie_total.average);
|
||||
stats->set_standard_deviation(src.pcie_total.standard_deviation);
|
||||
|
||||
stats = target->mutable_memory_clock();
|
||||
stats->set_max_value(src.memory_clock.max_value);
|
||||
stats->set_min_value(src.memory_clock.min_value);
|
||||
|
||||
Reference in New Issue
Block a user