Implementation for adding pcie_total (#40)

* Implementation for adding pcie_total

Signed-off-by: adapryor <Adam.pryor@amd.com>
Change-Id: I4b0cfd7095e9d984e939283ee7169d01f55a1847
Signed-off-by: adapryor <Adam.pryor@amd.com>

* Updates

Signed-off-by: adapryor <Adam.pryor@amd.com>
Change-Id: I021f29083de651cab9fbe7db98acbe20f65948d4

* Updates

Signed-off-by: adapryor <Adam.pryor@amd.com>
Change-Id: I42f3207b745fa787dabe30a85c8e063159d1337d

---------

Signed-off-by: adapryor <Adam.pryor@amd.com>

[ROCm/rdc commit: 60b7359161]
This commit is contained in:
Pryor, Adam
2024-12-26 18:36:41 -06:00
committed by GitHub
parent 0e5cf815d8
commit 20f3ba845c
8 changed files with 65 additions and 7 deletions
+3
View File
@@ -422,6 +422,9 @@ typedef struct {
uint64_t ecc_uncorrect; //!< Uncorrectable errors
rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats
rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats
rdc_stats_summary_t pcie_total; //!< Total PCIe bandwidth stats
//!< pcie_tx/pcie_rx are not available on mi300, max integer
//!< returned, so use pcie_total
rdc_stats_summary_t power_usage; //!< GPU Power usage stats
rdc_stats_summary_t gpu_clock; //!< GPU Clock speed stats
rdc_stats_summary_t memory_clock; //!< Mem. Clock speed stats
+1
View File
@@ -447,6 +447,7 @@ message GpuUsageInfo {
JobStatsSummary pcie_rx = 13;
JobStatsSummary memory_clock = 14;
JobStatsSummary gpu_temperature = 15;
JobStatsSummary pcie_total = 16;
}
message GetJobStatsResponse {
uint32 status = 1;
@@ -210,6 +210,7 @@ class rdc_gpu_usage_info_t(Structure):
,("ecc_uncorrect", c_uint64)
,("pcie_tx", rdc_stats_summary_t)
,("pcie_rx", rdc_stats_summary_t)
,("pcie_total", rdc_stats_summary_t)
,("power_usage", rdc_stats_summary_t)
,("gpu_clock", rdc_stats_summary_t)
,("memory_clock", rdc_stats_summary_t)
@@ -294,6 +294,7 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
summary_info.power_usage = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.pcie_tx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.pcie_rx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.pcie_total = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.gpu_temperature = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.memory_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.gpu_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
@@ -363,6 +364,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
set_summary(ite->second, gpu_info.pcie_tx, summary_info.pcie_tx, 1024 * 1024);
} else if (ite->first == RDC_FI_PCIE_RX) {
set_summary(ite->second, gpu_info.pcie_rx, summary_info.pcie_rx, 1024 * 1024);
} else if (ite->first == RDC_FI_PCIE_BANDWIDTH) {
set_summary(ite->second, gpu_info.pcie_total, summary_info.pcie_total, 1024 * 1024);
}
}
}
@@ -33,7 +33,7 @@ RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
// Add the default job stats fields
rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK,
RDC_FI_GPU_UTIL, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
RDC_FI_MEM_CLOCK, RDC_FI_GPU_TEMP};
RDC_FI_PCIE_BANDWIDTH, RDC_FI_MEM_CLOCK, RDC_FI_GPU_TEMP};
char job_field_group[] = "JobStatsFields";
rdc_field_grp_t fgid = JOB_FIELD_ID;
@@ -129,6 +129,12 @@ bool RdcStandaloneHandler::copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src,
target->pcie_rx.average = rxstats.average();
target->pcie_rx.standard_deviation = rxstats.standard_deviation();
const ::rdc::JobStatsSummary& pcietotalstats = src.pcie_total();
target->pcie_total.max_value = pcietotalstats.max_value();
target->pcie_total.min_value = pcietotalstats.min_value();
target->pcie_total.average = pcietotalstats.average();
target->pcie_total.standard_deviation = pcietotalstats.standard_deviation();
const ::rdc::JobStatsSummary& mcstats = src.memory_clock();
target->memory_clock.max_value = mcstats.max_value();
target->memory_clock.min_value = mcstats.min_value();
+44 -6
View File
@@ -203,6 +203,12 @@ void RdciStatsSubSystem::show_job_stats_json(const rdc_gpu_usage_info_t& gpu_inf
std::cout << "\"pcie_tx_avg\": " << gpu_info.pcie_tx.average << ",";
std::cout << "\"pcie_tx_stanard_deviation\": " << gpu_info.pcie_tx.standard_deviation << ",";
std::cout << "\"pcie_total_max\": " << gpu_info.pcie_total.max_value << ",";
std::cout << "\"pcie_total_min\": " << gpu_info.pcie_total.min_value << ",";
std::cout << "\"pcie_total_avg\": " << gpu_info.pcie_total.average << ",";
std::cout << "\"pcie_total_stanard_deviation\": " << gpu_info.pcie_total.standard_deviation
<< ",";
std::cout << "\"ecc_correct\": " << gpu_info.ecc_correct << ",";
std::cout << "\"ecc_uncorrect\": " << gpu_info.ecc_uncorrect;
}
@@ -254,13 +260,45 @@ void RdciStatsSubSystem::show_job_stats(const rdc_gpu_usage_info_t& gpu_info) co
<< " Avg: " << gpu_info.gpu_temperature.average << " SD: " << std::fixed
<< std::setprecision(2) << gpu_info.gpu_temperature.standard_deviation << "\n";
std::cout << "| PCIe Rx Bandwidth (megabytes) | "
<< "Max: " << gpu_info.pcie_rx.max_value << " Min: " << gpu_info.pcie_rx.min_value
<< " Avg: " << gpu_info.pcie_rx.average << " SD: " << std::fixed << std::setprecision(2)
<< gpu_info.pcie_rx.standard_deviation << "\n";
<< "Max: "
<< (gpu_info.pcie_rx.max_value == std::numeric_limits<int>::max()
? "N/A"
: std::to_string(gpu_info.pcie_rx.max_value))
<< " Min: "
<< (gpu_info.pcie_rx.min_value == std::numeric_limits<int>::max()
? "N/A"
: std::to_string(gpu_info.pcie_rx.min_value))
<< " Avg: "
<< (gpu_info.pcie_rx.average == std::numeric_limits<int>::max()
? "N/A"
: std::to_string(gpu_info.pcie_rx.average))
<< " SD: " << std::fixed << std::setprecision(2)
<< (gpu_info.pcie_rx.standard_deviation == std::numeric_limits<float>::max()
? "N/A"
: std::to_string(gpu_info.pcie_rx.standard_deviation))
<< "\n";
std::cout << "| PCIe Tx Bandwidth (megabytes) | "
<< "Max: " << gpu_info.pcie_tx.max_value << " Min: " << gpu_info.pcie_tx.min_value
<< " Avg: " << gpu_info.pcie_tx.average << " SD: " << std::fixed << std::setprecision(2)
<< gpu_info.pcie_tx.standard_deviation << "\n";
<< "Max: "
<< (gpu_info.pcie_tx.max_value == std::numeric_limits<int>::max()
? "N/A"
: std::to_string(gpu_info.pcie_tx.max_value))
<< " Min: "
<< (gpu_info.pcie_tx.min_value == std::numeric_limits<int>::max()
? "N/A"
: std::to_string(gpu_info.pcie_tx.min_value))
<< " Avg: "
<< (gpu_info.pcie_tx.average == std::numeric_limits<int>::max()
? "N/A"
: std::to_string(gpu_info.pcie_tx.average))
<< " SD: " << std::fixed << std::setprecision(2)
<< (gpu_info.pcie_tx.standard_deviation == std::numeric_limits<float>::max()
? "N/A"
: std::to_string(gpu_info.pcie_tx.standard_deviation))
<< "\n";
std::cout << "| PCIe Total Bandwidth (megabytes) | "
<< "Max: " << gpu_info.pcie_total.max_value << " Min: " << gpu_info.pcie_total.min_value
<< " Avg: " << gpu_info.pcie_total.average << " SD: " << std::fixed
<< std::setprecision(2) << gpu_info.pcie_total.standard_deviation << "\n";
std::cout << "| Correctable ECC Errors | " << gpu_info.ecc_correct << "\n";
std::cout << "| Uncorrectable ECC Errors | " << gpu_info.ecc_uncorrect << "\n";
std::cout << "+----------------------------------"
@@ -523,6 +523,12 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
stats->set_average(src.pcie_rx.average);
stats->set_standard_deviation(src.pcie_rx.standard_deviation);
stats = target->mutable_pcie_total();
stats->set_max_value(src.pcie_total.max_value);
stats->set_min_value(src.pcie_total.min_value);
stats->set_average(src.pcie_total.average);
stats->set_standard_deviation(src.pcie_total.standard_deviation);
stats = target->mutable_memory_clock();
stats->set_max_value(src.memory_clock.max_value);
stats->set_min_value(src.memory_clock.min_value);