diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index 3fdab1d1da..03c31d822a 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -422,6 +422,9 @@ typedef struct { uint64_t ecc_uncorrect; //!< Uncorrectable errors rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats + rdc_stats_summary_t pcie_total; //!< Total PCIe bandwidth stats + //!< pcie_tx/pcie_rx are not available on mi300, max integer + //!< returned, so use pcie_total rdc_stats_summary_t power_usage; //!< GPU Power usage stats rdc_stats_summary_t gpu_clock; //!< GPU Clock speed stats rdc_stats_summary_t memory_clock; //!< Mem. Clock speed stats diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index eb6f2732ed..f0fc2a8a22 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -447,6 +447,7 @@ message GpuUsageInfo { JobStatsSummary pcie_rx = 13; JobStatsSummary memory_clock = 14; JobStatsSummary gpu_temperature = 15; + JobStatsSummary pcie_total = 16; } message GetJobStatsResponse { uint32 status = 1; diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index cbb362ef09..9a21126d3f 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -210,6 +210,7 @@ class rdc_gpu_usage_info_t(Structure): ,("ecc_uncorrect", c_uint64) ,("pcie_tx", rdc_stats_summary_t) ,("pcie_rx", rdc_stats_summary_t) + ,("pcie_total", rdc_stats_summary_t) ,("power_usage", rdc_stats_summary_t) ,("gpu_clock", rdc_stats_summary_t) ,("memory_clock", rdc_stats_summary_t) diff --git a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index 959645bc8c..fdb6197a2a 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -294,6 +294,7 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64], summary_info.power_usage = {0, std::numeric_limits::max(), 0, 0}; summary_info.pcie_tx = {0, std::numeric_limits::max(), 0, 0}; summary_info.pcie_rx = {0, std::numeric_limits::max(), 0, 0}; + summary_info.pcie_total = {0, std::numeric_limits::max(), 0, 0}; summary_info.gpu_temperature = {0, std::numeric_limits::max(), 0, 0}; summary_info.memory_clock = {0, std::numeric_limits::max(), 0, 0}; summary_info.gpu_clock = {0, std::numeric_limits::max(), 0, 0}; @@ -363,6 +364,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64], set_summary(ite->second, gpu_info.pcie_tx, summary_info.pcie_tx, 1024 * 1024); } else if (ite->first == RDC_FI_PCIE_RX) { set_summary(ite->second, gpu_info.pcie_rx, summary_info.pcie_rx, 1024 * 1024); + } else if (ite->first == RDC_FI_PCIE_BANDWIDTH) { + set_summary(ite->second, gpu_info.pcie_total, summary_info.pcie_total, 1024 * 1024); } } } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc index a086cedb6f..040580c2e7 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -33,7 +33,7 @@ RdcGroupSettingsImpl::RdcGroupSettingsImpl() { // Add the default job stats fields rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, - RDC_FI_MEM_CLOCK, RDC_FI_GPU_TEMP}; + RDC_FI_PCIE_BANDWIDTH, RDC_FI_MEM_CLOCK, RDC_FI_GPU_TEMP}; char job_field_group[] = "JobStatsFields"; rdc_field_grp_t fgid = JOB_FIELD_ID; diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 1dde77dd7f..a18e0e2b8e 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -129,6 +129,12 @@ bool RdcStandaloneHandler::copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src, target->pcie_rx.average = rxstats.average(); target->pcie_rx.standard_deviation = rxstats.standard_deviation(); + const ::rdc::JobStatsSummary& pcietotalstats = src.pcie_total(); + target->pcie_total.max_value = pcietotalstats.max_value(); + target->pcie_total.min_value = pcietotalstats.min_value(); + target->pcie_total.average = pcietotalstats.average(); + target->pcie_total.standard_deviation = pcietotalstats.standard_deviation(); + const ::rdc::JobStatsSummary& mcstats = src.memory_clock(); target->memory_clock.max_value = mcstats.max_value(); target->memory_clock.min_value = mcstats.min_value(); diff --git a/projects/rdc/rdci/src/RdciStatsSubSystem.cc b/projects/rdc/rdci/src/RdciStatsSubSystem.cc index 95ac1783c5..1384021706 100644 --- a/projects/rdc/rdci/src/RdciStatsSubSystem.cc +++ b/projects/rdc/rdci/src/RdciStatsSubSystem.cc @@ -203,6 +203,12 @@ void RdciStatsSubSystem::show_job_stats_json(const rdc_gpu_usage_info_t& gpu_inf std::cout << "\"pcie_tx_avg\": " << gpu_info.pcie_tx.average << ","; std::cout << "\"pcie_tx_stanard_deviation\": " << gpu_info.pcie_tx.standard_deviation << ","; + std::cout << "\"pcie_total_max\": " << gpu_info.pcie_total.max_value << ","; + std::cout << "\"pcie_total_min\": " << gpu_info.pcie_total.min_value << ","; + std::cout << "\"pcie_total_avg\": " << gpu_info.pcie_total.average << ","; + std::cout << "\"pcie_total_stanard_deviation\": " << gpu_info.pcie_total.standard_deviation + << ","; + std::cout << "\"ecc_correct\": " << gpu_info.ecc_correct << ","; std::cout << "\"ecc_uncorrect\": " << gpu_info.ecc_uncorrect; } @@ -254,13 +260,45 @@ void RdciStatsSubSystem::show_job_stats(const rdc_gpu_usage_info_t& gpu_info) co << " Avg: " << gpu_info.gpu_temperature.average << " SD: " << std::fixed << std::setprecision(2) << gpu_info.gpu_temperature.standard_deviation << "\n"; std::cout << "| PCIe Rx Bandwidth (megabytes) | " - << "Max: " << gpu_info.pcie_rx.max_value << " Min: " << gpu_info.pcie_rx.min_value - << " Avg: " << gpu_info.pcie_rx.average << " SD: " << std::fixed << std::setprecision(2) - << gpu_info.pcie_rx.standard_deviation << "\n"; + << "Max: " + << (gpu_info.pcie_rx.max_value == std::numeric_limits::max() + ? "N/A" + : std::to_string(gpu_info.pcie_rx.max_value)) + << " Min: " + << (gpu_info.pcie_rx.min_value == std::numeric_limits::max() + ? "N/A" + : std::to_string(gpu_info.pcie_rx.min_value)) + << " Avg: " + << (gpu_info.pcie_rx.average == std::numeric_limits::max() + ? "N/A" + : std::to_string(gpu_info.pcie_rx.average)) + << " SD: " << std::fixed << std::setprecision(2) + << (gpu_info.pcie_rx.standard_deviation == std::numeric_limits::max() + ? "N/A" + : std::to_string(gpu_info.pcie_rx.standard_deviation)) + << "\n"; std::cout << "| PCIe Tx Bandwidth (megabytes) | " - << "Max: " << gpu_info.pcie_tx.max_value << " Min: " << gpu_info.pcie_tx.min_value - << " Avg: " << gpu_info.pcie_tx.average << " SD: " << std::fixed << std::setprecision(2) - << gpu_info.pcie_tx.standard_deviation << "\n"; + << "Max: " + << (gpu_info.pcie_tx.max_value == std::numeric_limits::max() + ? "N/A" + : std::to_string(gpu_info.pcie_tx.max_value)) + << " Min: " + << (gpu_info.pcie_tx.min_value == std::numeric_limits::max() + ? "N/A" + : std::to_string(gpu_info.pcie_tx.min_value)) + << " Avg: " + << (gpu_info.pcie_tx.average == std::numeric_limits::max() + ? "N/A" + : std::to_string(gpu_info.pcie_tx.average)) + << " SD: " << std::fixed << std::setprecision(2) + << (gpu_info.pcie_tx.standard_deviation == std::numeric_limits::max() + ? "N/A" + : std::to_string(gpu_info.pcie_tx.standard_deviation)) + << "\n"; + std::cout << "| PCIe Total Bandwidth (megabytes) | " + << "Max: " << gpu_info.pcie_total.max_value << " Min: " << gpu_info.pcie_total.min_value + << " Avg: " << gpu_info.pcie_total.average << " SD: " << std::fixed + << std::setprecision(2) << gpu_info.pcie_total.standard_deviation << "\n"; std::cout << "| Correctable ECC Errors | " << gpu_info.ecc_correct << "\n"; std::cout << "| Uncorrectable ECC Errors | " << gpu_info.ecc_uncorrect << "\n"; std::cout << "+----------------------------------" diff --git a/projects/rdc/server/src/rdc_api_service.cc b/projects/rdc/server/src/rdc_api_service.cc index edd6659748..9fe50df4d4 100644 --- a/projects/rdc/server/src/rdc_api_service.cc +++ b/projects/rdc/server/src/rdc_api_service.cc @@ -523,6 +523,12 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, stats->set_average(src.pcie_rx.average); stats->set_standard_deviation(src.pcie_rx.standard_deviation); + stats = target->mutable_pcie_total(); + stats->set_max_value(src.pcie_total.max_value); + stats->set_min_value(src.pcie_total.min_value); + stats->set_average(src.pcie_total.average); + stats->set_standard_deviation(src.pcie_total.standard_deviation); + stats = target->mutable_memory_clock(); stats->set_max_value(src.memory_clock.max_value); stats->set_min_value(src.memory_clock.min_value);