diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data index ecf4ce824c..95fbfaafee 100644 --- a/projects/rdc/common/rdc_field.data +++ b/projects/rdc/common/rdc_field.data @@ -218,7 +218,15 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_VMFAULT, "VM page fault", FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp", "THERMAL_THROT", false) FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false) FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false) -FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_MIGRATE_START, "GPU migrate has started", "MIGRATE_START", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_MIGRATE_END, "GPU migrate has ended", "MIGRATE_END", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_PAGE_FAULT_START, "GPU page fault started", "PAGE_FAULT_START", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_PAGE_FAULT_END, "GPU page fault ended", "PAGE_FAULT_END", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_QUEUE_EVICTION, "GPU queue eviction occured", "QUEUE_EVICITION", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_QUEUE_RESTORE, "GPU queue restore occured", "QUEUE_RESTORE", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_UNMAP_FROM_GPU, "GPU unmap occured", "UNMAP_FROM_GPU", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_PROCESS_START, "GPU process started", "PROCESS_START", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_PROCESS_END, "GPU process ended", "PROCESS_END", false) // RDC health related fields FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true) diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index d11d5a5874..288ab00220 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -399,9 +399,17 @@ typedef enum { //!< due to temperature rise RDC_EVNT_NOTIF_PRE_RESET, //!< GPU reset is about to occur RDC_EVNT_NOTIF_POST_RESET, //!< GPU reset just occurred - RDC_EVNT_NOTIF_RING_HANG, //!< GPU ring hang just occurred + RDC_EVNT_NOTIF_MIGRATE_START, + RDC_EVNT_NOTIF_MIGRATE_END, + RDC_EVNT_NOTIF_PAGE_FAULT_START, + RDC_EVNT_NOTIF_PAGE_FAULT_END, + RDC_EVNT_NOTIF_QUEUE_EVICTION, + RDC_EVNT_NOTIF_QUEUE_RESTORE, + RDC_EVNT_NOTIF_UNMAP_FROM_GPU, + RDC_EVNT_NOTIF_PROCESS_START, + RDC_EVNT_NOTIF_PROCESS_END, - RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG, + RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_PROCESS_END, /** * @brief RDC health related fields @@ -496,6 +504,18 @@ typedef struct { rdc_stats_summary_t memory_utilization; //!< Memory Utilization statistics } rdc_gpu_usage_info_t; //!< GPU usage statistics +#define MAX_PROCESS_NAME 256 +/** + * @brief The structure to track process start/stop times during a job running + */ +typedef struct { + uint32_t pid; //!< Process ID + char process_name[MAX_PROCESS_NAME]; + uint64_t start_time; //!< Process start time in microseconds since 1970 + uint64_t stop_time; //!< Process stop time in microseconds since 1970 +} rdc_process_status_info_t; +#define RDC_MAX_NUM_PROCESSES_STATUS 64 + /** * @brief The structure to hold the job stats */ @@ -504,6 +524,9 @@ typedef struct { rdc_gpu_usage_info_t summary; //!< Job usage summary statistics //!< (overall) rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary statistics by GPU + uint32_t num_processes; //!< Number of processes tracked + rdc_process_status_info_t + processes[RDC_MAX_NUM_PROCESSES_STATUS]; //!< Array to track process start/stop times } rdc_job_info_t; /** diff --git a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h index 5852a9f904..d821a6b891 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h @@ -78,6 +78,10 @@ struct RdcJobStatsCacheEntry { uint64_t start_time; uint64_t end_time; std::map gpu_stats; + + uint32_t num_processes = 0; + std::array processes{}; + std::map pid_to_index; }; // @@ -110,19 +114,14 @@ class RdcCacheManagerImpl : public RdcCacheManager { rdc_status_t rdc_job_remove(const char job_id[64]) override; rdc_status_t rdc_job_remove_all() override; - rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, - uint32_t gpu_index, + rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, uint32_t gpu_index, const rdc_field_value& value) override; - rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id, - uint32_t gpu_index, - rdc_field_t field_id, - uint64_t start_timestamp, - uint64_t end_timestamp, - rdc_field_value* start_value, + rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id, uint32_t gpu_index, + rdc_field_t field_id, uint64_t start_timestamp, + uint64_t end_timestamp, rdc_field_value* start_value, rdc_field_value* end_value) override; rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; - rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id, - uint32_t gpu_index, + rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id, uint32_t gpu_index, const rdc_field_value& value) override; private: diff --git a/projects/rdc/include/rdc_lib/impl/SmiUtils.h b/projects/rdc/include/rdc_lib/impl/SmiUtils.h index 7c8c06da6e..dd81e7ab8d 100644 --- a/projects/rdc/include/rdc_lib/impl/SmiUtils.h +++ b/projects/rdc/include/rdc_lib/impl/SmiUtils.h @@ -34,6 +34,8 @@ namespace rdc { rdc_status_t Smi2RdcError(amdsmi_status_t rsmi); amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id, amdsmi_processor_handle* processor_handle); +amdsmi_status_t get_gpu_id_from_processor_handle(amdsmi_processor_handle processor_handle, + uint32_t* gpu_index); amdsmi_status_t get_processor_count(uint32_t& all_processor_count); amdsmi_status_t get_socket_handles(std::vector& sockets); amdsmi_status_t get_processor_handles(amdsmi_socket_handle socket, diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index 94f19bcb3b..e9e7b6ffb3 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -456,11 +456,21 @@ message GpuUsageInfo { JobStatsSummary gpu_temperature = 15; JobStatsSummary pcie_total = 16; } + +message RdcProcessStatsInfo { + uint32 pid = 1; + string process_name = 2; + uint64 start_time = 3; + uint64 stop_time = 4; +} + message GetJobStatsResponse { uint32 status = 1; uint32 num_gpus = 2; GpuUsageInfo summary = 3; repeated GpuUsageInfo gpus = 4; + uint32 num_processes = 5; + repeated RdcProcessStatsInfo processes = 6; } message StopJobStatsRequest { diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index 7ba3c3600e..dbef7b921f 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -232,7 +232,15 @@ class rdc_field_t(c_int): RDC_EVNT_NOTIF_THERMAL_THROTTLE = 2001 RDC_EVNT_NOTIF_PRE_RESET = 2002 RDC_EVNT_NOTIF_POST_RESET = 2003 - RDC_EVNT_NOTIF_RING_HANG = 2004 + RDC_EVNT_NOTIF_MIGRATE_START = 2004 + RDC_EVNT_NOTIF_MIGRATE_END = 2005 + RDC_EVNT_NOTIF_PAGE_FAULT_START = 2006 + RDC_EVNT_NOTIF_PAGE_FAULT_END = 2007 + RDC_EVNT_NOTIF_QUEUE_EVICTION = 2008 + RDC_EVNT_NOTIF_QUEUE_RESTORE = 2009 + RDC_EVNT_NOTIF_UNMAP_FROM_GPU = 2010 + RDC_EVNT_NOTIF_PROCESS_START = 2011 + RDC_EVNT_NOTIF_PROCESS_END = 2012 RDC_HEALTH_XGMI_ERROR = 3000 RDC_HEALTH_PCIE_REPLAY_COUNT = 3001 RDC_HEALTH_RETIRED_PAGE_NUM = 3002 @@ -254,7 +262,15 @@ class rdc_field_t(c_int): RDC_EVNT_NOTIF_THERMAL_THROTTLE: rdc_metric_type_t.COUNTER, RDC_EVNT_NOTIF_PRE_RESET: rdc_metric_type_t.COUNTER, RDC_EVNT_NOTIF_POST_RESET: rdc_metric_type_t.COUNTER, - RDC_EVNT_NOTIF_RING_HANG: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_MIGRATE_START: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_MIGRATE_END: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_PAGE_FAULT_START: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_PAGE_FAULT_END: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_QUEUE_EVICTION: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_QUEUE_RESTORE: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_UNMAP_FROM_GPU: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_PROCESS_START: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_PROCESS_END: rdc_metric_type_t.COUNTER, } @classmethod @@ -317,11 +333,21 @@ class rdc_gpu_usage_info_t(Structure): ,("memory_utilization", rdc_stats_summary_t) ] +class rdc_process_status_info_t(Structure): + _fields_ = [ + ("pid", c_uint32) + ,("process_name", c_char*256) + ,("start_time", c_uint64) + ,("stop_time", c_uint64) + ] + class rdc_job_info_t(Structure): _fields_ = [ ("num_gpus", c_uint32) ,("summary", rdc_gpu_usage_info_t) ,("gpus", rdc_gpu_usage_info_t*16) + ,("num_processes", c_uint32) + ,("processes", rdc_process_status_info_t*64) ] class rdc_anonymous_0(ctypes.Union): diff --git a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index c617618bcd..f91004d618 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -196,6 +196,40 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index, return RDC_ST_NOT_FOUND; } + auto& je = cache_jobs_.at(job_id); + + // handle process‐start/stop + if (value.field_id == RDC_EVNT_NOTIF_PROCESS_START || + value.field_id == RDC_EVNT_NOTIF_PROCESS_END) { + rdc_process_status_info_t info{}; + sscanf(value.value.str, "PID: %u task: %255s", &info.pid, info.process_name); + + uint64_t ts_us = value.ts * 1000; + if (value.field_id == RDC_EVNT_NOTIF_PROCESS_START) { + info.start_time = ts_us; + } else { + info.stop_time = ts_us; + } + + auto pit = je.pid_to_index.find(info.pid); + if (pit == je.pid_to_index.end()) { + uint32_t slot = je.num_processes; + if (slot < RDC_MAX_NUM_PROCESSES_STATUS) { + je.processes[slot] = info; + je.pid_to_index[info.pid] = slot; + je.num_processes = slot + 1; + } + } else { + auto& entry = je.processes[pit->second]; + if (value.field_id == RDC_EVNT_NOTIF_PROCESS_START) { + entry.start_time = info.start_time; + } else { + entry.stop_time = info.stop_time; + } + } + return RDC_ST_OK; + } + auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index); if (gpu_iter == job_iter->second.gpu_stats.end()) { return RDC_ST_NOT_FOUND; @@ -380,6 +414,13 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64], set_average_summary(summary_info.gpu_temperature, p_job_info->num_gpus); set_average_summary(summary_info.memory_clock, p_job_info->num_gpus); + // Set process start/stop info + auto& je = cache_jobs_[jobId]; + p_job_info->num_processes = je.num_processes; + for (uint32_t i = 0; i < je.num_processes; ++i) { + p_job_info->processes[i] = je.processes[i]; + } + return RDC_ST_OK; } @@ -392,9 +433,12 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(const char job_id[64], const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo, const rdc_gpu_gauges_t& gpu_gauges) { - RdcJobStatsCacheEntry cacheEntry; + RdcJobStatsCacheEntry cacheEntry{}; cacheEntry.start_time = std::time(nullptr); cacheEntry.end_time = 0; + cacheEntry.num_processes = 0; + cacheEntry.pid_to_index.clear(); + for (auto& p : cacheEntry.processes) p = {}; for (unsigned int i = 0; i < ginfo.count; i++) { // GPUs GpuSummaryStats gstats; gstats.energy_consumed = 0; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc index df355e18f4..d5ae35c099 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc @@ -45,7 +45,15 @@ static std::unordered_map rdc_2_smi {RDC_EVNT_NOTIF_THERMAL_THROTTLE, AMDSMI_EVT_NOTIF_THERMAL_THROTTLE}, {RDC_EVNT_NOTIF_PRE_RESET, AMDSMI_EVT_NOTIF_GPU_PRE_RESET}, {RDC_EVNT_NOTIF_POST_RESET, AMDSMI_EVT_NOTIF_GPU_POST_RESET}, - {RDC_EVNT_NOTIF_RING_HANG, AMDSMI_EVT_NOTIF_RING_HANG}, + {RDC_EVNT_NOTIF_MIGRATE_START, AMDSMI_EVT_NOTIF_MIGRATE_START}, + {RDC_EVNT_NOTIF_MIGRATE_END, AMDSMI_EVT_NOTIF_MIGRATE_END}, + {RDC_EVNT_NOTIF_PAGE_FAULT_START, AMDSMI_EVT_NOTIF_PAGE_FAULT_START}, + {RDC_EVNT_NOTIF_PAGE_FAULT_END, AMDSMI_EVT_NOTIF_PAGE_FAULT_END}, + {RDC_EVNT_NOTIF_QUEUE_EVICTION, AMDSMI_EVT_NOTIF_QUEUE_EVICTION}, + {RDC_EVNT_NOTIF_QUEUE_RESTORE, AMDSMI_EVT_NOTIF_QUEUE_RESTORE}, + {RDC_EVNT_NOTIF_UNMAP_FROM_GPU, AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU}, + {RDC_EVNT_NOTIF_PROCESS_START, AMDSMI_EVT_NOTIF_PROCESS_START}, + {RDC_EVNT_NOTIF_PROCESS_END, AMDSMI_EVT_NOTIF_PROCESS_END}, }; static std::unordered_map smi_event_notif_2_rdc_map = { {AMDSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT}, @@ -53,7 +61,15 @@ static std::unordered_map smi_event {AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE}, {AMDSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET}, {AMDSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET}, - {AMDSMI_EVT_NOTIF_RING_HANG, RDC_EVNT_NOTIF_RING_HANG}, + {AMDSMI_EVT_NOTIF_MIGRATE_START, RDC_EVNT_NOTIF_MIGRATE_START}, + {AMDSMI_EVT_NOTIF_MIGRATE_END, RDC_EVNT_NOTIF_MIGRATE_END}, + {AMDSMI_EVT_NOTIF_PAGE_FAULT_START, RDC_EVNT_NOTIF_PAGE_FAULT_START}, + {AMDSMI_EVT_NOTIF_PAGE_FAULT_END, RDC_EVNT_NOTIF_PAGE_FAULT_END}, + {AMDSMI_EVT_NOTIF_QUEUE_EVICTION, RDC_EVNT_NOTIF_QUEUE_EVICTION}, + {AMDSMI_EVT_NOTIF_QUEUE_RESTORE, RDC_EVNT_NOTIF_QUEUE_RESTORE}, + {AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU, RDC_EVNT_NOTIF_UNMAP_FROM_GPU}, + {AMDSMI_EVT_NOTIF_PROCESS_START, RDC_EVNT_NOTIF_PROCESS_START}, + {AMDSMI_EVT_NOTIF_PROCESS_END, RDC_EVNT_NOTIF_PROCESS_END}, }; // This const determines space allocated on stack for notification events. @@ -160,9 +176,12 @@ rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32 for (uint32_t i = 0; i < f_cnt; ++i) { assert(smi_event_notif_2_rdc_map.find(smi_events[i].event) != smi_event_notif_2_rdc_map.end()); - uint64_t bdfid; - amdsmi_get_gpu_bdf_id(smi_events[i].processor_handle, &bdfid); - events[i].gpu_id = bdfid; + uint32_t gpu_id; + ret = get_gpu_id_from_processor_handle(smi_events[i].processor_handle, &gpu_id); + if (ret != AMDSMI_STATUS_SUCCESS) { + return Smi2RdcError(ret); + } + events[i].gpu_id = gpu_id; events[i].field.field_id = smi_event_notif_2_rdc_map[smi_events[i].event]; events[i].field.status = RDC_ST_OK; events[i].field.ts = now; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index fca624e6bc..8bdf18a736 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -73,6 +73,19 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, co return RDC_ST_NOT_FOUND; } + // Add process start/stop for all jobs + rdc_group_info_t ginfo; + result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); + if (result != RDC_ST_OK) { + return result; + } + + for (uint32_t ix = 0; ix < ginfo.count; ++ix) { + uint32_t gpu = ginfo.entity_ids[ix]; + fields_in_watch.emplace_back(gpu, RDC_EVNT_NOTIF_PROCESS_START); + fields_in_watch.emplace_back(gpu, RDC_EVNT_NOTIF_PROCESS_END); + } + JobWatchTableEntry jentry{group_id, fields_in_watch}; do { //< lock guard for thread safe std::lock_guard guard(watch_mutex_); @@ -80,11 +93,6 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, co } while (0); rdc_field_group_info_t finfo; - rdc_group_info_t ginfo; - result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); - if (result != RDC_ST_OK) { - return result; - } result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo); if (result != RDC_ST_OK) { diff --git a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc index 27a59f3344..cd555f1160 100644 --- a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc +++ b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc @@ -136,6 +136,37 @@ amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id, return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t get_gpu_id_from_processor_handle(amdsmi_processor_handle processor_handle, + uint32_t* gpu_index) { + if (!gpu_index) { + return AMDSMI_STATUS_INVAL; + } + + std::vector sockets; + auto ret = get_socket_handles(sockets); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + uint32_t idx = 0; + for (auto const& sock : sockets) { + std::vector procs; + ret = get_processor_handles(sock, procs); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + for (auto const& h : procs) { + if (h == processor_handle) { + *gpu_index = idx; + return AMDSMI_STATUS_SUCCESS; + } + ++idx; + } + } + + return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS; +} + amdsmi_status_t get_processor_count(uint32_t& all_processor_count) { uint32_t total_processor_count = 0; uint32_t socket_count; diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index db75d634e6..6b92a2ea0f 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -170,6 +170,18 @@ rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(const char job_id[64], copy_gpu_usage_info(reply.gpus(i), &(p_job_info->gpus[i])); } + p_job_info->num_processes = reply.num_processes(); + for (int i = 0; i < reply.num_processes(); i++) { + const auto& proc_msg = reply.processes(i); + p_job_info->processes[i].pid = proc_msg.pid(); + + strncpy_with_null(p_job_info->processes[i].process_name, proc_msg.process_name().c_str(), + MAX_PROCESS_NAME - 1); + + p_job_info->processes[i].start_time = proc_msg.start_time(); + p_job_info->processes[i].stop_time = proc_msg.stop_time(); + } + return RDC_ST_OK; } diff --git a/projects/rdc/rdci/src/RdciStatsSubSystem.cc b/projects/rdc/rdci/src/RdciStatsSubSystem.cc index 5ef6b6142f..812c3e0ca8 100644 --- a/projects/rdc/rdci/src/RdciStatsSubSystem.cc +++ b/projects/rdc/rdci/src/RdciStatsSubSystem.cc @@ -343,7 +343,7 @@ void RdciStatsSubSystem::process() { } if (stats_ops_ == STATS_DISPLAY) { - rdc_job_info_t job_info; + rdc_job_info_t job_info{}; result = rdc_job_get_stats(rdc_handle_, const_cast(job_id_.c_str()), &job_info); if (result != RDC_ST_OK) { throw RdcException(result, rdc_status_string(result)); @@ -370,6 +370,30 @@ void RdciStatsSubSystem::process() { std::cout << "}"; } } + if (!is_json_output()) { + std::cout << "| Processes Tracked | " << job_info.num_processes << "\n"; + for (uint32_t p = 0; p < job_info.num_processes; ++p) { + auto& pr = job_info.processes[p]; + std::ostringstream oss; + oss << "PID: " << pr.pid << ", Start: " << pr.start_time / 1000 + << ", End: " << pr.stop_time / 1000; + std::cout << "| " << std::setw(33) << std::left << pr.process_name << "| " << oss.str() + << "\n"; + } + std::cout << "+----------------------------------+------------------------------------\n"; + } else { + std::cout << ", \"processes_tracked\": " << job_info.num_processes; + std::cout << ", \"processes\": ["; + for (uint32_t p = 0; p < job_info.num_processes; ++p) { + auto& pr = job_info.processes[p]; + std::cout << (p ? "," : "") << "{" + << "\"pid\":" << pr.pid << "," + << "\"process_name\":\"" << pr.process_name << "\"," + << "\"start_time\":" << pr.start_time / 1000 << "," + << "\"stop_time\":" << pr.stop_time / 1000 << "}"; + } + std::cout << "]"; + } return; } diff --git a/projects/rdc/server/src/rdc_api_service.cc b/projects/rdc/server/src/rdc_api_service.cc index 466c3e13eb..06b25a72ed 100644 --- a/projects/rdc/server/src/rdc_api_service.cc +++ b/projects/rdc/server/src/rdc_api_service.cc @@ -470,6 +470,15 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { copy_gpu_usage_info(job_info.gpus[i], ginfo); } + reply->set_num_processes(job_info.num_processes); + for (uint32_t i = 0; i < job_info.num_processes; i++) { + ::rdc::RdcProcessStatsInfo* pinfo = reply->add_processes(); + pinfo->set_pid(job_info.processes[i].pid); + pinfo->set_process_name(job_info.processes[i].process_name); + pinfo->set_start_time(job_info.processes[i].start_time); + pinfo->set_stop_time(job_info.processes[i].stop_time); + } + return ::grpc::Status::OK; }