[SWDEV-243250] RDC Process Start/Stop integration (#189)
Change-Id: I3d2be33b5d23cd259b3d06fb572f81d19e6c3798
Signed-off-by: adapryor <Adam.pryor@amd.com>
[ROCm/rdc commit: 0e9c3b2c4f]
Šī revīzija ir iekļauta:
revīziju iesūtīja
GitHub
vecāks
30397e77f3
revīzija
ec661d5d17
@@ -218,7 +218,15 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_VMFAULT, "VM page fault",
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp", "THERMAL_THROT", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_MIGRATE_START, "GPU migrate has started", "MIGRATE_START", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_MIGRATE_END, "GPU migrate has ended", "MIGRATE_END", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_PAGE_FAULT_START, "GPU page fault started", "PAGE_FAULT_START", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_PAGE_FAULT_END, "GPU page fault ended", "PAGE_FAULT_END", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_QUEUE_EVICTION, "GPU queue eviction occured", "QUEUE_EVICITION", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_QUEUE_RESTORE, "GPU queue restore occured", "QUEUE_RESTORE", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_UNMAP_FROM_GPU, "GPU unmap occured", "UNMAP_FROM_GPU", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_PROCESS_START, "GPU process started", "PROCESS_START", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_PROCESS_END, "GPU process ended", "PROCESS_END", false)
|
||||
|
||||
// RDC health related fields
|
||||
FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true)
|
||||
|
||||
@@ -399,9 +399,17 @@ typedef enum {
|
||||
//!< due to temperature rise
|
||||
RDC_EVNT_NOTIF_PRE_RESET, //!< GPU reset is about to occur
|
||||
RDC_EVNT_NOTIF_POST_RESET, //!< GPU reset just occurred
|
||||
RDC_EVNT_NOTIF_RING_HANG, //!< GPU ring hang just occurred
|
||||
RDC_EVNT_NOTIF_MIGRATE_START,
|
||||
RDC_EVNT_NOTIF_MIGRATE_END,
|
||||
RDC_EVNT_NOTIF_PAGE_FAULT_START,
|
||||
RDC_EVNT_NOTIF_PAGE_FAULT_END,
|
||||
RDC_EVNT_NOTIF_QUEUE_EVICTION,
|
||||
RDC_EVNT_NOTIF_QUEUE_RESTORE,
|
||||
RDC_EVNT_NOTIF_UNMAP_FROM_GPU,
|
||||
RDC_EVNT_NOTIF_PROCESS_START,
|
||||
RDC_EVNT_NOTIF_PROCESS_END,
|
||||
|
||||
RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG,
|
||||
RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_PROCESS_END,
|
||||
|
||||
/**
|
||||
* @brief RDC health related fields
|
||||
@@ -496,6 +504,18 @@ typedef struct {
|
||||
rdc_stats_summary_t memory_utilization; //!< Memory Utilization statistics
|
||||
} rdc_gpu_usage_info_t; //!< GPU usage statistics
|
||||
|
||||
#define MAX_PROCESS_NAME 256
|
||||
/**
|
||||
* @brief The structure to track process start/stop times during a job running
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t pid; //!< Process ID
|
||||
char process_name[MAX_PROCESS_NAME];
|
||||
uint64_t start_time; //!< Process start time in microseconds since 1970
|
||||
uint64_t stop_time; //!< Process stop time in microseconds since 1970
|
||||
} rdc_process_status_info_t;
|
||||
#define RDC_MAX_NUM_PROCESSES_STATUS 64
|
||||
|
||||
/**
|
||||
* @brief The structure to hold the job stats
|
||||
*/
|
||||
@@ -504,6 +524,9 @@ typedef struct {
|
||||
rdc_gpu_usage_info_t summary; //!< Job usage summary statistics
|
||||
//!< (overall)
|
||||
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary statistics by GPU
|
||||
uint32_t num_processes; //!< Number of processes tracked
|
||||
rdc_process_status_info_t
|
||||
processes[RDC_MAX_NUM_PROCESSES_STATUS]; //!< Array to track process start/stop times
|
||||
} rdc_job_info_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -78,6 +78,10 @@ struct RdcJobStatsCacheEntry {
|
||||
uint64_t start_time;
|
||||
uint64_t end_time;
|
||||
std::map<uint32_t, GpuSummaryStats> gpu_stats;
|
||||
|
||||
uint32_t num_processes = 0;
|
||||
std::array<rdc_process_status_info_t, RDC_MAX_NUM_PROCESSES_STATUS> processes{};
|
||||
std::map<uint32_t, uint32_t> pid_to_index;
|
||||
};
|
||||
|
||||
// <job_id, job_stats>
|
||||
@@ -110,19 +114,14 @@ class RdcCacheManagerImpl : public RdcCacheManager {
|
||||
rdc_status_t rdc_job_remove(const char job_id[64]) override;
|
||||
rdc_status_t rdc_job_remove_all() override;
|
||||
|
||||
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
const rdc_field_value& value) override;
|
||||
rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_field_t field_id,
|
||||
uint64_t start_timestamp,
|
||||
uint64_t end_timestamp,
|
||||
rdc_field_value* start_value,
|
||||
rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
rdc_field_t field_id, uint64_t start_timestamp,
|
||||
uint64_t end_timestamp, rdc_field_value* start_value,
|
||||
rdc_field_value* end_value) override;
|
||||
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
|
||||
rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
const rdc_field_value& value) override;
|
||||
|
||||
private:
|
||||
|
||||
@@ -34,6 +34,8 @@ namespace rdc {
|
||||
rdc_status_t Smi2RdcError(amdsmi_status_t rsmi);
|
||||
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
|
||||
amdsmi_processor_handle* processor_handle);
|
||||
amdsmi_status_t get_gpu_id_from_processor_handle(amdsmi_processor_handle processor_handle,
|
||||
uint32_t* gpu_index);
|
||||
amdsmi_status_t get_processor_count(uint32_t& all_processor_count);
|
||||
amdsmi_status_t get_socket_handles(std::vector<amdsmi_socket_handle>& sockets);
|
||||
amdsmi_status_t get_processor_handles(amdsmi_socket_handle socket,
|
||||
|
||||
@@ -456,11 +456,21 @@ message GpuUsageInfo {
|
||||
JobStatsSummary gpu_temperature = 15;
|
||||
JobStatsSummary pcie_total = 16;
|
||||
}
|
||||
|
||||
message RdcProcessStatsInfo {
|
||||
uint32 pid = 1;
|
||||
string process_name = 2;
|
||||
uint64 start_time = 3;
|
||||
uint64 stop_time = 4;
|
||||
}
|
||||
|
||||
message GetJobStatsResponse {
|
||||
uint32 status = 1;
|
||||
uint32 num_gpus = 2;
|
||||
GpuUsageInfo summary = 3;
|
||||
repeated GpuUsageInfo gpus = 4;
|
||||
uint32 num_processes = 5;
|
||||
repeated RdcProcessStatsInfo processes = 6;
|
||||
}
|
||||
|
||||
message StopJobStatsRequest {
|
||||
|
||||
@@ -232,7 +232,15 @@ class rdc_field_t(c_int):
|
||||
RDC_EVNT_NOTIF_THERMAL_THROTTLE = 2001
|
||||
RDC_EVNT_NOTIF_PRE_RESET = 2002
|
||||
RDC_EVNT_NOTIF_POST_RESET = 2003
|
||||
RDC_EVNT_NOTIF_RING_HANG = 2004
|
||||
RDC_EVNT_NOTIF_MIGRATE_START = 2004
|
||||
RDC_EVNT_NOTIF_MIGRATE_END = 2005
|
||||
RDC_EVNT_NOTIF_PAGE_FAULT_START = 2006
|
||||
RDC_EVNT_NOTIF_PAGE_FAULT_END = 2007
|
||||
RDC_EVNT_NOTIF_QUEUE_EVICTION = 2008
|
||||
RDC_EVNT_NOTIF_QUEUE_RESTORE = 2009
|
||||
RDC_EVNT_NOTIF_UNMAP_FROM_GPU = 2010
|
||||
RDC_EVNT_NOTIF_PROCESS_START = 2011
|
||||
RDC_EVNT_NOTIF_PROCESS_END = 2012
|
||||
RDC_HEALTH_XGMI_ERROR = 3000
|
||||
RDC_HEALTH_PCIE_REPLAY_COUNT = 3001
|
||||
RDC_HEALTH_RETIRED_PAGE_NUM = 3002
|
||||
@@ -254,7 +262,15 @@ class rdc_field_t(c_int):
|
||||
RDC_EVNT_NOTIF_THERMAL_THROTTLE: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_PRE_RESET: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_POST_RESET: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_RING_HANG: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_MIGRATE_START: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_MIGRATE_END: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_PAGE_FAULT_START: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_PAGE_FAULT_END: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_QUEUE_EVICTION: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_QUEUE_RESTORE: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_UNMAP_FROM_GPU: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_PROCESS_START: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_PROCESS_END: rdc_metric_type_t.COUNTER,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -317,11 +333,21 @@ class rdc_gpu_usage_info_t(Structure):
|
||||
,("memory_utilization", rdc_stats_summary_t)
|
||||
]
|
||||
|
||||
class rdc_process_status_info_t(Structure):
|
||||
_fields_ = [
|
||||
("pid", c_uint32)
|
||||
,("process_name", c_char*256)
|
||||
,("start_time", c_uint64)
|
||||
,("stop_time", c_uint64)
|
||||
]
|
||||
|
||||
class rdc_job_info_t(Structure):
|
||||
_fields_ = [
|
||||
("num_gpus", c_uint32)
|
||||
,("summary", rdc_gpu_usage_info_t)
|
||||
,("gpus", rdc_gpu_usage_info_t*16)
|
||||
,("num_processes", c_uint32)
|
||||
,("processes", rdc_process_status_info_t*64)
|
||||
]
|
||||
|
||||
class rdc_anonymous_0(ctypes.Union):
|
||||
|
||||
@@ -196,6 +196,40 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index,
|
||||
return RDC_ST_NOT_FOUND;
|
||||
}
|
||||
|
||||
auto& je = cache_jobs_.at(job_id);
|
||||
|
||||
// handle process‐start/stop
|
||||
if (value.field_id == RDC_EVNT_NOTIF_PROCESS_START ||
|
||||
value.field_id == RDC_EVNT_NOTIF_PROCESS_END) {
|
||||
rdc_process_status_info_t info{};
|
||||
sscanf(value.value.str, "PID: %u task: %255s", &info.pid, info.process_name);
|
||||
|
||||
uint64_t ts_us = value.ts * 1000;
|
||||
if (value.field_id == RDC_EVNT_NOTIF_PROCESS_START) {
|
||||
info.start_time = ts_us;
|
||||
} else {
|
||||
info.stop_time = ts_us;
|
||||
}
|
||||
|
||||
auto pit = je.pid_to_index.find(info.pid);
|
||||
if (pit == je.pid_to_index.end()) {
|
||||
uint32_t slot = je.num_processes;
|
||||
if (slot < RDC_MAX_NUM_PROCESSES_STATUS) {
|
||||
je.processes[slot] = info;
|
||||
je.pid_to_index[info.pid] = slot;
|
||||
je.num_processes = slot + 1;
|
||||
}
|
||||
} else {
|
||||
auto& entry = je.processes[pit->second];
|
||||
if (value.field_id == RDC_EVNT_NOTIF_PROCESS_START) {
|
||||
entry.start_time = info.start_time;
|
||||
} else {
|
||||
entry.stop_time = info.stop_time;
|
||||
}
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index);
|
||||
if (gpu_iter == job_iter->second.gpu_stats.end()) {
|
||||
return RDC_ST_NOT_FOUND;
|
||||
@@ -380,6 +414,13 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
|
||||
set_average_summary(summary_info.gpu_temperature, p_job_info->num_gpus);
|
||||
set_average_summary(summary_info.memory_clock, p_job_info->num_gpus);
|
||||
|
||||
// Set process start/stop info
|
||||
auto& je = cache_jobs_[jobId];
|
||||
p_job_info->num_processes = je.num_processes;
|
||||
for (uint32_t i = 0; i < je.num_processes; ++i) {
|
||||
p_job_info->processes[i] = je.processes[i];
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
@@ -392,9 +433,12 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(const char job_id[64],
|
||||
const rdc_group_info_t& ginfo,
|
||||
const rdc_field_group_info_t& finfo,
|
||||
const rdc_gpu_gauges_t& gpu_gauges) {
|
||||
RdcJobStatsCacheEntry cacheEntry;
|
||||
RdcJobStatsCacheEntry cacheEntry{};
|
||||
cacheEntry.start_time = std::time(nullptr);
|
||||
cacheEntry.end_time = 0;
|
||||
cacheEntry.num_processes = 0;
|
||||
cacheEntry.pid_to_index.clear();
|
||||
for (auto& p : cacheEntry.processes) p = {};
|
||||
for (unsigned int i = 0; i < ginfo.count; i++) { // GPUs
|
||||
GpuSummaryStats gstats;
|
||||
gstats.energy_consumed = 0;
|
||||
|
||||
@@ -45,7 +45,15 @@ static std::unordered_map<rdc_field_t, amdsmi_evt_notification_type_t> rdc_2_smi
|
||||
{RDC_EVNT_NOTIF_THERMAL_THROTTLE, AMDSMI_EVT_NOTIF_THERMAL_THROTTLE},
|
||||
{RDC_EVNT_NOTIF_PRE_RESET, AMDSMI_EVT_NOTIF_GPU_PRE_RESET},
|
||||
{RDC_EVNT_NOTIF_POST_RESET, AMDSMI_EVT_NOTIF_GPU_POST_RESET},
|
||||
{RDC_EVNT_NOTIF_RING_HANG, AMDSMI_EVT_NOTIF_RING_HANG},
|
||||
{RDC_EVNT_NOTIF_MIGRATE_START, AMDSMI_EVT_NOTIF_MIGRATE_START},
|
||||
{RDC_EVNT_NOTIF_MIGRATE_END, AMDSMI_EVT_NOTIF_MIGRATE_END},
|
||||
{RDC_EVNT_NOTIF_PAGE_FAULT_START, AMDSMI_EVT_NOTIF_PAGE_FAULT_START},
|
||||
{RDC_EVNT_NOTIF_PAGE_FAULT_END, AMDSMI_EVT_NOTIF_PAGE_FAULT_END},
|
||||
{RDC_EVNT_NOTIF_QUEUE_EVICTION, AMDSMI_EVT_NOTIF_QUEUE_EVICTION},
|
||||
{RDC_EVNT_NOTIF_QUEUE_RESTORE, AMDSMI_EVT_NOTIF_QUEUE_RESTORE},
|
||||
{RDC_EVNT_NOTIF_UNMAP_FROM_GPU, AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU},
|
||||
{RDC_EVNT_NOTIF_PROCESS_START, AMDSMI_EVT_NOTIF_PROCESS_START},
|
||||
{RDC_EVNT_NOTIF_PROCESS_END, AMDSMI_EVT_NOTIF_PROCESS_END},
|
||||
};
|
||||
static std::unordered_map<amdsmi_evt_notification_type_t, rdc_field_t> smi_event_notif_2_rdc_map = {
|
||||
{AMDSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT},
|
||||
@@ -53,7 +61,15 @@ static std::unordered_map<amdsmi_evt_notification_type_t, rdc_field_t> smi_event
|
||||
{AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE},
|
||||
{AMDSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET},
|
||||
{AMDSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET},
|
||||
{AMDSMI_EVT_NOTIF_RING_HANG, RDC_EVNT_NOTIF_RING_HANG},
|
||||
{AMDSMI_EVT_NOTIF_MIGRATE_START, RDC_EVNT_NOTIF_MIGRATE_START},
|
||||
{AMDSMI_EVT_NOTIF_MIGRATE_END, RDC_EVNT_NOTIF_MIGRATE_END},
|
||||
{AMDSMI_EVT_NOTIF_PAGE_FAULT_START, RDC_EVNT_NOTIF_PAGE_FAULT_START},
|
||||
{AMDSMI_EVT_NOTIF_PAGE_FAULT_END, RDC_EVNT_NOTIF_PAGE_FAULT_END},
|
||||
{AMDSMI_EVT_NOTIF_QUEUE_EVICTION, RDC_EVNT_NOTIF_QUEUE_EVICTION},
|
||||
{AMDSMI_EVT_NOTIF_QUEUE_RESTORE, RDC_EVNT_NOTIF_QUEUE_RESTORE},
|
||||
{AMDSMI_EVT_NOTIF_UNMAP_FROM_GPU, RDC_EVNT_NOTIF_UNMAP_FROM_GPU},
|
||||
{AMDSMI_EVT_NOTIF_PROCESS_START, RDC_EVNT_NOTIF_PROCESS_START},
|
||||
{AMDSMI_EVT_NOTIF_PROCESS_END, RDC_EVNT_NOTIF_PROCESS_END},
|
||||
};
|
||||
|
||||
// This const determines space allocated on stack for notification events.
|
||||
@@ -160,9 +176,12 @@ rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32
|
||||
|
||||
for (uint32_t i = 0; i < f_cnt; ++i) {
|
||||
assert(smi_event_notif_2_rdc_map.find(smi_events[i].event) != smi_event_notif_2_rdc_map.end());
|
||||
uint64_t bdfid;
|
||||
amdsmi_get_gpu_bdf_id(smi_events[i].processor_handle, &bdfid);
|
||||
events[i].gpu_id = bdfid;
|
||||
uint32_t gpu_id;
|
||||
ret = get_gpu_id_from_processor_handle(smi_events[i].processor_handle, &gpu_id);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
events[i].gpu_id = gpu_id;
|
||||
events[i].field.field_id = smi_event_notif_2_rdc_map[smi_events[i].event];
|
||||
events[i].field.status = RDC_ST_OK;
|
||||
events[i].field.ts = now;
|
||||
|
||||
@@ -73,6 +73,19 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, co
|
||||
return RDC_ST_NOT_FOUND;
|
||||
}
|
||||
|
||||
// Add process start/stop for all jobs
|
||||
rdc_group_info_t ginfo;
|
||||
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
|
||||
for (uint32_t ix = 0; ix < ginfo.count; ++ix) {
|
||||
uint32_t gpu = ginfo.entity_ids[ix];
|
||||
fields_in_watch.emplace_back(gpu, RDC_EVNT_NOTIF_PROCESS_START);
|
||||
fields_in_watch.emplace_back(gpu, RDC_EVNT_NOTIF_PROCESS_END);
|
||||
}
|
||||
|
||||
JobWatchTableEntry jentry{group_id, fields_in_watch};
|
||||
do { //< lock guard for thread safe
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
@@ -80,11 +93,6 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, co
|
||||
} while (0);
|
||||
|
||||
rdc_field_group_info_t finfo;
|
||||
rdc_group_info_t ginfo;
|
||||
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo);
|
||||
if (result != RDC_ST_OK) {
|
||||
|
||||
@@ -136,6 +136,37 @@ amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t get_gpu_id_from_processor_handle(amdsmi_processor_handle processor_handle,
|
||||
uint32_t* gpu_index) {
|
||||
if (!gpu_index) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
std::vector<amdsmi_socket_handle> sockets;
|
||||
auto ret = get_socket_handles(sockets);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint32_t idx = 0;
|
||||
for (auto const& sock : sockets) {
|
||||
std::vector<amdsmi_processor_handle> procs;
|
||||
ret = get_processor_handles(sock, procs);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
for (auto const& h : procs) {
|
||||
if (h == processor_handle) {
|
||||
*gpu_index = idx;
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
amdsmi_status_t get_processor_count(uint32_t& all_processor_count) {
|
||||
uint32_t total_processor_count = 0;
|
||||
uint32_t socket_count;
|
||||
|
||||
@@ -170,6 +170,18 @@ rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(const char job_id[64],
|
||||
copy_gpu_usage_info(reply.gpus(i), &(p_job_info->gpus[i]));
|
||||
}
|
||||
|
||||
p_job_info->num_processes = reply.num_processes();
|
||||
for (int i = 0; i < reply.num_processes(); i++) {
|
||||
const auto& proc_msg = reply.processes(i);
|
||||
p_job_info->processes[i].pid = proc_msg.pid();
|
||||
|
||||
strncpy_with_null(p_job_info->processes[i].process_name, proc_msg.process_name().c_str(),
|
||||
MAX_PROCESS_NAME - 1);
|
||||
|
||||
p_job_info->processes[i].start_time = proc_msg.start_time();
|
||||
p_job_info->processes[i].stop_time = proc_msg.stop_time();
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
|
||||
@@ -343,7 +343,7 @@ void RdciStatsSubSystem::process() {
|
||||
}
|
||||
|
||||
if (stats_ops_ == STATS_DISPLAY) {
|
||||
rdc_job_info_t job_info;
|
||||
rdc_job_info_t job_info{};
|
||||
result = rdc_job_get_stats(rdc_handle_, const_cast<char*>(job_id_.c_str()), &job_info);
|
||||
if (result != RDC_ST_OK) {
|
||||
throw RdcException(result, rdc_status_string(result));
|
||||
@@ -370,6 +370,30 @@ void RdciStatsSubSystem::process() {
|
||||
std::cout << "}";
|
||||
}
|
||||
}
|
||||
if (!is_json_output()) {
|
||||
std::cout << "| Processes Tracked | " << job_info.num_processes << "\n";
|
||||
for (uint32_t p = 0; p < job_info.num_processes; ++p) {
|
||||
auto& pr = job_info.processes[p];
|
||||
std::ostringstream oss;
|
||||
oss << "PID: " << pr.pid << ", Start: " << pr.start_time / 1000
|
||||
<< ", End: " << pr.stop_time / 1000;
|
||||
std::cout << "| " << std::setw(33) << std::left << pr.process_name << "| " << oss.str()
|
||||
<< "\n";
|
||||
}
|
||||
std::cout << "+----------------------------------+------------------------------------\n";
|
||||
} else {
|
||||
std::cout << ", \"processes_tracked\": " << job_info.num_processes;
|
||||
std::cout << ", \"processes\": [";
|
||||
for (uint32_t p = 0; p < job_info.num_processes; ++p) {
|
||||
auto& pr = job_info.processes[p];
|
||||
std::cout << (p ? "," : "") << "{"
|
||||
<< "\"pid\":" << pr.pid << ","
|
||||
<< "\"process_name\":\"" << pr.process_name << "\","
|
||||
<< "\"start_time\":" << pr.start_time / 1000 << ","
|
||||
<< "\"stop_time\":" << pr.stop_time / 1000 << "}";
|
||||
}
|
||||
std::cout << "]";
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -470,6 +470,15 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() {
|
||||
copy_gpu_usage_info(job_info.gpus[i], ginfo);
|
||||
}
|
||||
|
||||
reply->set_num_processes(job_info.num_processes);
|
||||
for (uint32_t i = 0; i < job_info.num_processes; i++) {
|
||||
::rdc::RdcProcessStatsInfo* pinfo = reply->add_processes();
|
||||
pinfo->set_pid(job_info.processes[i].pid);
|
||||
pinfo->set_process_name(job_info.processes[i].process_name);
|
||||
pinfo->set_start_time(job_info.processes[i].start_time);
|
||||
pinfo->set_stop_time(job_info.processes[i].stop_time);
|
||||
}
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user