Implement the rdc_lib API to support the job stats

Add the function to start and stop the job recording.
Add the function to get the job stats for each GPU and summary of multiple GPUs
Add the function to remove the jobs.

Add a class RdcLogger which can control the log level using the environment variable RDC_LOG.
This is similar to GRPC_VERBOSITY gRPC. When the customer has the issues, he can enable the verbose
log to help us to troubleshoot the issues.

Add the -u support in the rdci group, fieldgroup and dmon for connecting to rdcd without authentication.

Change-Id: I22c591823c1ee6485db106b911bed8271d1b2769


[ROCm/rdc commit: a547dc7efd]
This commit is contained in:
Bill(Shuzhou) Liu
2020-04-08 08:47:29 -04:00
committato da Chris Freehill
parent aef3d29925
commit 0813e7052f
26 ha cambiato i file con 805 aggiunte e 102 eliminazioni
+7
Vedi File
@@ -16,9 +16,16 @@ LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery -l
```
## Troubleshooting
Enable the debug log:
```
sudo RDC_LOG=DEBUG ./server/rdcd
```
Check the ssl connection in rdci:
```
rdcd_hostname= "" # Set the rdcd you want to connect
openssl s_client -connect $rdcd_hostname:50051 -cert /etc/rdc/client/certs/rdc_client_cert.pem -key /etc/rdc/client/private/rdc_client_cert.key -CAfile /etc/rdc/client/certs/rdc_cacert.pem
```
+2 -4
Vedi File
@@ -92,11 +92,9 @@ int main(int, char **) {
goto cleanup;
}
// (2) start the recording. Set the sample frequency to once per second, the
// max keep age to one hour and the maximum number of samples to
// keep to unlimited.
// (2) start the recording. Set the sample frequency to once per second.
result = rdc_job_start_stats(rdc_handle, group_id,
job_id, 1000000, 3600, 0);
job_id, 1000000);
if (result != RDC_ST_OK) {
std::cout << "Error start job stats. Return: "
<< rdc_status_string(result);
+31 -6
Vedi File
@@ -52,6 +52,7 @@ typedef enum {
RDC_ST_NOT_FOUND, //!< Cannot find the value
RDC_ST_CONFLICT, //!< Conflict with current state
RDC_ST_CLIENT_ERROR, //!< The RDC client error
RDC_ST_ALREADY_EXIST, //!< The item already exists
RDC_ST_MAX_LIMIT //!< Max limit recording for the object
} rdc_status_t;
@@ -371,15 +372,10 @@ rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle);
*
* @param[in] update_freq How often to update this field in usec.
*
* @param[in] max_keep_age How long to keep data for this field in seconds.
*
* @param[in] max_keep_samples Maximum number of samples to keep. 0=no limit.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id, char job_id[64], uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples);
rdc_gpu_group_t group_id, char job_id[64], uint64_t update_freq);
/**
* @brief Get the stats of the job using the job id.
@@ -415,6 +411,35 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64],
rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle,
char job_id[64]);
/**
* @brief Request RDC to stop tracking the job given by job_id
*
* @details After this call, you will no longer be able to call
* rdc_job_get_stats() on this job_id. But you will be able to reuse
* the job_id after this call.
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] job_id The name of the job.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle,
char job_id[64]);
/**
* @brief Request RDC to stop tracking all the jobs
*
* @details After this call, you will no longer be able to call
* rdc_job_get_stats() on any job id. But you will be able to reuse
* the any previous used job id after this call.
*
* @param[in] p_rdc_handle The RDC handler.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle);
/**
* @brief Request RDC to update all fields to be watched.
*
@@ -24,6 +24,7 @@ THE SOFTWARE.
#include <memory>
#include <utility>
#include <string>
#include <vector>
#include <map>
#include "rdc_lib/rdc_common.h"
@@ -31,6 +32,7 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
typedef std::map<uint32_t, uint64_t> rdc_gpu_total_memory_t;
class RdcCacheManager {
public:
@@ -43,7 +45,19 @@ class RdcCacheManager {
const rdc_field_value& value) = 0;
virtual rdc_status_t evict_cache(uint32_t gpu_index, uint32_t field_id,
uint64_t max_keep_samples, double max_keep_age) = 0;
virtual uint32_t get_cache_size() = 0;
virtual std::string get_cache_stats() = 0;
virtual rdc_status_t rdc_job_get_stats(char jobId[64],
const rdc_gpu_total_memory_t& total_memory,
rdc_job_info_t* p_job_info) = 0;
virtual rdc_status_t rdc_job_start_stats(char jobId[64],
const rdc_group_info_t& group,
const rdc_field_group_info_t& finfo) = 0;
virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0;
virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index,
const std::string& job_id, const rdc_field_value& value) = 0;
virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
virtual ~RdcCacheManager() {}
};
@@ -59,7 +59,7 @@ class RdcGroupSettings {
};
typedef std::shared_ptr<RdcGroupSettings> RdcGroupSettingsPtr;
const uint32_t JOB_FIELD_ID = 0;
} // namespace rdc
} // namespace amd
+5 -5
Vedi File
@@ -33,12 +33,12 @@ class RdcHandler {
public:
// Job API
virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId,
char job_id[64], uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) = 0;
virtual rdc_status_t rdc_job_get_stats(char jobId[64],
char job_id[64], uint64_t update_freq) = 0;
virtual rdc_status_t rdc_job_get_stats(char jobId[64],
rdc_job_info_t* p_job_info)= 0;
virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0;
virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
// Discovery API
virtual rdc_status_t rdc_device_get_all(
@@ -0,0 +1,59 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDC_LIB_RDCLOGGER_H_
#define RDC_LIB_RDCLOGGER_H_
#include <iostream>
#include <string>
#include <chrono> // NOLINT
namespace amd {
namespace rdc {
class RdcLogger {
public:
explicit RdcLogger(std::ostream& os);
static RdcLogger& getLogger() {
static RdcLogger logger(std::cout);
return logger;
}
bool should_log(uint32_t severity) {
return log_level_ >= severity;
}
std::ostream& get_ostream() {
return os_;
}
std::string get_log_header(uint32_t severity,
const char* file, int line);
private:
std::ostream& os_;
uint32_t log_level_;
};
} // namespace rdc
} // namespace amd
#endif // RDC_LIB_RDCLOGGER_H_
@@ -36,10 +36,10 @@ class RdcWatchTable {
virtual rdc_status_t rdc_field_update_all() = 0;
virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id,
char job_id[64]) = 0;
virtual rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) = 0;
char job_id[64], uint64_t update_freq) = 0;
virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
@@ -23,7 +23,8 @@ THE SOFTWARE.
#define RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_
#include <memory>
#include <mutex>
#include <mutex> // NOLINT(build/c++11)
#include <string>
#include <vector>
#include <map>
#include "rdc_lib/RdcCacheManager.h"
@@ -41,6 +42,29 @@ struct RdcCacheEntry {
typedef std::map<RdcFieldKey, std::vector<RdcCacheEntry>> RdcCacheSamples;
struct FieldSummaryStats {
int64_t max_value;
int64_t min_value;
int64_t total_value;
uint64_t last_time;
uint64_t count;
};
struct GpuSummaryStats {
uint64_t energy_consumed;
uint64_t energy_last_time;
std::map<uint32_t, FieldSummaryStats> field_summaries;
};
// Per job entry
struct RdcJobStatsCacheEntry {
uint64_t start_time;
uint64_t end_time;
std::map<uint32_t, GpuSummaryStats> gpu_stats;
};
// <job_id, job_stats>
typedef std::map<std::string, RdcJobStatsCacheEntry> RdcJobStatsCache;
class RdcCacheManagerImpl: public RdcCacheManager {
public:
@@ -53,10 +77,27 @@ class RdcCacheManagerImpl: public RdcCacheManager {
const rdc_field_value& value) override;
rdc_status_t evict_cache(uint32_t gpu_index, uint32_t field_id,
uint64_t max_keep_samples, double max_keep_age) override;
uint32_t get_cache_size() override;
std::string get_cache_stats() override;
rdc_status_t rdc_job_get_stats(char job_id[64],
const rdc_gpu_total_memory_t& total_memory,
rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_start_stats(char job_id[64],
const rdc_group_info_t& group,
const rdc_field_group_info_t& finfo) override;
rdc_status_t rdc_job_stop_stats(char job_id[64]) override;
rdc_status_t rdc_update_job_stats(uint32_t gpu_index,
const std::string& job_id,
const rdc_field_value& value) override;
rdc_status_t rdc_job_remove(char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
private:
void set_summary(const FieldSummaryStats & stats,
rdc_stats_summary_t& gpu, rdc_stats_summary_t& summary, // NOLINT
unsigned int adjuster);
RdcCacheSamples cache_samples_;
RdcJobStatsCache cache_jobs_;
std::mutex cache_mutex_;
};
@@ -36,11 +36,12 @@ class RdcEmbeddedHandler: public RdcHandler {
public:
// Job API
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId,
char job_id[64], uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) override;
rdc_status_t rdc_job_get_stats(char jobId[64],
char job_id[64], uint64_t update_freq) override;
rdc_status_t rdc_job_get_stats(char jobId[64],
rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_stop_stats(char job_id[64]) override;
rdc_status_t rdc_job_stop_stats(char job_id[64]) override;
rdc_status_t rdc_job_remove(char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
// Discovery API
rdc_status_t rdc_device_get_all(
@@ -61,8 +61,8 @@ class RdcGroupSettingsImpl: public RdcGroupSettings {
private:
std::map<rdc_gpu_group_t, rdc_group_info_t> gpu_group_;
std::map<rdc_field_grp_t, rdc_field_group_info_t> field_group_;
uint32_t cur_group_id_ = 0;
uint32_t cur_filed_group_id_ = 0;
uint32_t cur_group_id_ = 1;
uint32_t cur_field_group_id_ = 0;
std::mutex group_mutex_;
std::mutex field_group_mutex_;
};
@@ -33,11 +33,12 @@ class RdcStandaloneHandler: public RdcHandler {
public:
// Job RdcAPI
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId,
char job_id[64], uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) override;
rdc_status_t rdc_job_get_stats(char jobId[64],
char job_id[64], uint64_t update_freq) override;
rdc_status_t rdc_job_get_stats(char jobId[64],
rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_stop_stats(char job_id[64]) override;
rdc_status_t rdc_job_stop_stats(char job_id[64]) override;
rdc_status_t rdc_job_remove(char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
// Discovery RdcAPI
rdc_status_t rdc_device_get_all(
@@ -22,11 +22,12 @@ THE SOFTWARE.
#ifndef RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_
#define RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_
#include <string>
#include <map>
#include <vector>
#include <utility>
#include <memory>
#include <mutex>
#include <mutex> // NOLINT
#include <atomic>
#include "rdc_lib/RdcWatchTable.h"
#include "rdc_lib/RdcGroupSettings.h"
@@ -45,14 +46,18 @@ struct FieldSettings {
uint64_t last_update_time;
};
struct JobWatchTableEntry {
uint32_t group_id;
std::vector<RdcFieldKey> fields; //< store fields for faster query
};
class RdcWatchTableImpl : public RdcWatchTable {
public:
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id,
char job_id[64]) override;
rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) override;
char job_id[64], uint64_t update_freq) override;
rdc_status_t rdc_job_stop_stats(char job_id[64]) override;
rdc_status_t rdc_job_remove(char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
@@ -84,10 +89,16 @@ class RdcWatchTableImpl : public RdcWatchTable {
//!< Helper function to clean up the watch table and cache
void clean_up();
//!< Helper function for debug information in watch table and cache
void debug_status();
//!< Helper function to get the fields using the group and the field group.
rdc_status_t get_fields_from_group(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, std::vector<RdcFieldKey> & fields);
rdc_field_grp_t field_group_id,
std::vector<RdcFieldKey> & fields); // NOLINT
bool is_job_watch_field(uint32_t gpu_index, uint32_t field_id,
std::string& job_id) const; // NOLINT
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
@@ -96,6 +107,10 @@ class RdcWatchTableImpl : public RdcWatchTable {
//!< The watch table to store the watch settings.
std::map<RdcFieldKey, FieldSettings> watch_table_;
//!< <job_id, gpu_group_id> pairs
std::map<std::string, JobWatchTableEntry> job_watch_table_;
//!< The settings for each field can be deduced from watch_table. But every
//!< rdc_field_update_all() call needs to deduce them. To improve the
//!< performance, the fields_to_watch_ is used to track the field settings.
+11 -5
Vedi File
@@ -24,12 +24,18 @@ THE SOFTWARE.
#define RDC_LIB_RDC_COMMON_H_
#include <iostream>
#define RDC_ERROR 0
#define RDC_INFO 1
#define RDC_DEBUG 2
#ifdef DEBUG
#define LOG_DEBUG(message) std::cout << message << std::endl
#else
#define LOG_DEBUG(message)
#endif
#define RDC_LOG(debug_level, msg) do { \
auto& logger = amd::rdc::RdcLogger::getLogger(); \
if (logger.should_log((debug_level))) { \
logger.get_ostream() << \
logger.get_log_header((debug_level), __FILE__, __LINE__) << \
msg << std::endl; \
} \
} while (0)
/**
* @brief The strncpy but with null terminated
+2
Vedi File
@@ -115,8 +115,10 @@ set(CMAKE_VERBOSE_MAKEFILE on)
set(BOOTSTRAP_LIB "rdc_bootstrap")
set(BOOTSTRAP_LIB_COMPONENT "lib${BOOTSTRAP_LIB}")
set(BOOTSTRAP_LIB_SRC_LIST "${SRC_DIR}/bootstrap/src/RdcBootStrap.cc")
set(BOOTSTRAP_LIB_SRC_LIST ${BOOTSTRAP_LIB_SRC_LIST} "${SRC_DIR}/bootstrap/src/RdcLogger.cc")
set(BOOTSTRAP_LIB_INC_LIST "${RDC_LIB_INC_DIR}/rdc/rdc.h")
set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/rdc_common.h")
set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcLogger.h")
set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcHandler.h")
message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}")
@@ -24,6 +24,7 @@ THE SOFTWARE.
#include <map>
#include "rdc/rdc.h"
#include "rdc_lib/RdcHandler.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
static void* libHandler = nullptr;
@@ -96,7 +97,7 @@ rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode,
if (!libHandler) {
error = dlerror();
LOG_DEBUG("Fail to open librdc.so: " << error);
RDC_LOG(RDC_ERROR, "Fail to open librdc.so: " << error);
return RDC_ST_FAIL_LOAD_MODULE;
}
@@ -104,7 +105,8 @@ rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode,
dlsym(libHandler, "make_handler");
if (!func_make_handler) {
error = dlerror();
LOG_DEBUG("Fail to find function make_handler:" << error);
RDC_LOG(RDC_ERROR,
"Fail to find function make_handler:" << error);
return RDC_ST_FAIL_LOAD_MODULE;
}
@@ -144,15 +146,32 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64] ,
}
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t groupId, char job_id[64], uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples ) {
rdc_gpu_group_t groupId, char job_id[64],
uint64_t update_freq) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_job_start_stats(groupId, job_id, update_freq,
max_keep_age, max_keep_samples);
rdc_job_start_stats(groupId, job_id, update_freq);
}
rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, char job_id[64]) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_job_remove(job_id);
}
rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_job_remove_all();
}
@@ -344,6 +363,8 @@ const char* rdc_status_string(rdc_status_t result) {
return "The max limit reached";
case RDC_ST_CONFLICT:
return "Conflict with current state";
case RDC_ST_ALREADY_EXIST:
return "The value already exists";
case RDC_ST_CLIENT_ERROR:
return "RDC Client error";
default:
@@ -0,0 +1,78 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/RdcLogger.h"
#include <stdlib.h>
#include <string.h>
#include <sstream>
#include <iomanip>
#include <iostream>
#include <chrono> // NOLINT
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcLogger::RdcLogger(std::ostream& os):
os_(os) {
char* verbose = getenv("RDC_LOG");
if (verbose == nullptr) {
log_level_ = RDC_ERROR;
} else if (strcmp(verbose, "DEBUG") == 0) {
log_level_ = RDC_DEBUG;
} else if (strcmp(verbose, "INFO") == 0) {
log_level_ = RDC_INFO;
} else {
log_level_ = RDC_ERROR;
}
}
std::string RdcLogger::get_log_header(uint32_t severity,
const char* file, int line) {
std::stringstream strstream;
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::system_clock::now().time_since_epoch()).count();
strstream << std::fixed << std::setprecision(3) << (ms/1000.0) << " ";
if (severity == RDC_DEBUG) {
strstream << "DEBUG ";
} else if (severity == RDC_INFO) {
strstream << "INFO ";
} else {
strstream << "ERROR ";
}
// extract out the file path as it may be very long.
if (file != nullptr) {
std::string file_str(file);
auto found = file_str.find_last_of("/");
if (found != std::string::npos) {
file_str = file_str.substr(found+1);
}
strstream << file_str << "(" << line << "): ";
}
return strstream.str();
}
} // namespace rdc
} // namespace amd
@@ -22,6 +22,8 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcCacheManagerImpl.h"
#include <sys/time.h>
#include <ctime>
#include <sstream>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
@@ -127,15 +129,26 @@ rdc_status_t RdcCacheManagerImpl::rdc_field_get_latest_value(
return RDC_ST_OK;
}
uint32_t RdcCacheManagerImpl::get_cache_size() {
uint32_t cache_size = 0;
std::string RdcCacheManagerImpl::get_cache_stats() {
std::stringstream strstream;
std::lock_guard<std::mutex> guard(cache_mutex_);
strstream << "Cache samples:";
auto cache_samples_ite = cache_samples_.begin();
for (; cache_samples_ite != cache_samples_.end(); cache_samples_ite++) {
cache_size+=cache_samples_ite->second.size();
strstream << "<" << cache_samples_ite->first.first << ","
<< cache_samples_ite->first.second << ":"
<< cache_samples_ite->second.size() << "> ";
}
return cache_size;
strstream <<" Job caches:";
auto job_ite = cache_jobs_.begin();
for ( ; job_ite != cache_jobs_.end(); job_ite++ ) {
strstream << "<" << job_ite->first << ":"
<< job_ite->second.gpu_stats.size() << "> ";
}
return strstream.str();
}
rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index,
@@ -162,5 +175,188 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index,
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_remove(char job_id[64]) {
std::lock_guard<std::mutex> guard(cache_mutex_);
cache_jobs_.erase(job_id);
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_remove_all() {
std::lock_guard<std::mutex> guard(cache_mutex_);
cache_jobs_.clear();
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index,
const std::string& job_id, const rdc_field_value& value) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_iter = cache_jobs_.find(job_id);
if (job_iter == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
}
auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index);
if (gpu_iter == job_iter->second.gpu_stats.end()) {
return RDC_ST_NOT_FOUND;
}
auto fsummary = gpu_iter->second.field_summaries.find(value.field_id);
if (fsummary == gpu_iter->second.field_summaries.end()) {
return RDC_ST_NOT_FOUND;
}
if (fsummary->second.count == 0) { // first item
fsummary->second.count = 1;
fsummary->second.max_value = value.value.l_int;
fsummary->second.min_value = value.value.l_int;
fsummary->second.total_value = value.value.l_int;
fsummary->second.last_time = value.ts;
if (value.field_id == RDC_FI_POWER_USAGE) {
gpu_iter->second.energy_last_time = value.ts;
}
return RDC_ST_OK;
}
if (value.field_id == RDC_FI_POWER_USAGE) {
uint64_t time_elapsed = value.ts - gpu_iter->second.energy_last_time;
// Stored in cache as microseconds and microwats
gpu_iter->second.energy_consumed +=
(time_elapsed * value.value.l_int)/(1000.0*1000000);
}
fsummary->second.max_value = std::max(fsummary->second.max_value,
static_cast<int64_t>(value.value.l_int));
fsummary->second.min_value = std::min(fsummary->second.min_value,
static_cast<int64_t>(value.value.l_int));
fsummary->second.total_value += value.value.l_int;
fsummary->second.last_time = value.ts;
fsummary->second.count++;
return RDC_ST_OK;
}
void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats,
rdc_stats_summary_t & gpu, rdc_stats_summary_t& summary,
unsigned int adjuster) {
if (stats.count == 0) return;
gpu.max_value = stats.max_value / adjuster;
gpu.min_value = stats.min_value / adjuster;
gpu.average = stats.total_value / stats.count / adjuster;
summary.max_value = std::max(summary.max_value, gpu.max_value);
summary.min_value = std::min(summary.min_value, gpu.min_value);
//< save total for future average calculation.
summary.average += gpu.average;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64],
const rdc_gpu_total_memory_t& total_memory,
rdc_job_info_t* p_job_info) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_stats = cache_jobs_.find(jobId);
if (job_stats == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
}
//< Init the summary info
RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " <<jobId);
auto& summary_info = p_job_info->summary;
summary_info.start_time = job_stats->second.start_time;
if (job_stats->second.end_time == 0) {
summary_info.end_time = time(nullptr);
} else {
summary_info.end_time = job_stats->second.end_time;
}
summary_info.energy_consumed = 0;
summary_info.max_gpu_memory_used = 0;
summary_info.power_usage = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.gpu_clock = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.gpu_utilization = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.memory_utilization = {0,
std::numeric_limits<uint64_t>::max(), 0};
p_job_info->num_gpus = job_stats->second.gpu_stats.size();
//< Populate information for each GPUs
auto gpus = job_stats->second.gpu_stats.begin();
for (; gpus != job_stats->second.gpu_stats.end(); gpus++) {
auto & gpu_info = p_job_info->gpus[gpus->first];
gpu_info.start_time = summary_info.start_time;
gpu_info.end_time = summary_info.end_time;
gpu_info.energy_consumed = gpus->second.energy_consumed;
summary_info.energy_consumed += gpu_info.energy_consumed;
auto ite = gpus->second.field_summaries.begin();
for (; ite != gpus->second.field_summaries.end(); ite++) {
if (ite->first == RDC_FI_POWER_USAGE) {
set_summary(ite->second,
gpu_info.power_usage, summary_info.power_usage, 1000000);
} else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) {
auto tmemory = total_memory.at(gpus->first);
set_summary(ite->second, gpu_info.memory_utilization,
summary_info.memory_utilization, tmemory/100);
gpu_info.max_gpu_memory_used = ite->second.max_value;
summary_info.max_gpu_memory_used = std::max(
summary_info.max_gpu_memory_used,
gpu_info.max_gpu_memory_used);
} else if (ite->first == RDC_FI_GPU_SM_CLOCK) {
set_summary(ite->second, gpu_info.gpu_clock,
summary_info.gpu_clock, 1000000);
} else if (ite->first == RDC_FI_GPU_UTIL) {
set_summary(ite->second, gpu_info.gpu_utilization,
summary_info.gpu_utilization, 1);
}
}
}
// Get the average of the summary
summary_info.power_usage.average = summary_info.power_usage.average/
p_job_info->num_gpus;
summary_info.gpu_clock.average = summary_info.gpu_clock.average/
p_job_info->num_gpus;
summary_info.gpu_utilization.average = summary_info.gpu_utilization.average/
p_job_info->num_gpus;
summary_info.memory_utilization.average =
summary_info.memory_utilization.average/p_job_info->num_gpus;
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64],
const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo) {
RdcJobStatsCacheEntry cacheEntry;
cacheEntry.start_time = std::time(nullptr);
cacheEntry.end_time = 0;
for (unsigned int i=0 ; i < ginfo.count; i++) { // GPUs
GpuSummaryStats gstats;
gstats.energy_consumed = 0;
gstats.energy_last_time = 0;
for (unsigned int j = 0; j < finfo.count; j++) { // init fields
FieldSummaryStats s;
s.count = 0;
s.max_value = s.min_value = s.total_value = 0;
gstats.field_summaries.insert({finfo.field_ids[j], s});
}
cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats});
}
std::lock_guard<std::mutex> guard(cache_mutex_);
cache_jobs_.insert({job_id, cacheEntry});
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_stats = cache_jobs_.find(job_id);
if (job_stats == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
}
job_stats->second.end_time = std::time(nullptr);
return RDC_ST_OK;
}
} // namespace rdc
} // namespace amd
@@ -27,6 +27,7 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcCacheManagerImpl.h"
#include "rdc_lib/impl/RdcWatchTableImpl.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcException.h"
#include "rocm_smi/rocm_smi.h"
@@ -73,6 +74,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode):
, metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_,
METIC_UPDATE_FREQUENCY)) {
if (mode == RDC_OPERATION_MODE_AUTO) {
RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO");
metrics_updater_->start();
}
}
@@ -83,32 +85,49 @@ RdcEmbeddedHandler::~RdcEmbeddedHandler() {
// JOB API
rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId,
char job_id[64], uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) {
// TODO(bill_liu): implement
(void)(groupId);
(void)(job_id);
(void)(update_freq);
(void)(max_keep_age);
(void)(max_keep_samples);
return RDC_ST_OK;
char job_id[64], uint64_t update_freq) {
return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64],
rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64],
rdc_job_info_t* p_job_info) {
// TODO(bill_liu): implement
(void)(job_id);
(void)(p_job_info);
return RDC_ST_OK;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
uint32_t count = 0;
rdc_status_t status = rdc_device_get_all(
gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
rdc_gpu_total_memory_t all_total_memory;
for (uint32_t i = 0; i < count ; i++) {
rdc_field_value total_memory;
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i],
RDC_FI_GPU_MEMORY_TOTAL, &total_memory);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU "
<< gpu_index_list[i]);
return status;
}
all_total_memory.insert({gpu_index_list[i], total_memory.value.l_int});
}
return cache_mgr_->rdc_job_get_stats(job_id, all_total_memory, p_job_info);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64] ) {
// TODO(bill_liu): implement
(void)(job_id);
return RDC_ST_OK;
rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64]) {
return watch_table_->rdc_job_stop_stats(job_id);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_remove(char job_id[64]) {
return watch_table_->rdc_job_remove(job_id);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_remove_all() {
return watch_table_->rdc_job_remove_all();
}
// Discovery API
rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(
@@ -194,6 +213,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id,
}
if (!is_gpu_exist) {
RDC_LOG(RDC_INFO, "Fail to add GPU index " << gpu_index << " to group "
<< group_id <<" as the GPU index is invalid.");
return RDC_ST_NOT_FOUND;
}
@@ -211,6 +232,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_field_create(uint32_t num_field_ids,
if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) {
for (uint32_t i = 0; i < num_field_ids; i++) {
if (!metric_fetcher_->is_field_valid(field_ids[i])) {
RDC_LOG(RDC_INFO,
"Fail to create field group with unknown field id "
<< field_ids[i]);
return RDC_ST_NOT_SUPPORTED;
}
}
@@ -285,6 +309,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_get_latest_value(
return RDC_ST_BAD_PARAMETER;
}
if (!metric_fetcher_->is_field_valid(field)) {
RDC_LOG(RDC_INFO,
"Fail to get latest value with unknown field id "
<< field);
return RDC_ST_NOT_SUPPORTED;
}
return cache_mgr_->rdc_field_get_latest_value(gpu_index, field, value);
@@ -297,6 +324,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_get_value_since(uint32_t gpu_index,
return RDC_ST_BAD_PARAMETER;
}
if (!metric_fetcher_->is_field_valid(field)) {
RDC_LOG(RDC_INFO,
"Fail to get value since with unknown field id "
<< field);
return RDC_ST_NOT_SUPPORTED;
}
return cache_mgr_->rdc_field_get_value_since(gpu_index, field,
@@ -22,11 +22,20 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcGroupSettingsImpl.h"
#include <ctime>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
namespace amd {
namespace rdc {
RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
// Add the default job stats fields
uint32_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE,
RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL};
char job_field_group[] = "JobStatsFields";
rdc_field_grp_t fgid = JOB_FIELD_ID;
rdc_group_field_create(sizeof(job_fields)/sizeof(uint32_t),
job_fields, job_field_group, &fgid);
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(
@@ -62,6 +71,8 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add(
// Check whether the index already exists
for (uint32_t i=0; i < ite->second.count; i++) {
if (ite->second.entity_ids[i] == gpu_index) {
RDC_LOG(RDC_INFO, "Fail to add " << gpu_index
<<" to GPU group " << groupId << " as it is already exists");
return RDC_ST_BAD_PARAMETER;
}
}
@@ -136,15 +147,19 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create(
if (field_group_.size() >= RDC_MAX_NUM_FIELD_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
field_group_.emplace(cur_filed_group_id_, finfo);
*rdc_field_group_id = cur_filed_group_id_;
cur_filed_group_id_++;
field_group_.emplace(cur_field_group_id_, finfo);
*rdc_field_group_id = cur_field_group_id_;
cur_field_group_id_++;
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy(
rdc_field_grp_t rdc_field_group_id) {
if (rdc_field_group_id == JOB_FIELD_ID) {
RDC_LOG(RDC_INFO, "Cannot delete system JOB_FIELD_ID field group");
return RDC_ST_BAD_PARAMETER;
}
std::lock_guard<std::mutex> guard(field_group_mutex_);
field_group_.erase(rdc_field_group_id);
return RDC_ST_OK;
@@ -183,6 +198,10 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_all_ids(
if (*count >= RDC_MAX_NUM_FIELD_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
// Skip system defined JOB_FIELD_ID
if (ite->first == JOB_FIELD_ID) continue;
field_group_id_list[*count] = ite->first;
(*count)++;
}
@@ -22,10 +22,11 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include <sys/time.h>
#include <string.h>
#include <chrono>
#include <chrono> //NOLINT
#include <algorithm>
#include <vector>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
@@ -48,6 +49,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
uint64_t i64 = 0;
if (!is_field_valid(field_id)) {
RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id
<< " which is not supported");
return RDC_ST_NOT_SUPPORTED;
}
@@ -125,6 +128,27 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
break;
}
gettimeofday(&tv, NULL);
int64_t latency = static_cast<uint64_t>(tv.tv_sec)*1000+tv.tv_usec/1000
- value->ts;
if (value->status != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" <<
field_id_string(field_id) << " with rsmi error code "
<< value->status <<", latency " << latency);
} else if (value->type == INTEGER) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << ":" << value->value.l_int
<< ", latency " << latency);
} else if (value->type == DOUBLE) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << ":" << value->value.dbl
<< ", latency " << latency);
} else if (value->type == STRING) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << ":" << value->value.str
<< ", latency " << latency);
}
return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
}
@@ -23,8 +23,10 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcWatchTableImpl.h"
#include <sys/time.h>
#include <ctime>
#include <sstream>
#include <algorithm>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
namespace amd {
namespace rdc {
@@ -39,24 +41,92 @@ RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
}
rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id,
char job_id[64]) {
// TODO(bill_liu): implement
(void)(group_id);
(void)(job_id);
return RDC_ST_OK;
char job_id[64], uint64_t update_freq) {
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
if (job_watch_table_.find(job_id) != job_watch_table_.end()) {
return RDC_ST_ALREADY_EXIST;
}
} while (0);
std::vector<RdcFieldKey> fields_in_watch;
rdc_status_t result = get_fields_from_group(group_id,
JOB_FIELD_ID, fields_in_watch);
JobWatchTableEntry jentry {group_id, fields_in_watch};
job_watch_table_.insert({job_id, jentry});
result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0);
if (result != RDC_ST_OK) {
return result;
}
rdc_field_group_info_t finfo;
rdc_group_info_t ginfo;
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
if (result != RDC_ST_OK) {
return result;
}
result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo);
if (result != RDC_ST_OK) {
return result;
}
result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo);
return result;
}
rdc_status_t RdcWatchTableImpl::rdc_watch_job_fields(rdc_gpu_group_t group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) {
// TODO(bill_liu): implement
(void)(group_id);
(void)(update_freq);
(void)(max_keep_age);
(void)(max_keep_samples);
return RDC_ST_OK;
rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) {
uint32_t job_group_id;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
auto job = job_watch_table_.find(job_id);
if (job == job_watch_table_.end()) {
return RDC_ST_NOT_FOUND;
}
job_group_id = job->second.group_id;
} while (0);
rdc_status_t result = rdc_field_unwatch(job_group_id, JOB_FIELD_ID);
if (result != RDC_ST_OK) {
return result;
}
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
job_watch_table_.erase(job_id);
} while (0);
result = cache_mgr_->rdc_job_stop_stats(job_id);
return result;
}
rdc_status_t RdcWatchTableImpl::rdc_job_remove(char job_id[64]) {
rdc_job_stop_stats(job_id);
return cache_mgr_->rdc_job_remove(job_id);
}
rdc_status_t RdcWatchTableImpl::rdc_job_remove_all() {
// Get all the job ids;
std::vector<std::string> v;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
for (auto ite = job_watch_table_.begin();
ite != job_watch_table_.end(); ite++) {
v.push_back(ite->first);
}
} while (0);
// Stop them
for (auto job = v.begin(); job != v.end(); job++) {
rdc_job_stop_stats(const_cast<char*>(job->c_str()));
}
return cache_mgr_->rdc_job_remove_all();
}
rdc_status_t RdcWatchTableImpl::get_fields_from_group(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, std::vector<RdcFieldKey> & fields) {
rdc_field_group_info_t finfo;
@@ -228,6 +298,21 @@ rdc_status_t RdcWatchTableImpl::rdc_field_unwatch(
return update_field_in_table_when_unwatch(ite->first);
}
bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index,
uint32_t field_id, std::string& job_id) const {
RdcFieldKey key{gpu_index, field_id};
for (auto ite = job_watch_table_.begin();
ite != job_watch_table_.end(); ite++) {
auto& fields = ite->second.fields;
if (std::find(fields.begin(), fields.end(), key) != fields.end()) {
job_id = ite->first;
return true;
}
}
return false;
}
rdc_status_t RdcWatchTableImpl::rdc_field_update_all() {
uint32_t items_fetched = 0;
@@ -251,13 +336,19 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() {
result = metric_fetcher_->fetch_smi_field(
fite->first.first, fite->first.second, &value);
if (result != RDC_ST_OK) {
LOG_DEBUG("Fail to fetch the field: " << rdc_status_string(result));
continue;
}
// Update the cache
cache_mgr_->rdc_update_cache(fite->first.first, value);
// Update the job stats cache
std::string job_id;
if (is_job_watch_field(fite->first.first, fite->first.second, job_id)) {
cache_mgr_->rdc_update_job_stats(fite->first.first, job_id, value);
}
// Update the last_upate_time
gettimeofday(&tv, NULL);
now = static_cast<uint64_t>(tv.tv_sec)*1000+tv.tv_usec/1000;
@@ -303,6 +394,56 @@ void RdcWatchTableImpl::clean_up() {
++wite;
}
}
// Debug log every 30 seconds
if (now/1000%30 == 0) {
debug_status();
}
}
void RdcWatchTableImpl::debug_status() {
RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size()
<< " watch_table_:" << watch_table_.size()
<< " job_watch_table_:" << job_watch_table_.size()
<< " cache stats:" << cache_mgr_->get_cache_stats());
if (watch_table_.size() > 0) {
RDC_LOG(RDC_DEBUG, "watch table details:");
}
for (auto wite = watch_table_.begin(); wite != watch_table_.end(); wite++) {
RDC_LOG(RDC_DEBUG, wite->first.first << "," << wite->first.second
<< ": age:" << wite->second.max_keep_age << ", samples:"
<< wite->second.max_keep_samples << ", is_watching:"
<< wite->second.is_watching << ", last_update_time:"
<< wite->second.last_update_time <<", update_freq:"
<< wite->second.update_freq);
}
if (job_watch_table_.size() > 0) {
RDC_LOG(RDC_DEBUG, "job watch table details: ");
}
for (auto jite = job_watch_table_.begin();
jite !=job_watch_table_.end(); jite++) {
std::stringstream strstream;
for (const auto& p : jite->second.fields) {
strstream << "<" << p.first << "," << p.second << "> ";
}
RDC_LOG(RDC_DEBUG, jite->first << ": " << jite->second.group_id
<< " fields : "<< strstream.str());
}
if (fields_to_watch_.size() > 0) {
RDC_LOG(RDC_DEBUG, "fields to watch details:");
}
for (auto fite = fields_to_watch_.begin(); fite != fields_to_watch_.end();
fite++) {
RDC_LOG(RDC_DEBUG, fite->first.first << "," << fite->first.second
<< ": age:" << fite->second.max_keep_age << ", samples:"
<< fite->second.max_keep_samples << ", is_watching:"
<< fite->second.is_watching << ", last_update_time:"
<< fite->second.last_update_time <<", update_freq:"
<< fite->second.update_freq);
}
}
} // namespace rdc
@@ -62,19 +62,16 @@ rdc_status_t RdcStandaloneHandler::error_handle(::grpc::Status status,
// JOB RdcAPI
rdc_status_t RdcStandaloneHandler::rdc_job_start_stats(rdc_gpu_group_t groupId,
char job_id[64], uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) {
char job_id[64], uint64_t update_freq) {
// TODO(bill_liu): implement
(void)(groupId);
(void)(job_id);
(void)(update_freq);
(void)(max_keep_age);
(void)(max_keep_samples);
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64],
rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64],
rdc_job_info_t* p_job_info) {
// TODO(bill_liu): implement
(void)(job_id);
@@ -82,13 +79,25 @@ rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64],
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_job_stop_stats(char job_id[64] ) {
rdc_status_t RdcStandaloneHandler::rdc_job_stop_stats(char job_id[64]) {
// TODO(bill_liu): implement
(void)(job_id);
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_job_remove(char job_id[64]) {
// TODO(bill_liu): implement
(void)(job_id);
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_job_remove_all() {
// TODO(bill_liu): implement
return RDC_ST_OK;
}
// Discovery RdcAPI
rdc_status_t RdcStandaloneHandler::rdc_device_get_all(
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) {
+10 -2
Vedi File
@@ -58,6 +58,7 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) {
const struct option long_options[] = {
{"host", required_argument, nullptr, HOST_OPTIONS },
{"help", optional_argument, nullptr, 'h' },
{"unauth", optional_argument, nullptr, 'u' },
{"list", optional_argument, nullptr, 'l' },
{"field-group-id", required_argument, nullptr, 'f' },
{"field-id", required_argument, nullptr, 'e' },
@@ -73,7 +74,7 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) {
std::string gpu_indexes;
std::string field_ids;
while ((opt = getopt_long(argc, argv, "hlf:g:c:d:e:i:",
while ((opt = getopt_long(argc, argv, "hluf:g:c:d:e:i:",
long_options, &option_index)) != -1) {
switch (opt) {
case HOST_OPTIONS:
@@ -82,9 +83,12 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) {
case 'h':
dmon_ops_ = DMON_HELP;
return;
case 'u':
use_auth_ = false;
break;
case 'l':
dmon_ops_ = DMON_LIST_FIELDS;
return;
break;
case 'f':
if (!IsNumber(optarg)) {
show_help();
@@ -130,6 +134,10 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) {
}
}
if (dmon_ops_ == DMON_LIST_FIELDS) {
return;
}
if (options_.find(OPTIONS_FIELD_GROUP_ID) == options_.end()) {
if (field_ids == "") {
show_help();
@@ -40,6 +40,7 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
const struct option long_options[] = {
{"host", required_argument, nullptr, HOST_OPTIONS },
{"help", optional_argument, nullptr, 'h' },
{"unauth", optional_argument, nullptr, 'u' },
{"list", optional_argument, nullptr, 'l' },
{"group", required_argument, nullptr, 'g'},
{"create", required_argument, nullptr, 'c' },
@@ -52,7 +53,7 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
int option_index = 0;
int opt = 0;
while ((opt = getopt_long(argc, argv, "hlif:c:g:d:",
while ((opt = getopt_long(argc, argv, "hluif:c:g:d:",
long_options, &option_index)) != -1) {
switch (opt) {
case HOST_OPTIONS:
@@ -61,6 +62,9 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
case 'h':
field_group_ops_ = FIELD_GROUP_HELP;
return;
case 'u':
use_auth_ = false;
break;
case 'l':
field_group_ops_ = FIELD_GROUP_LIST;
break;
@@ -40,6 +40,7 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
const struct option long_options[] = {
{"host", required_argument, nullptr, HOST_OPTIONS },
{"help", optional_argument, nullptr, 'h' },
{"unauth", optional_argument, nullptr, 'u' },
{"list", optional_argument, nullptr, 'l' },
{"group", required_argument, nullptr, 'g'},
{"create", required_argument, nullptr, 'c' },
@@ -52,7 +53,7 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
int option_index = 0;
int opt = 0;
while ((opt = getopt_long(argc, argv, "hlic:g:a:d:",
while ((opt = getopt_long(argc, argv, "hluic:g:a:d:",
long_options, &option_index)) != -1) {
switch (opt) {
case HOST_OPTIONS:
@@ -61,6 +62,9 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
case 'h':
group_ops_ = GROUP_HELP;
return;
case 'u':
use_auth_ = false;
break;
case 'l':
group_ops_ = GROUP_LIST;
break;