From 39f3d3af8a542d3af3281f9a0fbcce3ebcdf2bc4 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Tue, 21 Apr 2020 09:48:14 -0400 Subject: [PATCH] Implement the stats subsystem in rdci Add support for the stats subsystem in rdci Modify the dmon system to handle the case when no GPUs in a group Change-Id: I5a18e1201d24b5318b8e324a77551a757b108f25 [ROCm/rdc commit: 096dc2dadb54bb4940b05e0abcb026c177afd978] --- .../rdc_libs/rdc/src/RdcCacheManagerImpl.cc | 4 +- .../rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc | 18 +- projects/rdc/rdci/CMakeLists.txt | 1 + .../rdc/rdci/include/RdciStatsSubSystem.h | 64 +++++ projects/rdc/rdci/src/RdciDmonSubSystem.cc | 24 +- projects/rdc/rdci/src/RdciStatsSubSystem.cc | 260 ++++++++++++++++++ projects/rdc/rdci/src/rdci.cc | 3 + 7 files changed, 369 insertions(+), 5 deletions(-) create mode 100644 projects/rdc/rdci/include/RdciStatsSubSystem.h create mode 100644 projects/rdc/rdci/src/RdciStatsSubSystem.cc diff --git a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index 57523ae600..b8d503c9c1 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -257,7 +257,7 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], } //< Init the summary info - RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " <summary; summary_info.start_time = job_stats->second.start_time; if (job_stats->second.end_time == 0) { @@ -340,6 +340,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], } std::lock_guard guard(cache_mutex_); + // Remove the old stats if it exists + cache_jobs_.erase(job_id); cache_jobs_.insert({job_id, cacheEntry}); return RDC_ST_OK; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 47bc11fb4c..7134a51992 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -52,8 +52,20 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, std::vector fields_in_watch; rdc_status_t result = get_fields_from_group(group_id, JOB_FIELD_ID, fields_in_watch); + if (result != RDC_ST_OK) { + return result; + } + if (fields_in_watch.size() == 0) { + RDC_LOG(RDC_ERROR, "Fail to start job " << job_id <<". The group " + << group_id << " must contain at least one GPU."); + return RDC_ST_NOT_FOUND; + } + JobWatchTableEntry jentry {group_id, fields_in_watch}; - job_watch_table_.insert({job_id, jentry}); + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + job_watch_table_.insert({job_id, jentry}); + } while (0); result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); if (result != RDC_ST_OK) { @@ -62,10 +74,12 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, rdc_field_group_info_t finfo; rdc_group_info_t ginfo; - result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); + result = group_settings_->rdc_group_gpu_get_info( + group_id, &ginfo); if (result != RDC_ST_OK) { return result; } + result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo); if (result != RDC_ST_OK) { return result; diff --git a/projects/rdc/rdci/CMakeLists.txt b/projects/rdc/rdci/CMakeLists.txt index 477f258cc2..95e92eb835 100644 --- a/projects/rdc/rdci/CMakeLists.txt +++ b/projects/rdc/rdci/CMakeLists.txt @@ -75,6 +75,7 @@ set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciGroupSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciFieldGroupSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDmonSubSystem.cc") +set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciStatsSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${PROJECT_SOURCE_DIR}/common/rdc_utils.cc") message("RDCI_SRC_LIST=${RDCI_SRC_LIST}") set(RDCI_EXE "rdci") diff --git a/projects/rdc/rdci/include/RdciStatsSubSystem.h b/projects/rdc/rdci/include/RdciStatsSubSystem.h new file mode 100644 index 0000000000..69e9bdf40a --- /dev/null +++ b/projects/rdc/rdci/include/RdciStatsSubSystem.h @@ -0,0 +1,64 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDCI_INCLUDE_RDCISTATSSUBSYSTEM_H_ +#define RDCI_INCLUDE_RDCISTATSSUBSYSTEM_H_ +#include +#include +#include "RdciSubSystem.h" + + +namespace amd { +namespace rdc { + +class RdciStatsSubSystem: public RdciSubSystem { + public: + RdciStatsSubSystem(); + ~RdciStatsSubSystem(); + void parse_cmd_opts(int argc, char ** argv) override; + void process() override; + + private: + void show_help() const; + void show_job_stats(const rdc_gpu_usage_info_t& gpu_info) const; + + enum OPERATIONS { + STATS_UNKNOWN = 0, + STATS_HELP, + STATS_START_RECORDING, + STATS_STOP_RECORDING, + STATS_DISPLAY, + STATS_REMOVE, + STATS_REMOVE_ALL + } stats_ops_; + + + std::string job_id_; + uint32_t group_id_; + bool is_verbose_ = false; +}; + + +} // namespace rdc +} // namespace amd + + +#endif // RDCI_INCLUDE_RDCISTATSSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/src/RdciDmonSubSystem.cc b/projects/rdc/rdci/src/RdciDmonSubSystem.cc index c1618d785d..0c477996f7 100644 --- a/projects/rdc/rdci/src/RdciDmonSubSystem.cc +++ b/projects/rdc/rdci/src/RdciDmonSubSystem.cc @@ -309,12 +309,32 @@ void RdciDmonSubSystem::process() { result = rdc_group_gpu_get_info(rdc_handle_, options_[OPTIONS_GROUP_ID], &group_info); if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); + std::string error_msg = rdc_status_string(result); + if (result == RDC_ST_NOT_FOUND) { + error_msg = "Cannot find the group " + + std::to_string(options_[OPTIONS_GROUP_ID]); + } + throw RdcException(result, error_msg.c_str()); + } + if (group_info.count == 0) { + throw RdcException(RDC_ST_NOT_FOUND, "The gpu group " + + std::to_string(options_[OPTIONS_GROUP_ID]) + + " must contain at least 1 GPU."); } result = rdc_group_field_get_info(rdc_handle_, options_[OPTIONS_FIELD_GROUP_ID], &field_info); if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); + std::string error_msg = rdc_status_string(result); + if (result == RDC_ST_NOT_FOUND) { + error_msg = "Cannot find the field group " + + std::to_string(options_[OPTIONS_FIELD_GROUP_ID]); + } + throw RdcException(result, error_msg.c_str()); + } + if (field_info.count == 0) { + throw RdcException(RDC_ST_NOT_FOUND, "The field group " + + std::to_string(options_[OPTIONS_FIELD_GROUP_ID]) + + " must contain at least 1 field."); } // keep extra 1 minute data diff --git a/projects/rdc/rdci/src/RdciStatsSubSystem.cc b/projects/rdc/rdci/src/RdciStatsSubSystem.cc new file mode 100644 index 0000000000..608237932e --- /dev/null +++ b/projects/rdc/rdci/src/RdciStatsSubSystem.cc @@ -0,0 +1,260 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "RdciStatsSubSystem.h" +#include +#include +#include +#include +#include +#include "rdc_lib/rdc_common.h" +#include "common/rdc_utils.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" + +namespace amd { +namespace rdc { + +RdciStatsSubSystem::RdciStatsSubSystem() { +} + +RdciStatsSubSystem::~RdciStatsSubSystem() { +} + + +void RdciStatsSubSystem::parse_cmd_opts(int argc, char ** argv) { + const int HOST_OPTIONS = 1000; + const struct option long_options[] = { + {"host", required_argument, nullptr, HOST_OPTIONS }, + {"help", optional_argument, nullptr, 'h' }, + {"unauth", optional_argument, nullptr, 'u' }, + {"jstart", required_argument, nullptr, 's' }, + {"jstop", required_argument, nullptr, 'x' }, + {"job", required_argument, nullptr, 'j' }, + {"jremove", required_argument, nullptr, 'r'}, + {"jremoveall", optional_argument, nullptr, 'a' }, + {"verbose", optional_argument, nullptr, 'v'}, + {"group", required_argument, nullptr, 'g'}, + { nullptr, 0 , nullptr, 0 } + }; + + bool is_group_id_set = false; + int option_index = 0; + int opt = 0; + + while ((opt = getopt_long(argc, argv, "huvas:x:j:r:g:", + long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case 'h': + stats_ops_ = STATS_HELP; + return; + case 'u': + use_auth_ = false; + break; + case 's': + stats_ops_ = STATS_START_RECORDING; + job_id_ = optarg; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + is_group_id_set = true; + break; + case 'x': + stats_ops_ = STATS_STOP_RECORDING; + job_id_ = optarg; + break; + case 'j': + stats_ops_ = STATS_DISPLAY; + job_id_ = optarg; + break; + case 'v': + is_verbose_ = true; + break; + case 'r': + stats_ops_ = STATS_REMOVE; + job_id_ = optarg; + break; + case 'a': + stats_ops_ = STATS_REMOVE_ALL; + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Unknown command line options"); + } + } + + if (stats_ops_ == STATS_START_RECORDING + && is_group_id_set == false) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the group id to start recording"); + } +} + +void RdciStatsSubSystem::show_help() const { + std::cout << " stats -- Used to view job statistics.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci stats [--host :port] [-u] -s " + << " -g \n"; + std::cout << " rdci stats [--host :port] [-u] -x \n"; + std::cout << " rdci stats [--host :port] [-u] [-v] " + << "-j \n"; + std::cout << " rdci stats [--host :port] [-u] -r \n"; + std::cout << " rdci stats [--host :port] [-u] -a\n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " -s --jstart Start recording " + << "job statistics.\n"; + std::cout << " -g --group-id The GPU group to query " + << "on the specified host.\n"; + std::cout << " -x --jstop Stop recording " + << "job statistics.\n"; + std::cout << " -j --job Display " + << "job statistics.\n"; + std::cout << " -v --verbose Show job information " + << "for each GPU.\n"; + std::cout << " -r --jremove Remove " + << "job statistics.\n"; + std::cout << " -a --jremoveall Remove " + << "all job statistics.\n"; +} + +void RdciStatsSubSystem::show_job_stats( + const rdc_gpu_usage_info_t& gpu_info) const { + std::cout << "|------- Execution Stats ----------" + << "+------------------------------------\n"; + std::cout << "| Start Time * | " + << gpu_info.start_time << "\n"; + std::cout << "| End Time * | " + << gpu_info.end_time << "\n"; + std::cout << "| Total Execution Time (sec) * | " + << (gpu_info.end_time-gpu_info.start_time) << "\n"; + std::cout << "+------- Performance Stats --------" + << "+------------------------------------\n"; + std::cout << "| Energy Consumed (Joules) | " + << gpu_info.energy_consumed << "\n"; + std::cout << "| Power Usage (Watts) | " << "Max: " + << gpu_info.power_usage.max_value<< " Min: "<< + gpu_info.power_usage.min_value << " Avg: " + << gpu_info.power_usage.average << "\n"; + std::cout << "| SM Clock (MHz) | " << "Max: " + << gpu_info.gpu_clock.max_value << " Min: " << + gpu_info.gpu_clock.min_value << " Avg: " + << gpu_info.gpu_clock.average << "\n"; + std::cout << "| SM Utilization (%) | " << "Max: " + << gpu_info.gpu_utilization.max_value <<" Min: " << + gpu_info.gpu_utilization.min_value << " Avg: " << + gpu_info.gpu_utilization.average << "\n"; + std::cout << "| Max GPU Memory Used (bytes) * | " << + gpu_info.max_gpu_memory_used << "\n"; + std::cout << "| Memory Utilization (%) | " + << "Max: " << gpu_info.memory_utilization.max_value + <<" Min: "<< gpu_info.memory_utilization.min_value + << " Avg: " << gpu_info.memory_utilization.average << "\n"; + std::cout << "+----------------------------------" + << "+------------------------------------\n"; +} + +void RdciStatsSubSystem::process() { + if (stats_ops_ == STATS_HELP || + stats_ops_ == STATS_UNKNOWN) { + show_help(); + return; + } + + rdc_status_t result; + if (stats_ops_ == STATS_START_RECORDING) { + // Record job every 1 second + result = rdc_job_start_stats(rdc_handle_, group_id_, + const_cast(job_id_.c_str()), 1000000); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + std::cout << "Successfully started recording job " + << job_id_ << " with a group ID " << group_id_ << std::endl; + return; + } + + if (stats_ops_ == STATS_STOP_RECORDING) { + result = rdc_job_stop_stats(rdc_handle_, + const_cast(job_id_.c_str())); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + std::cout << "Successfully stopped recording job " + << job_id_ << std::endl; + return; + } + + if (stats_ops_ == STATS_DISPLAY) { + rdc_job_info_t job_info; + result = rdc_job_get_stats(rdc_handle_, + const_cast(job_id_.c_str()), &job_info); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + + std::cout << "| Summary \n"; + show_job_stats(job_info.summary); + if (is_verbose_ == false) { + return; + } + for (uint32_t i = 0; i < job_info.num_gpus; i++) { + std::cout << "| GPU " << i << "\n"; + show_job_stats(job_info.gpus[i]); + } + return; + } + + if (stats_ops_ == STATS_REMOVE) { + result = rdc_job_remove(rdc_handle_, + const_cast(job_id_.c_str())); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + std::cout << "Successfully removed job " + << job_id_ << std::endl; + return; + } + + if (stats_ops_ == STATS_REMOVE_ALL) { + result = rdc_job_remove_all(rdc_handle_); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + std::cout << "Successfully removed all jobs\n"; + return; + } +} + +} // namespace rdc +} // namespace amd + + diff --git a/projects/rdc/rdci/src/rdci.cc b/projects/rdc/rdci/src/rdci.cc index 6f53de0c63..63a9a1c457 100644 --- a/projects/rdc/rdci/src/rdci.cc +++ b/projects/rdc/rdci/src/rdci.cc @@ -29,6 +29,7 @@ THE SOFTWARE. #include "RdciDmonSubSystem.h" #include "RdciFieldGroupSubSystem.h" #include "RdciGroupSubSystem.h" +#include "RdciStatsSubSystem.h" int main(int argc, char ** argv) { @@ -52,6 +53,8 @@ int main(int argc, char ** argv) { subsystem.reset(new amd::rdc::RdciGroupSubSystem()); } else if (subsystem_name == "fieldgroup") { subsystem.reset(new amd::rdc::RdciFieldGroupSubSystem()); + } else if (subsystem_name == "stats") { + subsystem.reset(new amd::rdc::RdciStatsSubSystem()); } else { std::cout << usage_help; exit(0);