diff --git a/projects/rdc/README.md b/projects/rdc/README.md index 5432f97fd0..b145a5d1d7 100644 --- a/projects/rdc/README.md +++ b/projects/rdc/README.md @@ -2,15 +2,17 @@ Radeon Data Center ## To run the rdcd and rdci from the build folder without authentication +Note: Only if RDC not installed requires the LD_LIBRARY_PATH. ``` sudo LD_LIBRARY_PATH=$PWD/rdc_libs/ ./server/rdcd -u -LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery -u +LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery -u -l ``` ## To run the rdcd and rdci from the build folder with authentication +Note: Only if RDC not installed requires the LD_LIBRARY_PATH. ``` sudo LD_LIBRARY_PATH=$PWD/rdc_libs/ ./server/rdcd -LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery +LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery -l ``` ## Troubleshooting diff --git a/projects/rdc/rdci/CMakeLists.txt b/projects/rdc/rdci/CMakeLists.txt index 067b9811cd..50a4732270 100644 --- a/projects/rdc/rdci/CMakeLists.txt +++ b/projects/rdc/rdci/CMakeLists.txt @@ -70,6 +70,9 @@ include_directories(${INC_DIR} ${PROJECT_SOURCE_DIR}/include set(RDCI_SRC_LIST "${SRC_DIR}/rdci.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDiscoverySubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciSubSystem.cc") +set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciGroupSubSystem.cc") +set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciFieldGroupSubSystem.cc") +set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDmonSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${PROJECT_SOURCE_DIR}/common/rdc_utils.cc") message("RDCI_SRC_LIST=${RDCI_SRC_LIST}") set(RDCI_EXE "rdci") diff --git a/projects/rdc/rdci/include/RdciDmonSubSystem.h b/projects/rdc/rdci/include/RdciDmonSubSystem.h new file mode 100644 index 0000000000..ae3bb296e3 --- /dev/null +++ b/projects/rdc/rdci/include/RdciDmonSubSystem.h @@ -0,0 +1,77 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_ +#define RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_ +#include +#include +#include +#include "RdciSubSystem.h" + + +namespace amd { +namespace rdc { + +class RdciDmonSubSystem: public RdciSubSystem { + public: + RdciDmonSubSystem(); + ~RdciDmonSubSystem(); + void parse_cmd_opts(int argc, char ** argv) override; + void process() override; + + private: + void show_help() const; + void show_field_usage() const; + void clean_up(); + + void create_temp_group(); + void create_temp_field_group(); + + enum OPERATIONS { + DMON_UNKNOWN = 0, + DMON_HELP, + DMON_LIST_FIELDS, + DMON_MONITOR + } dmon_ops_; + + enum OPTIONS { + OPTIONS_UNKNOWN = 0, + OPTIONS_COUNT, + OPTIONS_DELAY, + OPTIONS_FIELD_GROUP_ID, + OPTIONS_GROUP_ID + }; + + std::map options_; + std::vector field_ids_; + std::vector gpu_indexes_; + bool need_cleanup_; + + static volatile sig_atomic_t is_terminating_; + static void set_terminating(int sig); +}; + + +} // namespace rdc +} // namespace amd + + +#endif // RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciFieldGroupSubSystem.h b/projects/rdc/rdci/include/RdciFieldGroupSubSystem.h new file mode 100644 index 0000000000..05c264c2a3 --- /dev/null +++ b/projects/rdc/rdci/include/RdciFieldGroupSubSystem.h @@ -0,0 +1,59 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDCI_INCLUDE_RDCIFIELDGROUPSUBSYSTEM_H_ +#define RDCI_INCLUDE_RDCIFIELDGROUPSUBSYSTEM_H_ + +#include +#include "RdciSubSystem.h" + +namespace amd { +namespace rdc { + +class RdciFieldGroupSubSystem: public RdciSubSystem { + public: + RdciFieldGroupSubSystem(); + void parse_cmd_opts(int argc, char ** argv) override; + void process() override; + private: + void show_help() const; + + enum OPERATIONS { + FIELD_GROUP_UNKNOWN = 0, + FIELD_GROUP_HELP, + FIELD_GROUP_CREATE, + FIELD_GROUP_DELETE, + FIELD_GROUP_LIST, + FIELD_GROUP_INFO + } field_group_ops_; + + bool is_group_set_; + uint32_t group_id_; + std::string group_name_; + std::string field_ids_; +}; + + +} // namespace rdc +} // namespace amd + + +#endif // RDCI_INCLUDE_RDCIFIELDGROUPSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciGroupSubSystem.h b/projects/rdc/rdci/include/RdciGroupSubSystem.h new file mode 100644 index 0000000000..2db13558eb --- /dev/null +++ b/projects/rdc/rdci/include/RdciGroupSubSystem.h @@ -0,0 +1,62 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDCI_INCLUDE_RDCIGROUPSUBSYSTEM_H_ +#define RDCI_INCLUDE_RDCIGROUPSUBSYSTEM_H_ + +#include +#include +#include "RdciSubSystem.h" + +namespace amd { +namespace rdc { + +class RdciGroupSubSystem: public RdciSubSystem { + public: + RdciGroupSubSystem(); + void parse_cmd_opts(int argc, char ** argv) override; + void process() override; + + private: + void show_help() const; + + enum OPERATIONS { + GROUP_UNKNOWN = 0, + GROUP_HELP, + GROUP_CREATE, + GROUP_DELETE, + GROUP_LIST, + GROUP_ADD_GPUS, + GROUP_INFO + } group_ops_; + + bool is_group_set_; + uint32_t group_id_; + std::string group_name_; + std::string gpu_ids_; +}; + + +} // namespace rdc +} // namespace amd + + +#endif // RDCI_INCLUDE_RDCIGROUPSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciSubSystem.h b/projects/rdc/rdci/include/RdciSubSystem.h index 310f70a2a4..c5a6d845ab 100644 --- a/projects/rdc/rdci/include/RdciSubSystem.h +++ b/projects/rdc/rdci/include/RdciSubSystem.h @@ -24,6 +24,7 @@ THE SOFTWARE. #include #include +#include #include "rdc_lib/rdc_common.h" #include "rdc/rdc.h" @@ -39,6 +40,8 @@ class RdciSubSystem { virtual void process() = 0; virtual ~RdciSubSystem(); protected: + std::vector split_string(const std::string& s, + char delimiter) const; void show_common_usage() const; rdc_handle_t rdc_handle_; std::string ip_port_; diff --git a/projects/rdc/rdci/src/RdciDmonSubSystem.cc b/projects/rdc/rdci/src/RdciDmonSubSystem.cc new file mode 100644 index 0000000000..f073ba65c2 --- /dev/null +++ b/projects/rdc/rdci/src/RdciDmonSubSystem.cc @@ -0,0 +1,395 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "RdciDmonSubSystem.h" +#include +#include +#include +#include +#include +#include "rdc_lib/rdc_common.h" +#include "common/rdc_utils.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" + +namespace amd { +namespace rdc { + +// When ctrl-C the program, the SIGINT handler will set the is_terminating +// to notify the program to clean up the resources created by the subsystem. +volatile sig_atomic_t RdciDmonSubSystem::is_terminating_ = 0; + +RdciDmonSubSystem::RdciDmonSubSystem(): + dmon_ops_(DMON_MONITOR) + , need_cleanup_(false) { + signal(SIGINT, set_terminating); +} + +RdciDmonSubSystem::~RdciDmonSubSystem() { + clean_up(); +} + +void RdciDmonSubSystem::set_terminating(int sig) { + if (sig == SIGINT) { + is_terminating_ = 1; + } +} + +void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { + const int HOST_OPTIONS = 1000; + const struct option long_options[] = { + {"host", required_argument, nullptr, HOST_OPTIONS }, + {"help", optional_argument, nullptr, 'h' }, + {"list", optional_argument, nullptr, 'l' }, + {"field-group-id", required_argument, nullptr, 'f' }, + {"field-id", required_argument, nullptr, 'e' }, + {"gpu_index", required_argument, nullptr, 'i'}, + {"group-id", required_argument, nullptr, 'g' }, + {"count", required_argument, nullptr, 'c'}, + {"delay", required_argument, nullptr, 'd'}, + { nullptr, 0 , nullptr, 0 } + }; + + int option_index = 0; + int opt = 0; + std::string gpu_indexes; + std::string field_ids; + + while ((opt = getopt_long(argc, argv, "hlf:g:c:d:e:i:", + long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case 'h': + dmon_ops_ = DMON_HELP; + return; + case 'l': + dmon_ops_ = DMON_LIST_FIELDS; + return; + case 'f': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The field group id needs to be a number"); + } + options_.insert({OPTIONS_FIELD_GROUP_ID, std::stoi(optarg)}); + break; + case 'e': + field_ids = optarg; + break; + case 'i': + gpu_indexes = optarg; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The group id needs to be a number"); + } + options_.insert({OPTIONS_GROUP_ID, std::stoi(optarg)}); + break; + case 'c': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The count needs to be a number"); + } + options_.insert({OPTIONS_COUNT, std::stoi(optarg)}); + break; + case 'd': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The delay needs to be a number"); + } + options_.insert({OPTIONS_DELAY, std::stoi(optarg)}); + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Unknown command line options"); + } + } + + if (options_.find(OPTIONS_FIELD_GROUP_ID) == options_.end()) { + if (field_ids == "") { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the fields or field group id"); + } else { + std::vector vec_ids = split_string(field_ids, ','); + for (uint32_t i = 0; i < vec_ids.size(); i++) { + if (!IsNumber(vec_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, "The field Id " + +vec_ids[i]+" needs to be a number"); + } + field_ids_.push_back(std::stoi(vec_ids[i])); + } + } + } + + if (options_.find(OPTIONS_GROUP_ID) == options_.end()) { + if (gpu_indexes == "") { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the GPUs or group id"); + } else { + std::vector vec_ids = split_string(gpu_indexes, ','); + for (uint32_t i = 0; i < vec_ids.size(); i++) { + if (!IsNumber(vec_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The GPU index "+vec_ids[i]+" needs to be a number"); + } + gpu_indexes_.push_back(std::stoi(vec_ids[i])); + } + } + } + + // Group and GPU index cannot co-exist + if (gpu_indexes != "" && + options_.find(OPTIONS_GROUP_ID) != options_.end()) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Use either the group or GPU indexes"); + } + + // Field group and field Ids cannot co-exist + if (field_ids != "" && + options_.find(OPTIONS_FIELD_GROUP_ID) != options_.end()) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Use either the field group or field IDs"); + } + + // Set default delay to 1 second + if (options_.find(OPTIONS_DELAY) == options_.end()) { + options_.insert({OPTIONS_DELAY, 1000}); + } + + // Set default count to max integer + if (options_.find(OPTIONS_COUNT) == options_.end()) { + options_.insert({OPTIONS_COUNT, std::numeric_limits::max()}); + } +} + +void RdciDmonSubSystem::show_help() const { + std::cout << " dmon -- Used to monitor GPUs and their stats.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci dmon [--host :port] [-u] -f " + << " -g \n"; + std::cout << " [-d ] [-c ]\n"; + std::cout << " rdci dmon [--host :port] [-u] -e " + << " -i \n"; + std::cout << " [-d ] [-c ]\n"; + std::cout << " rdci dmon [--host :port] [-u] -l \n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " -f --field-group-id The field group " + << "to query on the specified host.\n"; + std::cout << " -g --group-id The GPU group to query " + << "on the specified host.\n"; + std::cout << " -c --count count Integer representing How" + << " many times to loop before exiting. [default = runs forever.]\n"; + std::cout << " -e --field-id fieldIds Comma-separated list " + << "of the field ids to monitor.\n"; + std::cout << " -i --gpu_index gpuIndexes Comma-separated list " + << "of the GPU index to monitor.\n"; + std::cout << " -d --delay delay How often to query RDC " + << "in milli seconds. [default = 1000 msec, " + << "Minimum value = 100 msec.]\n"; + std::cout << " -l --list List to look up the long " + << "names and descriptions of the field ids\n"; +} + +void RdciDmonSubSystem::create_temp_group() { + if (gpu_indexes_.size() == 0) { + return; + } + + const std::string group_name("rdci-dmon-group"); + rdc_gpu_group_t group_id; + rdc_status_t result = rdc_group_gpu_create(rdc_handle_, + RDC_GROUP_EMPTY, group_name.c_str(), &group_id); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to create the dmon group"); + } + need_cleanup_ = true; + + for (uint32_t i = 0; i < gpu_indexes_.size() ; i++) { + result = rdc_group_gpu_add(rdc_handle_, group_id, gpu_indexes_[i]); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to add " + + std::to_string(gpu_indexes_[i])+" to the dmon group."); + } + } + options_.insert({OPTIONS_GROUP_ID, group_id}); +} + + +void RdciDmonSubSystem::create_temp_field_group() { + if (field_ids_.size() == 0) { + return; + } + + const std::string field_group_name("rdci-dmon-field-group"); + rdc_field_grp_t group_id; + uint32_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; + for (uint32_t i = 0; i < field_ids_.size(); i++) { + field_ids[i] = field_ids_[i]; + } + + rdc_status_t result = rdc_group_field_create(rdc_handle_, + field_ids_.size(), &field_ids[0], field_group_name.c_str(), &group_id); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to create the dmon field group."); + } + + need_cleanup_ = true; + options_.insert({OPTIONS_FIELD_GROUP_ID, group_id}); +} + +void RdciDmonSubSystem::show_field_usage() const { + std::cout << "Supported fields Ids:\n"; + std::cout << "100 RDC_FI_GPU_SM_CLOCK: Current GPU clock frequencies.\n"; + std::cout << "150 RDC_FI_GPU_TEMP: GPU " + << "temperature in millidegrees Celcius.\n"; + std::cout << "155 RDC_FI_POWER_USAGE: Power usage in microwatts.\n"; + std::cout << "203 RDC_FI_GPU_UTIL: GPU busy percentage.\n"; + std::cout << "525 RDC_FI_GPU_MEMORY_USAGE: Memory usage of the GPU " + << "instance in bytes.\n"; +} + +void RdciDmonSubSystem::process() { + if (dmon_ops_ == DMON_HELP || + dmon_ops_ == DMON_UNKNOWN) { + show_help(); + return; + } + + if (dmon_ops_ == DMON_LIST_FIELDS) { + show_field_usage(); + return; + } + + rdc_status_t result; + rdc_group_info_t group_info; + rdc_field_group_info_t field_info; + + // Create a temporary group/field if pass as GPU indexes or field ids + create_temp_group(); + create_temp_field_group(); + + result = rdc_group_gpu_get_info(rdc_handle_, + options_[OPTIONS_GROUP_ID], &group_info); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + result = rdc_group_field_get_info(rdc_handle_, + options_[OPTIONS_FIELD_GROUP_ID], &field_info); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + + // keep extra 1 minute data + double max_keep_age = options_[OPTIONS_DELAY]/1000.0 + 60; + const int max_keep_samples = 10; // keep only 10 samples + result = rdc_field_watch(rdc_handle_, + options_[OPTIONS_GROUP_ID], options_[OPTIONS_FIELD_GROUP_ID], + options_[OPTIONS_DELAY]*1000, max_keep_age, max_keep_samples); + need_cleanup_ = true; + std::cout << "GPU\t"; + for (uint32_t findex = 0; findex < field_info.count; findex++) { + std::cout << std::left << std::setw(20) + << field_id_string(field_info.field_ids[findex]); + } + std::cout << std::endl; + + for (uint32_t i = 0; i < options_[OPTIONS_COUNT]; i++) { + usleep(options_[OPTIONS_DELAY]*1000); + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + std::cout << group_info.entity_ids[gindex] << "\t"; + for (uint32_t findex = 0; findex < field_info.count; findex++) { + rdc_field_value value; + result = rdc_field_get_latest_value(rdc_handle_, + group_info.entity_ids[gindex], + field_info.field_ids[findex], &value); + if (result != RDC_ST_OK) { + std::cout << std::left << std::setw(20) << "error"; + } else { + if (value.type == INTEGER) { + std::cout << std::left << std::setw(20) + << value.value.l_int; + } else if (value.type == DOUBLE) { + std::cout << std::left << std::setw(20) + << value.value.dbl; + } else { + std::cout << std::left << std::setw(20) + << value.value.str; + } + } + + if (is_terminating_) { + clean_up(); + return; + } + } + std::cout << std::endl; + } + } + + clean_up(); +} + + +void RdciDmonSubSystem::clean_up() { + if (!need_cleanup_) { + return; + } + + // Not throw the errors in order to clean up all resources created + if (options_.find(OPTIONS_GROUP_ID) != options_.end() && + options_.find(OPTIONS_FIELD_GROUP_ID) != options_.end()) { + rdc_field_unwatch(rdc_handle_, options_[OPTIONS_GROUP_ID], + options_[OPTIONS_FIELD_GROUP_ID]); + } + + if (gpu_indexes_.size() != 0) { + auto group = options_.find(OPTIONS_GROUP_ID); + if (group != options_.end()) { + rdc_group_gpu_destroy(rdc_handle_, group->second); + } + } + + if (field_ids_.size() != 0) { + auto fgroup = options_.find(OPTIONS_FIELD_GROUP_ID); + if (fgroup != options_.end()) { + rdc_group_field_destroy(rdc_handle_, fgroup->second); + } + } + + need_cleanup_ = false; +} + +} // namespace rdc +} // namespace amd + + diff --git a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc new file mode 100644 index 0000000000..4d9b07d6c7 --- /dev/null +++ b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc @@ -0,0 +1,245 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "RdciFieldGroupSubSystem.h" +#include +#include +#include "rdc_lib/rdc_common.h" +#include "common/rdc_utils.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" + +namespace amd { +namespace rdc { + +RdciFieldGroupSubSystem::RdciFieldGroupSubSystem(): + field_group_ops_(FIELD_GROUP_UNKNOWN) + , is_group_set_(false) { +} + +void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { + const int HOST_OPTIONS = 1000; + const struct option long_options[] = { + {"host", required_argument, nullptr, HOST_OPTIONS }, + {"help", optional_argument, nullptr, 'h' }, + {"list", optional_argument, nullptr, 'l' }, + {"group", required_argument, nullptr, 'g'}, + {"create", required_argument, nullptr, 'c' }, + {"fieldids", required_argument, nullptr, 'f'}, + {"info", optional_argument, nullptr, 'i' }, + {"delete", required_argument, nullptr, 'd' }, + { nullptr, 0 , nullptr, 0 } + }; + + int option_index = 0; + int opt = 0; + + while ((opt = getopt_long(argc, argv, "hlif:c:g:d:", + long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case 'h': + field_group_ops_ = FIELD_GROUP_HELP; + return; + case 'l': + field_group_ops_ = FIELD_GROUP_LIST; + break; + case 'f': + field_ids_ = optarg; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + is_group_set_ = true; + break; + case 'c': + field_group_ops_ = FIELD_GROUP_CREATE; + group_name_ = optarg; + break; + case 'i': + field_group_ops_ = FIELD_GROUP_INFO; + break; + case 'd': + field_group_ops_ = FIELD_GROUP_DELETE; + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + is_group_set_ = true; + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Unknown command line options"); + } + } + + if (field_group_ops_ == FIELD_GROUP_UNKNOWN) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Must specify a valid operations"); + } +} + +void RdciFieldGroupSubSystem::show_help() const { + std::cout << " fieldgroup -- Used to create and maintain groups " + << "of field Ids.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci fieldgroup [--host :port] [-u] -l\n"; + std::cout << " rdci fieldgroup [--host :port] [-u] " + << "-c -f \n"; + std::cout << " rdci fieldgroup [--host :port] [-u] " + << "-g -i\n"; + std::cout << " rdci fieldgroup [--host :port] [-u] " + << "-d \n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " -l --list " + << "List the field groups that currently exist for a host.\n"; + std::cout << " -g --group groupId " + << "The field group to query on the specified host.\n"; + std::cout << " -c --create groupName " + << "Create a field group on the remote host.\n"; + std::cout << " -f --fieldids fieldIds Comma-separated " + << "list of the field ids to add to a field group\n"; + std::cout << " -i --info " + << "Display the information for the specified group Id\n"; + std::cout << " -d --delete groupId " + << "Delete a field group on the remote host.\n"; +} + + +void RdciFieldGroupSubSystem::process() { + rdc_status_t result = RDC_ST_OK; + rdc_field_group_info_t group_info; + uint32_t count = 0; + switch (field_group_ops_) { + case FIELD_GROUP_HELP: + show_help(); + break; + case FIELD_GROUP_CREATE: + { + if (group_name_ == "") { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Must specify the group name when create a field group"); + } + std::vector fields = split_string(field_ids_, ','); + uint32_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; + for (uint32_t i = 0; i < fields.size(); i++) { + if (!IsNumber(fields[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The field Id "+fields[i]+" needs to be a number"); + } + field_ids[i] = std::stoi(fields[i]); + } + rdc_field_grp_t group_id; + result = rdc_group_field_create(rdc_handle_, fields.size(), + &field_ids[0], group_name_.c_str(), &group_id); + if (result == RDC_ST_OK) { + std::cout << "Successfully created a field group" + << " with a group ID " << group_id << std::endl; + return; + } + break; + } + case FIELD_GROUP_DELETE: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the group id to delete a group"); + } + result = rdc_group_field_destroy(rdc_handle_, group_id_); + if (result == RDC_ST_OK) { + std::cout << "Successfully deleted the field group " + << group_id_ << std::endl; + return; + } + break; + case FIELD_GROUP_LIST: + rdc_field_grp_t group_id_list[RDC_MAX_NUM_FIELD_GROUPS]; + result = rdc_group_field_get_all_ids( + rdc_handle_, group_id_list, &count); + if ( result != RDC_ST_OK) break; + + std::cout << count << " field group found.\n"; + std::cout << "GroupID\t" << "GroupName\t" << "FieldIds\n"; + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_field_get_info( + rdc_handle_, group_id_list[i], &group_info); + if (result != RDC_ST_OK) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "Fail to get information for field group " + + std::to_string(group_id_list[i])); + } + + std::cout << group_id_list[i] << "\t" + << group_info.group_name << "\t\t"; + for (uint32_t j = 0; j < group_info.count; j++) { + std::cout << group_info.field_ids[j]; + if ( j < group_info.count -1 ) { + std::cout << ","; + } + } + std::cout << std::endl; + } + break; + case FIELD_GROUP_INFO: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the group id to show field group info"); + } + result = rdc_group_field_get_info( + rdc_handle_, group_id_, &group_info); + if (result == RDC_ST_OK) { + std::cout << "Group name: " << group_info.group_name + << std::endl; + std::cout << "Field Ids: "; + for (uint32_t i = 0; i < group_info.count; i++) { + std::cout << group_info.field_ids[i] << " "; + } + std::cout << std::endl; + return; + } + break; + default: + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); + } + + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } +} + + +} // namespace rdc +} // namespace amd + + diff --git a/projects/rdc/rdci/src/RdciGroupSubSystem.cc b/projects/rdc/rdci/src/RdciGroupSubSystem.cc new file mode 100644 index 0000000000..e8f58879cb --- /dev/null +++ b/projects/rdc/rdci/src/RdciGroupSubSystem.cc @@ -0,0 +1,260 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "RdciGroupSubSystem.h" +#include +#include +#include "common/rdc_utils.h" +#include "rdc_lib/rdc_common.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" + +namespace amd { +namespace rdc { + +RdciGroupSubSystem::RdciGroupSubSystem(): + group_ops_(GROUP_UNKNOWN) + , is_group_set_(false) { +} + +void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { + const int HOST_OPTIONS = 1000; + const struct option long_options[] = { + {"host", required_argument, nullptr, HOST_OPTIONS }, + {"help", optional_argument, nullptr, 'h' }, + {"list", optional_argument, nullptr, 'l' }, + {"group", required_argument, nullptr, 'g'}, + {"create", required_argument, nullptr, 'c' }, + {"add", required_argument, nullptr, 'a' }, + {"info", optional_argument, nullptr, 'i' }, + {"delete", required_argument, nullptr, 'd' }, + { nullptr, 0 , nullptr, 0 } + }; + + int option_index = 0; + int opt = 0; + + while ((opt = getopt_long(argc, argv, "hlic:g:a:d:", + long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case 'h': + group_ops_ = GROUP_HELP; + return; + case 'l': + group_ops_ = GROUP_LIST; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + is_group_set_ = true; + break; + case 'c': + group_ops_ = GROUP_CREATE; + group_name_ = optarg; + break; + case 'a': + group_ops_ = GROUP_ADD_GPUS; + gpu_ids_ = optarg; + break; + case 'i': + group_ops_ = GROUP_INFO; + break; + case 'd': + group_ops_ = GROUP_DELETE; + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + is_group_set_ = true; + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Unknown command line options"); + } + } + + if (group_ops_ == GROUP_UNKNOWN) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Must specify a valid operations"); + } +} + +void RdciGroupSubSystem::show_help() const { + std::cout << " group -- Used to create and maintain groups of GPUs.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci group [--host :port] [-u] -l\n"; + std::cout << " rdci group [--host :port] [-u] -c \n"; + std::cout << " rdci group [--host :port] [-u] -g " + << "[-a ]\n"; + std::cout << " rdci group [--host :port] [-u] " + << "-g [-i]\n"; + std::cout << " rdci group [--host :port] [-u] -d \n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " -l --list " + << "List the groups that currently exist for a host.\n"; + std::cout << " -g --group groupId " + << "The GPU group to query on the specified host.\n"; + std::cout << " -c --create groupName " + << "Create a group on the remote host.\n"; + std::cout << " -a --add gpuIndexes " + << "Comma-separated list of the GPU indexes to add to the group.\n"; + std::cout << " -i --info " + << "Display the information for the specified group Id\n"; + std::cout << " -d --delete groupId " + << "Delete a group on the remote host.\n"; +} + + +void RdciGroupSubSystem::process() { + rdc_status_t result = RDC_ST_OK; + std::vector gpu_ids; + rdc_group_info_t group_info; + uint32_t count = 0; + switch (group_ops_) { + case GROUP_HELP: + show_help(); + break; + case GROUP_CREATE: + if (group_name_ == "") { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Must specify the group name when create a group"); + } + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle_, RDC_GROUP_EMPTY, + group_name_.c_str(), &group_id); + if (result == RDC_ST_OK) { + std::cout << "Successfully created group with a group ID " + << group_id << std::endl; + return; + } + break; + case GROUP_DELETE: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the group id to delete a group"); + } + result = rdc_group_gpu_destroy(rdc_handle_, group_id_); + if (result == RDC_ST_OK) { + std::cout << "Successfully deleted the group " + << group_id_ << std::endl; + return; + } + break; + case GROUP_LIST: + rdc_gpu_group_t group_id_list[RDC_MAX_NUM_GROUPS]; + result = rdc_group_get_all_ids(rdc_handle_, group_id_list, &count); + if ( result != RDC_ST_OK) break; + + std::cout << count << " group found.\n"; + std::cout << "GroupID\t" << "GroupName\t" << "GPUIndex\n"; + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_get_info(rdc_handle_, + group_id_list[i], &group_info); + if (result != RDC_ST_OK) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "Fail to get information for group " + + std::to_string(group_id_list[i])); + } + + std::cout << group_id_list[i] << "\t" + << group_info.group_name << "\t\t"; + for (uint32_t j = 0; j < group_info.count; j++) { + std::cout << group_info.entity_ids[j]; + if (j < group_info.count -1) { + std::cout << ","; + } + } + std::cout << std::endl; + } + break; + case GROUP_ADD_GPUS: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the group id to add a group"); + } + + gpu_ids = split_string(gpu_ids_, ','); + for (uint32_t i = 0; i < gpu_ids.size(); i++) { + if (!IsNumber(gpu_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The GUP Id "+gpu_ids[i]+" needs to be a number"); + } + result = rdc_group_gpu_add(rdc_handle_, + group_id_, std::stoi(gpu_ids[i])); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to add GPU " + + gpu_ids[i] + " to the group"); + } + } + if (result == RDC_ST_OK) { + std::cout << "Successfully added the GPU " << gpu_ids_ + << " to group "<< group_id_ << std::endl; + return; + } + break; + case GROUP_INFO: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the group id to show group info"); + } + result = rdc_group_gpu_get_info(rdc_handle_, + group_id_, &group_info); + if (result == RDC_ST_OK) { + std::cout << "Group name: " + << group_info.group_name << std::endl; + std::cout << "Gpu indexes: "; + for (uint32_t i = 0; i < group_info.count; i++) { + std::cout << group_info.entity_ids[i] << " "; + } + std::cout << std::endl; + return; + } + break; + default: + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); + } + + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } +} + + +} // namespace rdc +} // namespace amd + + diff --git a/projects/rdc/rdci/src/RdciSubSystem.cc b/projects/rdc/rdci/src/RdciSubSystem.cc index 70f73b8a60..f52d8de544 100644 --- a/projects/rdc/rdci/src/RdciSubSystem.cc +++ b/projects/rdc/rdci/src/RdciSubSystem.cc @@ -20,6 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "RdciSubSystem.h" +#include #include "rdc_lib/RdcException.h" #include "common/rdc_utils.h" @@ -39,6 +40,17 @@ RdciSubSystem::RdciSubSystem(): } } +std::vector RdciSubSystem::split_string(const std::string& s, + char delimiter) const { + std::vector tokens; + std::string token; + std::istringstream tokenStream(s); + while (std::getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + void RdciSubSystem::connect() { rdc_status_t status; diff --git a/projects/rdc/rdci/src/rdci.cc b/projects/rdc/rdci/src/rdci.cc index 967018b29d..6f53de0c63 100644 --- a/projects/rdc/rdci/src/rdci.cc +++ b/projects/rdc/rdci/src/rdci.cc @@ -26,6 +26,9 @@ THE SOFTWARE. #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" #include "RdciDiscoverySubSystem.h" +#include "RdciDmonSubSystem.h" +#include "RdciFieldGroupSubSystem.h" +#include "RdciGroupSubSystem.h" int main(int argc, char ** argv) { @@ -43,6 +46,12 @@ int main(int argc, char ** argv) { amd::rdc::RdciSubSystemPtr subsystem; if (subsystem_name == "discovery") { subsystem.reset(new amd::rdc::RdciDiscoverySubSystem()); + } else if (subsystem_name == "dmon") { + subsystem.reset(new amd::rdc::RdciDmonSubSystem()); + } else if (subsystem_name == "group") { + subsystem.reset(new amd::rdc::RdciGroupSubSystem()); + } else if (subsystem_name == "fieldgroup") { + subsystem.reset(new amd::rdc::RdciFieldGroupSubSystem()); } else { std::cout << usage_help; exit(0);