From cff2ac849099056deeeed5efb2c6aae6103983b2 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 16 May 2024 03:38:25 -0500 Subject: [PATCH] Add rocprofiler_example.cc and fix logging Change-Id: Ib3ed8754f314edc76ea56bfec9a645d720f8926d Signed-off-by: Galantsev, Dmitrii [ROCm/rdc commit: c7fcb1ad255c00707c43fd404fec34418ad2b4ab] --- projects/rdc/example/CMakeLists.txt | 6 + projects/rdc/example/rocprofiler_example.cc | 261 ++++++++++++++++++ .../rdc_modules/rdc_rocp/RdcRocpBase.cc | 29 +- 3 files changed, 277 insertions(+), 19 deletions(-) create mode 100644 projects/rdc/example/rocprofiler_example.cc diff --git a/projects/rdc/example/CMakeLists.txt b/projects/rdc/example/CMakeLists.txt index 959d0913d2..01f251d1dd 100755 --- a/projects/rdc/example/CMakeLists.txt +++ b/projects/rdc/example/CMakeLists.txt @@ -107,6 +107,12 @@ set(DIAGNOSTIC_EXAMPLE_EXE "diagnostic") add_executable(${DIAGNOSTIC_EXAMPLE_EXE} "${DIAGNOSTIC_EXAMPLE_SRC_LIST}") target_link_libraries(${DIAGNOSTIC_EXAMPLE_EXE} pthread dl rdc_bootstrap) +set(ROCPROFILER_EXAMPLE_SRC_LIST "rocprofiler_example.cc") +cmake_print_variables(ROCPROFILER_EXAMPLE_SRC_LIST) +set(ROCPROFILER_EXAMPLE_EXE "rocprofiler") +add_executable(${ROCPROFILER_EXAMPLE_EXE} "${ROCPROFILER_EXAMPLE_SRC_LIST}") +target_link_libraries(${ROCPROFILER_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") diff --git a/projects/rdc/example/rocprofiler_example.cc b/projects/rdc/example/rocprofiler_example.cc new file mode 100644 index 0000000000..561cda2a41 --- /dev/null +++ b/projects/rdc/example/rocprofiler_example.cc @@ -0,0 +1,261 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +#include +#include +#include +#include +#include + +#include "rdc/rdc.h" + +rdc_handle_t rdc_handle; +rdc_status_t result; + +constexpr std::string_view value_to_string(rdc_field_value value) { + switch (value.type) { + case INTEGER: + return std::to_string(value.value.l_int); + case DOUBLE: + return std::to_string(value.value.dbl); + case STRING: + return value.value.str; + default: + return "UNKNOWN"; + } +} + +// Cleanup consists of shutting down RDC. +rdc_status_t cleanup() { + std::cout << "Cleaning up.\n"; + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; +} + +int run() { + char group_name[] = {"group1"}; + char field_group_name[] = {"fieldgroup1"}; + uint64_t since_timestamp = 0; + uint64_t next_timestamp = 0; + uint64_t start_timestamp = 0; + uint32_t count = 0; + + // Select the embedded mode and standalone mode dynamically. + std::cout << "Starting rdci in Embedded mode\n"; + + // Init the rdc + result = rdc_init(0); + + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl; + return cleanup(); + } else { + std::cout << "RDC Initialized.\n"; + } + + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); + if (result != RDC_ST_OK) { + std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result) + << std::endl; + return cleanup(); + } + + // Get the list of devices in the system + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + result = rdc_device_get_all(rdc_handle, gpu_index_list, &count); + if (result != RDC_ST_OK) { + std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result); + return cleanup(); + } + if (count == 0) { + std::cout << "No GPUs find on the sytem "; + return cleanup(); + } else { + std::cout << count << " GPUs found in the system.\n"; + } + + // Create the group + rdc_gpu_group_t group_id = 0; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " << rdc_status_string(result); + return cleanup(); + } + std::cout << "Created the GPU group " << group_id << std::endl; + + // Only add one GPU + count = 1; + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0 + if (result != RDC_ST_OK) { + std::cout << "Error adding group. Return: " << rdc_status_string(result); + return cleanup(); + } + rdc_device_attributes_t attribute; + result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute); + if (result != RDC_ST_OK) { + std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result); + return cleanup(); + } + std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group " + << group_id << std::endl; + } + + // Create a field group to monitor multiple fields + rdc_field_grp_t field_group_id = 0; + std::vector field_ids{}; + + field_ids.push_back(RDC_FI_GPU_MEMORY_USAGE); + field_ids.push_back(RDC_FI_POWER_USAGE); + field_ids.push_back(RDC_FI_PROF_CU_OCCUPANCY); + field_ids.push_back(RDC_FI_PROF_CU_UTILIZATION); + result = rdc_group_field_create(rdc_handle, field_ids.size(), field_ids.data(), field_group_name, + &field_group_id); + if (result != RDC_ST_OK) { + std::cout << "Error create field group, Return: " << rdc_status_string(result); + return cleanup(); + } + std::cout << "Created the field group " << field_group_id << "\n"; + std::cout << "fields: "; + for (auto field_id : field_ids) { + std::cout << "- " << field_id << "\n"; + } + + // Let the RDC to watch the fields and groups. The fields will be updated + // once per second, the max keep age is 1 minutes and only keep 10 samples. + result = rdc_field_watch(rdc_handle, group_id, field_group_id, + static_cast(1) * 1000 * 1000, 60, 10); + if (result != RDC_ST_OK) { + std::cout << "Error watch group fields. Return: " << rdc_status_string(result); + return cleanup(); + } + std::cout << "Start to watch group:" << group_id << ", field_group:" << field_group_id + << std::endl; + std::cout << "Sleep a few seconds before retreive the data ...\n"; + + // Since we are running the RDC_OPERATION_MODE_AUTO mode, the rdc_update_ + // all_fields() will be called periodically at background. If running as + // RDC_OPERATION_MODE_MANUAL mode, we must call rdc_field_update_all() + // periodically to take samples. + usleep(5 * 1000 * 1000); // sleep 5 seconds before fetch the stats + + // Retreive the field and group information from RDC + rdc_group_info_t group_info; + rdc_field_group_info_t field_info; + result = rdc_group_gpu_get_info(rdc_handle, group_id, &group_info); + if (result != RDC_ST_OK) { + std::cout << "Error get gpu group info. Return: " << rdc_status_string(result); + return cleanup(); + } + result = rdc_group_field_get_info(rdc_handle, field_group_id, &field_info); + if (result != RDC_ST_OK) { + std::cout << "Error get field group info. Return: " << rdc_status_string(result); + return cleanup(); + } + + // Get the latest metrics + std::cout << "Get the latest metrics for group:" << group_id << " field_group:" << field_group_id + << std::endl; + std::cout << "time_stamp\t" + << "GPU_index\t" + << "field_name\t\t" + << "field_value\n"; + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + for (uint32_t findex = 0; findex < field_info.count; findex++) { + rdc_field_value value; + result = rdc_field_get_latest_value(rdc_handle, group_info.entity_ids[gindex], + field_info.field_ids[findex], &value); + if (result == RDC_ST_NOT_FOUND) { + continue; + } + if (result != RDC_ST_OK) { + std::cout << "Error get least value. Return: " << rdc_status_string(result); + return cleanup(); + } + // We only support the integer metrics so far + std::cout << value.ts << "\t" << group_info.entity_ids[gindex] << "\t\t" << std::left + << std::setw(16) << field_id_string(value.field_id) << "\t" + << value_to_string(value) << std::endl; + } + } + + // Stop watching the field group + result = rdc_field_unwatch(rdc_handle, group_id, field_group_id); + if (result != RDC_ST_OK) { + std::cout << "Error stop watch fields. Return: " << rdc_status_string(result); + return cleanup(); + } + std::cout << "Stop watch group:" << group_id << ", field_group:" << field_group_id << std::endl; + + // Get the history data last 10 seconds + std::cout << "Get last 10 seconds metrics for group:" << group_id + << " field_group:" << field_group_id << std::endl; + std::cout << "time_stamp\t" + << "GPU_index\t" + << "field_name\t\t" + << "field_value\n"; + start_timestamp = static_cast(time(nullptr) - 10) * 1000; + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + for (uint32_t findex = 0; findex < field_info.count; findex++) { + since_timestamp = start_timestamp; + while (true) { + rdc_field_value value; + result = rdc_field_get_value_since(rdc_handle, group_info.entity_ids[gindex], + field_info.field_ids[findex], since_timestamp, + &next_timestamp, &value); + if (result == RDC_ST_NOT_FOUND) { + break; + } + if (result != RDC_ST_OK) { + std::cout << "Error get history data. Return: " << rdc_status_string(result); + return cleanup(); + } + std::cout << value.ts << "\t" << group_info.entity_ids[gindex] << "\t\t" << std::left + << std::setw(16) << field_id_string(value.field_id) << "\t" + << value_to_string(value) << std::endl; + since_timestamp = next_timestamp; + } // while + } // for findex + } // for gindex + + // Delete the field group and GPU group + result = rdc_group_field_destroy(rdc_handle, field_group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete field group. Return: " << rdc_status_string(result); + return cleanup(); + } + std::cout << "Deleted the field group " << field_group_id << std::endl; + + result = rdc_group_gpu_destroy(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete GPU group. Return: " << rdc_status_string(result); + return cleanup(); + } + std::cout << "Deleted the GPU group " << group_id << std::endl; + + return cleanup(); +} + +int main(int, char**) { return run(); } diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index 14f91d26af..0cf2a892ee 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -107,11 +107,15 @@ bool createHsaQueue(hsa_queue_t** queue, hsa_agent_t gpu_agent) { hsa_status_t status; status = hsa_queue_create(gpu_agent, 64, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, queue); - if (status != HSA_STATUS_SUCCESS) fprintf(stdout, "Queue creation failed"); + if (status != HSA_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Queue creation failed"); + } // TODO: warning: is it really required!! ?? status = hsa_amd_queue_set_priority(*queue, HSA_AMD_QUEUE_PRIORITY_HIGH); - if (status != HSA_STATUS_SUCCESS) fprintf(stdout, "HSA Queue Priority Set Failed"); + if (status != HSA_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "HSA Queue Priority Set Failed"); + } return (status == HSA_STATUS_SUCCESS); } @@ -140,11 +144,10 @@ int RdcRocpBase::run_profiler(const char* feature_name) { int mode = (ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_SINGLEGROUP); hsa_errno = rocprofiler_open(agent_arr.agents[i], features[i], features_count, &contexts[i], mode, &properties); - const char* error_string; + const char* error_string = nullptr; rocprofiler_error_string(&error_string); if (error_string != NULL) { - fprintf(stdout, "%s", error_string); - fflush(stdout); + RDC_LOG(RDC_ERROR, error_string); } assert(hsa_errno == HSA_STATUS_SUCCESS); } @@ -155,6 +158,7 @@ int RdcRocpBase::run_profiler(const char* feature_name) { } // this is the duration for which the counter increments from zero. + // TODO: Return error if sampling interval is lower than this value usleep(10000); for (int i = 0; i < dev_count; ++i) { @@ -163,8 +167,6 @@ int RdcRocpBase::run_profiler(const char* feature_name) { } for (int i = 0; i < dev_count; ++i) { - // printf("Iteration %d\n", loopcount++); - // fprintf(stdout, "------ Collecting Device[%d] -------\n", i); read_features(contexts[i], features_count); } @@ -205,20 +207,12 @@ RdcRocpBase::RdcRocpBase() { // populate monitored fields std::cout << "Size of counter_map_k: " << counter_map_k.size() << "\n"; - for (auto& k : counter_map_k) { - printf("metric %d = %s\n", k.first, k.second); - } for (auto& [k, v] : counter_map_k) { const char* str = v; metrics.emplace(std::make_pair(str, 0.0)); } assert(metrics.size() == counter_map_k.size()); - printf("Metric size %d\n", (int)metrics.size()); - for (auto& metric : metrics) { - printf("Metric: %s\n", metric.first); - } - hsa_status_t err = hsa_init(); if (err != HSA_STATUS_SUCCESS) { const char* errstr = nullptr; @@ -231,22 +225,19 @@ RdcRocpBase::RdcRocpBase() { if (errcode != 0) { return; } - printf("number of devices: %u\n", agent_arr.count); - printf("devices being profiled: %u\n", dev_count); for (int i = 0; i < dev_count; ++i) { int j = 0; for (auto& metric : metrics) { features[i][j].kind = (rocprofiler_feature_kind_t)ROCPROFILER_FEATURE_KIND_METRIC; features[i][j].name = metric.first; - printf("Metric[%d]: %s\n", j, features[i][j].name); j++; } } for (int i = 0; i < dev_count; ++i) { if (!createHsaQueue(&queues[i], agent_arr.agents[i])) { - fprintf(stdout, "can't create queues[%d]\n", i); + RDC_LOG(RDC_ERROR, "can't create queues[" << i << "]\n"); } } }