From 46cb82b75fe8a8cf766d8f7fc79175fdbed1f8a8 Mon Sep 17 00:00:00 2001 From: venkat1361 Date: Mon, 11 Nov 2024 07:07:28 -0600 Subject: [PATCH] Dimension support for reduce operator (#1147) * cache reference nodes * evaluation based on dim args * format * add dimensions for reduce operator * add dimensions for reduce operator * add dimensions for reduce operator docs * add dimensions for reduce operator. * refactor switch cases * Update CHANGELOG.md * updated doc with data example * updated doc with data example for reduce operation. * added fallthrough in switch case sum. * changelog.md * format * fix bug in constuct_test_data() [ROCm/rocprofiler-sdk commit: 472907a57646137fe6f93cdd99b79431dd38bd95] --- projects/rocprofiler-sdk/CHANGELOG.md | 2 + .../counter_collection_services.md | 51 +++++- .../rocprofiler-sdk/counters/evaluate_ast.cpp | 165 ++++++++++++------ .../counters/tests/evaluate_ast_test.cpp | 140 +++++++++++++++ 4 files changed, 294 insertions(+), 64 deletions(-) diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index 9c5bb695f0..e25e51e0fe 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -142,6 +142,8 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec ### Added - Added support for select() operation in counter expression. +- Added reduce operation for counter expression wrt dimension. + ### Changed ### Resolved issues diff --git a/projects/rocprofiler-sdk/source/docs/api-reference/counter_collection_services.md b/projects/rocprofiler-sdk/source/docs/api-reference/counter_collection_services.md index 1b89d7f152..85f3471d12 100644 --- a/projects/rocprofiler-sdk/source/docs/api-reference/counter_collection_services.md +++ b/projects/rocprofiler-sdk/source/docs/api-reference/counter_collection_services.md @@ -289,6 +289,50 @@ Expression: 100*reduce(GL2C_HIT,sum)/(reduce(GL2C_HIT,sum)+reduce(GL2C_MISS,sum) The reduce function reduces counter values across all dimensions such as shader engine, SIMD, and so on, to produce a single output value. This helps to collect and compare values across the entire device. Here are the common reduction operations: +- `sum`: Sums to create a single output. For example, `reduce(GL2C_HIT,sum)` sums all `GL2C_HIT` hardware register values. +- `avr`: Calculates the average across all dimensions. +- `min`: Selects minimum value across all dimensions. +- `max`: Selects the maximum value across all dimensions. + +```yaml +expression: reduce(X,sum,[DIMENSION_XCC]) +``` +Reduce() also supports dimension wise reduction, when provided dimensions in 3rd parameter. In the expression above, if `X` has two dimensions `DIMENSION_XCC`, `DIMENSION_SHADER_ARRAY`, and `DIMENSION_WGP`, the reduce happens across counter values where `DIMENSION_SHADER_ARRAY` and `DIMENSION_WGP` dimensions are same as shown below. + +Let's say DIM sizes of XCC, SHADER_ARRAY(SH), WGP be 2, 4, 4 respectively. + +Raw Counter Data in 3D space: + +#### XCC[0]: +| |WGP[0]|WGP[1]|WGP[2]|WGP[3]| +|-------|------|------|------|------| +| SH[0] | 1 | 2 | 3 | 4 | +| SH[1] | 5 | 6 | 7 | 8 | +| SH[2] | 9 | 10 | 11 | 12 | +| SH[3] | 13 | 14 | 15 | 16 | + +#### XCC[1]: +| |WGP[0]|WGP[1]|WGP[2]|WGP[3]| +|-------|------|------|------|------| +| SH[0] | 1 | 2 | 3 | 4 | +| SH[1] | 5 | 6 | 7 | 8 | +| SH[2] | 9 | 10 | 11 | 12 | +| SH[3] | 13 | 14 | 15 | 16 | + +Reducing XCC dim with sum, results to 2D space with only WGP and SH. + +| |WGP[0]|WGP[1]|WGP[2]|WGP[3]| +|-------|------|------|------|------| +| SH[0] | 2 | 4 | 6 | 8 | +| SH[1] | 10 | 12 | 14 | 16 | +| SH[2] | 18 | 20 | 22 | 24 | +| SH[3] | 26 | 28 | 30 | 32 | + +similarly, for `reduce(X,sum,[DIMENSION_XCC,DIMENSION_SHADER_ARRAY])` results in only WGP dimension. + +| |WGP[0]|WGP[1]|WGP[2]|WGP[3]| +|-------|------|------|------|------| +| | 56 | 64 | 72 | 80 | ### Select Function @@ -333,13 +377,6 @@ similarly, for `select(Y, [DIMENSION_XCC=[0],DIMENSION_SHADER_ENGINE=[2]])` resu |-------|------|------|------|------| | | 9 | 10 | 11 | 12 | -### Accumulate Function - -- `sum`: Sums to create a single output. For example, `reduce(GL2C_HIT,sum)` sums all `GL2C_HIT` hardware register values. -- `avr`: Calculates the average across all dimensions. -- `min`: Selects minimum value across all dimensions. -- `max`: Selects the maximum value across all dimensions. - ### Accumulate function ```yaml diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp index 69b41f8448..9adb5ba38f 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp @@ -60,21 +60,17 @@ get_reduce_op_type_from_string(const std::string& op) return type; } -std::vector* -perform_reduction(ReduceOperation reduce_op, std::vector* input_array) +void +perform_reduction_to_single_instance(ReduceOperation reduce_op, + std::vector* input_array, + rocprofiler_record_counter_t* result) { - rocprofiler_record_counter_t result{.id = 0, - .counter_value = 0, - .dispatch_id = 0, - .user_data = {.value = 0}, - .agent_id = {.handle = 0}}; - if(input_array->empty()) return input_array; switch(reduce_op) { case REDUCE_NONE: break; case REDUCE_MIN: { - result = + *result = *std::min_element(input_array->begin(), input_array->end(), [](auto& a, auto& b) { return a.counter_value < b.counter_value; }); @@ -82,55 +78,92 @@ perform_reduction(ReduceOperation reduce_op, std::vectorbegin(), input_array->end(), [](auto& a, auto& b) { return a.counter_value < b.counter_value; }); break; } - case REDUCE_SUM: - { - result = std::accumulate(input_array->begin(), - input_array->end(), - rocprofiler_record_counter_t{.id = 0, - .counter_value = 0, - .dispatch_id = 0, - .user_data = {.value = 0}, - .agent_id = {.handle = 0}}, - [](auto& a, auto& b) { - return rocprofiler_record_counter_t{ - .id = a.id, - .counter_value = a.counter_value + b.counter_value, - .dispatch_id = a.dispatch_id, - .user_data = {.value = 0}, - .agent_id = {.handle = 0}}; - }); - break; - } + case REDUCE_SUM: [[fallthrough]]; case REDUCE_AVG: { - result = std::accumulate(input_array->begin(), - input_array->end(), - rocprofiler_record_counter_t{.id = 0, - .counter_value = 0, - .dispatch_id = 0, - .user_data = {.value = 0}, - .agent_id = {.handle = 0}}, - [](auto& a, auto& b) { - return rocprofiler_record_counter_t{ - .id = a.id, - .counter_value = a.counter_value + b.counter_value, - .dispatch_id = a.dispatch_id, - .user_data = {.value = 0}, - .agent_id = {.handle = 0}}; - }); - result.counter_value /= input_array->size(); + *result = std::accumulate(input_array->begin(), + input_array->end(), + rocprofiler_record_counter_t{.id = 0, + .counter_value = 0, + .dispatch_id = 0, + .user_data = {.value = 0}, + .agent_id = {.handle = 0}}, + [](auto& a, auto& b) { + return rocprofiler_record_counter_t{ + .id = a.id, + .counter_value = a.counter_value + b.counter_value, + .dispatch_id = a.dispatch_id, + .user_data = {.value = 0}, + .agent_id = {.handle = 0}}; + }); + if(reduce_op == REDUCE_AVG) + { + (*result).counter_value /= input_array->size(); + } break; } } +} + +std::vector* +perform_reduction( + ReduceOperation reduce_op, + std::vector* input_array, + const std::unordered_set& _reduce_dimension_set) +{ + if(input_array->empty()) return input_array; + if(_reduce_dimension_set.empty() || + _reduce_dimension_set.size() == ROCPROFILER_DIMENSION_LAST - 1) + { + rocprofiler_record_counter_t result{.id = 0, + .counter_value = 0, + .dispatch_id = 0, + .user_data = {.value = 0}, + .agent_id = {.handle = 0}}; + perform_reduction_to_single_instance(reduce_op, input_array, &result); + input_array->clear(); + input_array->push_back(result); + set_dim_in_rec(input_array->begin()->id, ROCPROFILER_DIMENSION_NONE, 0); + return input_array; + } + + std::unordered_map> rec_groups; + size_t bit_length = DIM_BIT_LENGTH / ROCPROFILER_DIMENSION_LAST; + + for(auto& rec : *input_array) + { + for(auto dim : _reduce_dimension_set) + { + int64_t mask_dim = (MAX_64 >> (64 - bit_length)) << ((dim - 1) * bit_length); + + rec.id = rec.id | mask_dim; + rec.id = rec.id ^ mask_dim; + } + rec_groups[rec.id].push_back(rec); + } + input_array->clear(); - input_array->push_back(result); - set_dim_in_rec(input_array->begin()->id, ROCPROFILER_DIMENSION_NONE, 0); + for(auto& rec_pair : rec_groups) + { + rocprofiler_record_counter_t result{.id = 0, + .counter_value = 0, + .dispatch_id = 0, + .user_data = {.value = 0}, + .agent_id = {.handle = 0}}; + + perform_reduction_to_single_instance(reduce_op, &rec_pair.second, &result); + input_array->push_back(result); + } + if(input_array->size() == 1) + { + set_dim_in_rec(input_array->begin()->id, ROCPROFILER_DIMENSION_NONE, 0); + } return input_array; } @@ -375,11 +408,30 @@ EvaluateAST::set_dimensions() break; case REDUCE_NODE: { - // Reduction down to a single instance supported for now. - _dimension_types = - std::vector{{dimension_map().at(ROCPROFILER_DIMENSION_INSTANCE), - 1, - ROCPROFILER_DIMENSION_INSTANCE}}; + if(_reduce_dimension_set.empty()) + { + _dimension_types = std::vector{ + {dimension_map().at(ROCPROFILER_DIMENSION_INSTANCE), + 1, + ROCPROFILER_DIMENSION_INSTANCE}}; + } + + else + { + _dimension_types = std::vector{ + {dimension_map().at(ROCPROFILER_DIMENSION_INSTANCE), + 1, + ROCPROFILER_DIMENSION_INSTANCE}}; + auto first = _children[0].set_dimensions(); + first.erase(std::remove_if(first.begin(), + first.end(), + [&](const MetricDimension& dim) { + return _reduce_dimension_set.find(dim.type()) != + _reduce_dimension_set.end(); + }), + first.end()); + if(!first.empty()) _dimension_types = first; + } } break; case SELECT_NODE: @@ -678,10 +730,6 @@ EvaluateAST::evaluate( if(r1->size() < r2->size()) swap(r1, r2); - cache.emplace_back(std::make_unique>()); - *cache.back() = *r1; - r1 = cache.back().get(); - CHECK(!r1->empty() && !r2->empty()); if(r2->size() == 1) @@ -758,6 +806,9 @@ EvaluateAST::evaluate( throw std::runtime_error( fmt::format("Unable to lookup results for metric {}", _metric.name())); + cache.emplace_back(std::make_unique>()); + *cache.back() = *result; + result = cache.back().get(); return result; } break; @@ -767,7 +818,7 @@ EvaluateAST::evaluate( if(_reduce_op == REDUCE_NONE) throw std::runtime_error(fmt::format("Invalid Second argument to reduce(): {}", static_cast(_reduce_op))); - return perform_reduction(_reduce_op, result); + return perform_reduction(_reduce_op, result, _reduce_dimension_set); } case SELECT_NODE: { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/evaluate_ast_test.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/evaluate_ast_test.cpp index a602a20ddb..a43509d0db 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/evaluate_ast_test.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/evaluate_ast_test.cpp @@ -33,7 +33,9 @@ #include "lib/rocprofiler-sdk/agent.hpp" #include "lib/rocprofiler-sdk/counters/evaluate_ast.hpp" #include "lib/rocprofiler-sdk/counters/id_decode.hpp" +#include "lib/rocprofiler-sdk/counters/metrics.hpp" #include "lib/rocprofiler-sdk/counters/parser/reader.hpp" +#include "rocprofiler-sdk/fwd.h" namespace { @@ -1366,3 +1368,141 @@ TEST(evatuate_ast, evaluate_select) } } } + +TEST(evaluate_ast, counter_reduction_dimension) +{ + using namespace rocprofiler::counters; + + size_t bit_length = DIM_BIT_LENGTH / ROCPROFILER_DIMENSION_LAST; + + auto get_base_rec_id = [](uint64_t counter_id) { + rocprofiler_counter_instance_id_t base_id = 0; + set_counter_in_rec(base_id, {.handle = counter_id}); + return base_id; + }; + + auto max_dim = [&](auto&& a) -> auto + { + std::unordered_map groups_dim; + std::vector result; + for(auto rec : a) + { + int64_t mask_dim = (MAX_64 >> (64 - bit_length)) << (bit_length * 0); + + rec.id = rec.id | mask_dim; + rec.id = rec.id ^ mask_dim; + if(groups_dim.find(rec.id) == groups_dim.end()) + { + groups_dim[rec.id] = rec; + } + else + { + groups_dim[rec.id].counter_value = + std::max(groups_dim[rec.id].counter_value, rec.counter_value); + } + } + for(auto& rec_pair : groups_dim) + { + result.push_back(rec_pair.second); + } + return result; + }; + + auto sum_dim = [&](auto&& a) -> auto + { + std::vector result; + double counter_value = 0; + result.push_back(a[0]); + set_dim_in_rec(result.begin()->id, ROCPROFILER_DIMENSION_NONE, 0); + for(auto& rec : a) + { + counter_value += rec.counter_value; + } + result.begin()->counter_value = counter_value; + return result; + }; + + std::unordered_map metrics = { + {"VOORHEES", Metric("gfx9", "VOORHEES", "a", "a", "a", "", "", 0)}, + {"KRUEGER", Metric("gfx9", "KRUEGER", "a", "a", "a", "", "", 1)}, + {"max_BATES", + Metric("gfx9", + "max_BATES", + "C", + "C", + "C", + "reduce(VOORHEES+KRUEGER,max, [DIMENSION_XCC])", + "", + 2)}, + {"sum_BATES", + Metric("gfx9", + "sum_BATES", + "C", + "C", + "C", + "reduce(VOORHEES+KRUEGER,sum, [DIMENSION_XCC, DIMENSION_AID])", + "", + 3)}}; + + std::unordered_map> base_counter_data = { + {"VOORHEES", + construct_test_data_dim( + get_base_rec_id(0), {ROCPROFILER_DIMENSION_XCC, ROCPROFILER_DIMENSION_AID}, 8)}, + {"KRUEGER", + construct_test_data_dim( + get_base_rec_id(1), {ROCPROFILER_DIMENSION_XCC, ROCPROFILER_DIMENSION_AID}, 8)}, + }; + + std::unordered_map> asts; + for(const auto& [val, metric] : metrics) + { + RawAST* ast = nullptr; + auto buf = yy_scan_string(metric.expression().empty() ? metric.name().c_str() + : metric.expression().c_str()); + yyparse(&ast); + ASSERT_TRUE(ast) << metric.expression() << " " << metric.name(); + asts.emplace("gfx9", std::unordered_map{}) + .first->second.emplace(val, + EvaluateAST({.handle = metric.id()}, metrics, *ast, "gfx9")); + yy_delete_buffer(buf); + delete ast; + } + + std::vector, int64_t>> + derived_counters = { + {"max_BATES", + max_dim(plus_vec(base_counter_data["VOORHEES"], base_counter_data["KRUEGER"])), + 2}, + {"sum_BATES", + sum_dim(plus_vec(base_counter_data["VOORHEES"], base_counter_data["KRUEGER"])), + 2}, + }; + + std::unordered_map> base_counter_decode; + for(const auto& [name, base_counter_v] : base_counter_data) + { + base_counter_decode[metrics[name].id()] = base_counter_v; + } + + for(auto& [name, expected, eval_count] : derived_counters) + { + ROCP_INFO << name; + auto eval_counters = + rocprofiler::counters::get_required_hardware_counters(asts, "gfx9", metrics[name]); + ASSERT_TRUE(eval_counters); + ASSERT_EQ(eval_counters->size(), eval_count); + std::vector>> cache; + asts.at("gfx9").at(name).expand_derived(asts.at("gfx9")); + auto ret = asts.at("gfx9").at(name).evaluate(base_counter_decode, cache); + EXPECT_EQ(ret->size(), expected.size()); + int pos = 0; + asts.at("gfx9").at(name).set_out_id(*ret); + for(const auto& v : *ret) + { + set_counter_in_rec(expected[pos].id, {.handle = metrics[name].id()}); + EXPECT_EQ(v.id, expected[pos].id); + EXPECT_FLOAT_EQ(v.counter_value, expected[pos].counter_value); + pos++; + } + } +}