[SWDEV-531904] - Added GPU Cache Read Tests (#464)

New:
- gpu_cache_read.h and gpu_cache_read.cc
- Test reads GPU cache info and asserts valid structure
Updated:
- integration_test.py
- Added test_gpu_cache_info() and asserts valid structure
- test_get_gpu_compute_partition() to loop through all devices when test fail/pass
Added:
- test_get_gpu_compute_partition_returns_string() to integration_test.py
- This test displays the current compute partition for each bdf

---------

Signed-off-by: Juan Castillo <juan.castillo@amd.com>
Signed-off-by: Castillo, Juan <Juan.Castillo@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Tento commit je obsažen v:
Castillo, Juan
2025-06-19 15:23:34 -05:00
odevzdal GitHub
rodič 57a749f457
revize 470c62f887
5 změnil soubory, kde provedl 236 přidání a 2 odebrání
+2 -2
Zobrazit soubor
@@ -866,10 +866,10 @@ int KFDNode::get_cache_info(rsmi_gpu_cache_info_t *info) {
info->num_cache_types = 0;
for (unsigned int cache_id = 0; cache_id < caches_count; cache_id++) {
const auto prop_file = f_path + std::to_string(cache_id) + "/properties";
std::string level = get_properties_from_file(prop_file, "level ");
try {
std::string level = get_properties_from_file(prop_file, "level ");
int cache_level = std::stoi(level);
if (cache_level < 0 ) continue;
if (cache_level < 0) continue;
std::string type = get_properties_from_file(prop_file, "type ");
int cache_type = std::stoi(type);
+133
Zobrazit soubor
@@ -0,0 +1,133 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "gpu_cache_read.h"
#include <gtest/gtest.h>
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <iostream>
#include <iterator>
#include <map>
#include <sstream>
#include <string>
#include "../test_common.h"
#include "amd_smi/amdsmi.h"
#include "amd_smi/impl/amd_smi_utils.h"
#include "gpu_metrics_read.h"
#include "rocm_smi/rocm_smi_utils.h"
TestGPUCacheRead::TestGPUCacheRead() : TestBase() {
set_title("GPU Cache Read Test");
set_description(
"This test verifies the GPU cache "
"read metrics using the AMD SMI library.");
}
TestGPUCacheRead::~TestGPUCacheRead(void) {
// Cleanup if necessary
}
void TestGPUCacheRead::SetUp() {
TestBase::SetUp();
return;
}
void TestGPUCacheRead::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); }
void TestGPUCacheRead::DisplayResults(void) const {
TestBase::DisplayResults();
return;
}
void TestGPUCacheRead::Close() {
/**
* @brief Closes the TestGPUCacheRead test case and performs necessary cleanup.
*
* This function overrides the Close method from the TestBase class.
* It is responsible for executing any cleanup operations required after
* running the GPU cache read test. The function calls the base class's
* Close method to ensure all inherited cleanup procedures are executed.
*/
TestBase::Close();
return;
}
void TestGPUCacheRead::Run() {
/**
* @brief Runs the GPU cache read test.
*
* This function overrides the Run method from the TestBase class.
* It is responsible for executing the GPU cache read test using the
* AMD SMI library. The function retrieves the GPU cache read metrics
* and displays them.
*/
amdsmi_status_t err;
TestBase::Run();
if (setup_failed_) {
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
return;
}
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
PrintDeviceHeader(processor_handles_[i]);
std::cout << "Device #" << std::to_string(i) << "\n";
IF_VERB(STANDARD) {
std::cout << "\n\n";
std::cout << "\t**GPU CACHE INFO: Using static struct (Backwards Compatibility):\n";
}
amdsmi_gpu_cache_info_t res = {};
err = amdsmi_get_gpu_cache_info(processor_handles_[i], &res);
const char *status_string;
amdsmi_status_code_to_string(err, &status_string);
std::cout << "\t\t** amdsmi_get_gpu_cache_info(): " << status_string << "\n";
CHK_ERR_ASRT(err);
std::cout << "\t\tnum_cache_types: " << res.num_cache_types << "\n";
for (unsigned int j = 0; j < res.num_cache_types; j++) {
std::cout << "\t\tCache Type " << j << ":\n";
std::cout << "\t\t\tcache_level: " << res.cache[j].cache_level << "\n";
std::cout << "\t\t\tcache_properties: (0x" << std::hex << res.cache[j].cache_properties
<< std::dec << ") ";
// Example string representation (adjust according to actual bit definitions)
std::string props_str;
uint32_t props = res.cache[j].cache_properties;
if (props & AMDSMI_CACHE_PROPERTY_DATA_CACHE) props_str += "Data Cache, ";
if (props & AMDSMI_CACHE_PROPERTY_INST_CACHE) props_str += "Instruction Cache, ";
if (props & AMDSMI_CACHE_PROPERTY_CPU_CACHE) props_str += "CPU Cache, ";
if (props & AMDSMI_CACHE_PROPERTY_SIMD_CACHE) props_str += "SIMD Cache, ";
if (!props_str.empty())
props_str.erase(props_str.size() - 2); // Remove trailing comma and space
else
props_str = "None";
std::cout << props_str << "\n";
std::cout << "\t\t\tcache_size: " << res.cache[j].cache_size << " KB\n";
std::cout << "\t\t\tmax_num_cu_shared: " << res.cache[j].max_num_cu_shared << "\n";
std::cout << "\t\t\tnum_cache_instance: " << res.cache[j].num_cache_instance << "\n";
}
}
}
+50
Zobrazit soubor
@@ -0,0 +1,50 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef TESTS_AMD_SMI_TEST_FUNCTIONAL_GPU_CACHE_READ_H_
#define TESTS_AMD_SMI_TEST_FUNCTIONAL_GPU_CACHE_READ_H_
#include "../test_base.h"
class TestGPUCacheRead : public TestBase {
public:
TestGPUCacheRead();
// @Brief: Destructor for test case of TestGPUBusyRead
virtual ~TestGPUCacheRead();
// @Brief: Setup the environment for measurement
virtual void SetUp();
// @Brief: Core measurement execution
virtual void Run();
// @Brief: Clean up and retrive the resource
virtual void Close();
// @Brief: Display results
virtual void DisplayResults() const;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
};
#endif // TESTS_AMD_SMI_TEST_FUNCTIONAL_GPU_CACHE_READ_H_
+6
Zobrazit soubor
@@ -67,6 +67,7 @@
#include "functional/init_shutdown_refcount.h"
#include "functional/memorypartition_read_write.h"
#include "functional/computepartition_read_write.h"
#include "functional/gpu_cache_read.h"
static AMDSMITstGlobals *sRSMIGlvalues = nullptr;
@@ -281,6 +282,11 @@ TEST(amdsmitstReadWrite, TestEvtNotifReadWrite) {
TestEvtNotifReadWrite tst;
RunGenericTest(&tst);
}
TEST(amdsmitstReadOnly, TestGPUCacheRead) {
TestGPUCacheRead tst;
RunGenericTest(&tst);
}
/*
TEST(amdsmitstReadOnly, TestConcurrentInit) {
TestConcurrentInit tst;
+45
Zobrazit soubor
@@ -212,6 +212,51 @@ class TestAmdSmiPythonInterface(unittest.TestCase):
print()
self.tearDown()
@handle_exceptions
def test_gpu_cache_info(self):
self.setUp()
print("\n\n###Test amdsmi_interface.amdsmi_get_gpu_cache_info")
processors = amdsmi.amdsmi_get_processor_handles()
self.assertGreaterEqual(len(processors), 1)
self.assertLessEqual(len(processors), 32)
for i in range(0, len(processors)):
print("\n\n###Test Processor {}, bdf: {}".format(i, amdsmi.amdsmi_get_gpu_device_bdf(processors[i])))
print("\n###Test amdsmi_interface.amdsmi_get_gpu_cache_info \n")
try:
cache_info = amdsmi.amdsmi_interface.amdsmi_get_gpu_cache_info(processors[i])
except Exception as e:
print(f" Exception in amdsmi_get_gpu_cache_info: {e}")
self.fail(f"Test failed due to exception: {e}")
if isinstance(cache_info, dict):
for key, value in cache_info.items():
print(f" {key}: {value}")
for cache_entry in cache_info.get('cache', []):
self.assertIn('cache_size', cache_entry)
self.assertIn('cache_level', cache_entry)
self.assertIn('num_cache_instance', cache_entry)
self.assertIn('max_num_cu_shared', cache_entry)
else:
print(" cache_info: {}".format(cache_info))
print()
self.tearDown()
@handle_exceptions
def test_get_gpu_compute_partition(self):
processors = amdsmi.amdsmi_get_processor_handles()
self.assertGreater(len(processors), 0)
for i in range(0, len(processors)):
bdf = amdsmi.amdsmi_get_gpu_device_bdf(processors[i])
try:
result = amdsmi.amdsmi_get_gpu_compute_partition(processors[i])
self.assertIsInstance(result, str)
self.assertTrue(len(result) > 0)
print(f"\nCompute partition for handle {bdf}: {result}")
except Exception as e:
print(f"\nCompute partition not supported for handle {bdf}: {e}")
continue
print("All compute partitions returned as strings successfully (or not supported).")
self.tearDown()
def test_bdf_device_id(self):
self.setUp()
processors = amdsmi.amdsmi_get_processor_handles()