diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index 8379b93bcb..7f34f8e8dd 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -1095,6 +1095,7 @@ target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-c target_compile_options(rccl PRIVATE -Wno-format-nonliteral) target_compile_options(rccl PRIVATE -Wno-unused-function) target_compile_options(rccl PRIVATE -fgpu-rdc) +target_compile_options(rccl PRIVATE -fvisibility=hidden) if(QUIET_WARNINGS) target_compile_options(rccl PRIVATE -Wno-invalid-offsetof) @@ -1118,8 +1119,7 @@ if(ENABLE_CODE_COVERAGE) message(STATUS "Code coverage is enabled with build type '${CMAKE_BUILD_TYPE}'.") target_compile_options(rccl PRIVATE - -fvisibility=default -Xarch_host -fprofile-instr-generate - -Xarch_host -fcoverage-mapping) + -Xarch_host -fprofile-instr-generate -Xarch_host -fcoverage-mapping) set(COVERAGE_SHARED_LINKER_FLAGS -fprofile-generate @@ -1133,14 +1133,6 @@ if(ENABLE_CODE_COVERAGE) target_link_options(rccl PRIVATE ${COVERAGE_SHARED_LINKER_FLAGS}) target_link_options(rccl PRIVATE ${COVERAGE_EXE_LINKER_FLAGS}) -elseif(BUILD_TESTS) # Enable default/hidden visibility based on build type and ROCM_VERSION - if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug") - target_compile_options(rccl PRIVATE -fvisibility=default) - else() - target_compile_options(rccl PRIVATE -fvisibility=hidden) - endif() -else() # Enable hidden visibility for library without tests/code coverage enabled - target_compile_options(rccl PRIVATE -fvisibility=hidden) endif() if (HAVE_KERNARG_PRELOAD) diff --git a/projects/rccl/src/misc/alt_rsmi.cc b/projects/rccl/src/misc/alt_rsmi.cc index d748d2618e..372d1f33d8 100644 --- a/projects/rccl/src/misc/alt_rsmi.cc +++ b/projects/rccl/src/misc/alt_rsmi.cc @@ -48,18 +48,29 @@ struct ARSMI_systemNode { std::string s_card; }; -static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes"; +// Internal implementation details +// When compiled into the test binary (ARSMI_TEST_BUILD defined), these have external linkage +// so AltRsmiTestUtils.cpp can access them via extern declarations. +// When compiled into the production library (ARSMI_TEST_BUILD not defined), these are static. +#ifdef ARSMI_TEST_BUILD + // Test build: external linkage for cross-TU access + #define ARSMI_LOCAL thread_local +#else + // Production build: static/internal linkage + #define ARSMI_LOCAL static thread_local +#endif + +ARSMI_LOCAL const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes"; static const uint32_t kAmdGpuId = 0x1002; // Vector containing data about each node, ordered by bdf ID -static thread_local std::vector ARSMI_orderedNodes; +ARSMI_LOCAL std::vector ARSMI_orderedNodes; // 2-D matrix with link information between each pair of nodes. -static thread_local std::vector> ARSMI_orderedLinks; +ARSMI_LOCAL std::vector> ARSMI_orderedLinks; // Number of devices recognized -static thread_local int ARSMI_num_devices=-1; - +ARSMI_LOCAL int ARSMI_num_devices=-1; // Public API functions int ARSMI_init(void) diff --git a/projects/rccl/test/AltRsmiTests.cpp b/projects/rccl/test/AltRsmiTests.cpp new file mode 100644 index 0000000000..b1c12f090c --- /dev/null +++ b/projects/rccl/test/AltRsmiTests.cpp @@ -0,0 +1,1276 @@ +/************************************************************************* + * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "alt_rsmi.h" +#include "common/ProcessIsolatedTestRunner.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================================================ +// Internal structures and variables from alt_rsmi.cc (TEST USE ONLY) +// ============================================================================ +// When alt_rsmi.cc is compiled with ARSMI_TEST_BUILD, internal variables +// have external linkage and can be accessed by test utilities + +struct ARSMI_systemNode { + uint32_t s_node_id = 0; + uint64_t s_gpu_id = 0; + uint64_t s_unique_id = 0; + uint64_t s_location_id = 0; + uint64_t s_bdf = 0; + uint64_t s_domain = 0; + uint8_t s_bus = 0; + uint8_t s_device = 0; + uint8_t s_function = 0; + uint8_t s_partition_id = 0; + std::string s_card; +}; + +// External declarations of internal variables from alt_rsmi.cc +extern thread_local const char *kKFDNodesPathRoot; +extern thread_local int ARSMI_num_devices; +extern thread_local std::vector ARSMI_orderedNodes; +extern thread_local std::vector> ARSMI_orderedLinks; + +// ============================================================================ +// Test utilities for manipulating alt_rsmi.cc internal state +// ============================================================================ +namespace AltRsmiTestUtils { + +// Storage for the test path to ensure the pointer remains valid +static std::string sTestNodesPath; + +// Set the KFD nodes path for testing +// This redirects file reads to test directories +static void SetNodesPath(const std::string& path) { + sTestNodesPath = path; + kKFDNodesPathRoot = sTestNodesPath.c_str(); +} + +// Reset ARSMI internal state between tests +// This ensures test isolation +static void ResetState() { + ARSMI_num_devices = -1; + ARSMI_orderedNodes.clear(); + ARSMI_orderedLinks.clear(); +} + +// Get current number of devices (for verification) +static int GetNumDevices() { + return ARSMI_num_devices; +} + +} // namespace AltRsmiTestUtils + +// Test paths for creating mock KFD filesystem +// Use std::filesystem::temp_directory_path() for portability +static const std::string kTestKFDBasePath = + std::filesystem::temp_directory_path().string() + "/test_kfd_arsmi"; +static const std::string kTestKFDPath = + std::filesystem::temp_directory_path().string() + "/test_kfd_arsmi/topology/nodes"; + +namespace RcclUnitTesting { + +// Helper functions for creating test filesystem structures +// All file operations are scoped to kTestKFDBasePath for safety +namespace { + // Internal helper to create directories recursively (operates on absolute paths) + int createDirectoryImpl(const std::string &path) { + size_t pos = 0; + std::string currentPath; + + // Iterate through each component of the path + while ((pos = path.find('/', pos)) != std::string::npos) { + currentPath = path.substr(0, pos++); + if (!currentPath.empty() && mkdir(currentPath.c_str(), 0700) == -1 && + errno != EEXIST) { + return -1; // Return error if directory creation fails + } + } + + // Create the final directory + if (mkdir(path.c_str(), 0700) == -1 && errno != EEXIST) { + return -1; // Return error if directory creation fails + } + + return 0; // Success + } + + // Internal helper to remove a directory recursively (operates on absolute paths) + int removeDirectoryImpl(const std::string &path) { + DIR *dir = opendir(path.c_str()); + if (!dir) { + // ENOENT means directory doesn't exist - not an error during cleanup + if (errno == ENOENT) { + return 0; + } + std::cerr << "Failed to open directory: " << path << " (errno: " << errno + << ")" << std::endl; + return -1; + } + + struct dirent *entry; + while ((entry = readdir(dir)) != nullptr) { + // Skip "." and ".." entries + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { + continue; + } + + std::string fullPath = path + "/" + entry->d_name; + + // Check if the entry is a directory + struct stat entryStat; + if (stat(fullPath.c_str(), &entryStat) == -1) { + std::cerr << "Failed to stat: " << fullPath << " (errno: " << errno + << ")" << std::endl; + closedir(dir); + return -1; + } + + if (S_ISDIR(entryStat.st_mode)) { + // Recursively remove subdirectory + if (removeDirectoryImpl(fullPath) == -1) { + closedir(dir); + return -1; + } + } else { + // Remove file + if (unlink(fullPath.c_str()) == -1) { + std::cerr << "Failed to remove file: " << fullPath + << " (errno: " << errno << ")" << std::endl; + closedir(dir); + return -1; + } + } + } + + closedir(dir); + + // Remove the directory itself + if (rmdir(path.c_str()) == -1) { + std::cerr << "Failed to remove directory: " << path + << " (errno: " << errno << ")" << std::endl; + return -1; + } + + return 0; // Success + } + + // Validate that a path is within the test sandbox + bool isPathInSandbox(const std::string &path) { + // Must start with kTestKFDBasePath to be valid + return path.find(kTestKFDBasePath) == 0; + } + + // Create directory within test sandbox (relative to kTestKFDPath) + // Pass "" to create the base kTestKFDPath directory + int createDirectory(const std::string &relativePath = "") { + std::string fullPath = relativePath.empty() + ? std::string(kTestKFDPath) + : std::string(kTestKFDPath) + "/" + relativePath; + + if (!isPathInSandbox(fullPath)) { + std::cerr << "error: path " << fullPath << " is outside test sandbox" << std::endl; + return -1; + } + return createDirectoryImpl(fullPath); + } + + // Remove entire test sandbox directory + int removeTestSandbox() { + return removeDirectoryImpl(kTestKFDBasePath); + } + + // Create a file within test sandbox (relative to kTestKFDPath) + void createFile(const std::string &relativePath, const std::string &content) { + std::string fullPath = std::string(kTestKFDPath) + "/" + relativePath; + + if (!isPathInSandbox(fullPath)) { + std::cerr << "error: path " << fullPath << " is outside test sandbox" << std::endl; + return; + } + + std::ofstream file(fullPath); + if (!file) { + std::cerr << "Failed to create file: " << fullPath << ", errno: " << errno << std::endl; + return; + } + file << content; + file.close(); + } + + // Remove a file within test sandbox (relative to kTestKFDPath) + int removeFile(const std::string &relativePath) { + std::string fullPath = std::string(kTestKFDPath) + "/" + relativePath; + + if (!isPathInSandbox(fullPath)) { + std::cerr << "Security error: path " << fullPath << " is outside test sandbox" << std::endl; + return -1; + } + + if (unlink(fullPath.c_str()) == -1) { + std::cerr << "Failed to remove file: " << fullPath << " (errno: " << errno + << ")" << std::endl; + return -1; // Return error if file removal fails + } + return 0; // Success + } + + // Function to create the test directory structure and files + void setupTestFiles() { + createDirectory(); // Creates kTestKFDPath + + // Create node 0 with valid data + createDirectory("0"); + createFile("0/gpu_id", "4098\n"); + createFile("0/properties", "unique_id 16336014475442738425\n" + "location_id 23552\n" + "domain 0\n" + "vendor_id 4098\n"); + + createDirectory("0/io_links/0"); + createFile("0/io_links/0/properties", + "type 2\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 64000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + createDirectory("0/io_links/1"); + createFile("0/io_links/1/properties", + "type 11\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 50000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + createDirectory("1"); + createFile("1/gpu_id", "4098\n"); + createFile("1/properties", "unique_id 16336014475442738426\n" + "location_id 23553\n" + "domain 1\n" + "vendor_id 4098\n"); + + createDirectory("1/io_links/0"); + createFile("1/io_links/0/properties", + "type 2\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 1\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 32000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + createDirectory("1/io_links/1"); + createFile("1/io_links/1/properties", + "type 11\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 1\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 50000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist + createDirectory(std::to_string(invalid_dev_id) + "/io_links/"); + } + + // Common setup for all tests + void setupTestEnvironment() { + // Reset ARSMI state for test isolation + AltRsmiTestUtils::ResetState(); + + // Redirect kKFDNodesPathRoot to test directory + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + // Create the test directory structure + setupTestFiles(); + } + + // Common cleanup for all tests (not strictly necessary with process isolation, + // but good practice to clean up temp files) + void cleanupTestEnvironment() { + AltRsmiTestUtils::ResetState(); + removeTestSandbox(); + } + +} // anonymous namespace + +// Tests using process isolation for complete state isolation +TEST(AltRsmiTest, ARSMIInitDefault) { + RUN_ISOLATED_TEST( + "ARSMIInitDefault", + []() { + setupTestEnvironment(); + + int result = ARSMI_init(); + ASSERT_EQ(result, 0); + + // Verify that devices were discovered + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 2); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitMissingIoLinksPropertiesFile) { + RUN_ISOLATED_TEST( + "ARSMIInitMissingIoLinksPropertiesFile", + []() { + setupTestEnvironment(); + + // Remove properties file for io_links + removeFile("0/io_links/0/properties"); + removeFile("0/io_links/1/properties"); + + int result = ARSMI_init(); + ASSERT_EQ(result, 0); + + // Should still initialize successfully even with missing link properties + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_GT(num_devices, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitMissingNodeToProperty) { + RUN_ISOLATED_TEST( + "ARSMIInitMissingNodeToProperty", + []() { + setupTestEnvironment(); + + createFile("0/io_links/1/properties", + "type 2\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + // "node_to 0\n" // Missing node_to + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 64000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + ASSERT_EQ(result, 0); // Expect success even with missing node_to + + // Verify devices are still initialized + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_GT(num_devices, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitMissingWeightProperty) { + RUN_ISOLATED_TEST( + "ARSMIInitMissingWeightProperty", + []() { + setupTestEnvironment(); + + createFile("0/io_links/1/properties", + "type 2\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + // "weight 21\n" // Missing weight + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 64000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + // returns 1 when weight property is missing + ASSERT_EQ(result, 1); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitMissingTypeProperty) { + RUN_ISOLATED_TEST( + "ARSMIInitMissingTypeProperty", + []() { + setupTestEnvironment(); + + createFile("0/io_links/1/properties", + // "type 5\n" // Missing type + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 0\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + // returns 1 when type property is missing + ASSERT_EQ(result, 1); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitTypePCIeProperty) { + RUN_ISOLATED_TEST( + "ARSMIInitTypePCIeProperty", + []() { + // Create a setup with ONLY PCIe links (type 2) to test PCIe specifically + removeTestSandbox(); + createDirectory(); + + // Create node 0 with PCIe-only link to node 1 + createDirectory("0"); + createFile("0/gpu_id", "4098\n"); + createFile("0/properties", "unique_id 100\n" + "location_id 23552\n" + "domain 0\n" + "vendor_id 4098\n"); + createDirectory("0/io_links/0"); + createFile("0/io_links/0/properties", + "type 2\n" // PCIe type + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 16000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + // Create node 1 with PCIe-only link to node 0 + createDirectory("1"); + createFile("1/gpu_id", "4098\n"); + createFile("1/properties", "unique_id 101\n" + "location_id 23553\n" + "domain 1\n" + "vendor_id 4098\n"); + createDirectory("1/io_links/0"); + createFile("1/io_links/0/properties", + "type 2\n" // PCIe type + "version_major 0\n" + "version_minor 0\n" + "node_from 1\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 16000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + int result = ARSMI_init(); + ASSERT_EQ(result, 0); + + // Verify link info is correctly identified as PCIe + ARSMI_linkInfo info; + ASSERT_EQ(ARSMI_topo_get_link_info(0, 1, &info), 0); + ASSERT_EQ(info.type, ARSMI_IOLINK_TYPE_PCIEXPRESS); + ASSERT_EQ(info.src_node, 0); + ASSERT_EQ(info.dst_node, 1); + ASSERT_EQ(info.hops, 2); + ASSERT_EQ(info.weight, 21); + ASSERT_EQ(info.max_bandwidth, 16000); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitMissingMinBWProperty) { + RUN_ISOLATED_TEST( + "ARSMIInitMissingMinBWProperty", + []() { + setupTestEnvironment(); + + createFile("0/io_links/1/properties", + "type 11\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + // "min_bandwidth 0\n" // Missing min_bandwidth + "max_bandwidth 0\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + // returns 1 when min_bandwidth property is missing + ASSERT_EQ(result, 1); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitMissingMaxBWProperty) { + RUN_ISOLATED_TEST( + "ARSMIInitMissingMaxBWProperty", + []() { + setupTestEnvironment(); + + createFile("0/io_links/1/properties", + "type 5\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + // "max_bandwidth 0\n" // Missing max_bandwidth + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + // returns 1 when max_bandwidth property is missing + ASSERT_EQ(result, 1); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIGetNumDevicesUninitialized) { + RUN_ISOLATED_TEST( + "ARSMIGetNumDevicesUninitialized", + []() { + setupTestEnvironment(); + + // Verify ARSMI is uninitialized (ARSMI_num_devices == -1) + ASSERT_EQ(AltRsmiTestUtils::GetNumDevices(), -1); + + // Don't call ARSMI_init, let ARSMI_get_num_devices initialize + uint32_t num_devices = 0; + + int result = ARSMI_get_num_devices(&num_devices); + + // Verify that the function initializes successfully + ASSERT_EQ(result, 0); + + // Verify that auto-initialization occurred (ARSMI_num_devices >= 0) + ASSERT_GE(AltRsmiTestUtils::GetNumDevices(), 0); + + // Verify that the number of devices is correctly set + ASSERT_EQ(num_devices, 2); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIDevPciIdGetNullBdfId) { + RUN_ISOLATED_TEST( + "ARSMIDevPciIdGetNullBdfId", + []() { + setupTestEnvironment(); + + uint32_t device_index = 0; + int result = ARSMI_dev_pci_id_get(device_index, nullptr); + + ASSERT_EQ(result, EINVAL); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIDevPciIdGetValid) { + RUN_ISOLATED_TEST( + "ARSMIDevPciIdGetValid", + []() { + setupTestEnvironment(); + + uint32_t device_index = 0; + uint64_t bdfid = 0; + + int result = ARSMI_dev_pci_id_get(device_index, &bdfid); + + // Verify that the function succeeds + ASSERT_EQ(result, 0); + // BDF ID should be non-zero for valid devices + ASSERT_NE(bdfid, 0); + + cleanupTestEnvironment(); + } + ); +} + +// Tests covering invalid file/directory scenarios through public API +TEST(AltRsmiTest, ARSMIInitWithInvalidGpuIdData) { + RUN_ISOLATED_TEST( + "ARSMIInitWithInvalidGpuIdData", + []() { + // Create a gpu_id file with invalid (non-numeric) data + removeTestSandbox(); + createDirectory(); + createDirectory("0"); + createFile("0/gpu_id", "invalid_gpu_id"); + createFile("0/properties", "unique_id 12345\n" + "location_id 23552\n" + "domain 0\n" + "vendor_id 4098\n"); + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + int result = ARSMI_init(); + + // Init should handle invalid gpu_id gracefully + ASSERT_EQ(result, 0); + + // Should not discover any devices with invalid gpu_id + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitWithEmptyPropertiesFile) { + RUN_ISOLATED_TEST( + "ARSMIInitWithEmptyPropertiesFile", + []() { + // Create an empty properties file + removeTestSandbox(); + createDirectory(); + createDirectory("0"); + createFile("0/gpu_id", "4098\n"); + createFile("0/properties", ""); + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + int result = ARSMI_init(); + + // Should succeed but not discover devices with empty properties + ASSERT_EQ(result, 0); + + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitWithDirectoryInsteadOfPropertiesFile) { + RUN_ISOLATED_TEST( + "ARSMIInitWithDirectoryInsteadOfPropertiesFile", + []() { + // Create a directory instead of properties file + removeTestSandbox(); + createDirectory(); + createDirectory("0"); + createFile("0/gpu_id", "4098\n"); + createDirectory("0/properties"); + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + int result = ARSMI_init(); + + // Should handle this gracefully + ASSERT_EQ(result, 0); + + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitWithMissingVendorId) { + RUN_ISOLATED_TEST( + "ARSMIInitWithMissingVendorId", + []() { + // Create node without vendor_id + removeTestSandbox(); + createDirectory(); + createDirectory("0"); + createFile("0/gpu_id", "4098\n"); + createFile("0/properties", "unique_id 12345\n" + "location_id 23552\n" + "domain 0\n"); + // Missing vendor_id + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + int result = ARSMI_init(); + ASSERT_EQ(result, 0); + + // Should not discover devices without vendor_id + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitWithNonAMDVendorId) { + RUN_ISOLATED_TEST( + "ARSMIInitWithNonAMDVendorId", + []() { + // Create node with non-AMD vendor_id + removeTestSandbox(); + createDirectory(); + createDirectory("0"); + createFile("0/gpu_id", "4098\n"); + createFile("0/properties", "unique_id 12345\n" + "location_id 23552\n" + "domain 0\n" + "vendor_id 0x10DE\n"); // NVIDIA vendor ID + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + int result = ARSMI_init(); + ASSERT_EQ(result, 0); + + // Should not discover non-AMD devices + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ARSMIInitWithEmptyLinkPropertiesFile) { + RUN_ISOLATED_TEST( + "ARSMIInitWithEmptyLinkPropertiesFile", + []() { + setupTestEnvironment(); + + // Create setup but with empty link properties + createFile("0/io_links/0/properties", ""); + + int result = ARSMI_init(); + + // Should still initialize, just skip that link + ASSERT_EQ(result, 0); + + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_GT(num_devices, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, NullInfoPointer) { + RUN_ISOLATED_TEST( + "NullInfoPointer", + []() { + setupTestEnvironment(); + + int result = ARSMI_topo_get_link_info(0, 1, nullptr); + ASSERT_EQ(result, EINVAL); // Expect EINVAL for null `info` pointer + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, SourceDeviceIndexOutOfRange) { + RUN_ISOLATED_TEST( + "SourceDeviceIndexOutOfRange", + []() { + setupTestEnvironment(); + + ARSMI_linkInfo info; + // First initialize + ASSERT_EQ(ARSMI_init(), 0); + + int result = ARSMI_topo_get_link_info(999, 1, &info); // Invalid source index + ASSERT_EQ(result, EINVAL); // Expect EINVAL for out-of-range source index + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, DestinationDeviceIndexOutOfRange) { + RUN_ISOLATED_TEST( + "DestinationDeviceIndexOutOfRange", + []() { + setupTestEnvironment(); + + ARSMI_linkInfo info; + // First initialize + ASSERT_EQ(ARSMI_init(), 0); + + int result = ARSMI_topo_get_link_info(0, 999, &info); // Invalid destination index + ASSERT_EQ(result, EINVAL); // Expect EINVAL for out-of-range destination index + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, LinkInfoAutoInitializes) { + RUN_ISOLATED_TEST( + "LinkInfoAutoInitializes", + []() { + setupTestEnvironment(); + + // Test that ARSMI_topo_get_link_info auto-initializes if not already initialized + ARSMI_linkInfo info; + int result = ARSMI_topo_get_link_info(0, 0, &info); + + // Should succeed - auto-initialization should work with test data + ASSERT_EQ(result, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ValidLinkInfoBetweenDevices) { + RUN_ISOLATED_TEST( + "ValidLinkInfoBetweenDevices", + []() { + setupTestEnvironment(); + + // Initialize the system + ASSERT_EQ(ARSMI_init(), 0); + + ARSMI_linkInfo info; + int result = ARSMI_topo_get_link_info(0, 1, &info); + + // Should succeed + ASSERT_EQ(result, 0); + + // Verify link info contains reasonable values + ASSERT_EQ(info.src_node, 0); + ASSERT_EQ(info.dst_node, 1); + // Type should be XGMI (type 11 in properties) + ASSERT_EQ(info.type, ARSMI_IOLINK_TYPE_XGMI); + ASSERT_EQ(info.hops, 1); + ASSERT_EQ(info.weight, 21); + ASSERT_EQ(info.min_bandwidth, 0); + ASSERT_EQ(info.max_bandwidth, 50000); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, ValidLinkInfoSelfLink) { + RUN_ISOLATED_TEST( + "ValidLinkInfoSelfLink", + []() { + setupTestEnvironment(); + + // Initialize the system + ASSERT_EQ(ARSMI_init(), 0); + + ARSMI_linkInfo info; + int result = ARSMI_topo_get_link_info(0, 0, &info); + + // Should succeed - even self-links should return default values + ASSERT_EQ(result, 0); + + cleanupTestEnvironment(); + } + ); +} + +// TODO(alt_rsmi.cc): The current behavior of ARSMI_topo_get_link_info is questionable. +// When no direct link exists between devices, it silently returns success with +// fabricated default values (PCIe, 2 hops, weight 40). This can mislead callers +// into thinking a real link exists. The implementation should return an error +// code (e.g., ENOENT) when no link is defined. Once fixed, this test should be +// updated to verify the new behavior. +TEST(AltRsmiTest, LinkInfoWithNoDirectConnection) { + RUN_ISOLATED_TEST( + "LinkInfoWithNoDirectConnection", + []() { + // Setup with 2 nodes where they don't have direct XGMI connection + removeTestSandbox(); + createDirectory(); + + // Create node 0 + createDirectory("0"); + createFile("0/gpu_id", "4098\n"); + createFile("0/properties", + "unique_id 100\n" + "location_id 23552\n" + "domain 0\n" + "vendor_id 4098\n"); + // Create empty io_links directory (no actual links defined) + createDirectory("0/io_links"); + + // Create node 1 + createDirectory("1"); + createFile("1/gpu_id", "4098\n"); + createFile("1/properties", + "unique_id 101\n" + "location_id 23553\n" + "domain 1\n" + "vendor_id 4098\n"); + // Create empty io_links directory (no actual links defined) + createDirectory("1/io_links"); + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + ASSERT_EQ(ARSMI_init(), 0); + + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 2); + + // Try to get link info between the two devices (no direct link defined) + ARSMI_linkInfo info; + int result = ARSMI_topo_get_link_info(0, 1, &info); + + // Should succeed but return default values since no io_links are defined + ASSERT_EQ(result, 0); + + // When no direct link exists, src_node and dst_node remain as UINT_MAX + ASSERT_EQ(info.src_node, std::numeric_limits::max()); + ASSERT_EQ(info.dst_node, std::numeric_limits::max()); + + // Default values set + ASSERT_EQ(info.hops, 2); // Default hops + ASSERT_EQ(info.type, ARSMI_IOLINK_TYPE_PCIEXPRESS); // Default type + ASSERT_EQ(info.weight, 40); // Default weight + ASSERT_EQ(info.min_bandwidth, 0); // Default min_bandwidth + ASSERT_EQ(info.max_bandwidth, 0); // Default max_bandwidth + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, MultipleDevicesWithXGMILinks) { + RUN_ISOLATED_TEST( + "MultipleDevicesWithXGMILinks", + []() { + setupTestEnvironment(); + + // Test XGMI link type (type 11) + ASSERT_EQ(ARSMI_init(), 0); + + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 2); + + // Get link info for XGMI connection + ARSMI_linkInfo info; + ASSERT_EQ(ARSMI_topo_get_link_info(0, 1, &info), 0); + + // Verify XGMI properties + ASSERT_EQ(info.type, ARSMI_IOLINK_TYPE_XGMI); + ASSERT_EQ(info.hops, 1); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, LinkTypeUndefined) { + RUN_ISOLATED_TEST( + "LinkTypeUndefined", + []() { + setupTestEnvironment(); + + // Remove existing links and create setup with only undefined link type + removeFile("0/io_links/0/properties"); + removeFile("0/io_links/1/properties"); + removeFile("1/io_links/0/properties"); + removeFile("1/io_links/1/properties"); + + // Create link with undefined type (must be read last to not be overwritten) + createDirectory("0/io_links/2"); + createFile("0/io_links/2/properties", + "type 99\n" // Undefined type (not 2 or 11) + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 50000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + ASSERT_EQ(ARSMI_init(), 0); + + ARSMI_linkInfo info; + ASSERT_EQ(ARSMI_topo_get_link_info(0, 1, &info), 0); + + // Should have undefined type + ASSERT_EQ(info.type, ARSMI_IOLINK_TYPE_UNDEFINED); + ASSERT_EQ(info.hops, 0); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, DeviceOrderingByBDF) { + RUN_ISOLATED_TEST( + "DeviceOrderingByBDF", + []() { + setupTestEnvironment(); + + // Test that devices are ordered by BDF correctly + ASSERT_EQ(ARSMI_init(), 0); + + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 2); + + // Get BDF for both devices + uint64_t bdf0 = 0, bdf1 = 0; + ASSERT_EQ(ARSMI_dev_pci_id_get(0, &bdf0), 0); + ASSERT_EQ(ARSMI_dev_pci_id_get(1, &bdf1), 0); + + // BDFs should be ordered (lower BDF first) + // Based on test data: node0 (domain=0, location_id=23552) comes before + // node1 (domain=1, location_id=23553) + ASSERT_LT(bdf0, bdf1); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, FileExistsCheck) { + RUN_ISOLATED_TEST( + "FileExistsCheck", + []() { + // Test fileExists() indirectly by verifying behavior when files don't exist + // This covers the fileExists(char const*) internal function + + removeTestSandbox(); + createDirectory(); + + // Scenario 1: Node with missing gpu_id file - should be skipped + createDirectory("0"); + // Don't create gpu_id file - fileExists() will return false for it + createFile("0/properties", + "unique_id 100\n" + "location_id 23552\n" + "domain 0\n" + "vendor_id 4098\n"); + createDirectory("0/io_links"); + + // Scenario 2: Node with missing properties file - should be skipped + createDirectory("1"); + createFile("1/gpu_id", "4098\n"); + // Don't create properties file - fileExists() will return false for it + createDirectory("1/io_links"); + + // Scenario 3: Complete valid node + createDirectory("2"); + createFile("2/gpu_id", "4098\n"); + createFile("2/properties", + "unique_id 102\n" + "location_id 23554\n" + "domain 2\n" + "vendor_id 4098\n"); + createDirectory("2/io_links"); + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + ASSERT_EQ(ARSMI_init(), 0); + + // Only the complete node should be discovered (fileExists filtered out the others) + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 1); + + cleanupTestEnvironment(); + } + ); +} + +TEST(AltRsmiTest, BDFSortingLambda) { + RUN_ISOLATED_TEST( + "BDFSortingLambda", + []() { + // Test the BDF sorting lambda comparator in ARSMI_init() + // The lambda at line 183-186 sorts devices with the SAME unique_id by BDF + // Create multiple partitions (same unique_id) with different BDF values in REVERSE order + removeTestSandbox(); + createDirectory(); + + // Create 4 nodes with the SAME unique_id but different location_ids (which affects BDF) + // to exercise the lambda that sorts within the same unique_id group + const std::string same_unique_id = "12345678901234567890"; + + // Node 0: Highest BDF (will need to be moved to end by lambda) + createDirectory("0"); + createFile("0/gpu_id", "4098\n"); + createFile("0/properties", + "unique_id " + same_unique_id + "\n" + "location_id 4294967040\n" // Very high value for high BDF + "domain 3\n" + "vendor_id 4098\n"); + createDirectory("0/io_links"); + + // Node 1: Second highest BDF + createDirectory("1"); + createFile("1/gpu_id", "4098\n"); + createFile("1/properties", + "unique_id " + same_unique_id + "\n" + "location_id 16777216\n" + "domain 2\n" + "vendor_id 4098\n"); + createDirectory("1/io_links"); + + // Node 2: Second lowest BDF + createDirectory("2"); + createFile("2/gpu_id", "4098\n"); + createFile("2/properties", + "unique_id " + same_unique_id + "\n" + "location_id 65536\n" + "domain 1\n" + "vendor_id 4098\n"); + createDirectory("2/io_links"); + + // Node 3: Lowest BDF (should be sorted to first by lambda) + createDirectory("3"); + createFile("3/gpu_id", "4098\n"); + createFile("3/properties", + "unique_id " + same_unique_id + "\n" + "location_id 256\n" + "domain 0\n" + "vendor_id 4098\n"); + createDirectory("3/io_links"); + + AltRsmiTestUtils::ResetState(); + AltRsmiTestUtils::SetNodesPath(kTestKFDPath); + + ASSERT_EQ(ARSMI_init(), 0); + + // All 4 nodes have the same unique_id, so they're all partitions of the same device + // ARSMI_num_devices counts unique devices, but ARSMI_orderedNodes has all partitions + uint32_t num_devices = 0; + ASSERT_EQ(ARSMI_get_num_devices(&num_devices), 0); + ASSERT_EQ(num_devices, 4); // All 4 partitions should be counted + + // Access ARSMI_orderedNodes directly to verify the lambda sorted by s_bdf + ASSERT_EQ(ARSMI_orderedNodes.size(), 4); + + // The lambda should have sorted these by s_bdf within the unique_id group + // Verify ascending order + ASSERT_LT(ARSMI_orderedNodes[0].s_bdf, ARSMI_orderedNodes[1].s_bdf); + ASSERT_LT(ARSMI_orderedNodes[1].s_bdf, ARSMI_orderedNodes[2].s_bdf); + ASSERT_LT(ARSMI_orderedNodes[2].s_bdf, ARSMI_orderedNodes[3].s_bdf); + + // Verify the sort reordered them: node 3 should be first (lowest BDF) + ASSERT_EQ(ARSMI_orderedNodes[0].s_node_id, 3); // Node 3 has domain 0, location 256 + ASSERT_EQ(ARSMI_orderedNodes[1].s_node_id, 2); // Node 2 has domain 1, location 65536 + ASSERT_EQ(ARSMI_orderedNodes[2].s_node_id, 1); // Node 1 has domain 2, location 16777216 + ASSERT_EQ(ARSMI_orderedNodes[3].s_node_id, 0); // Node 0 has domain 3, location 4294967040 + + // Verify they all have the same unique_id + ASSERT_EQ(ARSMI_orderedNodes[0].s_unique_id, ARSMI_orderedNodes[1].s_unique_id); + ASSERT_EQ(ARSMI_orderedNodes[1].s_unique_id, ARSMI_orderedNodes[2].s_unique_id); + ASSERT_EQ(ARSMI_orderedNodes[2].s_unique_id, ARSMI_orderedNodes[3].s_unique_id); + + cleanupTestEnvironment(); + } + ); +} + +} // namespace RcclUnitTesting diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt index 6ee377974f..cf89bc28c4 100644 --- a/projects/rccl/test/CMakeLists.txt +++ b/projects/rccl/test/CMakeLists.txt @@ -236,6 +236,25 @@ if(BUILD_TESTS) list(APPEND RCCL_TEST_EXECUTABLES rccl-UnitTestsMPI) endif() + + # rccl-UnitTestsAltRsmi: Uses TEST BUILD alt_rsmi.cc (ARSMI_TEST_BUILD) + # This separate executable compiles alt_rsmi.cc with ARSMI_TEST_BUILD, + # enabling external linkage of internal variables so that + # tests can access and manipulate them for testing. + list(APPEND RCCL_TEST_EXECUTABLES rccl-UnitTestsAltRsmi) + + set(TEST_ALTRSMI_SOURCE_FILES + AltRsmiTests.cpp + ../src/misc/alt_rsmi.cc + common/main_fixtures.cpp + common/EnvVars.cpp + common/ProcessIsolatedTestRunner.cpp + ) + + add_executable(rccl-UnitTestsAltRsmi ${TEST_ALTRSMI_SOURCE_FILES}) + + # Define ARSMI_TEST_BUILD specifically for rccl-UnitTestsAltRsmi + target_compile_definitions(rccl-UnitTestsAltRsmi PRIVATE ARSMI_TEST_BUILD) endif() foreach(test_executable IN LISTS RCCL_TEST_EXECUTABLES)