diff --git a/CMakeLists.txt b/CMakeLists.txt index d3ff9da34d..c058cdfbd7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -715,6 +715,57 @@ foreach(SRC_FILE ${SRC_FILES}) endif() endforeach() +# Adding custom target to hipify all the source files +# This is required to make sure that all the hipified source files are +# available before compiling the unit tests executable(s) +add_custom_target(hipify_all DEPENDS ${HIP_SOURCES}) + +if (BUILD_TESTS) + if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug") + set(HIPIFY_SRC_DIR "${PROJECT_BINARY_DIR}/hipify/src") + set(REPLACE_SCRIPT "${CMAKE_SOURCE_DIR}/tools/scripts/replace_static.sh") + message ("Replacing static functions in ${HIPIFY_SRC_DIR} with ${REPLACE_SCRIPT} for unit tests") + # Create a list of files which needs to be modified to remove static + set(TEST_NONSTATIC_SOURCE_FILES + ${HIPIFY_SRC_DIR}/misc/alt_rsmi.cc + ) + # Create a custom command to backup the original files and remove static + # Always run replace script on hipified files, but preserve original backups + foreach(srcfile ${TEST_NONSTATIC_SOURCE_FILES}) + add_custom_command( + OUTPUT "${srcfile}.staticbak" + COMMAND bash -c "\ + ${CMAKE_COMMAND} -E echo 'Processing ${srcfile} for static replacement' && \ + if [ ! -f '${srcfile}.staticbak' ]; then \ + ${CMAKE_COMMAND} -E copy '${srcfile}' '${srcfile}.staticbak' && \ + ${CMAKE_COMMAND} -E echo 'Created backup: ${srcfile}.staticbak'; \ + fi && \ + ${CMAKE_COMMAND} -E echo 'Exposing internal functions/variables in ${srcfile}' && \ + ${REPLACE_SCRIPT} ${srcfile} --replace-vars --exclude-list=${exclude_list} 2>&1 \ + " + DEPENDS "${srcfile}" "${REPLACE_SCRIPT}" hipify_all + COMMENT "Removing static from ${srcfile} (backup preserved at ${srcfile}.staticbak)" + VERBATIM + ) + list(APPEND STATIC_BAK_FILES "${srcfile}.staticbak") + endforeach() + add_custom_target(replace_static_in_hipify ALL DEPENDS ${STATIC_BAK_FILES}) + add_dependencies(replace_static_in_hipify hipify_all) + else() + # Restore originals in the build directory if backup exists + foreach(srcfile ${HIPIFY_SRC_FILES}) + if(EXISTS "${srcfile}.staticbak") + execute_process( + COMMAND ${CMAKE_COMMAND} -E copy "${srcfile}.staticbak" "${srcfile}" + ) + execute_process( + COMMAND ${CMAKE_COMMAND} -E remove "${srcfile}.staticbak" + ) + endif() + endforeach() + endif() +endif() + # Generate device/host tables and all the collective functions that are going to be in librccl.so #================================================================================================== find_package(Python3 COMPONENTS Interpreter REQUIRED) @@ -769,6 +820,11 @@ add_library(rccl ${HIP_SOURCES}) ## Set RCCL dependencies add_dependencies(rccl git_version_check) # Execute git_version_check during build +if (BUILD_TESTS AND ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug") + ## Set static replacement dependency for fixture unit tests + add_dependencies(rccl replace_static_in_hipify) +endif() + ## Set RCCL include directories target_include_directories(rccl PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src) # for hipfied headers @@ -986,7 +1042,7 @@ if (HAVE_PARALLEL_JOBS) target_compile_options(rccl PRIVATE -parallel-jobs=12) endif() -if (ROCM_VERSION VERSION_GREATER_EQUAL "60200") +if (ROCM_VERSION VERSION_GREATER_EQUAL "60200") target_compile_options(rccl PRIVATE --offload-compress) # Compress GPU code at compile time. target_link_libraries(rccl PRIVATE --offload-compress) # Compress GPU code at link time. message(STATUS "--offload-compress enabled - ROCm version >= 6.2.0") @@ -999,7 +1055,7 @@ target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized) target_compile_options(rccl PRIVATE -Wall) target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy) target_compile_options(rccl PRIVATE -Wno-format-nonliteral) -target_compile_options(rccl PRIVATE -fgpu-rdc) +target_compile_options(rccl PRIVATE -fgpu-rdc) # Generate relocatable device code (required for extern __shared__) ## Set RCCL compile and linker options for unit tests and code coverage if(ENABLE_CODE_COVERAGE) diff --git a/install.sh b/install.sh index cd31682ca1..d67759e0d7 100755 --- a/install.sh +++ b/install.sh @@ -332,15 +332,25 @@ fi # Optionally, run RCCL-UnitTests, if they're enabled. if [[ "${run_tests}" == true ]]; then - if [[ -x "./test/rccl-UnitTests" ]]; then - if [[ "${run_tests_all}" == true ]]; then + if [[ ! -x "./test/rccl-UnitTests" ]]; then + echo "RCCL-UnitTests have not been built yet; Please re-run script with \"-t\" to build the binary." + exit 1 + fi + if [[ "${build_release}" == false && ! -x "./test/rccl-UnitTestsFixtures" ]]; then + echo "RCCL-UnitTestsFixtures have not been built yet; Please re-run script with \"-t\" to build the binary." + exit 1 + fi + if [[ "${run_tests_all}" == true ]]; then + if [[ -x "./test/rccl-UnitTests" ]]; then ./test/rccl-UnitTests - else - ./test/rccl-UnitTests --gtest_filter="AllReduce.*" + fi + if [[ "${build_release}" == false && -x "./test/rccl-UnitTestsFixtures" ]]; then + ./test/rccl-UnitTestsFixtures fi else - echo "RCCL-UnitTests have not been built yet; Please re-run script with \"-t\" to build RCCL-UnitTests." - exit 1 + if [[ -x "./test/rccl-UnitTests" ]]; then + ./test/rccl-UnitTests --gtest_filter="AllReduce.*" + fi fi fi diff --git a/src/misc/alt_rsmi.cc b/src/misc/alt_rsmi.cc index 0961e42beb..0dffe07cd5 100644 --- a/src/misc/alt_rsmi.cc +++ b/src/misc/alt_rsmi.cc @@ -201,10 +201,7 @@ int ARSMI_init(void) ARSMI_orderedNodes.push_back(sort_vecs[j][k]); } break; - found = true; } - if (found) - continue; } } diff --git a/test/AltRsmiTests.cpp b/test/AltRsmiTests.cpp new file mode 100644 index 0000000000..6bc0373366 --- /dev/null +++ b/test/AltRsmiTests.cpp @@ -0,0 +1,906 @@ +/************************************************************************* + * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "alt_rsmi.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ARSMI_systemNode { + uint32_t s_node_id = 0; + uint64_t s_gpu_id = 0; + uint64_t s_unique_id = 0; + uint64_t s_location_id = 0; + uint64_t s_bdf = 0; + uint64_t s_domain = 0; + uint8_t s_bus = 0; + uint8_t s_device = 0; + uint8_t s_function = 0; + uint8_t s_partition_id = 0; + std::string s_card; +}; + +const char *kPathDRMRoot = "/sys/class/drm"; +const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes"; +uint32_t kAmdGpuId = 0x1002; + +// Vector containing data about each node, ordered by bdf ID +thread_local std::vector ARSMI_orderedNodes; + +// 2-D matrix with link information between each pair of nodes. +thread_local std::vector> ARSMI_orderedLinks; + +// Number of devices recognized +thread_local int ARSMI_num_devices = -1; + +int getNodeIndex(uint32_t node_id); + +std::string DevicePath(uint32_t dev_id); + +int isRegularFile(std::string fname, bool *is_reg); + +bool isNumber(const std::string &s); + +int openNodeFile(uint32_t dev_id, std::string node_file, std::ifstream *fs); + +int countIoLinks(uint32_t dev_id); + +int openLinkFile(uint32_t dev_id, uint32_t target_id, std::string node_file, + std::ifstream *fs); +int readGpuId(uint32_t node_id, uint64_t *gpu_id); + +bool isNodeSupported(uint32_t node_indx); + +int getPropertyValue(std::string property, uint64_t *value, + std::map &properties); + +bool fileExists(char const *filename); + +int ARSMI_readDeviceProperties(uint32_t node_id, + std::map &properties); + +int ARSMI_readLinkProperties(uint32_t node_id, uint32_t target_node_id, + std::map &properties); + +// /sys/class/kfd/kfd/topology/nodes/*/properties +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val, + std::map &properties); + +// /sys/class/kfd/kfd/topology/nodes/*/io_links/*/properties +int read_link_properties(uint32_t node, uint32_t target, + std::string property_name, uint64_t *val, + std::map &properties); + +// /sys/class/kfd/kfd/topology/nodes/*/gpu_id +int getGpuId(uint32_t node, uint64_t *gpu_id); + +namespace RcclUnitTesting { + +class AltRsmiTest : public ::testing::Test { + +protected: + // Helper function to create directories recursively + int createDirectory(const std::string &path) { + size_t pos = 0; + std::string currentPath; + + // Iterate through each component of the path + while ((pos = path.find('/', pos)) != std::string::npos) { + currentPath = path.substr(0, pos++); + if (!currentPath.empty() && mkdir(currentPath.c_str(), 0700) == -1 && + errno != EEXIST) { + return -1; // Return error if directory creation fails + } + } + + // Create the final directory + if (mkdir(path.c_str(), 0700) == -1 && errno != EEXIST) { + return -1; // Return error if directory creation fails + } + + return 0; // Success + } + + // Helper function to remove a directory recursively + int removeDirectory(const std::string &path) { + DIR *dir = opendir(path.c_str()); + if (!dir) { + std::cerr << "Failed to open directory: " << path << " (errno: " << errno + << ")" << std::endl; + return -1; + } + + struct dirent *entry; + while ((entry = readdir(dir)) != nullptr) { + // Skip "." and ".." entries + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { + continue; + } + + std::string fullPath = path + "/" + entry->d_name; + + // Check if the entry is a directory + struct stat entryStat; + if (stat(fullPath.c_str(), &entryStat) == -1) { + std::cerr << "Failed to stat: " << fullPath << " (errno: " << errno + << ")" << std::endl; + closedir(dir); + return -1; + } + + if (S_ISDIR(entryStat.st_mode)) { + // Recursively remove subdirectory + if (removeDirectory(fullPath) == -1) { + closedir(dir); + return -1; + } + } else { + // Remove file + if (unlink(fullPath.c_str()) == -1) { + std::cerr << "Failed to remove file: " << fullPath + << " (errno: " << errno << ")" << std::endl; + closedir(dir); + return -1; + } + } + } + + closedir(dir); + + // Remove the directory itself + if (rmdir(path.c_str()) == -1) { + std::cerr << "Failed to remove directory: " << path + << " (errno: " << errno << ")" << std::endl; + return -1; + } + + return 0; // Success + } + + // Helper function to create a file with content + void createFile(const std::string &path, const std::string &content) { + std::ofstream file(path); + if (!file) { + std::cerr << "Failed to create file: " << path << ", errno: " << errno << std::endl; + return; + } + file << content; + file.close(); + } + + // Helper function to remove a file + int removeFile(const std::string &path) { + if (unlink(path.c_str()) == -1) { + std::cerr << "Failed to remove file: " << path << " (errno: " << errno + << ")" << std::endl; + return -1; // Return error if file removal fails + } + return 0; // Success + } + + // Function to create the test directory structure and files + void setupTestFiles() { + const std::string basePath = "/tmp/test_kfd/topology/nodes"; + + createDirectory(basePath); + + // Create node 0 with valid data + createDirectory(basePath + "/0"); + createFile(basePath + "/0/gpu_id", "4098\n"); + createFile(basePath + "/0/properties", "unique_id 16336014475442738425\n" + "location_id 23552\n" + "domain 2\n" + "vendor_id 4098\n"); + + createDirectory(basePath + "/0/io_links/0"); + createFile(basePath + "/0/io_links/0/properties", + "type 5\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 1\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 0\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + createDirectory(basePath + "/0/io_links/1"); + createFile(basePath + "/0/io_links/1/properties", + "type 2\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 64000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + createDirectory(basePath + "/1"); + createFile(basePath + "/1/properties", "unique_id 16336014475442738426\n" + "location_id 23553\n" + "domain 1\n" + "vendor_id 4098\n"); + + createDirectory(basePath + "/1/io_links/0"); + createFile(basePath + "/1/io_links/0/properties", + "type 5\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 1\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 0\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + createDirectory(basePath + "/1/io_links/1"); + createFile(basePath + "/1/io_links/1/properties", + "type 2\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 1\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 0\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist + createDirectory(basePath + "/" + std::to_string(invalid_dev_id) + "/io_links/"); + + ARSMI_num_devices = 2; + ARSMI_systemNode node0 = {0, 0, 0, 0, 200, 0, 0, 0, 0, 0, ""}; + ARSMI_systemNode node1 = {1, 0, 0, 0, 100, 0, 0, 0, 0, 0, ""}; + + ARSMI_orderedNodes.clear(); + ARSMI_orderedNodes.push_back(node0); // Node 0 + ARSMI_orderedNodes.push_back(node1); // Node 1 + + ARSMI_orderedLinks.clear(); + ARSMI_orderedLinks.resize(2); + + // Link info from node 0 to node 0 and node 1 + ARSMI_orderedLinks[0].push_back({0, 0, 0, ARSMI_IOLINK_TYPE_UNDEFINED, 0, 0, 0}); // self-link + ARSMI_orderedLinks[0].push_back({0, 1, 1, ARSMI_IOLINK_TYPE_PCIEXPRESS, 40, 1000, 2000}); // 0->1 + + // Link info from node 1 to node 0 and node 1 + ARSMI_orderedLinks[1].push_back({1, 0, 1, ARSMI_IOLINK_TYPE_PCIEXPRESS, 40, 1000, 2000}); // 1->0 + ARSMI_orderedLinks[1].push_back({1, 1, 0, ARSMI_IOLINK_TYPE_UNDEFINED, 0, 0, 0}); // self-link + } + + void SetUp() override { + // Redirect kKFDNodesPathRoot to a temporary directory for testing + kKFDNodesPathRoot = "/tmp/test_kfd/topology/nodes"; + + // Create the test directory structure and files + setupTestFiles(); + } + + void TearDown() override { + // Clean up the temporary directory + removeDirectory("/tmp/test_kfd"); + } +}; + +TEST_F(AltRsmiTest, ARSMIInitDefault) { + ARSMI_num_devices = -1; // Force uninitialized state + int result = ARSMI_init(); + + ASSERT_EQ(result, 0); +} + +TEST_F(AltRsmiTest, ARSMIInitMissingIoLinksPropertiesFile) { + ARSMI_num_devices = -1; // Force uninitialized state + // Remove properties file for io_links + removeFile("/tmp/test_kfd/topology/nodes/0/io_links/0/properties"); + removeFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties"); + + int result = ARSMI_init(); + + ASSERT_EQ(result, 0); +} + +TEST_F(AltRsmiTest, ARSMIInitMissingNodeToProperty) { + ARSMI_num_devices = -1; // Force uninitialized state + createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties", + "type 2\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + // "node_to 0\n" // Missing node_to + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 64000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + + ASSERT_EQ(result, 0); // Expect success +} + +TEST_F(AltRsmiTest, ARSMIInitMissingWeightProperty) { + ARSMI_num_devices = -1; // Force uninitialized state + createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties", + "type 2\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 0\n" + // "weight 21\n" // Missing weight + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 64000\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + + ASSERT_NE(result, 0); // Expect non-zero error code +} + +TEST_F(AltRsmiTest, ARSMIInitMissingTypeProperty) { + ARSMI_num_devices = -1; // Force uninitialized state + createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties", + // "type 5\n" // Missing type + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + "max_bandwidth 0\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + + ASSERT_NE(result, 0); // Expect non-zero error code +} + +TEST_F(AltRsmiTest, ARSMIInitTypePCIeProperty) { + ARSMI_num_devices = -1; // Force uninitialized state + int result = ARSMI_init(); + + ASSERT_EQ(result, 0); +} + +TEST_F(AltRsmiTest, ARSMIInitMissingMinBWProperty) { + ARSMI_num_devices = -1; // Force uninitialized state + createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties", + "type 11\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + // "min_bandwidth 0\n" // Missing min_bandwidth + "max_bandwidth 0\n" + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + + ASSERT_NE(result, 0); // Expect non-zero error code +} + +TEST_F(AltRsmiTest, ARSMIInitMissingMaxBWProperty) { + ARSMI_num_devices = -1; // Force uninitialized state + createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties", + "type 5\n" + "version_major 0\n" + "version_minor 0\n" + "node_from 0\n" + "node_to 0\n" + "weight 21\n" + "min_latency 0\n" + "max_latency 0\n" + "min_bandwidth 0\n" + // "max_bandwidth 0\n" // Missing max_bandwidth + "recommended_transfer_size 0\n" + "recommended_sdma_engine_id_mask 0\n" + "flags 0\n"); + + int result = ARSMI_init(); + + ASSERT_NE(result, 0); // Expect non-zero error code +} + +TEST_F(AltRsmiTest, ARSMIGetNumDevicesUninitialized) { + ARSMI_num_devices = -1; // Force uninitialized state + uint32_t num_devices = 0; + + int result = ARSMI_get_num_devices(&num_devices); + + // Verify that the function initializes successfully + ASSERT_EQ(result, 0); + + // Verify that the number of devices is correctly set + ASSERT_EQ(num_devices, ARSMI_num_devices); +} + +TEST_F(AltRsmiTest, ARSMIDevPciIdGetNullBdfId) { + uint32_t device_index = 0; + int result = ARSMI_dev_pci_id_get(device_index, nullptr); + + ASSERT_EQ(result, EINVAL); +} + +TEST_F(AltRsmiTest, ARSMIDevPciIdGetUninitialized) { + ARSMI_num_devices = -1; // Force uninitialized state + uint32_t device_index = 0; + uint64_t bdfid = 0; + + // Fail to initialize the function + // kKFDNodesPathRoot = "/invalid/path/to/file"; + kKFDNodesPathRoot = "/invalid/path/to/file"; + + int result = ARSMI_dev_pci_id_get(device_index, &bdfid); + + // Verify that the function fails to initialize and returns an error + ASSERT_NE(result, 0); +} + +TEST_F(AltRsmiTest, GetNodeIndexInvalidNode) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + int result = getNodeIndex(invalid_node_id); + ASSERT_EQ(result, -1); // Expect -1 for invalid node +} + +TEST_F(AltRsmiTest, DevicePathInvalidDeviceId) { + uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist + std::string path = DevicePath(invalid_dev_id); + ASSERT_FALSE(path.empty()); // Path should still be constructed, but it won't + // point to a valid device +} + +TEST_F(AltRsmiTest, IsRegularFileInvalidPath) { + std::string invalid_path = "/invalid/path/to/file"; + bool is_reg = false; + int result = isRegularFile(invalid_path, &is_reg); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_FALSE(is_reg); // Expect is_reg to be false +} + +TEST_F(AltRsmiTest, IsNumberInvalidInput) { + ASSERT_FALSE(isNumber("abc123")); // Non-numeric string + ASSERT_FALSE(isNumber("")); // Empty string + ASSERT_FALSE(isNumber(" ")); // Whitespace string +} + +TEST_F(AltRsmiTest, OpenNodeFileInvalidPath) { + std::ifstream fs; + int result = openNodeFile(9999, "invalid_file", &fs); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_FALSE(fs.is_open()); // File stream should not be open +} + +TEST_F(AltRsmiTest, OpenNodeFileNotRegularFile) { + removeFile("/tmp/test_kfd/topology/nodes/0/properties"); + + // Create a directory instead of a regular file + createDirectory("/tmp/test_kfd/topology/nodes/0/properties"); + + std::ifstream fs; + int result = openNodeFile(0, "properties", &fs); + + // Verify that the function returns ENOENT + ASSERT_EQ(result, ENOENT); +} + +TEST_F(AltRsmiTest, OpenNodeFileInvalidNodeFile) { + uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist + std::ifstream fs; + int result = + openNodeFile(invalid_dev_id, "invalid_file", &fs); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_FALSE(fs.is_open()); // File stream should not be open +} + +TEST_F(AltRsmiTest, CountIoLinksInvalidDeviceId) { + uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist + int result = countIoLinks(invalid_dev_id); + ASSERT_EQ(result, 0); // Expect 0 links for an invalid device +} + +TEST_F(AltRsmiTest, OpenLinkFileInvalidLinkFile) { + uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist + uint32_t invalid_target_id = 9999; // Target ID that doesn't exist + std::ifstream fs; + int result = openLinkFile(invalid_dev_id, invalid_target_id, + "invalid_file", &fs); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_FALSE(fs.is_open()); // File stream should not be open +} + +TEST_F(AltRsmiTest, OpenLinkFileInvalidPath) { + std::ifstream fs; + int result = openLinkFile(9999, 9999, "invalid_file", &fs); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_FALSE(fs.is_open()); // File stream should not be open +} + +TEST_F(AltRsmiTest, OpenLinkFileNotRegularFile) { + removeFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties"); + + // Create a directory instead of a regular file + createDirectory("/tmp/test_kfd/topology/nodes/0/io_links/1/properties"); + + std::ifstream fs; + int result = openLinkFile(0, 1, "properties", &fs); + + // Verify that the function returns ENOENT + ASSERT_EQ(result, ENOENT); +} + +TEST_F(AltRsmiTest, GetGpuIdInvalidNode) { + uint64_t gpu_id = 0; + int result = getGpuId(9999, &gpu_id); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_EQ(gpu_id, 0); // GPU ID should not be modified +} + +TEST_F(AltRsmiTest, GetGpuIdInvalidId) { + uint64_t *gpu_id = nullptr; + int result = getGpuId(9999, gpu_id); + ASSERT_NE(result, 0); // Expect non-zero error code +} + +TEST_F(AltRsmiTest, ReadGpuIdInvalidNode) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + uint64_t gpu_id = 0; + int result = readGpuId(invalid_node_id, &gpu_id); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_EQ(gpu_id, 0); // GPU ID should not be modified +} + +TEST_F(AltRsmiTest, ReadGpuIdInvalidData) { + // Create the directory structure + removeDirectory("/tmp/test_kfd"); + createDirectory("/tmp/test_kfd/topology/nodes/0"); + + // Create a gpu_id file with invalid (non-numeric) data + std::ofstream gpu_id_file("/tmp/test_kfd/topology/nodes/0/gpu_id"); + gpu_id_file << "invalid_gpu_id"; // Non-numeric data + gpu_id_file.close(); + + uint64_t gpu_id = 0; + + // Call the readGpuId function + int result = readGpuId(0, &gpu_id); + + // Verify that the function returns ENXIO + ASSERT_EQ(result, ENXIO); +} + +TEST_F(AltRsmiTest, IsNodeSupportedInvalidNode) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + bool result = isNodeSupported(invalid_node_id); + ASSERT_FALSE(result); // Expect false for unsupported node +} + +TEST_F(AltRsmiTest, IsNodeSupportedEmptyFile) { + // Create an empty properties file + std::ofstream properties_file("/tmp/test_kfd/topology/nodes/0/properties"); + properties_file.close(); + + // Call the isNodeSupported function + bool result = isNodeSupported(0); + + // Verify that the function returns false for an empty file + ASSERT_FALSE(result); +} + +TEST_F(AltRsmiTest, GetPropertyValueInvalidProperty) { + std::map properties = {{"valid_property", 12345}}; + uint64_t value = 0; + int result = + getPropertyValue("invalid_property", &value, properties); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_EQ(value, 0); // Value should not be modified +} + +TEST_F(AltRsmiTest, GetPropertyValueNullValuePointer) { + std::map properties = {{"key1", 12345}}; + uint64_t *value = nullptr; + + // Call the function with a null value pointer + int result = getPropertyValue("key1", value, properties); + + // Verify that the function returns EINVAL + ASSERT_EQ(result, EINVAL); +} + +TEST_F(AltRsmiTest, GetPropertyValueEmptyPropertiesMap) { + std::map properties; // Empty map + uint64_t value = 0; + + // Call the function with an empty properties map + int result = getPropertyValue("key1", &value, properties); + + // Verify that the function returns EINVAL + ASSERT_EQ(result, EINVAL); +} + +TEST_F(AltRsmiTest, GetPropertyValueKeyNotFound) { + std::map properties = {{"key1", 12345}}; + uint64_t value = 0; + + // Call the function with a key that does not exist in the map + int result = getPropertyValue("key2", &value, properties); + + // Verify that the function returns EINVAL + ASSERT_EQ(result, EINVAL); +} + +TEST_F(AltRsmiTest, FileExistsInvalidPath) { + const char *invalid_path = "/invalid/path/to/file"; + bool result = fileExists(invalid_path); + ASSERT_FALSE(result); // Expect false for non-existent file +} + +TEST_F(AltRsmiTest, ARSMIReadDevicePropertiesInvalidNode) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + std::map properties; + int result = + ARSMI_readDeviceProperties(invalid_node_id, properties); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_TRUE(properties.empty()); // Properties map should remain empty +} + +TEST_F(AltRsmiTest, ARSMI_readDevicePropertiesNotRegularFile) { + // Clean up + removeFile("/tmp/test_kfd/topology/nodes/0/properties"); + + // Create a directory instead of a regular file + createDirectory("/tmp/test_kfd/topology/nodes/0/properties"); + + std::map properties = {{"unique_id", 12345}, + {"location_id", 67890}}; + + std::ifstream fs; + int result = ARSMI_readDeviceProperties(0, properties); + + // Verify that the function returns ENOENT + ASSERT_EQ(result, ENOENT); +} + +TEST_F(AltRsmiTest, ARSMI_readDevicePropertiesEmptyFile) { + createFile("/tmp/test_kfd/topology/nodes/0/properties", ""); + + std::map properties; + + std::ifstream fs; + int result = ARSMI_readDeviceProperties(0, properties); + + // Verify that the function handles empty lines correctly + ASSERT_EQ(result, ENOENT); +} + +TEST_F(AltRsmiTest, ARSMI_readDevicePropertiesTrailingEmptyLines) { + createFile("/tmp/test_kfd/topology/nodes/0/properties", "key1 101\n" + "key2 102\n" + " \n" + "\n"); + + std::map properties; + + std::ifstream fs; + int result = ARSMI_readDeviceProperties(0, properties); + + // Verify that the function handles empty lines correctly + ASSERT_EQ(result, 0); +} + +TEST_F(AltRsmiTest, ARSMIReadLinkPropertiesInvalidLink) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + uint32_t invalid_target_id = 9999; // Target ID that doesn't exist + std::map properties; + int result = ARSMI_readLinkProperties( + invalid_node_id, invalid_target_id, properties); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_TRUE(properties.empty()); // Properties map should remain empty +} + +TEST_F(AltRsmiTest, ARSMI_readLinkPropertiesNotRegularFile) { + // Clean up + removeFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties"); + + // Create a directory instead of a regular file + createDirectory("/tmp/test_kfd/topology/nodes/0/properties"); + + std::map properties = {{"unique_id", 12345}, + {"location_id", 67890}}; + + std::ifstream fs; + int result = ARSMI_readLinkProperties(0, 1, properties); + + // Verify that the function returns ENOENT + ASSERT_EQ(result, ENOENT); + + // Clean up + rmdir("/tmp/test_kfd/0/io_links/1/properties"); +} + +TEST_F(AltRsmiTest, ARSMI_readLinkPropertiesTrailingEmptyLine) { + + createFile("/tmp/test_kfd/topology/nodes/0/io_links/0/properties", + "key1 101\n" + "key2 102\n" + " \n"); + + std::map properties; + + std::ifstream fs; + int result = ARSMI_readLinkProperties(0, 0, properties); + + // Verify that the function handles empty lines correctly + ASSERT_EQ(result, 0); +} + +TEST_F(AltRsmiTest, ARSMI_readLinkPropertiesEmptyFile) { + createFile("/tmp/test_kfd/topology/nodes/0/io_links/0/properties", ""); + + std::map properties; + + std::ifstream fs; + int result = ARSMI_readLinkProperties(0, 0, properties); + + ASSERT_EQ(result, ENOENT); +} + +TEST_F(AltRsmiTest, ReadNodePropertiesInvalidProperty) { + std::map properties = {{"unique_id", 12345}, + {"location_id", 67890}}; + uint64_t value = 0; + + // Call the wrapper function with an invalid property name + int result = read_node_properties(0, "invalid_property", + &value, properties); + + // Verify that the function fails for an invalid property name + ASSERT_NE(result, 0); +} + +TEST_F(AltRsmiTest, ReadNodePropertiesInvalidNode) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + std::map properties; + uint64_t value = 0; + int result = read_node_properties( + invalid_node_id, "valid_property", &value, properties); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_EQ(value, 0); // Value should not be modified +} + +TEST_F(AltRsmiTest, ReadNodePropertiesInvalidPropertyValue) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + std::map properties; + uint64_t *value = nullptr; + int result = read_node_properties(invalid_node_id, "", value, + properties); + ASSERT_EQ(result, EINVAL); // Expect non-zero error code +} + +TEST_F(AltRsmiTest, ReadLinkPropertiesInvalidLink) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + uint32_t invalid_target_id = 9999; // Target ID that doesn't exist + std::map properties; + uint64_t value = 0; + int result = read_link_properties( + invalid_node_id, invalid_target_id, "valid_property", &value, properties); + ASSERT_NE(result, 0); // Expect non-zero error code + ASSERT_EQ(value, 0); // Value should not be modified +} + +TEST_F(AltRsmiTest, ReadLinkPropertiesInvalidPropertyValue) { + uint32_t invalid_node_id = 9999; // Node ID that doesn't exist + uint32_t invalid_target_id = 9999; // Target ID that doesn't exist + std::map properties; + uint64_t *value = nullptr; + int result = read_link_properties( + invalid_node_id, invalid_target_id, "", value, properties); + ASSERT_EQ(result, EINVAL); // Expect non-zero error code +} + +TEST_F(AltRsmiTest, NullInfoPointer) { + int result = ARSMI_topo_get_link_info(0, 1, nullptr); + ASSERT_EQ(result, EINVAL); // Expect EINVAL for null `info` pointer +} + +TEST_F(AltRsmiTest, SourceDeviceIndexOutOfRange) { + ARSMI_linkInfo info; + ARSMI_num_devices = + 2; // Simulate initialized state with two devices + int result = ARSMI_topo_get_link_info(999, 1, &info); // Invalid source index + ASSERT_EQ(result, EINVAL); // Expect EINVAL for out-of-range source index +} + +TEST_F(AltRsmiTest, DestinationDeviceIndexOutOfRange) { + ARSMI_linkInfo info; + ARSMI_num_devices = + 2; // Simulate initialized state with two devices + int result = + ARSMI_topo_get_link_info(0, 999, &info); // Invalid destination index + ASSERT_EQ(result, EINVAL); // Expect EINVAL for out-of-range destination index +} + +TEST_F(AltRsmiTest, UninitializedNumDevices) { + + kKFDNodesPathRoot = + "/tmp/invalid_path"; // Simulate invalid path + + ARSMI_linkInfo info; + ARSMI_num_devices = -1; // Simulate uninitialized state + int result = ARSMI_topo_get_link_info(0, 0, &info); + ASSERT_NE(result, 0); // Expect non-zero error code for uninitialized state +} + +TEST_F(AltRsmiTest, InvalidLinkInfo) { + + // Initialize ARSMI_orderedLinks with data not in order + ARSMI_orderedLinks = { + { + {1, 0, 0, ARSMI_IOLINK_TYPE_UNDEFINED, 0, 0, + 0}, // No link from Device 1 to itself + {1, 0, 1, ARSMI_IOLINK_TYPE_PCIEXPRESS, 40, 1000, 2000} + // Link from Device 1 to Device 0 + }, + { + {0, 1, 1, ARSMI_IOLINK_TYPE_PCIEXPRESS, 40, 1000, + 2000}, // Link from Device 0 to Device 1 + {0, 0, 0, ARSMI_IOLINK_TYPE_UNDEFINED, 0, 0, 0} + // No link from Device 0 to itself + }}; + + // Leave ARSMI_orderedLinks uninitialized + ARSMI_linkInfo info; + int result = ARSMI_topo_get_link_info(0, 1, &info); + ASSERT_EQ(info.hops, 2); // Expect default values for uninitialized link info + ASSERT_EQ(info.type, ARSMI_IOLINK_TYPE_PCIEXPRESS); + ASSERT_EQ(info.weight, 40); + ASSERT_EQ(info.min_bandwidth, 0); + ASSERT_EQ(info.max_bandwidth, 0); +} + +} // namespace RcclUnitTesting diff --git a/test/ArgCheckTests.cpp b/test/ArgCheckTests.cpp new file mode 100644 index 0000000000..dd9bc9bcf3 --- /dev/null +++ b/test/ArgCheckTests.cpp @@ -0,0 +1,327 @@ +/************************************************************************* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include + +#include "argcheck.h" +#include "comm.h" +#include + +class ArgCheckTest : public ::testing::Test { +protected: + ncclComm_t comm; + struct ncclInfo *info; + int *sendDevicePtr = nullptr; + int *recvDevicePtr = nullptr; + + // Helper function to set up valid ncclInfo for boundary testing + void SetupValidInfo() { + // Set up valid info structure + info->comm = comm; + info->root = 0; // Valid root + info->datatype = (ncclDataType_t)0; // Valid datatype + info->op = (ncclRedOp_t)0; // Valid reduction operation + info->coll = ncclFuncBroadcast; // Valid collective operation + info->sendbuff = nullptr; // Will be set per test if needed + info->recvbuff = nullptr; // Will be set per test if needed + info->count = 10; // Valid count + info->opName = "TestOp"; // Valid operation name + } + + // Helper function for tests requiring device memory + void SetupValidBufferWithDeviceMemory() { + // Set the active device to match comm->cudaDev + hipError_t errSetDevice = hipSetDevice(comm->cudaDev); + ASSERT_EQ(errSetDevice, hipSuccess); + + // Allocate device memory + hipError_t errSend = hipMalloc(&sendDevicePtr, sizeof(int)); + ASSERT_EQ(errSend, hipSuccess); + hipError_t errRecv = hipMalloc(&recvDevicePtr, sizeof(int)); + ASSERT_EQ(errRecv, hipSuccess); + + // Set device pointers + info->sendbuff = sendDevicePtr; + info->recvbuff = recvDevicePtr; + } + + // Helper to clean up device memory + void CleanupDeviceMemory() { + if (sendDevicePtr) { + hipFree(sendDevicePtr); + sendDevicePtr = nullptr; + } + if (recvDevicePtr) { + hipFree(recvDevicePtr); + recvDevicePtr = nullptr; + } + } + + void SetUp() override { + // Allocate and zero-initialize ncclComm as a pointer + comm = (struct ncclComm *)calloc(1, sizeof(struct ncclComm)); + ASSERT_NE(comm, nullptr) << "Failed to allocate ncclComm"; + + // Initialize the communicator with required fields + comm->cudaDev = 0; + comm->nRanks = 4; + comm->checkPointers = true; + comm->rank = 0; + + comm->startMagic = NCCL_MAGIC; + comm->endMagic = NCCL_MAGIC; + + // Verify the magic values were set correctly + ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic not set correctly"; + ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic not set correctly"; + + // Allocate and zero-initialize ncclInfo as a pointer + info = (ncclInfo *)calloc(1, sizeof(ncclInfo)); + ASSERT_NE(info, nullptr) << "Failed to allocate ncclInfo"; + + SetupValidInfo(); + + SetupValidBufferWithDeviceMemory(); + } + + void TearDown() override { + // Free the allocated memory + CleanupDeviceMemory(); + if (info) { + free(info); + info = nullptr; + } + if (comm) { + free(comm); + comm = nullptr; + } + } +}; + +TEST_F(ArgCheckTest, CudaPtrCheck_ValidPointer) { + int *devicePtr = nullptr; + hipError_t err = hipMalloc(&devicePtr, sizeof(int)); + ASSERT_EQ(err, hipSuccess); + + ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp"); + EXPECT_EQ(result, ncclSuccess); + + hipFree(devicePtr); +} + +TEST_F(ArgCheckTest, CudaPtrCheck_NullPointer) { + ncclResult_t result = CudaPtrCheck(nullptr, comm, "invalidPtr", "TestOp"); + EXPECT_EQ(result, ncclInvalidArgument); +} + +TEST_F(ArgCheckTest, CudaPtrCheck_DifferentDevicePointer) { + int *devicePtr = nullptr; + hipSetDevice(1); + hipError_t err = hipMalloc(&devicePtr, sizeof(int)); + ASSERT_EQ(err, hipSuccess); + + ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp"); + EXPECT_EQ(result, ncclInvalidArgument); + + hipFree(devicePtr); + hipSetDevice(comm->cudaDev); +} + +TEST_F(ArgCheckTest, CudaPtrCheck_HostMemoryPointer) { + // Test with host memory instead of device memory + int *hostPtr = (int *)malloc(sizeof(int)); + ASSERT_NE(hostPtr, nullptr) << "Failed to allocate host memory"; + + *hostPtr = 42; // Initialize the memory + + // This should fail because host memory is not device memory + ncclResult_t result = CudaPtrCheck(hostPtr, comm, "hostPtr", "TestOp"); + + // Host memory should be rejected by CudaPtrCheck + EXPECT_EQ(result, ncclInvalidArgument) + << "Host memory should be rejected by CudaPtrCheck"; + + free(hostPtr); +} + +TEST_F(ArgCheckTest, PtrCheck_ValidPointer) { + int value = 42; + ncclResult_t result = PtrCheck(&value, "TestOp", "value"); + ASSERT_EQ(result, ncclSuccess); +} + +TEST_F(ArgCheckTest, PtrCheck_NullPointer) { + ncclResult_t result = PtrCheck(nullptr, "TestOp", "value"); + ASSERT_EQ(result, ncclInvalidArgument); +} + +TEST_F(ArgCheckTest, CommCheck_ValidComm) { + comm->startMagic = NCCL_MAGIC; + comm->endMagic = NCCL_MAGIC; + + // Verify magic values are still correct (should be set in SetUp()) + ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic was corrupted"; + ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic was corrupted"; + + // Call CommCheck and verify the result + ncclResult_t result = CommCheck(comm, "TestOp", "testComm"); + EXPECT_EQ(result, ncclSuccess) << "Failed for valid communicator"; +} + +TEST_F(ArgCheckTest, CommCheck_NullComm) { + ncclResult_t result = CommCheck(nullptr, "TestOp", "comm"); + ASSERT_EQ(result, ncclInvalidArgument); +} + +TEST_F(ArgCheckTest, CommCheck_CorruptedStartMagic) { + // Corrupt only startMagic, keep endMagic valid + comm->startMagic = 1; // Corrupt startMagic + comm->endMagic = NCCL_MAGIC; // Keep endMagic valid + + // Call CommCheck and verify the result + ncclResult_t result = CommCheck(comm, "TestOp", "comm"); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted startMagic"; +} + +TEST_F(ArgCheckTest, CommCheck_CorruptedEndMagic) { + // Keep startMagic valid, corrupt only endMagic + comm->startMagic = NCCL_MAGIC; // Keep startMagic valid + comm->endMagic = 1; // Corrupt endMagic + + // Call CommCheck and verify the result + ncclResult_t result = CommCheck(comm, "TestOp", "comm"); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted endMagic"; +} + +TEST_F(ArgCheckTest, CommCheck_CorruptedBothMagics) { + // Corrupt both startMagic and endMagic + comm->startMagic = 1; // Corrupt startMagic + comm->endMagic = 1; // Corrupt endMagic + + // Call CommCheck and verify the result + ncclResult_t result = CommCheck(comm, "TestOp", "comm"); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for corrupted both magic values"; +} + +TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_NegativeValue) { + info->root = -1; // Invalid root (< 0) + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root < 0"; +} + +TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_ExceedsNRanks) { + info->root = comm->nRanks; // Invalid root (>= nRanks) + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root >= nRanks"; +} + +TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_NegativeValue) { + info->datatype = (ncclDataType_t)-1; // Invalid datatype (< 0) + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype < 0"; +} + +TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_ExceedsMaxValue) { + info->datatype = + (ncclDataType_t)ncclNumTypes; // Invalid datatype (>= ncclNumTypes) + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for invalid datatype >= ncclNumTypes"; +} + +TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_NegativeValue) { + info->op = (ncclRedOp_t)-1; // Invalid reduction operation (< 0) + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for invalid reduction operation < 0"; +} + +TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_ExceedsMaxValue) { + info->op = + (ncclRedOp_t)ncclNumOps; // Invalid reduction operation (>= ncclNumOps) + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for invalid reduction operation >= ncclNumOps"; +} + +TEST_F(ArgCheckTest, ArgsCheck_InvalidCommunicatorPointers) { + info->op = (ncclRedOp_t)0; // Valid reduction operation + if (info->sendbuff) { + hipFree((void *)info->sendbuff); + info->sendbuff = nullptr; // Invalid send buffer + } + if (info->recvbuff) { + hipFree((void *)info->recvbuff); + info->recvbuff = nullptr; // Invalid receive buffer + } + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for invalid communicator pointers"; +} + +TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperationOutOfRange) { + info->op = (ncclRedOp_t)5; // Invalid reduction operation (out of range) + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for invalid reduction operation"; +} + +TEST_F(ArgCheckTest, ArgsCheck_UserDefinedReductionOperationInvalid) { + // Test case: User-defined reduction operation with freeNext != -1 + info->op = (ncclRedOp_t)(ncclNumOps + + 1); // Set op to a user-defined reduction operation + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for user-defined reduction operation with freeNext != -1"; +} + +TEST_F(ArgCheckTest, ArgsCheck_SendAndRecvFunction) { + info->recvbuff = + recvDevicePtr; // Use allocated device pointer for receive buffer + + // Test both ncclFuncSend and ncclFuncRecv + for (auto coll : {ncclFuncSend, ncclFuncRecv}) { + info->coll = coll; // Set the collective operation + + // Call ArgsCheck and verify the result + ncclResult_t result = ArgsCheck(info); + ASSERT_EQ(result, ncclSuccess) << "Failed for coll = " << coll; + } +} + +TEST_F(ArgCheckTest, ArgsCheck_CollNotReduce) { + // Case: info->coll != ncclFuncReduce + info->coll = ncclFuncBroadcast; // Set coll to ncclFuncBroadcast + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclSuccess) << "Failed for coll != ncclFuncReduce"; +} + +TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithRootRank) { + // Case: info->coll == ncclFuncReduce and info->comm->rank == info->root + info->coll = ncclFuncReduce; // Set coll to ncclFuncReduce + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclSuccess) + << "Failed for coll == ncclFuncReduce and rank == root"; +} + +TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithNonRootRank) { + comm->rank = 1; // Set rank to 1 (non-root) + + ncclResult_t result = ArgsCheck(info); + EXPECT_EQ(result, ncclSuccess) + << "Failed for coll == ncclFuncReduce and rank != root"; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 31b14baf68..1d1a73a1d2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.12) if(BUILD_TESTS) option(OPENMP_TESTS_ENABLED "Enable OpenMP for unit tests" OFF) - + message("Building rccl unit tests (Installed in /test/rccl-UnitTests)") find_package(hsa-runtime64 PATHS /opt/rocm ) @@ -56,63 +56,96 @@ if(BUILD_TESTS) common/TestBed.cpp common/TestBedChild.cpp common/StandaloneUtils.cpp - ../src/misc/recorder.cc proxy_trace/ProxyTraceUnitTests.cpp - ../src/misc/proxy_trace/proxy_trace.cc ) - # Append a file if BUILD_TESTS is ON and build type is not Debug - # Visibility is hidden by default, so we need to explicitly add the recorder.cc file - # to the unit tests to ensure it is included for unit test execution - if(BUILD_TESTS AND NOT CMAKE_BUILD_TYPE MATCHES "Debug") + # Append file if build type is not Debug. + # Visibility is hidden by default, so we need to explicitly add the following file(s) + # to the unit tests to ensure it is included for the existing rccl-UnitTests execution + if(NOT CMAKE_BUILD_TYPE MATCHES "Debug") list(APPEND TEST_SOURCE_FILES ../src/misc/recorder.cc + ../src/misc/proxy_trace/proxy_trace.cc ) endif() + # Add rccl-UnitTests binary add_executable(rccl-UnitTests ${TEST_SOURCE_FILES}) - ## Set rccl-UnitTests include directories - target_include_directories(rccl-UnitTests PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS}) - target_include_directories(rccl-UnitTests PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header - target_include_directories(rccl-UnitTests PRIVATE ${PROJECT_BINARY_DIR}/hipify/src/include) # for rccl_bfloat16.h - target_include_directories(rccl-UnitTests PRIVATE ${PROJECT_BINARY_DIR}/hipify/src/include/plugin) # for recorder tests + set(RCCL_TEST_EXECUTABLES rccl-UnitTests) - ## Set rccl-UnitTests compile definitions - if(LL128_ENABLED) - target_compile_definitions(rccl-UnitTests PRIVATE ENABLE_LL128) - endif() - if(OPENMP_TESTS_ENABLED) - target_compile_definitions(rccl-UnitTests PRIVATE ENABLE_OPENMP) - endif() - target_compile_definitions(rccl-UnitTests PRIVATE ROCM_PATH="${ROCM_PATH}") + # Create rccl-UnitTestsFixtures binary if ROCm version is 4.6.0 or greater + # and build type is Debug + if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug") - ## Set rccl-UnitTests compile definitions - if(OPENMP_TESTS_ENABLED) - target_compile_options(rccl-UnitTests PRIVATE "${OpenMP_CXX_FLAGS}") + set(TEST_FIXTURE_SOURCE_FILES + AltRsmiTests.cpp + ArgCheckTests.cpp + common/main_fixtures.cpp + common/EnvVars.cpp + ) + + # Add rccl-UnitTestsFixtures binary + add_executable(rccl-UnitTestsFixtures ${TEST_FIXTURE_SOURCE_FILES}) + + list(APPEND RCCL_TEST_EXECUTABLES rccl-UnitTestsFixtures) + + add_dependencies(rccl-UnitTestsFixtures replace_static_in_hipify) endif() - ## Set rccl-UnitTests linked libraries - target_link_libraries(rccl-UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES}) - target_link_libraries(rccl-UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64) - target_link_libraries(rccl-UnitTests PRIVATE Threads::Threads) - target_link_libraries(rccl-UnitTests PRIVATE dl) - target_link_libraries(rccl-UnitTests PRIVATE fmt::fmt-header-only) - if(OPENMP_TESTS_ENABLED) - target_link_libraries(rccl-UnitTests PRIVATE "${OpenMP_CXX_FLAGS}") - endif() + ## Set include directories for the target(s) + foreach(target ${RCCL_TEST_EXECUTABLES}) + target_include_directories(${target} PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS}) + target_include_directories(${target} PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header + target_include_directories(${target} PRIVATE ${PROJECT_BINARY_DIR}/hipify/src/include) # for rccl_bfloat16.h + target_include_directories(${target} PRIVATE ${PROJECT_BINARY_DIR}/hipify/src/include/plugin) # for recorder tests + + # Get the compile definitions from the main rccl target + # These helps to keep the test compile definitions in sync with the main rccl target + # Also, all the structure layout remains the same across all the targets + get_target_property(RCCL_COMPILE_DEFINITIONS rccl COMPILE_DEFINITIONS) + if(RCCL_COMPILE_DEFINITIONS) + target_compile_definitions(${target} PRIVATE ${RCCL_COMPILE_DEFINITIONS}) + endif() + + # Also get interface compile definitions + get_target_property(RCCL_INTERFACE_COMPILE_DEFINITIONS rccl INTERFACE_COMPILE_DEFINITIONS) + if(RCCL_INTERFACE_COMPILE_DEFINITIONS) + target_compile_definitions(${target} PRIVATE ${RCCL_INTERFACE_COMPILE_DEFINITIONS}) + endif() + + ## Set compile definitions + if(LL128_ENABLED) + target_compile_definitions(${target} PRIVATE ENABLE_LL128) + endif() + if(OPENMP_TESTS_ENABLED) + target_compile_definitions(${target} PRIVATE ENABLE_OPENMP) + endif() + target_compile_definitions(${target} PRIVATE ROCM_PATH="${ROCM_PATH}") + + ## Set rccl unittests linked libraries + target_link_libraries(${target} PRIVATE ${GTEST_BOTH_LIBRARIES}) + target_link_libraries(${target} PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64) + target_link_libraries(${target} PRIVATE Threads::Threads) + target_link_libraries(${target} PRIVATE dl) + target_link_libraries(${target} PRIVATE fmt::fmt-header-only) + if(OPENMP_TESTS_ENABLED) + target_link_libraries(${target} PRIVATE "${OpenMP_CXX_FLAGS}") + endif() + + # Link rccl library + if(BUILD_SHARED_LIBS) + target_link_libraries(${target} PRIVATE rccl) + set_property(TARGET ${target} PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib;${CMAKE_BINARY_DIR}") + else() + add_dependencies(${target} rccl) + target_link_libraries(${target} PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/lib -L${ROCM_PATH}/rocm_smi/lib) + endif() + set_property(TARGET ${target} PROPERTY BUILD_RPATH "${CMAKE_BINARY_DIR};${ROCM_PATH}/lib") + + # Install the binary + rocm_install(TARGETS ${target} COMPONENT tests) + endforeach() - # rccl-UnitTests using static library of rccl requires passing rccl - # through -l and -L instead of command line input. - if(BUILD_SHARED_LIBS) - target_link_libraries(rccl-UnitTests PRIVATE rccl) - set_property(TARGET rccl-UnitTests PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib;${CMAKE_BINARY_DIR}") - else() - add_dependencies(rccl-UnitTests rccl) - target_link_libraries(rccl-UnitTests PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/lib -L${ROCM_PATH}/rocm_smi/lib) - endif() - set_property(TARGET rccl-UnitTests PROPERTY BUILD_RPATH "${CMAKE_BINARY_DIR};${ROCM_PATH}/lib") - rocm_install(TARGETS rccl-UnitTests COMPONENT tests) -else() - message("Not building rccl unit tests") endif() + diff --git a/test/common/StandaloneUtils.hpp b/test/common/StandaloneUtils.hpp index c2615a6cbd..c76957e8b9 100644 --- a/test/common/StandaloneUtils.hpp +++ b/test/common/StandaloneUtils.hpp @@ -25,7 +25,7 @@ } \ } while(0) -#define MAX_STACK_SIZE 570 +#define MAX_STACK_SIZE 640 #ifdef ENABLE_LL128 #define MAX_STACK_SIZE_gfx90a 360 diff --git a/test/common/main_fixtures.cpp b/test/common/main_fixtures.cpp new file mode 100644 index 0000000000..ab12a51173 --- /dev/null +++ b/test/common/main_fixtures.cpp @@ -0,0 +1,45 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + + #include + #include "EnvVars.hpp" + + int main(int argc, char **argv) + { + ::testing::InitGoogleTest(&argc, argv); + RcclUnitTesting::EnvVars ev; + ev.ShowConfig(); + int retCode = RUN_ALL_TESTS(); + + // Show timing information + + if (ev.showTiming) + { + size_t totalTimeMsec = 0; + fflush(stdout); + printf("[ TIMING ] %-20s: %-20s: %10s ms (%s)\n", "TEST SUITE", "TEST NAME", "TIME", "STATUS"); + auto unitTest = ::testing::UnitTest::GetInstance(); + for (int i = 0; i < unitTest->total_test_suite_count(); i++) + { + auto suiteInfo = unitTest->GetTestSuite(i); + if (!suiteInfo->should_run()) continue; + + for (int j = 0; j < suiteInfo->total_test_count(); j++) + { + auto testInfo = suiteInfo->GetTestInfo(j); + if (!testInfo->should_run()) continue; + auto testResult = testInfo->result(); + if (testResult->Skipped()) continue; + printf("[ TIMING ] %-20s: %-20s: %10.2f sec (%4s)\n", testInfo->test_suite_name(), testInfo->name(), testResult->elapsed_time() / 1000.0, testResult->Passed() ? "PASS" : "FAIL"); + } + printf("[ TIMING ] %-20s: %-20s: %10.2f sec (%4s)\n", suiteInfo->name(), "TOTAL", suiteInfo->elapsed_time() / 1000.0, suiteInfo->Passed() ? "PASS" : "FAIL"); + totalTimeMsec += suiteInfo->elapsed_time(); + } + printf("[ TIMING ] Total time: %10.2f minutes\n", totalTimeMsec / (60 * 1000.0)); + } + return retCode; + } + diff --git a/tools/scripts/replace_static.sh b/tools/scripts/replace_static.sh new file mode 100755 index 0000000000..58269756b9 --- /dev/null +++ b/tools/scripts/replace_static.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +# Usage: +# ./replace_static_functions.sh [--replace-vars] [--verbose] +# +# - Replaces all 'static' function definitions with non-static. +# - Replaces all 'static inline' with 'inline'. +# - If --replace-vars is given, also replaces 'static' at variable definitions. +# - If --verbose is given, shows a diff of the changes. + +set -e + +SOURCE_FILE="$1" +REPLACE_VARS=0 +VERBOSE=0 + +for arg in "$@"; do + if [[ "$arg" == "--replace-vars" ]]; then + REPLACE_VARS=1 + fi + if [[ "$arg" == "--verbose" ]]; then + VERBOSE=1 + fi +done + +if [[ ! -f "$SOURCE_FILE" ]]; then + echo "Source file '$SOURCE_FILE' not found!" + exit 1 +fi + +TMP_FILE="${SOURCE_FILE}.tmp.$$" + +# Regex explanation: +# \b : Word boundary, ensures 'static' and 'inline' are matched as whole words. +# static : Matches the literal word 'static'. +# [[:space:]]+ : Matches one or more whitespace characters (spaces, tabs, etc.) between 'static' and 'inline'. +# inline : Matches the literal word 'inline'. +# \b : Word boundary after 'inline'. +echo "[INFO] Replacing 'static inline' with 'inline' in $SOURCE_FILE" +sed -E 's/\bstatic[[:space:]]+inline\b/inline/g' "$SOURCE_FILE" > "$TMP_FILE" + +# Regex explanation: +# ^ : Start of the line. +# ([[:space:]]*) : Captures any leading whitespace at the start of the line (indentation). +# (inline[[:space:]]+|__device__[[:space:]]+|__forceinline__[[:space:]]+|__host__[[:space:]]+|__global__[[:space:]]+|)* : +# Matches zero or more occurrences of common C/C++/CUDA qualifiers (each followed by whitespace). +# ([[:space:]]*(...|)*) : The outer group allows for any combination/order of these qualifiers. +# static[[:space:]]+ : Matches the literal word 'static' followed by one or more spaces/tabs. +# \1 : In the replacement, refers to the leading whitespace and any qualifiers (without 'static'). +# +# Removes 'static' after any qualifiers before the function name +echo "[INFO] Replacing 'static' in function qualifiers in $SOURCE_FILE" +sed -E -i 's/^([[:space:]]*(inline[[:space:]]+|__device__[[:space:]]+|__forceinline__[[:space:]]+|__host__[[:space:]]+|__global__[[:space:]]+|)*)static[[:space:]]+/\1/g' "$TMP_FILE" + + +# Regex explanation: +# ^ : Start of the line. +# ([[:space:]]*) : Captures any leading whitespace at the start of the line. +# static : Matches the literal word 'static'. +# ([[:space:]]+) : Captures one or more spaces after 'static'. +if [[ "$REPLACE_VARS" == "1" ]]; then + echo "[INFO] Replacing 'static' at variable definitions in $SOURCE_FILE" + # This matches 'static' at the start of a line (possibly with spaces), followed by a type and a variable name + sed -E -i 's/^([[:space:]]*)static([[:space:]]+)/\1/g' "$TMP_FILE" +fi + +if [[ "$VERBOSE" == "1" ]]; then + echo "[INFO] Showing diff for changes:" + diff -u "$SOURCE_FILE" "$TMP_FILE" || true +fi + +mv "$TMP_FILE" "$SOURCE_FILE" +echo "Static function replacement complete for $SOURCE_FILE" +if [[ "$REPLACE_VARS" == "1" ]]; then + echo "Static variable replacement also performed." +fi +