Code coverage improvements (#1665)

* Increased max stack size to 640

* Added new binary for executing unit tests

Added new unit tests for argcheck.cc and alt_rsmi.cc files

Modified the method to execute unit tests to cover static methods
by using a bash script to convert static to non-static functions
and variables on the fly restricted to debug build type.
This commit is contained in:
Atul Kulkarni
2025-07-17 11:20:49 -05:00
committed by GitHub
orang tua ef6a54ba34
melakukan 275fdd43c1
9 mengubah file dengan 1527 tambahan dan 57 penghapusan
+58 -2
Melihat File
@@ -715,6 +715,57 @@ foreach(SRC_FILE ${SRC_FILES})
endif()
endforeach()
# Adding custom target to hipify all the source files
# This is required to make sure that all the hipified source files are
# available before compiling the unit tests executable(s)
add_custom_target(hipify_all DEPENDS ${HIP_SOURCES})
if (BUILD_TESTS)
if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug")
set(HIPIFY_SRC_DIR "${PROJECT_BINARY_DIR}/hipify/src")
set(REPLACE_SCRIPT "${CMAKE_SOURCE_DIR}/tools/scripts/replace_static.sh")
message ("Replacing static functions in ${HIPIFY_SRC_DIR} with ${REPLACE_SCRIPT} for unit tests")
# Create a list of files which needs to be modified to remove static
set(TEST_NONSTATIC_SOURCE_FILES
${HIPIFY_SRC_DIR}/misc/alt_rsmi.cc
)
# Create a custom command to backup the original files and remove static
# Always run replace script on hipified files, but preserve original backups
foreach(srcfile ${TEST_NONSTATIC_SOURCE_FILES})
add_custom_command(
OUTPUT "${srcfile}.staticbak"
COMMAND bash -c "\
${CMAKE_COMMAND} -E echo 'Processing ${srcfile} for static replacement' && \
if [ ! -f '${srcfile}.staticbak' ]; then \
${CMAKE_COMMAND} -E copy '${srcfile}' '${srcfile}.staticbak' && \
${CMAKE_COMMAND} -E echo 'Created backup: ${srcfile}.staticbak'; \
fi && \
${CMAKE_COMMAND} -E echo 'Exposing internal functions/variables in ${srcfile}' && \
${REPLACE_SCRIPT} ${srcfile} --replace-vars --exclude-list=${exclude_list} 2>&1 \
"
DEPENDS "${srcfile}" "${REPLACE_SCRIPT}" hipify_all
COMMENT "Removing static from ${srcfile} (backup preserved at ${srcfile}.staticbak)"
VERBATIM
)
list(APPEND STATIC_BAK_FILES "${srcfile}.staticbak")
endforeach()
add_custom_target(replace_static_in_hipify ALL DEPENDS ${STATIC_BAK_FILES})
add_dependencies(replace_static_in_hipify hipify_all)
else()
# Restore originals in the build directory if backup exists
foreach(srcfile ${HIPIFY_SRC_FILES})
if(EXISTS "${srcfile}.staticbak")
execute_process(
COMMAND ${CMAKE_COMMAND} -E copy "${srcfile}.staticbak" "${srcfile}"
)
execute_process(
COMMAND ${CMAKE_COMMAND} -E remove "${srcfile}.staticbak"
)
endif()
endforeach()
endif()
endif()
# Generate device/host tables and all the collective functions that are going to be in librccl.so
#==================================================================================================
find_package(Python3 COMPONENTS Interpreter REQUIRED)
@@ -769,6 +820,11 @@ add_library(rccl ${HIP_SOURCES})
## Set RCCL dependencies
add_dependencies(rccl git_version_check) # Execute git_version_check during build
if (BUILD_TESTS AND ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug")
## Set static replacement dependency for fixture unit tests
add_dependencies(rccl replace_static_in_hipify)
endif()
## Set RCCL include directories
target_include_directories(rccl PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src) # for hipfied headers
@@ -986,7 +1042,7 @@ if (HAVE_PARALLEL_JOBS)
target_compile_options(rccl PRIVATE -parallel-jobs=12)
endif()
if (ROCM_VERSION VERSION_GREATER_EQUAL "60200")
if (ROCM_VERSION VERSION_GREATER_EQUAL "60200")
target_compile_options(rccl PRIVATE --offload-compress) # Compress GPU code at compile time.
target_link_libraries(rccl PRIVATE --offload-compress) # Compress GPU code at link time.
message(STATUS "--offload-compress enabled - ROCm version >= 6.2.0")
@@ -999,7 +1055,7 @@ target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized)
target_compile_options(rccl PRIVATE -Wall)
target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy)
target_compile_options(rccl PRIVATE -Wno-format-nonliteral)
target_compile_options(rccl PRIVATE -fgpu-rdc)
target_compile_options(rccl PRIVATE -fgpu-rdc) # Generate relocatable device code (required for extern __shared__)
## Set RCCL compile and linker options for unit tests and code coverage
if(ENABLE_CODE_COVERAGE)
+16 -6
Melihat File
@@ -332,15 +332,25 @@ fi
# Optionally, run RCCL-UnitTests, if they're enabled.
if [[ "${run_tests}" == true ]]; then
if [[ -x "./test/rccl-UnitTests" ]]; then
if [[ "${run_tests_all}" == true ]]; then
if [[ ! -x "./test/rccl-UnitTests" ]]; then
echo "RCCL-UnitTests have not been built yet; Please re-run script with \"-t\" to build the binary."
exit 1
fi
if [[ "${build_release}" == false && ! -x "./test/rccl-UnitTestsFixtures" ]]; then
echo "RCCL-UnitTestsFixtures have not been built yet; Please re-run script with \"-t\" to build the binary."
exit 1
fi
if [[ "${run_tests_all}" == true ]]; then
if [[ -x "./test/rccl-UnitTests" ]]; then
./test/rccl-UnitTests
else
./test/rccl-UnitTests --gtest_filter="AllReduce.*"
fi
if [[ "${build_release}" == false && -x "./test/rccl-UnitTestsFixtures" ]]; then
./test/rccl-UnitTestsFixtures
fi
else
echo "RCCL-UnitTests have not been built yet; Please re-run script with \"-t\" to build RCCL-UnitTests."
exit 1
if [[ -x "./test/rccl-UnitTests" ]]; then
./test/rccl-UnitTests --gtest_filter="AllReduce.*"
fi
fi
fi
-3
Melihat File
@@ -201,10 +201,7 @@ int ARSMI_init(void)
ARSMI_orderedNodes.push_back(sort_vecs[j][k]);
}
break;
found = true;
}
if (found)
continue;
}
}
+906
Melihat File
@@ -0,0 +1,906 @@
/*************************************************************************
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "alt_rsmi.h"
#include <cerrno>
#include <cstdio>
#include <dirent.h>
#include <fstream>
#include <gtest/gtest.h>
#include <iostream>
#include <map>
#include <string>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <vector>
struct ARSMI_systemNode {
uint32_t s_node_id = 0;
uint64_t s_gpu_id = 0;
uint64_t s_unique_id = 0;
uint64_t s_location_id = 0;
uint64_t s_bdf = 0;
uint64_t s_domain = 0;
uint8_t s_bus = 0;
uint8_t s_device = 0;
uint8_t s_function = 0;
uint8_t s_partition_id = 0;
std::string s_card;
};
const char *kPathDRMRoot = "/sys/class/drm";
const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes";
uint32_t kAmdGpuId = 0x1002;
// Vector containing data about each node, ordered by bdf ID
thread_local std::vector<ARSMI_systemNode> ARSMI_orderedNodes;
// 2-D matrix with link information between each pair of nodes.
thread_local std::vector<std::vector<ARSMI_linkInfo>> ARSMI_orderedLinks;
// Number of devices recognized
thread_local int ARSMI_num_devices = -1;
int getNodeIndex(uint32_t node_id);
std::string DevicePath(uint32_t dev_id);
int isRegularFile(std::string fname, bool *is_reg);
bool isNumber(const std::string &s);
int openNodeFile(uint32_t dev_id, std::string node_file, std::ifstream *fs);
int countIoLinks(uint32_t dev_id);
int openLinkFile(uint32_t dev_id, uint32_t target_id, std::string node_file,
std::ifstream *fs);
int readGpuId(uint32_t node_id, uint64_t *gpu_id);
bool isNodeSupported(uint32_t node_indx);
int getPropertyValue(std::string property, uint64_t *value,
std::map<std::string, uint64_t> &properties);
bool fileExists(char const *filename);
int ARSMI_readDeviceProperties(uint32_t node_id,
std::map<std::string, uint64_t> &properties);
int ARSMI_readLinkProperties(uint32_t node_id, uint32_t target_node_id,
std::map<std::string, uint64_t> &properties);
// /sys/class/kfd/kfd/topology/nodes/*/properties
int read_node_properties(uint32_t node, std::string property_name,
uint64_t *val,
std::map<std::string, uint64_t> &properties);
// /sys/class/kfd/kfd/topology/nodes/*/io_links/*/properties
int read_link_properties(uint32_t node, uint32_t target,
std::string property_name, uint64_t *val,
std::map<std::string, uint64_t> &properties);
// /sys/class/kfd/kfd/topology/nodes/*/gpu_id
int getGpuId(uint32_t node, uint64_t *gpu_id);
namespace RcclUnitTesting {
class AltRsmiTest : public ::testing::Test {
protected:
// Helper function to create directories recursively
int createDirectory(const std::string &path) {
size_t pos = 0;
std::string currentPath;
// Iterate through each component of the path
while ((pos = path.find('/', pos)) != std::string::npos) {
currentPath = path.substr(0, pos++);
if (!currentPath.empty() && mkdir(currentPath.c_str(), 0700) == -1 &&
errno != EEXIST) {
return -1; // Return error if directory creation fails
}
}
// Create the final directory
if (mkdir(path.c_str(), 0700) == -1 && errno != EEXIST) {
return -1; // Return error if directory creation fails
}
return 0; // Success
}
// Helper function to remove a directory recursively
int removeDirectory(const std::string &path) {
DIR *dir = opendir(path.c_str());
if (!dir) {
std::cerr << "Failed to open directory: " << path << " (errno: " << errno
<< ")" << std::endl;
return -1;
}
struct dirent *entry;
while ((entry = readdir(dir)) != nullptr) {
// Skip "." and ".." entries
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
continue;
}
std::string fullPath = path + "/" + entry->d_name;
// Check if the entry is a directory
struct stat entryStat;
if (stat(fullPath.c_str(), &entryStat) == -1) {
std::cerr << "Failed to stat: " << fullPath << " (errno: " << errno
<< ")" << std::endl;
closedir(dir);
return -1;
}
if (S_ISDIR(entryStat.st_mode)) {
// Recursively remove subdirectory
if (removeDirectory(fullPath) == -1) {
closedir(dir);
return -1;
}
} else {
// Remove file
if (unlink(fullPath.c_str()) == -1) {
std::cerr << "Failed to remove file: " << fullPath
<< " (errno: " << errno << ")" << std::endl;
closedir(dir);
return -1;
}
}
}
closedir(dir);
// Remove the directory itself
if (rmdir(path.c_str()) == -1) {
std::cerr << "Failed to remove directory: " << path
<< " (errno: " << errno << ")" << std::endl;
return -1;
}
return 0; // Success
}
// Helper function to create a file with content
void createFile(const std::string &path, const std::string &content) {
std::ofstream file(path);
if (!file) {
std::cerr << "Failed to create file: " << path << ", errno: " << errno << std::endl;
return;
}
file << content;
file.close();
}
// Helper function to remove a file
int removeFile(const std::string &path) {
if (unlink(path.c_str()) == -1) {
std::cerr << "Failed to remove file: " << path << " (errno: " << errno
<< ")" << std::endl;
return -1; // Return error if file removal fails
}
return 0; // Success
}
// Function to create the test directory structure and files
void setupTestFiles() {
const std::string basePath = "/tmp/test_kfd/topology/nodes";
createDirectory(basePath);
// Create node 0 with valid data
createDirectory(basePath + "/0");
createFile(basePath + "/0/gpu_id", "4098\n");
createFile(basePath + "/0/properties", "unique_id 16336014475442738425\n"
"location_id 23552\n"
"domain 2\n"
"vendor_id 4098\n");
createDirectory(basePath + "/0/io_links/0");
createFile(basePath + "/0/io_links/0/properties",
"type 5\n"
"version_major 0\n"
"version_minor 0\n"
"node_from 0\n"
"node_to 1\n"
"weight 21\n"
"min_latency 0\n"
"max_latency 0\n"
"min_bandwidth 0\n"
"max_bandwidth 0\n"
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
createDirectory(basePath + "/0/io_links/1");
createFile(basePath + "/0/io_links/1/properties",
"type 2\n"
"version_major 0\n"
"version_minor 0\n"
"node_from 0\n"
"node_to 0\n"
"weight 21\n"
"min_latency 0\n"
"max_latency 0\n"
"min_bandwidth 0\n"
"max_bandwidth 64000\n"
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
createDirectory(basePath + "/1");
createFile(basePath + "/1/properties", "unique_id 16336014475442738426\n"
"location_id 23553\n"
"domain 1\n"
"vendor_id 4098\n");
createDirectory(basePath + "/1/io_links/0");
createFile(basePath + "/1/io_links/0/properties",
"type 5\n"
"version_major 0\n"
"version_minor 0\n"
"node_from 1\n"
"node_to 0\n"
"weight 21\n"
"min_latency 0\n"
"max_latency 0\n"
"min_bandwidth 0\n"
"max_bandwidth 0\n"
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
createDirectory(basePath + "/1/io_links/1");
createFile(basePath + "/1/io_links/1/properties",
"type 2\n"
"version_major 0\n"
"version_minor 0\n"
"node_from 1\n"
"node_to 0\n"
"weight 21\n"
"min_latency 0\n"
"max_latency 0\n"
"min_bandwidth 0\n"
"max_bandwidth 0\n"
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist
createDirectory(basePath + "/" + std::to_string(invalid_dev_id) + "/io_links/");
ARSMI_num_devices = 2;
ARSMI_systemNode node0 = {0, 0, 0, 0, 200, 0, 0, 0, 0, 0, ""};
ARSMI_systemNode node1 = {1, 0, 0, 0, 100, 0, 0, 0, 0, 0, ""};
ARSMI_orderedNodes.clear();
ARSMI_orderedNodes.push_back(node0); // Node 0
ARSMI_orderedNodes.push_back(node1); // Node 1
ARSMI_orderedLinks.clear();
ARSMI_orderedLinks.resize(2);
// Link info from node 0 to node 0 and node 1
ARSMI_orderedLinks[0].push_back({0, 0, 0, ARSMI_IOLINK_TYPE_UNDEFINED, 0, 0, 0}); // self-link
ARSMI_orderedLinks[0].push_back({0, 1, 1, ARSMI_IOLINK_TYPE_PCIEXPRESS, 40, 1000, 2000}); // 0->1
// Link info from node 1 to node 0 and node 1
ARSMI_orderedLinks[1].push_back({1, 0, 1, ARSMI_IOLINK_TYPE_PCIEXPRESS, 40, 1000, 2000}); // 1->0
ARSMI_orderedLinks[1].push_back({1, 1, 0, ARSMI_IOLINK_TYPE_UNDEFINED, 0, 0, 0}); // self-link
}
void SetUp() override {
// Redirect kKFDNodesPathRoot to a temporary directory for testing
kKFDNodesPathRoot = "/tmp/test_kfd/topology/nodes";
// Create the test directory structure and files
setupTestFiles();
}
void TearDown() override {
// Clean up the temporary directory
removeDirectory("/tmp/test_kfd");
}
};
TEST_F(AltRsmiTest, ARSMIInitDefault) {
ARSMI_num_devices = -1; // Force uninitialized state
int result = ARSMI_init();
ASSERT_EQ(result, 0);
}
TEST_F(AltRsmiTest, ARSMIInitMissingIoLinksPropertiesFile) {
ARSMI_num_devices = -1; // Force uninitialized state
// Remove properties file for io_links
removeFile("/tmp/test_kfd/topology/nodes/0/io_links/0/properties");
removeFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties");
int result = ARSMI_init();
ASSERT_EQ(result, 0);
}
TEST_F(AltRsmiTest, ARSMIInitMissingNodeToProperty) {
ARSMI_num_devices = -1; // Force uninitialized state
createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties",
"type 2\n"
"version_major 0\n"
"version_minor 0\n"
"node_from 0\n"
// "node_to 0\n" // Missing node_to
"weight 21\n"
"min_latency 0\n"
"max_latency 0\n"
"min_bandwidth 0\n"
"max_bandwidth 64000\n"
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
int result = ARSMI_init();
ASSERT_EQ(result, 0); // Expect success
}
TEST_F(AltRsmiTest, ARSMIInitMissingWeightProperty) {
ARSMI_num_devices = -1; // Force uninitialized state
createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties",
"type 2\n"
"version_major 0\n"
"version_minor 0\n"
"node_from 0\n"
"node_to 0\n"
// "weight 21\n" // Missing weight
"min_latency 0\n"
"max_latency 0\n"
"min_bandwidth 0\n"
"max_bandwidth 64000\n"
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
int result = ARSMI_init();
ASSERT_NE(result, 0); // Expect non-zero error code
}
TEST_F(AltRsmiTest, ARSMIInitMissingTypeProperty) {
ARSMI_num_devices = -1; // Force uninitialized state
createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties",
// "type 5\n" // Missing type
"version_major 0\n"
"version_minor 0\n"
"node_from 0\n"
"node_to 0\n"
"weight 21\n"
"min_latency 0\n"
"max_latency 0\n"
"min_bandwidth 0\n"
"max_bandwidth 0\n"
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
int result = ARSMI_init();
ASSERT_NE(result, 0); // Expect non-zero error code
}
TEST_F(AltRsmiTest, ARSMIInitTypePCIeProperty) {
ARSMI_num_devices = -1; // Force uninitialized state
int result = ARSMI_init();
ASSERT_EQ(result, 0);
}
TEST_F(AltRsmiTest, ARSMIInitMissingMinBWProperty) {
ARSMI_num_devices = -1; // Force uninitialized state
createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties",
"type 11\n"
"version_major 0\n"
"version_minor 0\n"
"node_from 0\n"
"node_to 0\n"
"weight 21\n"
"min_latency 0\n"
"max_latency 0\n"
// "min_bandwidth 0\n" // Missing min_bandwidth
"max_bandwidth 0\n"
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
int result = ARSMI_init();
ASSERT_NE(result, 0); // Expect non-zero error code
}
TEST_F(AltRsmiTest, ARSMIInitMissingMaxBWProperty) {
ARSMI_num_devices = -1; // Force uninitialized state
createFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties",
"type 5\n"
"version_major 0\n"
"version_minor 0\n"
"node_from 0\n"
"node_to 0\n"
"weight 21\n"
"min_latency 0\n"
"max_latency 0\n"
"min_bandwidth 0\n"
// "max_bandwidth 0\n" // Missing max_bandwidth
"recommended_transfer_size 0\n"
"recommended_sdma_engine_id_mask 0\n"
"flags 0\n");
int result = ARSMI_init();
ASSERT_NE(result, 0); // Expect non-zero error code
}
TEST_F(AltRsmiTest, ARSMIGetNumDevicesUninitialized) {
ARSMI_num_devices = -1; // Force uninitialized state
uint32_t num_devices = 0;
int result = ARSMI_get_num_devices(&num_devices);
// Verify that the function initializes successfully
ASSERT_EQ(result, 0);
// Verify that the number of devices is correctly set
ASSERT_EQ(num_devices, ARSMI_num_devices);
}
TEST_F(AltRsmiTest, ARSMIDevPciIdGetNullBdfId) {
uint32_t device_index = 0;
int result = ARSMI_dev_pci_id_get(device_index, nullptr);
ASSERT_EQ(result, EINVAL);
}
TEST_F(AltRsmiTest, ARSMIDevPciIdGetUninitialized) {
ARSMI_num_devices = -1; // Force uninitialized state
uint32_t device_index = 0;
uint64_t bdfid = 0;
// Fail to initialize the function
// kKFDNodesPathRoot = "/invalid/path/to/file";
kKFDNodesPathRoot = "/invalid/path/to/file";
int result = ARSMI_dev_pci_id_get(device_index, &bdfid);
// Verify that the function fails to initialize and returns an error
ASSERT_NE(result, 0);
}
TEST_F(AltRsmiTest, GetNodeIndexInvalidNode) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
int result = getNodeIndex(invalid_node_id);
ASSERT_EQ(result, -1); // Expect -1 for invalid node
}
TEST_F(AltRsmiTest, DevicePathInvalidDeviceId) {
uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist
std::string path = DevicePath(invalid_dev_id);
ASSERT_FALSE(path.empty()); // Path should still be constructed, but it won't
// point to a valid device
}
TEST_F(AltRsmiTest, IsRegularFileInvalidPath) {
std::string invalid_path = "/invalid/path/to/file";
bool is_reg = false;
int result = isRegularFile(invalid_path, &is_reg);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_FALSE(is_reg); // Expect is_reg to be false
}
TEST_F(AltRsmiTest, IsNumberInvalidInput) {
ASSERT_FALSE(isNumber("abc123")); // Non-numeric string
ASSERT_FALSE(isNumber("")); // Empty string
ASSERT_FALSE(isNumber(" ")); // Whitespace string
}
TEST_F(AltRsmiTest, OpenNodeFileInvalidPath) {
std::ifstream fs;
int result = openNodeFile(9999, "invalid_file", &fs);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_FALSE(fs.is_open()); // File stream should not be open
}
TEST_F(AltRsmiTest, OpenNodeFileNotRegularFile) {
removeFile("/tmp/test_kfd/topology/nodes/0/properties");
// Create a directory instead of a regular file
createDirectory("/tmp/test_kfd/topology/nodes/0/properties");
std::ifstream fs;
int result = openNodeFile(0, "properties", &fs);
// Verify that the function returns ENOENT
ASSERT_EQ(result, ENOENT);
}
TEST_F(AltRsmiTest, OpenNodeFileInvalidNodeFile) {
uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist
std::ifstream fs;
int result =
openNodeFile(invalid_dev_id, "invalid_file", &fs);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_FALSE(fs.is_open()); // File stream should not be open
}
TEST_F(AltRsmiTest, CountIoLinksInvalidDeviceId) {
uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist
int result = countIoLinks(invalid_dev_id);
ASSERT_EQ(result, 0); // Expect 0 links for an invalid device
}
TEST_F(AltRsmiTest, OpenLinkFileInvalidLinkFile) {
uint32_t invalid_dev_id = 9999; // Device ID that doesn't exist
uint32_t invalid_target_id = 9999; // Target ID that doesn't exist
std::ifstream fs;
int result = openLinkFile(invalid_dev_id, invalid_target_id,
"invalid_file", &fs);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_FALSE(fs.is_open()); // File stream should not be open
}
TEST_F(AltRsmiTest, OpenLinkFileInvalidPath) {
std::ifstream fs;
int result = openLinkFile(9999, 9999, "invalid_file", &fs);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_FALSE(fs.is_open()); // File stream should not be open
}
TEST_F(AltRsmiTest, OpenLinkFileNotRegularFile) {
removeFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties");
// Create a directory instead of a regular file
createDirectory("/tmp/test_kfd/topology/nodes/0/io_links/1/properties");
std::ifstream fs;
int result = openLinkFile(0, 1, "properties", &fs);
// Verify that the function returns ENOENT
ASSERT_EQ(result, ENOENT);
}
TEST_F(AltRsmiTest, GetGpuIdInvalidNode) {
uint64_t gpu_id = 0;
int result = getGpuId(9999, &gpu_id);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_EQ(gpu_id, 0); // GPU ID should not be modified
}
TEST_F(AltRsmiTest, GetGpuIdInvalidId) {
uint64_t *gpu_id = nullptr;
int result = getGpuId(9999, gpu_id);
ASSERT_NE(result, 0); // Expect non-zero error code
}
TEST_F(AltRsmiTest, ReadGpuIdInvalidNode) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
uint64_t gpu_id = 0;
int result = readGpuId(invalid_node_id, &gpu_id);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_EQ(gpu_id, 0); // GPU ID should not be modified
}
TEST_F(AltRsmiTest, ReadGpuIdInvalidData) {
// Create the directory structure
removeDirectory("/tmp/test_kfd");
createDirectory("/tmp/test_kfd/topology/nodes/0");
// Create a gpu_id file with invalid (non-numeric) data
std::ofstream gpu_id_file("/tmp/test_kfd/topology/nodes/0/gpu_id");
gpu_id_file << "invalid_gpu_id"; // Non-numeric data
gpu_id_file.close();
uint64_t gpu_id = 0;
// Call the readGpuId function
int result = readGpuId(0, &gpu_id);
// Verify that the function returns ENXIO
ASSERT_EQ(result, ENXIO);
}
TEST_F(AltRsmiTest, IsNodeSupportedInvalidNode) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
bool result = isNodeSupported(invalid_node_id);
ASSERT_FALSE(result); // Expect false for unsupported node
}
TEST_F(AltRsmiTest, IsNodeSupportedEmptyFile) {
// Create an empty properties file
std::ofstream properties_file("/tmp/test_kfd/topology/nodes/0/properties");
properties_file.close();
// Call the isNodeSupported function
bool result = isNodeSupported(0);
// Verify that the function returns false for an empty file
ASSERT_FALSE(result);
}
TEST_F(AltRsmiTest, GetPropertyValueInvalidProperty) {
std::map<std::string, uint64_t> properties = {{"valid_property", 12345}};
uint64_t value = 0;
int result =
getPropertyValue("invalid_property", &value, properties);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_EQ(value, 0); // Value should not be modified
}
TEST_F(AltRsmiTest, GetPropertyValueNullValuePointer) {
std::map<std::string, uint64_t> properties = {{"key1", 12345}};
uint64_t *value = nullptr;
// Call the function with a null value pointer
int result = getPropertyValue("key1", value, properties);
// Verify that the function returns EINVAL
ASSERT_EQ(result, EINVAL);
}
TEST_F(AltRsmiTest, GetPropertyValueEmptyPropertiesMap) {
std::map<std::string, uint64_t> properties; // Empty map
uint64_t value = 0;
// Call the function with an empty properties map
int result = getPropertyValue("key1", &value, properties);
// Verify that the function returns EINVAL
ASSERT_EQ(result, EINVAL);
}
TEST_F(AltRsmiTest, GetPropertyValueKeyNotFound) {
std::map<std::string, uint64_t> properties = {{"key1", 12345}};
uint64_t value = 0;
// Call the function with a key that does not exist in the map
int result = getPropertyValue("key2", &value, properties);
// Verify that the function returns EINVAL
ASSERT_EQ(result, EINVAL);
}
TEST_F(AltRsmiTest, FileExistsInvalidPath) {
const char *invalid_path = "/invalid/path/to/file";
bool result = fileExists(invalid_path);
ASSERT_FALSE(result); // Expect false for non-existent file
}
TEST_F(AltRsmiTest, ARSMIReadDevicePropertiesInvalidNode) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
std::map<std::string, uint64_t> properties;
int result =
ARSMI_readDeviceProperties(invalid_node_id, properties);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_TRUE(properties.empty()); // Properties map should remain empty
}
TEST_F(AltRsmiTest, ARSMI_readDevicePropertiesNotRegularFile) {
// Clean up
removeFile("/tmp/test_kfd/topology/nodes/0/properties");
// Create a directory instead of a regular file
createDirectory("/tmp/test_kfd/topology/nodes/0/properties");
std::map<std::string, uint64_t> properties = {{"unique_id", 12345},
{"location_id", 67890}};
std::ifstream fs;
int result = ARSMI_readDeviceProperties(0, properties);
// Verify that the function returns ENOENT
ASSERT_EQ(result, ENOENT);
}
TEST_F(AltRsmiTest, ARSMI_readDevicePropertiesEmptyFile) {
createFile("/tmp/test_kfd/topology/nodes/0/properties", "");
std::map<std::string, uint64_t> properties;
std::ifstream fs;
int result = ARSMI_readDeviceProperties(0, properties);
// Verify that the function handles empty lines correctly
ASSERT_EQ(result, ENOENT);
}
TEST_F(AltRsmiTest, ARSMI_readDevicePropertiesTrailingEmptyLines) {
createFile("/tmp/test_kfd/topology/nodes/0/properties", "key1 101\n"
"key2 102\n"
" \n"
"\n");
std::map<std::string, uint64_t> properties;
std::ifstream fs;
int result = ARSMI_readDeviceProperties(0, properties);
// Verify that the function handles empty lines correctly
ASSERT_EQ(result, 0);
}
TEST_F(AltRsmiTest, ARSMIReadLinkPropertiesInvalidLink) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
uint32_t invalid_target_id = 9999; // Target ID that doesn't exist
std::map<std::string, uint64_t> properties;
int result = ARSMI_readLinkProperties(
invalid_node_id, invalid_target_id, properties);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_TRUE(properties.empty()); // Properties map should remain empty
}
TEST_F(AltRsmiTest, ARSMI_readLinkPropertiesNotRegularFile) {
// Clean up
removeFile("/tmp/test_kfd/topology/nodes/0/io_links/1/properties");
// Create a directory instead of a regular file
createDirectory("/tmp/test_kfd/topology/nodes/0/properties");
std::map<std::string, uint64_t> properties = {{"unique_id", 12345},
{"location_id", 67890}};
std::ifstream fs;
int result = ARSMI_readLinkProperties(0, 1, properties);
// Verify that the function returns ENOENT
ASSERT_EQ(result, ENOENT);
// Clean up
rmdir("/tmp/test_kfd/0/io_links/1/properties");
}
TEST_F(AltRsmiTest, ARSMI_readLinkPropertiesTrailingEmptyLine) {
createFile("/tmp/test_kfd/topology/nodes/0/io_links/0/properties",
"key1 101\n"
"key2 102\n"
" \n");
std::map<std::string, uint64_t> properties;
std::ifstream fs;
int result = ARSMI_readLinkProperties(0, 0, properties);
// Verify that the function handles empty lines correctly
ASSERT_EQ(result, 0);
}
TEST_F(AltRsmiTest, ARSMI_readLinkPropertiesEmptyFile) {
createFile("/tmp/test_kfd/topology/nodes/0/io_links/0/properties", "");
std::map<std::string, uint64_t> properties;
std::ifstream fs;
int result = ARSMI_readLinkProperties(0, 0, properties);
ASSERT_EQ(result, ENOENT);
}
TEST_F(AltRsmiTest, ReadNodePropertiesInvalidProperty) {
std::map<std::string, uint64_t> properties = {{"unique_id", 12345},
{"location_id", 67890}};
uint64_t value = 0;
// Call the wrapper function with an invalid property name
int result = read_node_properties(0, "invalid_property",
&value, properties);
// Verify that the function fails for an invalid property name
ASSERT_NE(result, 0);
}
TEST_F(AltRsmiTest, ReadNodePropertiesInvalidNode) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
std::map<std::string, uint64_t> properties;
uint64_t value = 0;
int result = read_node_properties(
invalid_node_id, "valid_property", &value, properties);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_EQ(value, 0); // Value should not be modified
}
TEST_F(AltRsmiTest, ReadNodePropertiesInvalidPropertyValue) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
std::map<std::string, uint64_t> properties;
uint64_t *value = nullptr;
int result = read_node_properties(invalid_node_id, "", value,
properties);
ASSERT_EQ(result, EINVAL); // Expect non-zero error code
}
TEST_F(AltRsmiTest, ReadLinkPropertiesInvalidLink) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
uint32_t invalid_target_id = 9999; // Target ID that doesn't exist
std::map<std::string, uint64_t> properties;
uint64_t value = 0;
int result = read_link_properties(
invalid_node_id, invalid_target_id, "valid_property", &value, properties);
ASSERT_NE(result, 0); // Expect non-zero error code
ASSERT_EQ(value, 0); // Value should not be modified
}
TEST_F(AltRsmiTest, ReadLinkPropertiesInvalidPropertyValue) {
uint32_t invalid_node_id = 9999; // Node ID that doesn't exist
uint32_t invalid_target_id = 9999; // Target ID that doesn't exist
std::map<std::string, uint64_t> properties;
uint64_t *value = nullptr;
int result = read_link_properties(
invalid_node_id, invalid_target_id, "", value, properties);
ASSERT_EQ(result, EINVAL); // Expect non-zero error code
}
TEST_F(AltRsmiTest, NullInfoPointer) {
int result = ARSMI_topo_get_link_info(0, 1, nullptr);
ASSERT_EQ(result, EINVAL); // Expect EINVAL for null `info` pointer
}
TEST_F(AltRsmiTest, SourceDeviceIndexOutOfRange) {
ARSMI_linkInfo info;
ARSMI_num_devices =
2; // Simulate initialized state with two devices
int result = ARSMI_topo_get_link_info(999, 1, &info); // Invalid source index
ASSERT_EQ(result, EINVAL); // Expect EINVAL for out-of-range source index
}
TEST_F(AltRsmiTest, DestinationDeviceIndexOutOfRange) {
ARSMI_linkInfo info;
ARSMI_num_devices =
2; // Simulate initialized state with two devices
int result =
ARSMI_topo_get_link_info(0, 999, &info); // Invalid destination index
ASSERT_EQ(result, EINVAL); // Expect EINVAL for out-of-range destination index
}
TEST_F(AltRsmiTest, UninitializedNumDevices) {
kKFDNodesPathRoot =
"/tmp/invalid_path"; // Simulate invalid path
ARSMI_linkInfo info;
ARSMI_num_devices = -1; // Simulate uninitialized state
int result = ARSMI_topo_get_link_info(0, 0, &info);
ASSERT_NE(result, 0); // Expect non-zero error code for uninitialized state
}
TEST_F(AltRsmiTest, InvalidLinkInfo) {
// Initialize ARSMI_orderedLinks with data not in order
ARSMI_orderedLinks = {
{
{1, 0, 0, ARSMI_IOLINK_TYPE_UNDEFINED, 0, 0,
0}, // No link from Device 1 to itself
{1, 0, 1, ARSMI_IOLINK_TYPE_PCIEXPRESS, 40, 1000, 2000}
// Link from Device 1 to Device 0
},
{
{0, 1, 1, ARSMI_IOLINK_TYPE_PCIEXPRESS, 40, 1000,
2000}, // Link from Device 0 to Device 1
{0, 0, 0, ARSMI_IOLINK_TYPE_UNDEFINED, 0, 0, 0}
// No link from Device 0 to itself
}};
// Leave ARSMI_orderedLinks uninitialized
ARSMI_linkInfo info;
int result = ARSMI_topo_get_link_info(0, 1, &info);
ASSERT_EQ(info.hops, 2); // Expect default values for uninitialized link info
ASSERT_EQ(info.type, ARSMI_IOLINK_TYPE_PCIEXPRESS);
ASSERT_EQ(info.weight, 40);
ASSERT_EQ(info.min_bandwidth, 0);
ASSERT_EQ(info.max_bandwidth, 0);
}
} // namespace RcclUnitTesting
+327
Melihat File
@@ -0,0 +1,327 @@
/*************************************************************************
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <gtest/gtest.h>
#include "argcheck.h"
#include "comm.h"
#include <hip/hip_runtime.h>
class ArgCheckTest : public ::testing::Test {
protected:
ncclComm_t comm;
struct ncclInfo *info;
int *sendDevicePtr = nullptr;
int *recvDevicePtr = nullptr;
// Helper function to set up valid ncclInfo for boundary testing
void SetupValidInfo() {
// Set up valid info structure
info->comm = comm;
info->root = 0; // Valid root
info->datatype = (ncclDataType_t)0; // Valid datatype
info->op = (ncclRedOp_t)0; // Valid reduction operation
info->coll = ncclFuncBroadcast; // Valid collective operation
info->sendbuff = nullptr; // Will be set per test if needed
info->recvbuff = nullptr; // Will be set per test if needed
info->count = 10; // Valid count
info->opName = "TestOp"; // Valid operation name
}
// Helper function for tests requiring device memory
void SetupValidBufferWithDeviceMemory() {
// Set the active device to match comm->cudaDev
hipError_t errSetDevice = hipSetDevice(comm->cudaDev);
ASSERT_EQ(errSetDevice, hipSuccess);
// Allocate device memory
hipError_t errSend = hipMalloc(&sendDevicePtr, sizeof(int));
ASSERT_EQ(errSend, hipSuccess);
hipError_t errRecv = hipMalloc(&recvDevicePtr, sizeof(int));
ASSERT_EQ(errRecv, hipSuccess);
// Set device pointers
info->sendbuff = sendDevicePtr;
info->recvbuff = recvDevicePtr;
}
// Helper to clean up device memory
void CleanupDeviceMemory() {
if (sendDevicePtr) {
hipFree(sendDevicePtr);
sendDevicePtr = nullptr;
}
if (recvDevicePtr) {
hipFree(recvDevicePtr);
recvDevicePtr = nullptr;
}
}
void SetUp() override {
// Allocate and zero-initialize ncclComm as a pointer
comm = (struct ncclComm *)calloc(1, sizeof(struct ncclComm));
ASSERT_NE(comm, nullptr) << "Failed to allocate ncclComm";
// Initialize the communicator with required fields
comm->cudaDev = 0;
comm->nRanks = 4;
comm->checkPointers = true;
comm->rank = 0;
comm->startMagic = NCCL_MAGIC;
comm->endMagic = NCCL_MAGIC;
// Verify the magic values were set correctly
ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic not set correctly";
ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic not set correctly";
// Allocate and zero-initialize ncclInfo as a pointer
info = (ncclInfo *)calloc(1, sizeof(ncclInfo));
ASSERT_NE(info, nullptr) << "Failed to allocate ncclInfo";
SetupValidInfo();
SetupValidBufferWithDeviceMemory();
}
void TearDown() override {
// Free the allocated memory
CleanupDeviceMemory();
if (info) {
free(info);
info = nullptr;
}
if (comm) {
free(comm);
comm = nullptr;
}
}
};
TEST_F(ArgCheckTest, CudaPtrCheck_ValidPointer) {
int *devicePtr = nullptr;
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
ASSERT_EQ(err, hipSuccess);
ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp");
EXPECT_EQ(result, ncclSuccess);
hipFree(devicePtr);
}
TEST_F(ArgCheckTest, CudaPtrCheck_NullPointer) {
ncclResult_t result = CudaPtrCheck(nullptr, comm, "invalidPtr", "TestOp");
EXPECT_EQ(result, ncclInvalidArgument);
}
TEST_F(ArgCheckTest, CudaPtrCheck_DifferentDevicePointer) {
int *devicePtr = nullptr;
hipSetDevice(1);
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
ASSERT_EQ(err, hipSuccess);
ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp");
EXPECT_EQ(result, ncclInvalidArgument);
hipFree(devicePtr);
hipSetDevice(comm->cudaDev);
}
TEST_F(ArgCheckTest, CudaPtrCheck_HostMemoryPointer) {
// Test with host memory instead of device memory
int *hostPtr = (int *)malloc(sizeof(int));
ASSERT_NE(hostPtr, nullptr) << "Failed to allocate host memory";
*hostPtr = 42; // Initialize the memory
// This should fail because host memory is not device memory
ncclResult_t result = CudaPtrCheck(hostPtr, comm, "hostPtr", "TestOp");
// Host memory should be rejected by CudaPtrCheck
EXPECT_EQ(result, ncclInvalidArgument)
<< "Host memory should be rejected by CudaPtrCheck";
free(hostPtr);
}
TEST_F(ArgCheckTest, PtrCheck_ValidPointer) {
int value = 42;
ncclResult_t result = PtrCheck(&value, "TestOp", "value");
ASSERT_EQ(result, ncclSuccess);
}
TEST_F(ArgCheckTest, PtrCheck_NullPointer) {
ncclResult_t result = PtrCheck(nullptr, "TestOp", "value");
ASSERT_EQ(result, ncclInvalidArgument);
}
TEST_F(ArgCheckTest, CommCheck_ValidComm) {
comm->startMagic = NCCL_MAGIC;
comm->endMagic = NCCL_MAGIC;
// Verify magic values are still correct (should be set in SetUp())
ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic was corrupted";
ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic was corrupted";
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(comm, "TestOp", "testComm");
EXPECT_EQ(result, ncclSuccess) << "Failed for valid communicator";
}
TEST_F(ArgCheckTest, CommCheck_NullComm) {
ncclResult_t result = CommCheck(nullptr, "TestOp", "comm");
ASSERT_EQ(result, ncclInvalidArgument);
}
TEST_F(ArgCheckTest, CommCheck_CorruptedStartMagic) {
// Corrupt only startMagic, keep endMagic valid
comm->startMagic = 1; // Corrupt startMagic
comm->endMagic = NCCL_MAGIC; // Keep endMagic valid
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted startMagic";
}
TEST_F(ArgCheckTest, CommCheck_CorruptedEndMagic) {
// Keep startMagic valid, corrupt only endMagic
comm->startMagic = NCCL_MAGIC; // Keep startMagic valid
comm->endMagic = 1; // Corrupt endMagic
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted endMagic";
}
TEST_F(ArgCheckTest, CommCheck_CorruptedBothMagics) {
// Corrupt both startMagic and endMagic
comm->startMagic = 1; // Corrupt startMagic
comm->endMagic = 1; // Corrupt endMagic
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for corrupted both magic values";
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_NegativeValue) {
info->root = -1; // Invalid root (< 0)
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root < 0";
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_ExceedsNRanks) {
info->root = comm->nRanks; // Invalid root (>= nRanks)
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root >= nRanks";
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_NegativeValue) {
info->datatype = (ncclDataType_t)-1; // Invalid datatype (< 0)
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype < 0";
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_ExceedsMaxValue) {
info->datatype =
(ncclDataType_t)ncclNumTypes; // Invalid datatype (>= ncclNumTypes)
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid datatype >= ncclNumTypes";
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_NegativeValue) {
info->op = (ncclRedOp_t)-1; // Invalid reduction operation (< 0)
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid reduction operation < 0";
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_ExceedsMaxValue) {
info->op =
(ncclRedOp_t)ncclNumOps; // Invalid reduction operation (>= ncclNumOps)
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid reduction operation >= ncclNumOps";
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidCommunicatorPointers) {
info->op = (ncclRedOp_t)0; // Valid reduction operation
if (info->sendbuff) {
hipFree((void *)info->sendbuff);
info->sendbuff = nullptr; // Invalid send buffer
}
if (info->recvbuff) {
hipFree((void *)info->recvbuff);
info->recvbuff = nullptr; // Invalid receive buffer
}
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid communicator pointers";
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperationOutOfRange) {
info->op = (ncclRedOp_t)5; // Invalid reduction operation (out of range)
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid reduction operation";
}
TEST_F(ArgCheckTest, ArgsCheck_UserDefinedReductionOperationInvalid) {
// Test case: User-defined reduction operation with freeNext != -1
info->op = (ncclRedOp_t)(ncclNumOps +
1); // Set op to a user-defined reduction operation
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for user-defined reduction operation with freeNext != -1";
}
TEST_F(ArgCheckTest, ArgsCheck_SendAndRecvFunction) {
info->recvbuff =
recvDevicePtr; // Use allocated device pointer for receive buffer
// Test both ncclFuncSend and ncclFuncRecv
for (auto coll : {ncclFuncSend, ncclFuncRecv}) {
info->coll = coll; // Set the collective operation
// Call ArgsCheck and verify the result
ncclResult_t result = ArgsCheck(info);
ASSERT_EQ(result, ncclSuccess) << "Failed for coll = " << coll;
}
}
TEST_F(ArgCheckTest, ArgsCheck_CollNotReduce) {
// Case: info->coll != ncclFuncReduce
info->coll = ncclFuncBroadcast; // Set coll to ncclFuncBroadcast
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclSuccess) << "Failed for coll != ncclFuncReduce";
}
TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithRootRank) {
// Case: info->coll == ncclFuncReduce and info->comm->rank == info->root
info->coll = ncclFuncReduce; // Set coll to ncclFuncReduce
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclSuccess)
<< "Failed for coll == ncclFuncReduce and rank == root";
}
TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithNonRootRank) {
comm->rank = 1; // Set rank to 1 (non-root)
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclSuccess)
<< "Failed for coll == ncclFuncReduce and rank != root";
}
+78 -45
Melihat File
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.12)
if(BUILD_TESTS)
option(OPENMP_TESTS_ENABLED "Enable OpenMP for unit tests" OFF)
message("Building rccl unit tests (Installed in /test/rccl-UnitTests)")
find_package(hsa-runtime64 PATHS /opt/rocm )
@@ -56,63 +56,96 @@ if(BUILD_TESTS)
common/TestBed.cpp
common/TestBedChild.cpp
common/StandaloneUtils.cpp
../src/misc/recorder.cc
proxy_trace/ProxyTraceUnitTests.cpp
../src/misc/proxy_trace/proxy_trace.cc
)
# Append a file if BUILD_TESTS is ON and build type is not Debug
# Visibility is hidden by default, so we need to explicitly add the recorder.cc file
# to the unit tests to ensure it is included for unit test execution
if(BUILD_TESTS AND NOT CMAKE_BUILD_TYPE MATCHES "Debug")
# Append file if build type is not Debug.
# Visibility is hidden by default, so we need to explicitly add the following file(s)
# to the unit tests to ensure it is included for the existing rccl-UnitTests execution
if(NOT CMAKE_BUILD_TYPE MATCHES "Debug")
list(APPEND TEST_SOURCE_FILES
../src/misc/recorder.cc
../src/misc/proxy_trace/proxy_trace.cc
)
endif()
# Add rccl-UnitTests binary
add_executable(rccl-UnitTests ${TEST_SOURCE_FILES})
## Set rccl-UnitTests include directories
target_include_directories(rccl-UnitTests PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
target_include_directories(rccl-UnitTests PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header
target_include_directories(rccl-UnitTests PRIVATE ${PROJECT_BINARY_DIR}/hipify/src/include) # for rccl_bfloat16.h
target_include_directories(rccl-UnitTests PRIVATE ${PROJECT_BINARY_DIR}/hipify/src/include/plugin) # for recorder tests
set(RCCL_TEST_EXECUTABLES rccl-UnitTests)
## Set rccl-UnitTests compile definitions
if(LL128_ENABLED)
target_compile_definitions(rccl-UnitTests PRIVATE ENABLE_LL128)
endif()
if(OPENMP_TESTS_ENABLED)
target_compile_definitions(rccl-UnitTests PRIVATE ENABLE_OPENMP)
endif()
target_compile_definitions(rccl-UnitTests PRIVATE ROCM_PATH="${ROCM_PATH}")
# Create rccl-UnitTestsFixtures binary if ROCm version is 4.6.0 or greater
# and build type is Debug
if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug")
## Set rccl-UnitTests compile definitions
if(OPENMP_TESTS_ENABLED)
target_compile_options(rccl-UnitTests PRIVATE "${OpenMP_CXX_FLAGS}")
set(TEST_FIXTURE_SOURCE_FILES
AltRsmiTests.cpp
ArgCheckTests.cpp
common/main_fixtures.cpp
common/EnvVars.cpp
)
# Add rccl-UnitTestsFixtures binary
add_executable(rccl-UnitTestsFixtures ${TEST_FIXTURE_SOURCE_FILES})
list(APPEND RCCL_TEST_EXECUTABLES rccl-UnitTestsFixtures)
add_dependencies(rccl-UnitTestsFixtures replace_static_in_hipify)
endif()
## Set rccl-UnitTests linked libraries
target_link_libraries(rccl-UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES})
target_link_libraries(rccl-UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64)
target_link_libraries(rccl-UnitTests PRIVATE Threads::Threads)
target_link_libraries(rccl-UnitTests PRIVATE dl)
target_link_libraries(rccl-UnitTests PRIVATE fmt::fmt-header-only)
if(OPENMP_TESTS_ENABLED)
target_link_libraries(rccl-UnitTests PRIVATE "${OpenMP_CXX_FLAGS}")
endif()
## Set include directories for the target(s)
foreach(target ${RCCL_TEST_EXECUTABLES})
target_include_directories(${target} PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
target_include_directories(${target} PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header
target_include_directories(${target} PRIVATE ${PROJECT_BINARY_DIR}/hipify/src/include) # for rccl_bfloat16.h
target_include_directories(${target} PRIVATE ${PROJECT_BINARY_DIR}/hipify/src/include/plugin) # for recorder tests
# Get the compile definitions from the main rccl target
# These helps to keep the test compile definitions in sync with the main rccl target
# Also, all the structure layout remains the same across all the targets
get_target_property(RCCL_COMPILE_DEFINITIONS rccl COMPILE_DEFINITIONS)
if(RCCL_COMPILE_DEFINITIONS)
target_compile_definitions(${target} PRIVATE ${RCCL_COMPILE_DEFINITIONS})
endif()
# Also get interface compile definitions
get_target_property(RCCL_INTERFACE_COMPILE_DEFINITIONS rccl INTERFACE_COMPILE_DEFINITIONS)
if(RCCL_INTERFACE_COMPILE_DEFINITIONS)
target_compile_definitions(${target} PRIVATE ${RCCL_INTERFACE_COMPILE_DEFINITIONS})
endif()
## Set compile definitions
if(LL128_ENABLED)
target_compile_definitions(${target} PRIVATE ENABLE_LL128)
endif()
if(OPENMP_TESTS_ENABLED)
target_compile_definitions(${target} PRIVATE ENABLE_OPENMP)
endif()
target_compile_definitions(${target} PRIVATE ROCM_PATH="${ROCM_PATH}")
## Set rccl unittests linked libraries
target_link_libraries(${target} PRIVATE ${GTEST_BOTH_LIBRARIES})
target_link_libraries(${target} PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64)
target_link_libraries(${target} PRIVATE Threads::Threads)
target_link_libraries(${target} PRIVATE dl)
target_link_libraries(${target} PRIVATE fmt::fmt-header-only)
if(OPENMP_TESTS_ENABLED)
target_link_libraries(${target} PRIVATE "${OpenMP_CXX_FLAGS}")
endif()
# Link rccl library
if(BUILD_SHARED_LIBS)
target_link_libraries(${target} PRIVATE rccl)
set_property(TARGET ${target} PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib;${CMAKE_BINARY_DIR}")
else()
add_dependencies(${target} rccl)
target_link_libraries(${target} PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/lib -L${ROCM_PATH}/rocm_smi/lib)
endif()
set_property(TARGET ${target} PROPERTY BUILD_RPATH "${CMAKE_BINARY_DIR};${ROCM_PATH}/lib")
# Install the binary
rocm_install(TARGETS ${target} COMPONENT tests)
endforeach()
# rccl-UnitTests using static library of rccl requires passing rccl
# through -l and -L instead of command line input.
if(BUILD_SHARED_LIBS)
target_link_libraries(rccl-UnitTests PRIVATE rccl)
set_property(TARGET rccl-UnitTests PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib;${CMAKE_BINARY_DIR}")
else()
add_dependencies(rccl-UnitTests rccl)
target_link_libraries(rccl-UnitTests PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/lib -L${ROCM_PATH}/rocm_smi/lib)
endif()
set_property(TARGET rccl-UnitTests PROPERTY BUILD_RPATH "${CMAKE_BINARY_DIR};${ROCM_PATH}/lib")
rocm_install(TARGETS rccl-UnitTests COMPONENT tests)
else()
message("Not building rccl unit tests")
endif()
+1 -1
Melihat File
@@ -25,7 +25,7 @@
} \
} while(0)
#define MAX_STACK_SIZE 570
#define MAX_STACK_SIZE 640
#ifdef ENABLE_LL128
#define MAX_STACK_SIZE_gfx90a 360
+45
Melihat File
@@ -0,0 +1,45 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <gtest/gtest.h>
#include "EnvVars.hpp"
int main(int argc, char **argv)
{
::testing::InitGoogleTest(&argc, argv);
RcclUnitTesting::EnvVars ev;
ev.ShowConfig();
int retCode = RUN_ALL_TESTS();
// Show timing information
if (ev.showTiming)
{
size_t totalTimeMsec = 0;
fflush(stdout);
printf("[ TIMING ] %-20s: %-20s: %10s ms (%s)\n", "TEST SUITE", "TEST NAME", "TIME", "STATUS");
auto unitTest = ::testing::UnitTest::GetInstance();
for (int i = 0; i < unitTest->total_test_suite_count(); i++)
{
auto suiteInfo = unitTest->GetTestSuite(i);
if (!suiteInfo->should_run()) continue;
for (int j = 0; j < suiteInfo->total_test_count(); j++)
{
auto testInfo = suiteInfo->GetTestInfo(j);
if (!testInfo->should_run()) continue;
auto testResult = testInfo->result();
if (testResult->Skipped()) continue;
printf("[ TIMING ] %-20s: %-20s: %10.2f sec (%4s)\n", testInfo->test_suite_name(), testInfo->name(), testResult->elapsed_time() / 1000.0, testResult->Passed() ? "PASS" : "FAIL");
}
printf("[ TIMING ] %-20s: %-20s: %10.2f sec (%4s)\n", suiteInfo->name(), "TOTAL", suiteInfo->elapsed_time() / 1000.0, suiteInfo->Passed() ? "PASS" : "FAIL");
totalTimeMsec += suiteInfo->elapsed_time();
}
printf("[ TIMING ] Total time: %10.2f minutes\n", totalTimeMsec / (60 * 1000.0));
}
return retCode;
}
+96
Melihat File
@@ -0,0 +1,96 @@
#!/bin/bash
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# Usage:
# ./replace_static_functions.sh <source_file> [--replace-vars] [--verbose]
#
# - Replaces all 'static' function definitions with non-static.
# - Replaces all 'static inline' with 'inline'.
# - If --replace-vars is given, also replaces 'static' at variable definitions.
# - If --verbose is given, shows a diff of the changes.
set -e
SOURCE_FILE="$1"
REPLACE_VARS=0
VERBOSE=0
for arg in "$@"; do
if [[ "$arg" == "--replace-vars" ]]; then
REPLACE_VARS=1
fi
if [[ "$arg" == "--verbose" ]]; then
VERBOSE=1
fi
done
if [[ ! -f "$SOURCE_FILE" ]]; then
echo "Source file '$SOURCE_FILE' not found!"
exit 1
fi
TMP_FILE="${SOURCE_FILE}.tmp.$$"
# Regex explanation:
# \b : Word boundary, ensures 'static' and 'inline' are matched as whole words.
# static : Matches the literal word 'static'.
# [[:space:]]+ : Matches one or more whitespace characters (spaces, tabs, etc.) between 'static' and 'inline'.
# inline : Matches the literal word 'inline'.
# \b : Word boundary after 'inline'.
echo "[INFO] Replacing 'static inline' with 'inline' in $SOURCE_FILE"
sed -E 's/\bstatic[[:space:]]+inline\b/inline/g' "$SOURCE_FILE" > "$TMP_FILE"
# Regex explanation:
# ^ : Start of the line.
# ([[:space:]]*) : Captures any leading whitespace at the start of the line (indentation).
# (inline[[:space:]]+|__device__[[:space:]]+|__forceinline__[[:space:]]+|__host__[[:space:]]+|__global__[[:space:]]+|)* :
# Matches zero or more occurrences of common C/C++/CUDA qualifiers (each followed by whitespace).
# ([[:space:]]*(...|)*) : The outer group allows for any combination/order of these qualifiers.
# static[[:space:]]+ : Matches the literal word 'static' followed by one or more spaces/tabs.
# \1 : In the replacement, refers to the leading whitespace and any qualifiers (without 'static').
#
# Removes 'static' after any qualifiers before the function name
echo "[INFO] Replacing 'static' in function qualifiers in $SOURCE_FILE"
sed -E -i 's/^([[:space:]]*(inline[[:space:]]+|__device__[[:space:]]+|__forceinline__[[:space:]]+|__host__[[:space:]]+|__global__[[:space:]]+|)*)static[[:space:]]+/\1/g' "$TMP_FILE"
# Regex explanation:
# ^ : Start of the line.
# ([[:space:]]*) : Captures any leading whitespace at the start of the line.
# static : Matches the literal word 'static'.
# ([[:space:]]+) : Captures one or more spaces after 'static'.
if [[ "$REPLACE_VARS" == "1" ]]; then
echo "[INFO] Replacing 'static' at variable definitions in $SOURCE_FILE"
# This matches 'static' at the start of a line (possibly with spaces), followed by a type and a variable name
sed -E -i 's/^([[:space:]]*)static([[:space:]]+)/\1/g' "$TMP_FILE"
fi
if [[ "$VERBOSE" == "1" ]]; then
echo "[INFO] Showing diff for changes:"
diff -u "$SOURCE_FILE" "$TMP_FILE" || true
fi
mv "$TMP_FILE" "$SOURCE_FILE"
echo "Static function replacement complete for $SOURCE_FILE"
if [[ "$REPLACE_VARS" == "1" ]]; then
echo "Static variable replacement also performed."
fi