Файли
Atul Kulkarni 142860442a Enable MPI support to execute MPI specific unit/functional tests (#1996)
* Added MPI support to execute unit/functional tests

Update node and process validation
Updated node detection count and modified validation method
Update validation logic to include max procs and nodes

* Address review comments

* Fix warnings

* Added a new NET transport test and clean up

* Added MPI test logging mechanism

* Decoupled GTest framework

* Added Net IB functional tests

* Updated with resource guards

* Added NET IB tests and refactored code

* Update P2pWorkflow test

* Update documentation

* Add MPI_TESTS_ENABLED guard to the file

* Fix Shm and NetIB tests

* Applied refactoring and cleanup

* Replaced BufferGuard with AutoGuard

* Modified test debug logging

* Use macro to reduce NcclTypeTraits code duplication

- Replace repetitive template specializations with a single
  DEFINE_NCCL_TYPE_TRAIT macro
- Use stringification operator (#) to auto-generate type name strings
- Add #undef to keep macro from polluting namespace
- Makes adding new type mappings trivial

* Unify buffer initialization with generic pattern function

- Remove initializeBufferWithCustomPattern
- Make initializeBufferWithPattern generic with PatternFunc template param
- Now single function handles all patterns via lambda injection
- Updated all test files to use lambdas for pattern generation
- Pattern logic now visible at call site (self-documenting)

* Unify buffer verification with pluggable pattern function

- Remove verifyBufferWithCustomCheck
- Make verifyBufferData generic with PatternFunc template param
- Single function handles all verification patterns via lambda injection
- Updated all test files to use lambdas
- Better defaults: num_samples=0 means verify all elements
- Pattern logic now visible at call site (self-documenting)

* Docs: Add DeviceBufferHelpers section to MPITestRunner.md

- Document new refactored buffer initialization/verification API
- Explain pluggable pattern functions with lambda examples
- Show type mapping and automatic float/int comparison
- Include migration guide from old API to new unified functions
- Demonstrate best practices with real-world examples
- Reference recent refactoring commits (macro-based type traits)

* Docs: Update documentation and examples

- Update on DeviceBufferHelpers
- Update examples using DeviceBufferHelpers methods, e.g. data verification

* Address review comment.

- Replace manual pattern generation loop with initializeBufferWithPattern call
- Use downloadBuffer to get host copy instead of manual hipMemcpy

* Remove non-existent dependency

* Remove duplicate testcase

* Code cleanup in test files

* Moved common constants to base class

[ROCm/rccl commit: 29e1567b95]
2025-12-06 16:05:37 -06:00

382 рядки
12 KiB
C++

/*************************************************************************
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#pragma once
#include "nccl.h"
#include <cmath>
#include <hip/hip_runtime.h>
#include <type_traits>
#include <vector>
/**
* @file DeviceBufferHelpers.hpp
* @brief Template-based device buffer utilities for RCCL tests
*
* Provides type-safe, reusable functions for device buffer operations:
* - Initialization with test patterns (Host -> Device)
* - Host <-> Device transfers
* - Data verification (Device -> Host)
* - NCCL datatype mapping
*
* NOTE: All functions expect DEVICE memory pointers allocated with hipMalloc().
* For host memory operations, use direct CPU operations instead.
*/
namespace RCCLTestHelpers
{
// ============================================================================
// NCCL Datatype Mapping
// ============================================================================
/**
* @brief Maps C++ types to NCCL data types at compile time
* @tparam T C++ data type
*/
template<typename T>
struct NcclTypeTraits;
/**
* @brief Macro to define NcclTypeTraits specializations
*
* ncclDataType_t mapping and the string name using the stringification
* operator (#) for each supported type.
*
* @param cpp_type The C++ type (e.g., uint64_t, float)
* @param nccl_type The corresponding NCCL type (e.g., ncclUint64, ncclFloat)
*/
#define DEFINE_NCCL_TYPE_TRAIT(cpp_type, nccl_type) \
template<> \
struct NcclTypeTraits<cpp_type> \
{ \
static constexpr ncclDataType_t value = nccl_type; \
static constexpr const char* name = #cpp_type; \
}
// Define all supported type mappings
DEFINE_NCCL_TYPE_TRAIT(float, ncclFloat);
DEFINE_NCCL_TYPE_TRAIT(double, ncclDouble);
DEFINE_NCCL_TYPE_TRAIT(int8_t, ncclInt8);
DEFINE_NCCL_TYPE_TRAIT(uint8_t, ncclUint8);
DEFINE_NCCL_TYPE_TRAIT(int32_t, ncclInt32);
DEFINE_NCCL_TYPE_TRAIT(uint32_t, ncclUint32);
DEFINE_NCCL_TYPE_TRAIT(int64_t, ncclInt64);
DEFINE_NCCL_TYPE_TRAIT(uint64_t, ncclUint64);
// Undefine macro to avoid polluting namespace
#undef DEFINE_NCCL_TYPE_TRAIT
/**
* @brief Helper function to get NCCL datatype for a C++ type
* @tparam T C++ data type
* @return Corresponding ncclDataType_t
*/
template<typename T>
constexpr ncclDataType_t getNcclDataType()
{
return NcclTypeTraits<T>::value;
}
/**
* @brief Helper function to get type name string
* @tparam T C++ data type
* @return Type name as string
*/
template<typename T>
constexpr const char* getTypeName()
{
return NcclTypeTraits<T>::name;
}
// ============================================================================
// Device Buffer Initialization
// ============================================================================
/**
* @brief Initialize device buffer with pattern function
*
* Generic function that allows any pattern generation via lambda or function pointer.
*
* Example usage:
* @code
* // Rank-based pattern: rank * multiplier + index
* initializeBufferWithPattern<float>(buffer, size,
* [rank, multiplier](size_t i) { return rank * multiplier + i; });
*
* // Constant value pattern
* initializeBufferWithPattern<int>(buffer, size,
* [](size_t i) { return 42; });
*
* // Custom pattern
* initializeBufferWithPattern<double>(buffer, size,
* [](size_t i) { return std::sin(i * 0.1); });
* @endcode
*
* @tparam T Element type (float, int, etc.)
* @tparam PatternFunc Callable type (lambda, function pointer, functor)
* @param device_buffer Device memory pointer (from hipMalloc)
* @param num_elements Number of elements
* @param pattern_func Function that generates value for each index: T pattern_func(size_t index)
* @return hipError_t from hipMemcpy, or hipSuccess
*/
template<typename T, typename PatternFunc>
hipError_t initializeBufferWithPattern(void* device_buffer,
size_t num_elements,
PatternFunc pattern_func)
{
if(!device_buffer || num_elements == 0)
{
return hipErrorInvalidValue;
}
std::vector<T> host_data(num_elements);
for(size_t i = 0; i < num_elements; i++)
{
host_data[i] = pattern_func(i);
}
return hipMemcpy(device_buffer,
host_data.data(),
num_elements * sizeof(T),
hipMemcpyHostToDevice);
}
/**
* @brief Zero-initialize device buffer
*
* @tparam T Element type
* @param device_buffer Device memory pointer (from hipMalloc)
* @param num_elements Number of elements
* @return hipError_t from hipMemset
*/
template<typename T>
hipError_t zeroInitializeBuffer(void* device_buffer, size_t num_elements)
{
if(!device_buffer || num_elements == 0)
{
return hipErrorInvalidValue;
}
return hipMemset(device_buffer, 0, num_elements * sizeof(T));
}
// ============================================================================
// Device Buffer Verification
// ============================================================================
/**
* @brief Verify device buffer data with pattern function
*
* Generic function that allows any verification pattern via lambda or function pointer.
* Downloads data from device and verifies elements against expected values.
* Uses appropriate comparison for floating-point vs integer types.
*
* Example usage:
* @code
* // Rank-based pattern verification: rank * multiplier + index
* verifyBufferData<float>(buffer, size,
* [rank, multiplier](size_t i) { return rank * multiplier + i; },
* num_samples, tolerance);
*
* // Constant value verification
* verifyBufferData<int>(buffer, size,
* [](size_t i) { return 42; });
*
* // Custom pattern verification
* verifyBufferData<double>(buffer, size,
* [](size_t i) { return std::sin(i * 0.1); },
* size, 1e-6); // verify all elements with tighter tolerance
* @endcode
*
* @tparam T Element type
* @tparam PatternFunc Callable type (lambda, function pointer, functor)
* @param device_buffer Device memory pointer (from hipMalloc)
* @param num_elements Total number of elements in buffer
* @param pattern_func Function that generates expected value for each index: T pattern_func(size_t index)
* @param num_samples Number of elements to verify (default: all, capped at num_elements)
* @param tolerance Tolerance for floating-point comparison (default: 1e-5, ignored for integer types)
* @param[out] first_error_index If verification fails, set to index of first mismatch
* @param[out] expected_value If verification fails, set to expected value
* @param[out] actual_value If verification fails, set to actual value
* @return true if all samples match, false otherwise
*/
template<typename T, typename PatternFunc>
bool verifyBufferData(const void* device_buffer,
size_t num_elements,
PatternFunc pattern_func,
size_t num_samples = 0, // 0 means verify all
double tolerance = 1e-5,
size_t* first_error_index = nullptr,
T* expected_value = nullptr,
T* actual_value = nullptr)
{
if(!device_buffer || num_elements == 0)
{
return false;
}
// Default to verifying all elements if num_samples is 0
if(num_samples == 0)
{
num_samples = num_elements;
}
else
{
// Cap num_samples at num_elements
num_samples = std::min(num_samples, num_elements);
}
// Download data from device
std::vector<T> host_data(num_elements);
hipError_t err = hipMemcpy(host_data.data(),
device_buffer,
num_elements * sizeof(T),
hipMemcpyDeviceToHost);
if(err != hipSuccess)
{
return false;
}
// Verify samples
for(size_t i = 0; i < num_samples; i++)
{
T expected = pattern_func(i);
T actual = host_data[i];
bool matches = false;
// Use appropriate comparison based on type
if constexpr(std::is_floating_point_v<T>)
{
// Floating-point: use tolerance-based comparison
matches = (std::abs(actual - expected) <= tolerance);
}
else
{
// Integer: exact comparison
matches = (actual == expected);
}
if(!matches)
{
// Record error details
if(first_error_index)
*first_error_index = i;
if(expected_value)
*expected_value = expected;
if(actual_value)
*actual_value = actual;
return false;
}
}
return true;
}
// ============================================================================
// Combined Operations
// ============================================================================
// Forward declaration for downloadBuffer (used in allocateAndInitialize)
template<typename T>
std::pair<hipError_t, std::vector<T>> downloadBuffer(const void* device_buffer, size_t num_elements);
/**
* @brief Allocate, initialize, and return RAII-guarded device buffers
*
* Convenience function that combines allocation and initialization.
* Returns host vector for later verification if needed.
*
* @tparam T Element type
* @param[out] device_buffer Pointer to receive device buffer address
* @param num_elements Number of elements
* @param rank MPI rank for pattern generation
* @param multiplier Pattern multiplier
* @return std::pair<hipError_t, std::vector<T>> - error code and host data copy
*/
template<typename T>
std::pair<hipError_t, std::vector<T>> allocateAndInitialize(void** device_buffer,
size_t num_elements,
int rank,
int multiplier = 1000)
{
if(!device_buffer)
{
return {hipErrorInvalidValue, {}};
}
// Allocate device memory
hipError_t err = hipMalloc(device_buffer, num_elements * sizeof(T));
if(err != hipSuccess)
{
return {err, {}};
}
// Initialize using generic pattern function
err = initializeBufferWithPattern<T>(
*device_buffer, num_elements,
[rank, multiplier](size_t i) { return static_cast<T>(rank * multiplier + i); });
if(err != hipSuccess)
{
return {err, {}};
}
// Download and return host copy for verification
return downloadBuffer<T>(*device_buffer, num_elements);
}
/**
* @brief Copy data from one device buffer to another
*
* @tparam T Element type (used for size calculation)
* @param dst Destination device buffer (from hipMalloc)
* @param src Source device buffer (from hipMalloc)
* @param num_elements Number of elements to copy
* @return hipError_t from hipMemcpy
*/
template<typename T>
hipError_t copyDeviceBuffer(void* dst, const void* src, size_t num_elements)
{
if(!dst || !src || num_elements == 0)
{
return hipErrorInvalidValue;
}
return hipMemcpy(dst, src, num_elements * sizeof(T), hipMemcpyDeviceToDevice);
}
/**
* @brief Download device buffer to host vector
*
* @tparam T Element type
* @param device_buffer Device memory pointer (from hipMalloc)
* @param num_elements Number of elements
* @return std::pair<hipError_t, std::vector<T>> - error code and host data
*/
template<typename T>
std::pair<hipError_t, std::vector<T>> downloadBuffer(const void* device_buffer, size_t num_elements)
{
std::vector<T> host_data(num_elements);
if(!device_buffer || num_elements == 0)
{
return {hipErrorInvalidValue, {}};
}
hipError_t err = hipMemcpy(host_data.data(),
device_buffer,
num_elements * sizeof(T),
hipMemcpyDeviceToHost);
return {err, std::move(host_data)};
}
} // namespace RCCLTestHelpers