ファイル
Atul Kulkarni 142860442a Enable MPI support to execute MPI specific unit/functional tests (#1996)
* Added MPI support to execute unit/functional tests

Update node and process validation
Updated node detection count and modified validation method
Update validation logic to include max procs and nodes

* Address review comments

* Fix warnings

* Added a new NET transport test and clean up

* Added MPI test logging mechanism

* Decoupled GTest framework

* Added Net IB functional tests

* Updated with resource guards

* Added NET IB tests and refactored code

* Update P2pWorkflow test

* Update documentation

* Add MPI_TESTS_ENABLED guard to the file

* Fix Shm and NetIB tests

* Applied refactoring and cleanup

* Replaced BufferGuard with AutoGuard

* Modified test debug logging

* Use macro to reduce NcclTypeTraits code duplication

- Replace repetitive template specializations with a single
  DEFINE_NCCL_TYPE_TRAIT macro
- Use stringification operator (#) to auto-generate type name strings
- Add #undef to keep macro from polluting namespace
- Makes adding new type mappings trivial

* Unify buffer initialization with generic pattern function

- Remove initializeBufferWithCustomPattern
- Make initializeBufferWithPattern generic with PatternFunc template param
- Now single function handles all patterns via lambda injection
- Updated all test files to use lambdas for pattern generation
- Pattern logic now visible at call site (self-documenting)

* Unify buffer verification with pluggable pattern function

- Remove verifyBufferWithCustomCheck
- Make verifyBufferData generic with PatternFunc template param
- Single function handles all verification patterns via lambda injection
- Updated all test files to use lambdas
- Better defaults: num_samples=0 means verify all elements
- Pattern logic now visible at call site (self-documenting)

* Docs: Add DeviceBufferHelpers section to MPITestRunner.md

- Document new refactored buffer initialization/verification API
- Explain pluggable pattern functions with lambda examples
- Show type mapping and automatic float/int comparison
- Include migration guide from old API to new unified functions
- Demonstrate best practices with real-world examples
- Reference recent refactoring commits (macro-based type traits)

* Docs: Update documentation and examples

- Update on DeviceBufferHelpers
- Update examples using DeviceBufferHelpers methods, e.g. data verification

* Address review comment.

- Replace manual pattern generation loop with initializeBufferWithPattern call
- Use downloadBuffer to get host copy instead of manual hipMemcpy

* Remove non-existent dependency

* Remove duplicate testcase

* Code cleanup in test files

* Moved common constants to base class

[ROCm/rccl commit: 29e1567b95]
2025-12-06 16:05:37 -06:00

456 行
12 KiB
C++

/*************************************************************************
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#pragma once
#include "nccl.h"
#include "net.h"
#include "transport.h"
#include <cstdio>
#include <cstdlib>
#include <hip/hip_runtime.h>
#include <utility>
/**
* @file ResourceGuards.hpp
* @brief Comprehensive RAII resource guards for automatic cleanup in tests
*
* Provides all RAII guard types for automatic resource management:
* - ScopeGuard: Generic cleanup for any action (with lambdas)
* - AutoGuard: Typed guards for resources with simple cleanup functions
* - ResourceGuard: Typed guards for resources with stateful deleters
* - Specialized guards: NcclRegHandleGuard, etc.
*
* Guards ensure cleanup even when ASSERT_* fails in tests.
* See MPITestRunner.md for detailed usage documentation.
*/
namespace RCCLTestGuards
{
// ============================================================================
// ScopeGuard - Generic cleanup for arbitrary actions
// ============================================================================
/**
* @class ScopeGuard
* @brief Generic RAII scope guard for custom cleanup logic
*
* Executes a cleanup function on scope exit (normal return, early return, or exception).
* Useful for resources that don't have dedicated RAII guards or for one-off cleanup needs.
*
* @par Example:
* @code
* void* buffer = nullptr;
* hipMalloc(&buffer, size);
* auto guard = makeScopeGuard([&]() { if(buffer) hipFree(buffer); });
* // Automatic cleanup on scope exit
* @endcode
*
* @tparam Func Callable type (lambda, function pointer, functor)
*/
template<typename Func>
class ScopeGuard
{
Func cleanup_; ///< Cleanup function to execute on scope exit
bool dismissed_; ///< If true, skip cleanup (for ownership transfer)
public:
explicit ScopeGuard(Func f) noexcept : cleanup_(std::move(f)), dismissed_(false) {}
~ScopeGuard() noexcept
{
if(!dismissed_)
{
cleanup_();
}
}
void dismiss() noexcept { dismissed_ = true; }
void restore() noexcept { dismissed_ = false; }
ScopeGuard(ScopeGuard&& other) noexcept
: cleanup_(std::move(other.cleanup_)), dismissed_(other.dismissed_)
{
other.dismissed_ = true;
}
ScopeGuard& operator=(ScopeGuard&& other) noexcept
{
if(this != &other)
{
if(!dismissed_)
{
cleanup_();
}
cleanup_ = std::move(other.cleanup_);
dismissed_ = other.dismissed_;
other.dismissed_ = true;
}
return *this;
}
ScopeGuard(const ScopeGuard&) = delete;
ScopeGuard& operator=(const ScopeGuard&) = delete;
};
/**
* @brief Factory function to create ScopeGuard with type deduction
*
* @par Example:
* @code
* auto guard = makeScopeGuard([&]() { cleanup(); });
* @endcode
*/
template<typename Func>
ScopeGuard<Func> makeScopeGuard(Func f)
{
return ScopeGuard<Func>(std::move(f));
}
/**
* @def SCOPE_EXIT
* @brief Convenience macro for creating anonymous scope guards
*
* @par Example:
* @code
* void* buffer = nullptr;
* hipMalloc(&buffer, size);
* SCOPE_EXIT(if(buffer) hipFree(buffer));
* @endcode
*/
#define SCOPE_EXIT_CONCAT_IMPL(a, b) a##b
#define SCOPE_EXIT_CONCAT(a, b) SCOPE_EXIT_CONCAT_IMPL(a, b)
#define SCOPE_EXIT(code) \
auto SCOPE_EXIT_CONCAT(scope_guard_, __LINE__) = RCCLTestGuards::makeScopeGuard([&]() { code; })
// ============================================================================
// AutoGuard & ResourceGuard - Typed resource management
// ============================================================================
/**
* @class AutoGuard
* @brief Modern RAII guard using non-type template parameter for deleter
*
* Uses C++17's auto template parameters to directly reference cleanup functions,
* eliminating the need for deleter functors in simple cases.
*
* @tparam T Resource handle type
* @tparam DeleterFunc Function pointer for cleanup (auto-deduced)
*/
template<typename T, auto DeleterFunc>
class AutoGuard
{
private:
T resource_;
bool dismissed_;
public:
explicit AutoGuard(T resource = T{}) : resource_(resource), dismissed_(false) {}
~AutoGuard()
{
if(!dismissed_ && resource_)
{
DeleterFunc(resource_);
}
}
// Get the resource handle
T get() const
{
return resource_;
}
// Get pointer to resource handle (for API calls)
T* ptr()
{
return &resource_;
}
// Set the resource handle
void set(T resource)
{
resource_ = resource;
}
// Dismiss the guard (prevent cleanup)
void dismiss()
{
dismissed_ = true;
}
// Release ownership (prevent cleanup)
T release()
{
dismissed_ = true;
return resource_;
}
AutoGuard(const AutoGuard&) = delete;
AutoGuard& operator=(const AutoGuard&) = delete;
AutoGuard(AutoGuard&& other) noexcept : resource_(other.resource_), dismissed_(other.dismissed_)
{
other.dismissed_ = true;
}
AutoGuard& operator=(AutoGuard&& other) noexcept
{
if(this != &other)
{
if(!dismissed_ && resource_)
{
DeleterFunc(resource_);
}
resource_ = other.resource_;
dismissed_ = other.dismissed_;
other.dismissed_ = true;
}
return *this;
}
};
/**
* @class ResourceGuard
* @brief Generic RAII guard template for resources with complex cleanup
*
* Uses a functor-based deleter for stateful deleters requiring additional context.
* For simple cleanup functions, prefer AutoGuard<T, func> instead.
*
* @tparam T Resource handle type
* @tparam Deleter Functor type for cleanup
*/
template<typename T, typename Deleter>
class ResourceGuard
{
private:
T resource_;
Deleter deleter_;
bool owns_;
public:
// Construct a resource guard
// @param resource Resource handle (can be nullptr/0)
// @param deleter Cleanup function/functor
explicit ResourceGuard(T resource = T{}, Deleter deleter = Deleter{})
: resource_(resource), deleter_(std::move(deleter)), owns_(true)
{}
// Destructor - automatically cleans up resource
~ResourceGuard()
{
if(owns_ && resource_)
{
deleter_(resource_);
}
}
// Get the resource handle
T get() const
{
return resource_;
}
// Get pointer to resource handle (for API calls)
T* ptr()
{
return &resource_;
}
// Set the resource handle
void set(T resource)
{
resource_ = resource;
}
// Reset the resource handle
// @param resource New resource handle (can be nullptr/0)
void reset(T resource = T{})
{
if(owns_ && resource_ && resource_ != resource)
{
deleter_(resource_);
}
resource_ = resource;
owns_ = true;
}
T release()
{
owns_ = false;
return resource_;
}
ResourceGuard(const ResourceGuard&) = delete;
ResourceGuard& operator=(const ResourceGuard&) = delete;
ResourceGuard(ResourceGuard&& other) noexcept
: resource_(other.resource_), deleter_(std::move(other.deleter_)), owns_(other.owns_)
{
other.owns_ = false;
}
ResourceGuard& operator=(ResourceGuard&& other) noexcept
{
if(this != &other)
{
// Clean up current resource
if(owns_ && resource_)
{
deleter_(resource_);
}
// Take ownership of other's resource
resource_ = other.resource_;
deleter_ = std::move(other.deleter_);
owns_ = other.owns_;
other.owns_ = false;
}
return *this;
}
};
// Note: Simple stateless deleters are replaced by wrapper functions + AutoGuard.
// Only stateful deleters that need additional context are kept here.
// Common deleters (NCCL-specific, used across many tests)
struct NcclRegHandleDeleter
{
ncclComm_t comm;
explicit NcclRegHandleDeleter(ncclComm_t c = nullptr) : comm(c) {}
void operator()(void* reg_handle) const
{
if(reg_handle && comm)
{
ncclCommDeregister(comm, reg_handle);
}
}
};
// Wrapper functions for AutoGuard (void-returning cleanup functions)
inline void hipFreeWrapper(void* ptr)
{
if(ptr)
{
hipError_t err = hipFree(ptr);
if(err != hipSuccess)
{
fprintf(stderr,
"WARNING: hipFree failed in destructor: %s (ptr=%p)\n",
hipGetErrorString(err),
ptr);
}
}
}
inline void hipStreamDestroyWrapper(hipStream_t stream)
{
if(stream)
{
hipError_t err = hipStreamDestroy(stream);
if(err != hipSuccess)
{
fprintf(stderr,
"WARNING: hipStreamDestroy failed in destructor: %s (stream=%p)\n",
hipGetErrorString(err),
static_cast<void*>(stream));
}
}
}
inline void hipEventDestroyWrapper(hipEvent_t event)
{
if(event)
{
hipError_t err = hipEventDestroy(event);
if(err != hipSuccess)
{
fprintf(stderr,
"WARNING: hipEventDestroy failed in destructor: %s (event=%p)\n",
hipGetErrorString(err),
static_cast<void*>(event));
}
}
}
inline void ncclCommDestroyWrapper(ncclComm_t comm)
{
if(comm)
{
ncclResult_t result = ncclCommDestroy(comm);
if(result != ncclSuccess)
{
fprintf(stderr,
"WARNING: ncclCommDestroy failed in destructor: %s (comm=%p)\n",
ncclGetErrorString(result),
static_cast<void*>(comm));
}
}
}
inline void freeWrapper(void* ptr)
{
if(ptr)
free(ptr);
}
// Type aliases for AutoGuard-based guards
using HostBufferAutoGuard = AutoGuard<void*, freeWrapper>;
using DeviceBufferAutoGuard = AutoGuard<void*, hipFreeWrapper>;
using HipStreamAutoGuard = AutoGuard<hipStream_t, hipStreamDestroyWrapper>;
using HipEventAutoGuard = AutoGuard<hipEvent_t, hipEventDestroyWrapper>;
using NcclCommAutoGuard = AutoGuard<ncclComm_t, ncclCommDestroyWrapper>;
// Type aliases for ResourceGuard-based guards (common/NCCL-specific)
using NcclRegHandleGuard = ResourceGuard<void*, NcclRegHandleDeleter>;
// Factory methods for ResourceGuard
template<typename T, typename Deleter>
inline auto makeGuard(T resource, Deleter deleter) -> ResourceGuard<T, Deleter>
{
return ResourceGuard<T, Deleter>(resource, std::move(deleter));
}
inline NcclRegHandleGuard makeRegHandleGuard(void* handle, ncclComm_t comm)
{
return NcclRegHandleGuard(handle, NcclRegHandleDeleter(comm));
}
template<typename T, typename Deleter>
inline auto makeCustomGuard(T resource, Deleter deleter) -> ResourceGuard<T, Deleter>
{
return ResourceGuard<T, Deleter>(resource, std::move(deleter));
}
// Factory methods for AutoGuard
template<typename T, auto DeleterFunc>
inline AutoGuard<T, DeleterFunc> makeAutoGuard(T resource)
{
return AutoGuard<T, DeleterFunc>(resource);
}
inline HostBufferAutoGuard makeHostBufferAutoGuard(void* buffer)
{
return HostBufferAutoGuard(buffer);
}
inline DeviceBufferAutoGuard makeDeviceBufferAutoGuard(void* buffer)
{
return DeviceBufferAutoGuard(buffer);
}
inline HipStreamAutoGuard makeStreamAutoGuard(hipStream_t stream)
{
return HipStreamAutoGuard(stream);
}
inline HipEventAutoGuard makeEventAutoGuard(hipEvent_t event)
{
return HipEventAutoGuard(event);
}
inline NcclCommAutoGuard makeCommAutoGuard(ncclComm_t comm)
{
return NcclCommAutoGuard(comm);
}
} // namespace RCCLTestGuards