Files
Wenkai Du 75a69211a0 Add all_reduce_bias_perf to support All Reduce with Bias (#130)
Use dynamic symbol loading of ncclAllReduceWithBias

Co-authored-by: mberenjk <146776561+mberenjk@users.noreply.github.com>

[ROCm/rccl-tests commit: db6ea5a594]
2025-10-13 16:09:10 -05:00

78 baris
2.9 KiB
C++

/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef _d41d8cd98f00b204e9800998ecf8427e
#define _d41d8cd98f00b204e9800998ecf8427e
#include <cuda_runtime.h>
#include <stdint.h>
/* Routines for launching kernels that verify reduction results. A significant
* feature of these routines is they carefully craft floating point input
* to produce exactly predictable output.
*
* int elt_ty: actually just a ncclDataType_t
*
* int red_op: mostly just a ncclRedOp_t. Since PreMulSum ops are dynamically
* created, these are encoded as the value ncclNumOps and their scalar is
* assumed to be `ncclVerifiablePremulScalar(rank_me)`
*
* uint64_t seed: arbitrary 64-bits to use in seeding the random values
*
* intptr_t elt_ix0: index of first element pointed to by elts when generating
* random values. This makes it possible to generate subsequences independently
* as well as in aggregate.
*
* int rank_n: Number of contributions into the reduction. Non-reduction
* collectives like broadcast, gather, etc will always set this to one.
*
* int rank_me: Index of this contribution
*/
// Use this as the local scalar for PreMulSum ops
template<typename T>
__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
return T(rank_me%2 == 0 ? 1.0f : 2.0f);
}
// Enqueue kernel to generate data which is to be reduced.
hipError_t ncclVerifiablePrepareInput(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
);
// Enqueue kernel to generate expected results of reduction.
hipError_t ncclVerifiablePrepareExpected(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
);
// Enqueue kernel to verify reduced data matches expectation. The number of
// failed elements is written to bad_elt_n which must be in cudaHost memory.
// If `expected == nullptr` then the expected results are generated on-the-fly
// which can be costly. Thus if you plan to run the same reduction multiple
// times it is advantageous to precompute the expected values with
// ncclVerifiablePrepareExpected and pass them as `expected` here.
hipError_t ncclVerifiableVerify(
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t *bad_elt_n, cudaStream_t stream
);
// Enqueue kernel that applies bias to expected results
void ncclVerifiableApplyBias(
void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
cudaStream_t stream
);
#ifdef NCCL_VERIFIABLE_SELF_TEST
void ncclVerifiableLaunchSelfTest();
#endif
#endif