3e62ceddc5
* Adding experimental clique-based kernels (opt-in only) Co-authored-by: Stanley Tsang <stanley.tsang@amd.com> Co-authored-by: Gilbert Lee <gilbert.lee@amd.com> Co-authored-by: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Co-authored-by: Stanley Tsang <stanley.tsang@amd.com> Co-authored-by: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
472 lines
20 KiB
C++
472 lines
20 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
#ifndef CORRECTNESSTEST_HPP
|
|
#define CORRECTNESSTEST_HPP
|
|
|
|
#include <cstdio>
|
|
#include <tuple>
|
|
#include <vector>
|
|
#include <gtest/gtest.h>
|
|
#include "rccl.h"
|
|
#include "../include/rccl_bfloat16.h"
|
|
|
|
#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
|
|
#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
|
|
|
|
#define MAX_ENV_TOKENS 16
|
|
|
|
namespace CorrectnessTests
|
|
{
|
|
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollSendRecv } ncclFunc_t;
|
|
typedef enum { ncclInputBuffer, ncclOutputBuffer } ncclBufferType_t;
|
|
|
|
// Performs the various basic reduction operations
|
|
template <typename T>
|
|
T ReduceOp(ncclRedOp_t const op, T const A, T const B)
|
|
{
|
|
switch (op)
|
|
{
|
|
case ncclSum: return A + B;
|
|
case ncclProd: return A * B;
|
|
case ncclMax: return std::max(A, B);
|
|
case ncclMin: return std::min(A, B);
|
|
default:
|
|
fprintf(stderr, "[ERROR] Unsupported reduction operator (%d)\n", op);
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
// Returns the number of bytes per element for each supported datatype
|
|
static int DataTypeToBytes(ncclDataType_t const dataType)
|
|
{
|
|
switch (dataType)
|
|
{
|
|
case ncclInt8: return 1;
|
|
case ncclUint8: return 1;
|
|
case ncclInt32: return 4;
|
|
case ncclUint32: return 4;
|
|
case ncclInt64: return 8;
|
|
case ncclUint64: return 8;
|
|
case ncclFloat16: return 2;
|
|
case ncclFloat32: return 4;
|
|
case ncclFloat64: return 8;
|
|
case ncclBfloat16: return 2;
|
|
default:
|
|
fprintf(stderr, "[ERROR] Unsupported datatype (%d)\n", dataType);
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
// Encapsulates all the memory used per devices for collectives, as well as reference results
|
|
struct Dataset
|
|
{
|
|
int numDevices; // Number of devices participating
|
|
size_t numElements; // Number of elements per array
|
|
ncclDataType_t dataType; // Data type of each input/output pointer
|
|
bool inPlace; // Whether or not output pointers are same as input pointers
|
|
ncclFunc_t function; // Buffer sizes are different in case of gather, scatter and all to all
|
|
|
|
std::vector<void *> inputs; // Input pointers (1 per device)
|
|
std::vector<void *> outputs; // Output pointers (1 per device)
|
|
// May be identical to input pointers for in-place tests
|
|
std::vector<void *> expected; // Expected output (1 per device)
|
|
|
|
size_t NumBytes() const
|
|
{
|
|
return numElements * DataTypeToBytes(dataType);
|
|
}
|
|
|
|
size_t NumBytes(ncclBufferType_t bufferType) const
|
|
{
|
|
if ((function == ncclCollGather && (bufferType == ncclOutputBuffer || inPlace == true)) ||
|
|
(function == ncclCollScatter && bufferType == ncclInputBuffer) ||
|
|
function == ncclCollAllToAll)
|
|
return numElements * DataTypeToBytes(dataType) * numDevices;
|
|
return numElements * DataTypeToBytes(dataType);
|
|
}
|
|
|
|
void Initialize(int const numDevices_,
|
|
size_t const numElements_,
|
|
ncclDataType_t const dataType_,
|
|
bool const inPlace_,
|
|
ncclFunc_t const func_ = ncclCollBroadcast)
|
|
{
|
|
numDevices = numDevices_;
|
|
numElements = numElements_;
|
|
dataType = dataType_;
|
|
inPlace = inPlace_;
|
|
function = func_;
|
|
|
|
inputs.resize(numDevices);
|
|
outputs.resize(numDevices);
|
|
expected.resize(numDevices);
|
|
|
|
// Allocate per-device memory
|
|
for (int i = 0; i < numDevices; i++)
|
|
{
|
|
HIP_CALL(hipSetDevice(i));
|
|
HIP_CALL(hipMalloc((void **)&inputs[i], NumBytes(ncclInputBuffer)));
|
|
if (inPlace)
|
|
outputs[i] = inputs[i];
|
|
else
|
|
HIP_CALL(hipMalloc((void **)&outputs[i], NumBytes(ncclOutputBuffer)));
|
|
|
|
expected[i] = malloc(NumBytes(ncclOutputBuffer));
|
|
}
|
|
}
|
|
|
|
// Explicit memory release to avoid double-free from subDatasets
|
|
void Release()
|
|
{
|
|
for (int i = 0; i < numDevices; i++)
|
|
{
|
|
if (!inPlace) hipFree(outputs[i]);
|
|
hipFree(inputs[i]);
|
|
free(expected[i]);
|
|
}
|
|
|
|
outputs.clear();
|
|
}
|
|
|
|
// Creates a dataset by pointing to an existing dataset
|
|
// Primarily to allow for testing with different starting byte-alignments
|
|
void ExtractSubDataset(size_t const startElement,
|
|
size_t const lastElement,
|
|
Dataset& subDataset)
|
|
{
|
|
ASSERT_LE(startElement, lastElement);
|
|
ASSERT_LT(lastElement, numElements);
|
|
|
|
subDataset.numDevices = numDevices;
|
|
subDataset.numElements = lastElement - startElement + 1;
|
|
subDataset.dataType = dataType;
|
|
subDataset.inPlace = inPlace;
|
|
|
|
subDataset.inputs.resize(numDevices);
|
|
subDataset.outputs.resize(numDevices);
|
|
subDataset.expected.resize(numDevices);
|
|
|
|
size_t const byteOffset = (startElement * DataTypeToBytes(dataType));
|
|
for (int i = 0; i < numDevices; i++)
|
|
{
|
|
subDataset.inputs[i] = (int8_t *)inputs[i] + byteOffset;
|
|
subDataset.outputs[i] = (int8_t *)outputs[i] + byteOffset;
|
|
subDataset.expected[i] = (int8_t *)expected[i] + byteOffset;
|
|
}
|
|
}
|
|
};
|
|
|
|
typedef std::tuple<ncclRedOp_t /* op */,
|
|
ncclDataType_t /* dataType */,
|
|
size_t /* numElements */,
|
|
int /* numDevices */,
|
|
bool /* inPlace */,
|
|
const char* /* envVals */> TestTuple;
|
|
|
|
// Base class for each collective test
|
|
// - Each test is instantiated with a different TestTuple
|
|
class CorrectnessTest : public testing::TestWithParam<TestTuple>
|
|
{
|
|
public:
|
|
struct PrintToStringParamName
|
|
{
|
|
std::string operator()(const testing::TestParamInfo<CorrectnessTest::ParamType>& info)
|
|
{
|
|
std::string name;
|
|
|
|
name += opStrings[std::get<0>(info.param)] + "_";
|
|
name += dataTypeStrings[std::get<1>(info.param)] + "_";
|
|
name += std::to_string(std::get<2>(info.param)) + "elements_";
|
|
name += std::to_string(std::get<3>(info.param)) + "devices_";
|
|
name += std::get<4>(info.param) == true ? "inplace_" : "outofplace_";
|
|
std::string envVars = std::string(std::get<5>(info.param));
|
|
std::replace(envVars.begin(), envVars.end(), '=', '_');
|
|
name += envVars;
|
|
|
|
return name;
|
|
}
|
|
|
|
std::map<ncclRedOp_t, std::string> opStrings
|
|
{
|
|
{ncclSum, "sum"},
|
|
{ncclProd, "prod"},
|
|
{ncclMax, "max"},
|
|
{ncclMin, "min"}
|
|
};
|
|
|
|
std::map<ncclDataType_t, std::string> dataTypeStrings
|
|
{
|
|
{ncclInt8, "int8"},
|
|
{ncclChar, "char"},
|
|
{ncclUint8, "uint8"},
|
|
{ncclInt32, "int32"},
|
|
{ncclInt, "int"},
|
|
{ncclUint32, "uint32"},
|
|
{ncclInt64, "int64"},
|
|
{ncclUint64, "uint64"},
|
|
{ncclFloat16, "float16"},
|
|
{ncclHalf, "half"},
|
|
{ncclFloat32, "float32"},
|
|
{ncclFloat64, "float64"},
|
|
{ncclDouble, "double"},
|
|
{ncclBfloat16, "bfloat16"}
|
|
};
|
|
};
|
|
protected:
|
|
|
|
// This code is called per test-tuple
|
|
void SetUp() override
|
|
{
|
|
// Make the test tuple parameters accessible
|
|
std::tie(op, dataType, numElements, numDevices, inPlace, envVals) = GetParam();
|
|
|
|
envString = 0;
|
|
numTokens = 0;
|
|
if (strcmp(envVals, "")) {
|
|
// enable RCCL env vars testing
|
|
setenv("RCCL_TEST_ENV_VARS", "ENABLE", 1);
|
|
envString = strdup(envVals);
|
|
tokens[numTokens] = strtok(envString, "=, ");
|
|
numTokens++;
|
|
while (tokens[numTokens-1] != NULL && numTokens < MAX_ENV_TOKENS)
|
|
tokens[numTokens++] = strtok(NULL, "=, ");
|
|
for (int i = 0; i < numTokens/2; i++) {
|
|
char *val = getenv(tokens[i*2]);
|
|
if (val)
|
|
savedEnv[i] = strdup(val);
|
|
else
|
|
savedEnv[i] = 0;
|
|
setenv(tokens[i*2], tokens[i*2+1], 1);
|
|
fprintf(stdout, "[ ] setting environmental variable %s to %s\n", tokens[i*2], getenv(tokens[i*2]));
|
|
}
|
|
}
|
|
|
|
// Collect the number of available GPUs
|
|
HIP_CALL(hipGetDeviceCount(&numDevicesAvailable));
|
|
|
|
// Only proceed with testing if there are enough GPUs
|
|
if (numDevices > numDevicesAvailable)
|
|
{
|
|
fprintf(stdout, "[ SKIPPED ] Test requires %d devices (only %d available)\n",
|
|
numDevices, numDevicesAvailable);
|
|
|
|
// Modify the number of devices so that tear-down doesn't occur
|
|
// This is temporary until GTEST_SKIP() becomes available
|
|
numDevices = 0;
|
|
numDevicesAvailable = -1;
|
|
return;
|
|
}
|
|
|
|
// Initialize communicators
|
|
comms.resize(numDevices);
|
|
NCCL_CALL(ncclCommInitAll(comms.data(), numDevices, NULL));
|
|
|
|
// Create streams
|
|
streams.resize(numDevices);
|
|
for (int i = 0; i < numDevices; i++)
|
|
{
|
|
HIP_CALL(hipSetDevice(i));
|
|
HIP_CALL(hipStreamCreate(&streams[i]));
|
|
}
|
|
}
|
|
|
|
// Clean up per TestTuple
|
|
void TearDown() override
|
|
{
|
|
// Release communicators and streams
|
|
for (int i = 0; i < numDevices; i++)
|
|
{
|
|
NCCL_CALL(ncclCommDestroy(comms[i]));
|
|
HIP_CALL(hipStreamDestroy(streams[i]));
|
|
}
|
|
// Restore env vars after tests
|
|
for (int i = 0; i < numTokens/2; i++) {
|
|
if (savedEnv[i]) {
|
|
setenv(tokens[i*2], savedEnv[i], 1);
|
|
fprintf(stdout, "[ ] restored environmental variable %s to %s\n", tokens[i*2], getenv(tokens[i*2]));
|
|
free(savedEnv[i]);
|
|
}
|
|
else {
|
|
unsetenv(tokens[i*2]);
|
|
fprintf(stdout, "[ ] removed environmental variable %s\n", tokens[i*2]);
|
|
}
|
|
}
|
|
// Cleanup
|
|
unsetenv("RCCL_TEST_ENV_VARS");
|
|
free(envString);
|
|
}
|
|
|
|
void FillDatasetWithPattern(Dataset& dataset)
|
|
{
|
|
int8_t* arrayI1 = (int8_t *)malloc(dataset.NumBytes(ncclInputBuffer));
|
|
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
|
int32_t* arrayI4 = (int32_t *)arrayI1;
|
|
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
|
int64_t* arrayI8 = (int64_t *)arrayI1;
|
|
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
|
float* arrayF4 = (float *)arrayI1;
|
|
double* arrayF8 = (double *)arrayI1;
|
|
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
|
|
|
// NOTE: Currently half-precision float tests are unsupported due to half being supported
|
|
// on GPU only and not host
|
|
|
|
// Fills input data[i][j] with (i + j) % 6
|
|
// - Keeping range small to reduce likelihood of overflow
|
|
// - Sticking with floating points values that are perfectly representable
|
|
for (int i = 0; i < dataset.numDevices; i++)
|
|
{
|
|
for (int j = 0; j < dataset.NumBytes(ncclInputBuffer)/DataTypeToBytes(dataset.dataType); j++)
|
|
{
|
|
int valueI = (i + j) % 6;
|
|
float valueF = (float)valueI;
|
|
|
|
switch (dataset.dataType)
|
|
{
|
|
case ncclInt8: arrayI1[j] = valueI; break;
|
|
case ncclUint8: arrayU1[j] = valueI; break;
|
|
case ncclInt32: arrayI4[j] = valueI; break;
|
|
case ncclUint32: arrayU4[j] = valueI; break;
|
|
case ncclInt64: arrayI8[j] = valueI; break;
|
|
case ncclUint64: arrayU8[j] = valueI; break;
|
|
case ncclFloat32: arrayF4[j] = valueF; break;
|
|
case ncclFloat64: arrayF8[j] = valueF; break;
|
|
case ncclBfloat16: arrayB2[j] = rccl_bfloat16(valueF); break;
|
|
default:
|
|
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
HIP_CALL(hipSetDevice(i));
|
|
HIP_CALL(hipMemcpy(dataset.inputs[i], arrayI1, dataset.NumBytes(ncclInputBuffer), hipMemcpyHostToDevice));
|
|
|
|
// Fills output data[i][j] with 0 (if not inplace)
|
|
if (!dataset.inPlace)
|
|
HIP_CALL(hipMemset(dataset.outputs[i], 0, dataset.NumBytes(ncclOutputBuffer)));
|
|
}
|
|
|
|
free(arrayI1);
|
|
}
|
|
|
|
void Synchronize() const
|
|
{
|
|
// Wait for reduction to complete
|
|
for (int i = 0; i < numDevices; i++)
|
|
{
|
|
HIP_CALL(hipSetDevice(i));
|
|
HIP_CALL(hipStreamSynchronize(streams[i]));
|
|
}
|
|
}
|
|
|
|
void ValidateResults(Dataset const& dataset, int root = 0) const
|
|
{
|
|
int8_t* outputI1 = (int8_t *)malloc(dataset.NumBytes(ncclOutputBuffer));
|
|
uint8_t* outputU1 = (uint8_t *)outputI1;
|
|
int32_t* outputI4 = (int32_t *)outputI1;
|
|
uint32_t* outputU4 = (uint32_t *)outputI1;
|
|
int64_t* outputI8 = (int64_t *)outputI1;
|
|
uint64_t* outputU8 = (uint64_t *)outputI1;
|
|
float* outputF4 = (float *)outputI1;
|
|
double* outputF8 = (double *)outputI1;
|
|
rccl_bfloat16* outputB2 = (rccl_bfloat16 *)outputI1;
|
|
|
|
bool isMatch = true;
|
|
|
|
// Loop over each device's output and compare it to the expected output
|
|
// (Each collective operation computes its own expected results)
|
|
for (int i = 0; i < dataset.numDevices && isMatch; i++)
|
|
{
|
|
// only output on root rank is valid for gather collective
|
|
if (dataset.function == ncclCollGather && i != root)
|
|
continue;
|
|
HIP_CALL(hipMemcpy(outputI1, dataset.outputs[i], dataset.NumBytes(ncclOutputBuffer), hipMemcpyDeviceToHost));
|
|
|
|
int8_t* expectedI1 = (int8_t *)dataset.expected[i];
|
|
uint8_t* expectedU1 = (uint8_t *)expectedI1;
|
|
int32_t* expectedI4 = (int32_t *)expectedI1;
|
|
uint32_t* expectedU4 = (uint32_t *)expectedI1;
|
|
int64_t* expectedI8 = (int64_t *)expectedI1;
|
|
uint64_t* expectedU8 = (uint64_t *)expectedI1;
|
|
float* expectedF4 = (float *)expectedI1;
|
|
double* expectedF8 = (double *)expectedI1;
|
|
rccl_bfloat16* expectedB2 = (rccl_bfloat16 *)expectedI1;
|
|
|
|
for (int j = 0; j < dataset.numElements && isMatch; j++)
|
|
{
|
|
switch (dataset.dataType)
|
|
{
|
|
case ncclInt8: isMatch &= (outputI1[j] == expectedI1[j]); break;
|
|
case ncclUint8: isMatch &= (outputU1[j] == expectedU1[j]); break;
|
|
case ncclInt32: isMatch &= (outputI4[j] == expectedI4[j]); break;
|
|
case ncclUint32: isMatch &= (outputU4[j] == expectedU4[j]); break;
|
|
case ncclInt64: isMatch &= (outputI8[j] == expectedI8[j]); break;
|
|
case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break;
|
|
case ncclFloat32: isMatch &= (outputF4[j] == expectedF4[j]); break;
|
|
case ncclFloat64: isMatch &= (outputF8[j] == expectedF8[j]); break;
|
|
case ncclBfloat16: isMatch &= (outputB2[j] == expectedB2[j]); break;
|
|
default:
|
|
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
|
exit(0);
|
|
}
|
|
|
|
if (!isMatch)
|
|
{
|
|
switch (dataset.dataType)
|
|
{
|
|
case ncclInt8:
|
|
printf("Expected %d. Output %d on device %d[%d]\n", expectedI1[j], outputI1[j], i, j);
|
|
break;
|
|
case ncclUint8:
|
|
printf("Expected %u. Output %u on device %d[%d]\n", expectedU1[j], outputU1[j], i, j); break;
|
|
case ncclInt32:
|
|
printf("Expected %d. Output %d on device %d[%d]\n", expectedI4[j], outputI4[j], i, j); break;
|
|
case ncclUint32:
|
|
printf("Expected %u. Output %u on device %d[%d]\n", expectedU4[j], outputU4[j], i, j); break;
|
|
case ncclInt64:
|
|
printf("Expected %ld. Output %ld on device %d[%d]\n", expectedI8[j], outputI8[j], i, j); break;
|
|
case ncclUint64:
|
|
printf("Expected %lu. Output %lu on device %d[%d]\n", expectedU8[j], outputU8[j], i, j); break;
|
|
case ncclFloat32:
|
|
printf("Expected %f. Output %f on device %d[%d]\n", expectedF4[j], outputF4[j], i, j); break;
|
|
case ncclFloat64:
|
|
printf("Expected %lf. Output %lf on device %d[%d]\n", expectedF8[j], outputF8[j], i, j); break;
|
|
case ncclBfloat16:
|
|
printf("Expected %f. Output %f on device %d[%d]\n", (float)expectedB2[j], (float)outputB2[j], i, j); break;
|
|
default:
|
|
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
|
exit(0);
|
|
}
|
|
}
|
|
}
|
|
ASSERT_EQ(isMatch, true);
|
|
}
|
|
free(outputI1);
|
|
}
|
|
|
|
// Passed in parameters from TestTuple
|
|
ncclRedOp_t op;
|
|
ncclDataType_t dataType;
|
|
size_t numElements;
|
|
int numDevices;
|
|
bool inPlace;
|
|
const char* envVals;
|
|
|
|
int numDevicesAvailable;
|
|
std::vector<ncclComm_t> comms;
|
|
std::vector<hipStream_t> streams;
|
|
|
|
// internal only
|
|
char* envString;
|
|
char* tokens[MAX_ENV_TOKENS];
|
|
int numTokens;
|
|
char* savedEnv[MAX_ENV_TOKENS/2];
|
|
};
|
|
|
|
}
|
|
|
|
#endif
|