Files
rocm-systems/test/CorrectnessTest.hpp
T
gilbertlee-amd 3e62ceddc5 Clique kernel support (#295) (#15)
* Adding experimental clique-based kernels (opt-in only)

Co-authored-by: Stanley Tsang <stanley.tsang@amd.com>
Co-authored-by: Gilbert Lee <gilbert.lee@amd.com>
Co-authored-by: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>

Co-authored-by: Stanley Tsang <stanley.tsang@amd.com>
Co-authored-by: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
2021-01-28 09:45:01 -07:00

472 lines
20 KiB
C++

/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef CORRECTNESSTEST_HPP
#define CORRECTNESSTEST_HPP
#include <cstdio>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "rccl.h"
#include "../include/rccl_bfloat16.h"
#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
#define MAX_ENV_TOKENS 16
namespace CorrectnessTests
{
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollSendRecv } ncclFunc_t;
typedef enum { ncclInputBuffer, ncclOutputBuffer } ncclBufferType_t;
// Performs the various basic reduction operations
template <typename T>
T ReduceOp(ncclRedOp_t const op, T const A, T const B)
{
switch (op)
{
case ncclSum: return A + B;
case ncclProd: return A * B;
case ncclMax: return std::max(A, B);
case ncclMin: return std::min(A, B);
default:
fprintf(stderr, "[ERROR] Unsupported reduction operator (%d)\n", op);
exit(0);
}
}
// Returns the number of bytes per element for each supported datatype
static int DataTypeToBytes(ncclDataType_t const dataType)
{
switch (dataType)
{
case ncclInt8: return 1;
case ncclUint8: return 1;
case ncclInt32: return 4;
case ncclUint32: return 4;
case ncclInt64: return 8;
case ncclUint64: return 8;
case ncclFloat16: return 2;
case ncclFloat32: return 4;
case ncclFloat64: return 8;
case ncclBfloat16: return 2;
default:
fprintf(stderr, "[ERROR] Unsupported datatype (%d)\n", dataType);
exit(0);
}
}
// Encapsulates all the memory used per devices for collectives, as well as reference results
struct Dataset
{
int numDevices; // Number of devices participating
size_t numElements; // Number of elements per array
ncclDataType_t dataType; // Data type of each input/output pointer
bool inPlace; // Whether or not output pointers are same as input pointers
ncclFunc_t function; // Buffer sizes are different in case of gather, scatter and all to all
std::vector<void *> inputs; // Input pointers (1 per device)
std::vector<void *> outputs; // Output pointers (1 per device)
// May be identical to input pointers for in-place tests
std::vector<void *> expected; // Expected output (1 per device)
size_t NumBytes() const
{
return numElements * DataTypeToBytes(dataType);
}
size_t NumBytes(ncclBufferType_t bufferType) const
{
if ((function == ncclCollGather && (bufferType == ncclOutputBuffer || inPlace == true)) ||
(function == ncclCollScatter && bufferType == ncclInputBuffer) ||
function == ncclCollAllToAll)
return numElements * DataTypeToBytes(dataType) * numDevices;
return numElements * DataTypeToBytes(dataType);
}
void Initialize(int const numDevices_,
size_t const numElements_,
ncclDataType_t const dataType_,
bool const inPlace_,
ncclFunc_t const func_ = ncclCollBroadcast)
{
numDevices = numDevices_;
numElements = numElements_;
dataType = dataType_;
inPlace = inPlace_;
function = func_;
inputs.resize(numDevices);
outputs.resize(numDevices);
expected.resize(numDevices);
// Allocate per-device memory
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipMalloc((void **)&inputs[i], NumBytes(ncclInputBuffer)));
if (inPlace)
outputs[i] = inputs[i];
else
HIP_CALL(hipMalloc((void **)&outputs[i], NumBytes(ncclOutputBuffer)));
expected[i] = malloc(NumBytes(ncclOutputBuffer));
}
}
// Explicit memory release to avoid double-free from subDatasets
void Release()
{
for (int i = 0; i < numDevices; i++)
{
if (!inPlace) hipFree(outputs[i]);
hipFree(inputs[i]);
free(expected[i]);
}
outputs.clear();
}
// Creates a dataset by pointing to an existing dataset
// Primarily to allow for testing with different starting byte-alignments
void ExtractSubDataset(size_t const startElement,
size_t const lastElement,
Dataset& subDataset)
{
ASSERT_LE(startElement, lastElement);
ASSERT_LT(lastElement, numElements);
subDataset.numDevices = numDevices;
subDataset.numElements = lastElement - startElement + 1;
subDataset.dataType = dataType;
subDataset.inPlace = inPlace;
subDataset.inputs.resize(numDevices);
subDataset.outputs.resize(numDevices);
subDataset.expected.resize(numDevices);
size_t const byteOffset = (startElement * DataTypeToBytes(dataType));
for (int i = 0; i < numDevices; i++)
{
subDataset.inputs[i] = (int8_t *)inputs[i] + byteOffset;
subDataset.outputs[i] = (int8_t *)outputs[i] + byteOffset;
subDataset.expected[i] = (int8_t *)expected[i] + byteOffset;
}
}
};
typedef std::tuple<ncclRedOp_t /* op */,
ncclDataType_t /* dataType */,
size_t /* numElements */,
int /* numDevices */,
bool /* inPlace */,
const char* /* envVals */> TestTuple;
// Base class for each collective test
// - Each test is instantiated with a different TestTuple
class CorrectnessTest : public testing::TestWithParam<TestTuple>
{
public:
struct PrintToStringParamName
{
std::string operator()(const testing::TestParamInfo<CorrectnessTest::ParamType>& info)
{
std::string name;
name += opStrings[std::get<0>(info.param)] + "_";
name += dataTypeStrings[std::get<1>(info.param)] + "_";
name += std::to_string(std::get<2>(info.param)) + "elements_";
name += std::to_string(std::get<3>(info.param)) + "devices_";
name += std::get<4>(info.param) == true ? "inplace_" : "outofplace_";
std::string envVars = std::string(std::get<5>(info.param));
std::replace(envVars.begin(), envVars.end(), '=', '_');
name += envVars;
return name;
}
std::map<ncclRedOp_t, std::string> opStrings
{
{ncclSum, "sum"},
{ncclProd, "prod"},
{ncclMax, "max"},
{ncclMin, "min"}
};
std::map<ncclDataType_t, std::string> dataTypeStrings
{
{ncclInt8, "int8"},
{ncclChar, "char"},
{ncclUint8, "uint8"},
{ncclInt32, "int32"},
{ncclInt, "int"},
{ncclUint32, "uint32"},
{ncclInt64, "int64"},
{ncclUint64, "uint64"},
{ncclFloat16, "float16"},
{ncclHalf, "half"},
{ncclFloat32, "float32"},
{ncclFloat64, "float64"},
{ncclDouble, "double"},
{ncclBfloat16, "bfloat16"}
};
};
protected:
// This code is called per test-tuple
void SetUp() override
{
// Make the test tuple parameters accessible
std::tie(op, dataType, numElements, numDevices, inPlace, envVals) = GetParam();
envString = 0;
numTokens = 0;
if (strcmp(envVals, "")) {
// enable RCCL env vars testing
setenv("RCCL_TEST_ENV_VARS", "ENABLE", 1);
envString = strdup(envVals);
tokens[numTokens] = strtok(envString, "=, ");
numTokens++;
while (tokens[numTokens-1] != NULL && numTokens < MAX_ENV_TOKENS)
tokens[numTokens++] = strtok(NULL, "=, ");
for (int i = 0; i < numTokens/2; i++) {
char *val = getenv(tokens[i*2]);
if (val)
savedEnv[i] = strdup(val);
else
savedEnv[i] = 0;
setenv(tokens[i*2], tokens[i*2+1], 1);
fprintf(stdout, "[ ] setting environmental variable %s to %s\n", tokens[i*2], getenv(tokens[i*2]));
}
}
// Collect the number of available GPUs
HIP_CALL(hipGetDeviceCount(&numDevicesAvailable));
// Only proceed with testing if there are enough GPUs
if (numDevices > numDevicesAvailable)
{
fprintf(stdout, "[ SKIPPED ] Test requires %d devices (only %d available)\n",
numDevices, numDevicesAvailable);
// Modify the number of devices so that tear-down doesn't occur
// This is temporary until GTEST_SKIP() becomes available
numDevices = 0;
numDevicesAvailable = -1;
return;
}
// Initialize communicators
comms.resize(numDevices);
NCCL_CALL(ncclCommInitAll(comms.data(), numDevices, NULL));
// Create streams
streams.resize(numDevices);
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipStreamCreate(&streams[i]));
}
}
// Clean up per TestTuple
void TearDown() override
{
// Release communicators and streams
for (int i = 0; i < numDevices; i++)
{
NCCL_CALL(ncclCommDestroy(comms[i]));
HIP_CALL(hipStreamDestroy(streams[i]));
}
// Restore env vars after tests
for (int i = 0; i < numTokens/2; i++) {
if (savedEnv[i]) {
setenv(tokens[i*2], savedEnv[i], 1);
fprintf(stdout, "[ ] restored environmental variable %s to %s\n", tokens[i*2], getenv(tokens[i*2]));
free(savedEnv[i]);
}
else {
unsetenv(tokens[i*2]);
fprintf(stdout, "[ ] removed environmental variable %s\n", tokens[i*2]);
}
}
// Cleanup
unsetenv("RCCL_TEST_ENV_VARS");
free(envString);
}
void FillDatasetWithPattern(Dataset& dataset)
{
int8_t* arrayI1 = (int8_t *)malloc(dataset.NumBytes(ncclInputBuffer));
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
// NOTE: Currently half-precision float tests are unsupported due to half being supported
// on GPU only and not host
// Fills input data[i][j] with (i + j) % 6
// - Keeping range small to reduce likelihood of overflow
// - Sticking with floating points values that are perfectly representable
for (int i = 0; i < dataset.numDevices; i++)
{
for (int j = 0; j < dataset.NumBytes(ncclInputBuffer)/DataTypeToBytes(dataset.dataType); j++)
{
int valueI = (i + j) % 6;
float valueF = (float)valueI;
switch (dataset.dataType)
{
case ncclInt8: arrayI1[j] = valueI; break;
case ncclUint8: arrayU1[j] = valueI; break;
case ncclInt32: arrayI4[j] = valueI; break;
case ncclUint32: arrayU4[j] = valueI; break;
case ncclInt64: arrayI8[j] = valueI; break;
case ncclUint64: arrayU8[j] = valueI; break;
case ncclFloat32: arrayF4[j] = valueF; break;
case ncclFloat64: arrayF8[j] = valueF; break;
case ncclBfloat16: arrayB2[j] = rccl_bfloat16(valueF); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipMemcpy(dataset.inputs[i], arrayI1, dataset.NumBytes(ncclInputBuffer), hipMemcpyHostToDevice));
// Fills output data[i][j] with 0 (if not inplace)
if (!dataset.inPlace)
HIP_CALL(hipMemset(dataset.outputs[i], 0, dataset.NumBytes(ncclOutputBuffer)));
}
free(arrayI1);
}
void Synchronize() const
{
// Wait for reduction to complete
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipStreamSynchronize(streams[i]));
}
}
void ValidateResults(Dataset const& dataset, int root = 0) const
{
int8_t* outputI1 = (int8_t *)malloc(dataset.NumBytes(ncclOutputBuffer));
uint8_t* outputU1 = (uint8_t *)outputI1;
int32_t* outputI4 = (int32_t *)outputI1;
uint32_t* outputU4 = (uint32_t *)outputI1;
int64_t* outputI8 = (int64_t *)outputI1;
uint64_t* outputU8 = (uint64_t *)outputI1;
float* outputF4 = (float *)outputI1;
double* outputF8 = (double *)outputI1;
rccl_bfloat16* outputB2 = (rccl_bfloat16 *)outputI1;
bool isMatch = true;
// Loop over each device's output and compare it to the expected output
// (Each collective operation computes its own expected results)
for (int i = 0; i < dataset.numDevices && isMatch; i++)
{
// only output on root rank is valid for gather collective
if (dataset.function == ncclCollGather && i != root)
continue;
HIP_CALL(hipMemcpy(outputI1, dataset.outputs[i], dataset.NumBytes(ncclOutputBuffer), hipMemcpyDeviceToHost));
int8_t* expectedI1 = (int8_t *)dataset.expected[i];
uint8_t* expectedU1 = (uint8_t *)expectedI1;
int32_t* expectedI4 = (int32_t *)expectedI1;
uint32_t* expectedU4 = (uint32_t *)expectedI1;
int64_t* expectedI8 = (int64_t *)expectedI1;
uint64_t* expectedU8 = (uint64_t *)expectedI1;
float* expectedF4 = (float *)expectedI1;
double* expectedF8 = (double *)expectedI1;
rccl_bfloat16* expectedB2 = (rccl_bfloat16 *)expectedI1;
for (int j = 0; j < dataset.numElements && isMatch; j++)
{
switch (dataset.dataType)
{
case ncclInt8: isMatch &= (outputI1[j] == expectedI1[j]); break;
case ncclUint8: isMatch &= (outputU1[j] == expectedU1[j]); break;
case ncclInt32: isMatch &= (outputI4[j] == expectedI4[j]); break;
case ncclUint32: isMatch &= (outputU4[j] == expectedU4[j]); break;
case ncclInt64: isMatch &= (outputI8[j] == expectedI8[j]); break;
case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break;
case ncclFloat32: isMatch &= (outputF4[j] == expectedF4[j]); break;
case ncclFloat64: isMatch &= (outputF8[j] == expectedF8[j]); break;
case ncclBfloat16: isMatch &= (outputB2[j] == expectedB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
if (!isMatch)
{
switch (dataset.dataType)
{
case ncclInt8:
printf("Expected %d. Output %d on device %d[%d]\n", expectedI1[j], outputI1[j], i, j);
break;
case ncclUint8:
printf("Expected %u. Output %u on device %d[%d]\n", expectedU1[j], outputU1[j], i, j); break;
case ncclInt32:
printf("Expected %d. Output %d on device %d[%d]\n", expectedI4[j], outputI4[j], i, j); break;
case ncclUint32:
printf("Expected %u. Output %u on device %d[%d]\n", expectedU4[j], outputU4[j], i, j); break;
case ncclInt64:
printf("Expected %ld. Output %ld on device %d[%d]\n", expectedI8[j], outputI8[j], i, j); break;
case ncclUint64:
printf("Expected %lu. Output %lu on device %d[%d]\n", expectedU8[j], outputU8[j], i, j); break;
case ncclFloat32:
printf("Expected %f. Output %f on device %d[%d]\n", expectedF4[j], outputF4[j], i, j); break;
case ncclFloat64:
printf("Expected %lf. Output %lf on device %d[%d]\n", expectedF8[j], outputF8[j], i, j); break;
case ncclBfloat16:
printf("Expected %f. Output %f on device %d[%d]\n", (float)expectedB2[j], (float)outputB2[j], i, j); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
ASSERT_EQ(isMatch, true);
}
free(outputI1);
}
// Passed in parameters from TestTuple
ncclRedOp_t op;
ncclDataType_t dataType;
size_t numElements;
int numDevices;
bool inPlace;
const char* envVals;
int numDevicesAvailable;
std::vector<ncclComm_t> comms;
std::vector<hipStream_t> streams;
// internal only
char* envString;
char* tokens[MAX_ENV_TOKENS];
int numTokens;
char* savedEnv[MAX_ENV_TOKENS/2];
};
}
#endif