Files
rocm-systems/test/CorrectnessTest.hpp
T
2020-01-29 15:28:08 -08:00

370 строки
15 KiB
C++

/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef CORRECTNESSTEST_HPP
#define CORRECTNESSTEST_HPP
#include <cstdio>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "rccl.h"
#include "../include/rccl_bfloat16.h"
#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
namespace CorrectnessTests
{
// Performs the various basic reduction operations
template <typename T>
T ReduceOp(ncclRedOp_t const op, T const A, T const B)
{
switch (op)
{
case ncclSum: return A + B;
case ncclProd: return A * B;
case ncclMax: return std::max(A, B);
case ncclMin: return std::min(A, B);
default:
fprintf(stderr, "[ERROR] Unsupported reduction operator (%d)\n", op);
exit(0);
}
}
// Returns the number of bytes per element for each supported datatype
static int DataTypeToBytes(ncclDataType_t const dataType)
{
switch (dataType)
{
case ncclInt8: return 1;
case ncclUint8: return 1;
case ncclInt32: return 4;
case ncclUint32: return 4;
case ncclInt64: return 8;
case ncclUint64: return 8;
case ncclFloat16: return 2;
case ncclFloat32: return 4;
case ncclFloat64: return 8;
case ncclBfloat16: return 2;
default:
fprintf(stderr, "[ERROR] Unsupported datatype (%d)\n", dataType);
exit(0);
}
}
// Encapsulates all the memory used per devices for collectives, as well as reference results
struct Dataset
{
int numDevices; // Number of devices participating
size_t numElements; // Number of elements per array
ncclDataType_t dataType; // Data type of each input/output pointer
bool inPlace; // Whether or not output pointers are same as input pointers
std::vector<void *> inputs; // Input pointers (1 per device)
std::vector<void *> outputs; // Output pointers (1 per device)
// May be identical to input pointers for in-place tests
std::vector<void *> expected; // Expected output (1 per device)
size_t NumBytes() const
{
return numElements * DataTypeToBytes(dataType);
}
void Initialize(int const numDevices_,
size_t const numElements_,
ncclDataType_t const dataType_,
bool const inPlace_)
{
numDevices = numDevices_;
numElements = numElements_;
dataType = dataType_;
inPlace = inPlace_;
inputs.resize(numDevices);
outputs.resize(numDevices);
expected.resize(numDevices);
// Allocate per-device memory
size_t const numBytes = NumBytes();
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipMalloc((void **)&inputs[i], numBytes));
if (inPlace)
outputs[i] = inputs[i];
else
HIP_CALL(hipMalloc((void **)&outputs[i], numBytes));
expected[i] = malloc(numBytes);
}
}
// Explicit memory release to avoid double-free from subDatasets
void Release()
{
for (int i = 0; i < outputs.size(); i++)
{
if (!inPlace) hipFree(outputs[i]);
hipFree(inputs[i]);
free(expected[i]);
}
outputs.clear();
}
// Creates a dataset by pointing to an existing dataset
// Primarily to allow for testing with different starting byte-alignments
void ExtractSubDataset(size_t const startElement,
size_t const lastElement,
Dataset& subDataset)
{
ASSERT_LE(startElement, lastElement);
ASSERT_LT(lastElement, numElements);
subDataset.numDevices = numDevices;
subDataset.numElements = lastElement - startElement + 1;
subDataset.dataType = dataType;
subDataset.inPlace = inPlace;
subDataset.inputs.resize(numDevices);
subDataset.outputs.resize(numDevices);
subDataset.expected.resize(numDevices);
size_t const byteOffset = (startElement * DataTypeToBytes(dataType));
for (int i = 0; i < numDevices; i++)
{
subDataset.inputs[i] = (int8_t *)inputs[i] + byteOffset;
subDataset.outputs[i] = (int8_t *)outputs[i] + byteOffset;
subDataset.expected[i] = (int8_t *)expected[i] + byteOffset;
}
}
};
typedef std::tuple<ncclRedOp_t /* op */,
ncclDataType_t /* dataType */,
size_t /* numElements */,
int /* numDevices */,
bool /* inPlace */> TestTuple;
// Base class for each collective test
// - Each test is instantiated with a different TestTuple
class CorrectnessTest : public testing::TestWithParam<TestTuple>
{
protected:
// This code is called per test-tuple
void SetUp() override
{
// Check for fine-grained env variable (otherwise will hang)
if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
{
printf("Must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n");
exit(0);
}
// Make the test tuple parameters accessible
std::tie(op, dataType, numElements, numDevices, inPlace) = GetParam();
// Collect the number of available GPUs
HIP_CALL(hipGetDeviceCount(&numDevicesAvailable));
// Only proceed with testing if there are enough GPUs
if (numDevices > numDevicesAvailable)
{
fprintf(stdout, "[ SKIPPED ] Test requires %d devices (only %d available)\n",
numDevices, numDevicesAvailable);
// Modify the number of devices so that tear-down doesn't occur
// This is temporary until GTEST_SKIP() becomes available
numDevices = 0;
numDevicesAvailable = -1;
return;
}
// Initialize communicators
comms.resize(numDevices);
NCCL_CALL(ncclCommInitAll(comms.data(), numDevices, NULL));
// Create streams
streams.resize(numDevices);
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipStreamCreate(&streams[i]));
}
}
// Clean up per TestTuple
void TearDown() override
{
// Release communicators and streams
for (int i = 0; i < numDevices; i++)
{
NCCL_CALL(ncclCommDestroy(comms[i]));
HIP_CALL(hipStreamDestroy(streams[i]));
}
}
void FillDatasetWithPattern(Dataset& dataset)
{
int8_t* arrayI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
// NOTE: Currently half-precision float tests are unsupported due to half being supported
// on GPU only and not host
// Fills input data[i][j] with (i + j) % 6
// - Keeping range small to reduce likelihood of overflow
// - Sticking with floating points values that are perfectly representable
for (int i = 0; i < dataset.numDevices; i++)
{
for (int j = 0; j < dataset.numElements; j++)
{
int valueI = (i + j) % 6;
float valueF = (float)valueI;
switch (dataset.dataType)
{
case ncclInt8: arrayI1[j] = valueI; break;
case ncclUint8: arrayU1[j] = valueI; break;
case ncclInt32: arrayI4[j] = valueI; break;
case ncclUint32: arrayU4[j] = valueI; break;
case ncclInt64: arrayI8[j] = valueI; break;
case ncclUint64: arrayU8[j] = valueI; break;
case ncclFloat32: arrayF4[j] = valueF; break;
case ncclFloat64: arrayF8[j] = valueF; break;
case ncclBfloat16: arrayB2[j] = rccl_bfloat16(valueF); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipMemcpy(dataset.inputs[i], arrayI1, dataset.NumBytes(), hipMemcpyHostToDevice));
// Fills output data[i][j] with 0 (if not inplace)
if (!dataset.inPlace)
HIP_CALL(hipMemset(dataset.outputs[i], 0, dataset.NumBytes()));
}
free(arrayI1);
}
void Synchronize() const
{
// Wait for reduction to complete
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipStreamSynchronize(streams[i]));
}
}
void ValidateResults(Dataset const& dataset) const
{
int8_t* outputI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* outputU1 = (uint8_t *)outputI1;
int32_t* outputI4 = (int32_t *)outputI1;
uint32_t* outputU4 = (uint32_t *)outputI1;
int64_t* outputI8 = (int64_t *)outputI1;
uint64_t* outputU8 = (uint64_t *)outputI1;
float* outputF4 = (float *)outputI1;
double* outputF8 = (double *)outputI1;
rccl_bfloat16* outputB2 = (rccl_bfloat16 *)outputI1;
bool isMatch = true;
// Loop over each device's output and compare it to the expected output
// (Each collective operation computes its own expected results)
for (int i = 0; i < dataset.numDevices && isMatch; i++)
{
HIP_CALL(hipMemcpy(outputI1, dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
int8_t* expectedI1 = (int8_t *)dataset.expected[i];
uint8_t* expectedU1 = (uint8_t *)expectedI1;
int32_t* expectedI4 = (int32_t *)expectedI1;
uint32_t* expectedU4 = (uint32_t *)expectedI1;
int64_t* expectedI8 = (int64_t *)expectedI1;
uint64_t* expectedU8 = (uint64_t *)expectedI1;
float* expectedF4 = (float *)expectedI1;
double* expectedF8 = (double *)expectedI1;
rccl_bfloat16* expectedB2 = (rccl_bfloat16 *)expectedI1;
for (int j = 0; j < dataset.numElements && isMatch; j++)
{
switch (dataset.dataType)
{
case ncclInt8: isMatch &= (outputI1[j] == expectedI1[j]); break;
case ncclUint8: isMatch &= (outputU1[j] == expectedU1[j]); break;
case ncclInt32: isMatch &= (outputI4[j] == expectedI4[j]); break;
case ncclUint32: isMatch &= (outputU4[j] == expectedU4[j]); break;
case ncclInt64: isMatch &= (outputI8[j] == expectedI8[j]); break;
case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break;
case ncclFloat32: isMatch &= (outputF4[j] == expectedF4[j]); break;
case ncclFloat64: isMatch &= (outputF8[j] == expectedF8[j]); break;
case ncclBfloat16: isMatch &= (outputB2[j] == expectedB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
if (!isMatch)
{
switch (dataset.dataType)
{
case ncclInt8:
printf("Expected %d. Output %d on device %d[%d]\n", outputI1[j], expectedI1[j], i, j); break;
case ncclUint8:
printf("Expected %u. Output %u on device %d[%d]\n", outputU1[j], expectedU1[j], i, j); break;
case ncclInt32:
printf("Expected %d. Output %d on device %d[%d]\n", outputI4[j], expectedI4[j], i, j); break;
case ncclUint32:
printf("Expected %u. Output %u on device %d[%d]\n", outputU4[j], expectedU4[j], i, j); break;
case ncclInt64:
printf("Expected %ld. Output %ld on device %d[%d]\n", outputI8[j], expectedI8[j], i, j); break;
case ncclUint64:
printf("Expected %lu. Output %lu on device %d[%d]\n", outputU8[j], expectedU8[j], i, j); break;
case ncclFloat32:
printf("Expected %f. Output %f on device %d[%d]\n", outputF4[j], expectedF4[j], i, j); break;
case ncclFloat64:
printf("Expected %lf. Output %lf on device %d[%d]\n", outputF8[j], expectedF8[j], i, j); break;
case ncclBfloat16:
printf("Expected %f. Output %f on device %d[%d]\n", (float)outputB2[j], (float)expectedB2[j], i, j); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
ASSERT_EQ(isMatch, true);
}
}
// Passed in parameters from TestTuple
ncclRedOp_t op;
ncclDataType_t dataType;
size_t numElements;
int numDevices;
bool inPlace;
int numDevicesAvailable;
std::vector<ncclComm_t> comms;
std::vector<hipStream_t> streams;
};
}
#endif