SWDEV-361383 - Fixing compilation issue for stress test files under catch2 framework (#2998)

Change-Id: I4253a6cdb2c10bcb0021f869b7b3c4f1acd28787

[ROCm/hip-tests commit: 339a51dcaf]
This commit is contained in:
ROCm CI Service Account
2022-10-19 16:52:24 +05:30
committato da GitHub
parent 4ea956e684
commit ee95d39a40
7 ha cambiato i file con 56 aggiunte e 55 eliminazioni
@@ -349,7 +349,7 @@ static bool TestMemoryAllocationInLoop(int test_type,
}
if (!bPassed) break;
}
hipFree(outputVec_d);
HIP_CHECK(hipFree(outputVec_d));
free(outputVec_h);
return bPassed;
}
@@ -158,7 +158,7 @@ TEST_CASE("Stress_hipMallocManaged_MultiSize") {
hipStream_t strm;
HIP_CHECK(hipStreamCreate(&strm));
dim3 dimBlock(blockSize, 1, 1);
for (int i = 1; i < (1024*1024); ++i) {
for (int i = 1; i < (1024*100); ++i) {
HIP_CHECK(hipMallocManaged(&Hmm1, i));
HIP_CHECK(hipMallocManaged(&Hmm2, i));
for (int j = 0; j < i; ++j) {
@@ -57,9 +57,8 @@ static int HmmAttrPrint() {
return managed;
}
static void ReleaseResource(int *Hmm, int *Hmm1, hipStream_t *strm) {
static void ReleaseResource(int *Hmm, hipStream_t *strm) {
HIP_CHECK(hipFree(Hmm));
HIP_CHECK(hipFree(Hmm1));
HIP_CHECK(hipStreamDestroy(*strm));
}
@@ -70,11 +69,10 @@ static void ReleaseResource(int *Hmm, int *Hmm1, hipStream_t *strm) {
TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") {
int MangdMem = HmmAttrPrint();
if (MangdMem == 1) {
int *Hmm = nullptr, *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4);
int *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4);
int InitVal = 123, NumElms = MemSz/4;
bool IfTestPassed = true;
HIP_CHECK(hipGetDeviceCount(&NumDevs));
HIP_CHECK(hipMallocManaged(&Hmm, MemSz));
HIP_CHECK(hipMallocManaged(&Hmm1, MemSz));
for (int i = 0; i < NumElms; ++i) {
Hmm1[i] = InitVal;
@@ -93,44 +91,40 @@ TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") {
// Prefetching memory from i to j
HIP_CHECK(hipMemPrefetchAsync(Hmm1, MemSz, j, strm));
HIP_CHECK(hipStreamSynchronize(strm));
MemPrftchAsyncKernel<<<(NumElms/32), 32, 0, strm>>>(Hmm, Hmm1, NumElms);
MemPrftchAsyncKernel1<<<(NumElms/32), 32, 0, strm>>>(Hmm1, NumElms);
HIP_CHECK(hipStreamSynchronize(strm));
// Verifying the result
for (int m = 0; m < NumElms; ++m) {
if (Hmm[m] != (InitVal * InitVal)) {
if (Hmm1[m] != (InitVal * InitVal)) {
IfTestPassed = false;
}
}
if (!IfTestPassed) {
ReleaseResource(Hmm, Hmm1, &strm);
ReleaseResource(Hmm1, &strm);
INFO("Did not find expected value!");
REQUIRE(false);
}
// Resetting the values in Hmm
HIP_CHECK(hipMemset(Hmm, 0, MemSz));
// Prefetching memory from j to i
HIP_CHECK(hipMemPrefetchAsync(Hmm1, MemSz, i, strm));
HIP_CHECK(hipStreamSynchronize(strm));
MemPrftchAsyncKernel<<<(NumElms/32), 32, 0, strm>>>(Hmm, Hmm1, NumElms);
MemPrftchAsyncKernel1<<<(NumElms/32), 32, 0, strm>>>(Hmm1, NumElms);
HIP_CHECK(hipStreamSynchronize(strm));
// Verifying the result
for (int m = 0; m < NumElms; ++m) {
if (Hmm[m] != (InitVal * InitVal)) {
if (Hmm1[m] != (InitVal * InitVal)) {
IfTestPassed = false;
}
}
if (!IfTestPassed) {
ReleaseResource(Hmm, Hmm1, &strm);
ReleaseResource(Hmm1, &strm);
INFO("Did not find expected value!");
REQUIRE(false);
}
// Resetting the values in Hmm
HIP_CHECK(hipMemset(Hmm, 0, MemSz));
HIP_CHECK(hipStreamDestroy(strm));
}
}
// Releasing the resources in case all the scenarios passed
HIP_CHECK(hipFree(Hmm));
HIP_CHECK(hipFree(Hmm1));
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
@@ -72,7 +72,7 @@ void Memcpy_And_verify(int NUM_ELM) {
for (int i = 0; i < Available_Gpus; ++i) {
for (int j = i+1; j < Available_Gpus; ++j) {
canAccessPeer = 0;
hipDeviceCanAccessPeer(&canAccessPeer, i, j);
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
if (canAccessPeer) {
HIP_CHECK(hipMemcpy(A_d[j], A_d[i], NUM_ELM * sizeof(TestType),
hipMemcpyDefault));
@@ -122,7 +122,7 @@ void Memcpy_And_verify(int NUM_ELM) {
int canAccessPeer = 0;
for (int i = 0; i < Available_Gpus; ++i) {
for (int j = i+1; j < Available_Gpus; ++j) {
hipDeviceCanAccessPeer(&canAccessPeer, i, j);
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
if (canAccessPeer) {
HIP_CHECK(hipMemcpyHtoD(hipDeviceptr_t(A_d[i]),
A_h, NUM_ELM * sizeof(TestType)));
@@ -165,7 +165,7 @@ void Memcpy_And_verify(int NUM_ELM) {
for (int i = 0; i < Available_Gpus; ++i) {
for (int j = i+1; j < Available_Gpus; ++j) {
canAccessPeer = 0;
hipDeviceCanAccessPeer(&canAccessPeer, i, j);
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
if (canAccessPeer) {
HIP_CHECK(hipMemcpyAsync(A_d[j], A_d[i],
NUM_ELM * sizeof(TestType),
@@ -219,7 +219,7 @@ void Memcpy_And_verify(int NUM_ELM) {
for (int i = 0; i < Available_Gpus; ++i) {
for (int j = i+1; j < Available_Gpus; ++j) {
canAccessPeer = 0;
hipDeviceCanAccessPeer(&canAccessPeer, i, j);
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
if (canAccessPeer) {
HIP_CHECK(hipSetDevice(j));
HIP_CHECK(hipMemcpyDtoDAsync(hipDeviceptr_t(A_d[j]),
@@ -5,25 +5,25 @@ TEST_CASE("Stress_hipMalloc", "DifferentSizes") {
SECTION("Size 10") {
auto res = hipMalloc(&d_a, sizeof(10));
REQUIRE(res == hipSuccess);
hipFree(d_a);
HIP_CHECK(hipFree(d_a));
d_a = nullptr;
}
SECTION("Size 100") {
auto res = hipMalloc(&d_a, sizeof(100));
REQUIRE(res == hipSuccess);
hipFree(d_a);
HIP_CHECK(hipFree(d_a));
d_a = nullptr;
}
SECTION("Size 1000") {
auto res = hipMalloc(&d_a, sizeof(1000));
REQUIRE(res == hipSuccess);
hipFree(d_a);
HIP_CHECK(hipFree(d_a));
d_a = nullptr;
}
SECTION("Size 10000") {
auto res = hipMalloc(&d_a, sizeof(10000));
REQUIRE(res == hipSuccess);
hipFree(d_a);
HIP_CHECK(hipFree(d_a));
d_a = nullptr;
}
SECTION("Size MAX") {
@@ -31,4 +31,4 @@ TEST_CASE("Stress_hipMalloc", "DifferentSizes") {
REQUIRE(res == hipErrorOutOfMemory);
d_a = nullptr;
}
}
}
@@ -494,7 +494,7 @@ TEST_CASE("Stress_printf_ComplexKernelMultStreamMultGpu") {
unsigned int print_limit = 4; // = 4 GB
uint32_t iterCount = 1;
int numOfGPUs = 0;
hipGetDeviceCount(&numOfGPUs);
HIP_CHECK(hipGetDeviceCount(&numOfGPUs));
if (numOfGPUs < 2) {
printf("Skipping test because numOfGPUs < 2\n");
return;
@@ -26,11 +26,13 @@ THE SOFTWARE.
#include <random>
#include <thread>
__global__ void addVal(unsigned long long* ptr, size_t index, unsigned long long val) {
__global__ void addVal(unsigned long long* ptr, size_t index,
unsigned long long val) {
atomicAdd(ptr + index, val);
}
// Create a copy constructible AtomicWrap around std::atomic so that we can put it in a vector
// Create a copy constructible AtomicWrap around std::atomic so that
// we can put it in a vector
template <typename T> struct AtomicWrap {
std::atomic<T> data;
@@ -68,18 +70,19 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
constexpr size_t maxWork = 10000;
constexpr size_t maxVal = 10;
std::uniform_int_distribution<std::mt19937::result_type> genIndex(0, hwThreads - 1);
std::uniform_int_distribution<std::mt19937::result_type> genIndex(0,
hwThreads - 1);
std::uniform_int_distribution<std::mt19937::result_type> genWork(0, maxWork);
std::uniform_int_distribution<std::mt19937::result_type> genVal(0, maxVal);
auto enqueueKernelThread = [&](hipStream_t stream) {
auto iter = genWork(engine); // Generate work to be done via thread
for (auto i = 0; i < iter; i++) {
for (unsigned long i = 0; i < iter; i++) {
auto index = genIndex(engine); // Generate Index to add to
auto val = genVal(engine); // Generate value to add to the destination
auto val = genVal(engine); // Generate value to add to the destination
hostData[index].data += val; // Replicate it on host
addVal<<<1, 1, 0, stream>>>(dPtr, static_cast<size_t>(index),
static_cast<unsigned long long>(val)); // And on device
static_cast<unsigned long long>(val)); // And on device
}
};
@@ -101,8 +104,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
HIP_CHECK(hipStreamDestroy(stream));
auto hPtr = std::make_unique<unsigned long long[]>(hwThreads);
HIP_CHECK(
hipMemcpy(hPtr.get(), dPtr, sizeof(unsigned long long) * hwThreads, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(hPtr.get(), dPtr, sizeof(unsigned long long) * hwThreads,
hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(dPtr));
@@ -113,7 +116,7 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
}
}
__global__ void doOperation(int* dPtr, size_t size, int val) {
__global__ void doOperation(int* dPtr, int val) {
auto i = threadIdx.x;
atomicAdd(dPtr + i, val);
}
@@ -135,14 +138,15 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
std::vector<hipStream_t> streamPool{};
streamPool.reserve(deviceCount * streamPerGPU);
std::map<hipStream_t, int*> streamToDeviceMemory; // Map of stream and device memory
std::map<hipStream_t, AtomicWrap<int>> streamToHostMemory; // Map of stream and host result
std::map<hipStream_t, size_t> streamToDeviceIndex; // Map of stream and device it was created on
// Map of stream and device memory
std::map<hipStream_t, int*> streamToDeviceMemory;
// Map of stream and host result
std::map<hipStream_t, AtomicWrap<int>> streamToHostMemory;
// Map of stream and device it was created on
std::map<hipStream_t, size_t> streamToDeviceIndex;
constexpr size_t size = 1024;
for (size_t i = 0; i < deviceCount; i++) {
for (int i = 0; i < deviceCount; i++) {
HIP_CHECK(hipSetDevice(i));
for (size_t j = 0; j < streamPerGPU; j++) {
@@ -155,8 +159,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
HIP_CHECK(hipMalloc(&dPtr, sizeof(int) * size));
REQUIRE(dPtr != nullptr);
HIP_CHECK(hipMemset(dPtr, 0, sizeof(int) * size));
streamToDeviceMemory[stream] = dPtr; // All streams work on exclusive memory
// All streams work on exclusive memory
streamToDeviceMemory[stream] = dPtr;
streamToHostMemory[stream] = AtomicWrap<int>(0); // CPU result
@@ -171,8 +175,10 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
std::random_device device;
std::mt19937 engine(device());
std::uniform_int_distribution<std::mt19937::result_type> genVal(-maxVal, maxVal);
std::uniform_int_distribution<std::mt19937::result_type> genStream(0, streamPool.size() - 1);
std::uniform_int_distribution<std::mt19937::result_type> genVal(-maxVal,
maxVal);
std::uniform_int_distribution<std::mt19937::result_type> genStream(0,
streamPool.size() - 1);
#if HT_NVIDIA
std::mutex ness; // On nvidia, current device needs to match stream's device
@@ -183,7 +189,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
#if HT_NVIDIA
std::unique_lock<std::mutex> lock(ness); // Lock on creation
#endif
hipStream_t stream = streamPool[genStream(engine)]; // Get a random stream
// Get a random stream
hipStream_t stream = streamPool[genStream(engine)];
// TODO use HIP_CHECK_THREAD when PR#2664 is merged
if (hipSuccess != hipSetDevice(streamToDeviceIndex[stream])) {
@@ -191,11 +198,10 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
}
int val = genVal(engine); // Generate Value to add/sub to
streamToHostMemory[stream].data.fetch_add(val); // Replicate result on CPU
// Replicate result on CPU
streamToHostMemory[stream].data.fetch_add(val);
auto dPtr = streamToDeviceMemory[stream];
doOperation<<<1, 1024, 0, stream>>>(dPtr, size,
val); // On GPU
doOperation<<<1, 1024, 0, stream>>>(dPtr, val); // On GPU
}
};
@@ -219,13 +225,14 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
for (auto& i : streamPool) {
HIP_CHECK(hipStreamSynchronize(i));
auto dResult = std::make_unique<int[]>(size);
HIP_CHECK(hipMemcpy(dResult.get(), streamToDeviceMemory[i], sizeof(int) * size,
hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(dResult.get(), streamToDeviceMemory[i],
sizeof(int) * size, hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(streamToDeviceMemory[i]));
HIP_CHECK(hipStreamDestroy(i));
auto res = streamToHostMemory[i].data.load();
INFO("Matching CPU: " << res << " GPU: " << dResult[0] << " Dev Ptr: "
<< streamToDeviceMemory[i] << " on Device: " << streamToDeviceIndex[i]);
REQUIRE(std::all_of(dResult.get(), dResult.get() + size, [=](int r) { return r == res; }));
<< streamToDeviceMemory[i] << " on Device: " << streamToDeviceIndex[i]);
REQUIRE(std::all_of(dResult.get(), dResult.get() + size,
[=](int r) { return r == res; }));
}
}