SWDEV-361383 - Fixing compilation issue for stress test files under catch2 framework (#2998)
Change-Id: I4253a6cdb2c10bcb0021f869b7b3c4f1acd28787
[ROCm/hip-tests commit: 339a51dcaf]
This commit is contained in:
committato da
GitHub
parent
4ea956e684
commit
ee95d39a40
@@ -349,7 +349,7 @@ static bool TestMemoryAllocationInLoop(int test_type,
|
||||
}
|
||||
if (!bPassed) break;
|
||||
}
|
||||
hipFree(outputVec_d);
|
||||
HIP_CHECK(hipFree(outputVec_d));
|
||||
free(outputVec_h);
|
||||
return bPassed;
|
||||
}
|
||||
|
||||
@@ -158,7 +158,7 @@ TEST_CASE("Stress_hipMallocManaged_MultiSize") {
|
||||
hipStream_t strm;
|
||||
HIP_CHECK(hipStreamCreate(&strm));
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
for (int i = 1; i < (1024*1024); ++i) {
|
||||
for (int i = 1; i < (1024*100); ++i) {
|
||||
HIP_CHECK(hipMallocManaged(&Hmm1, i));
|
||||
HIP_CHECK(hipMallocManaged(&Hmm2, i));
|
||||
for (int j = 0; j < i; ++j) {
|
||||
|
||||
@@ -57,9 +57,8 @@ static int HmmAttrPrint() {
|
||||
return managed;
|
||||
}
|
||||
|
||||
static void ReleaseResource(int *Hmm, int *Hmm1, hipStream_t *strm) {
|
||||
static void ReleaseResource(int *Hmm, hipStream_t *strm) {
|
||||
HIP_CHECK(hipFree(Hmm));
|
||||
HIP_CHECK(hipFree(Hmm1));
|
||||
HIP_CHECK(hipStreamDestroy(*strm));
|
||||
}
|
||||
|
||||
@@ -70,11 +69,10 @@ static void ReleaseResource(int *Hmm, int *Hmm1, hipStream_t *strm) {
|
||||
TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") {
|
||||
int MangdMem = HmmAttrPrint();
|
||||
if (MangdMem == 1) {
|
||||
int *Hmm = nullptr, *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4);
|
||||
int *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4);
|
||||
int InitVal = 123, NumElms = MemSz/4;
|
||||
bool IfTestPassed = true;
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevs));
|
||||
HIP_CHECK(hipMallocManaged(&Hmm, MemSz));
|
||||
HIP_CHECK(hipMallocManaged(&Hmm1, MemSz));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
Hmm1[i] = InitVal;
|
||||
@@ -93,44 +91,40 @@ TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") {
|
||||
// Prefetching memory from i to j
|
||||
HIP_CHECK(hipMemPrefetchAsync(Hmm1, MemSz, j, strm));
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
MemPrftchAsyncKernel<<<(NumElms/32), 32, 0, strm>>>(Hmm, Hmm1, NumElms);
|
||||
MemPrftchAsyncKernel1<<<(NumElms/32), 32, 0, strm>>>(Hmm1, NumElms);
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
// Verifying the result
|
||||
for (int m = 0; m < NumElms; ++m) {
|
||||
if (Hmm[m] != (InitVal * InitVal)) {
|
||||
if (Hmm1[m] != (InitVal * InitVal)) {
|
||||
IfTestPassed = false;
|
||||
}
|
||||
}
|
||||
if (!IfTestPassed) {
|
||||
ReleaseResource(Hmm, Hmm1, &strm);
|
||||
ReleaseResource(Hmm1, &strm);
|
||||
INFO("Did not find expected value!");
|
||||
REQUIRE(false);
|
||||
}
|
||||
// Resetting the values in Hmm
|
||||
HIP_CHECK(hipMemset(Hmm, 0, MemSz));
|
||||
// Prefetching memory from j to i
|
||||
HIP_CHECK(hipMemPrefetchAsync(Hmm1, MemSz, i, strm));
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
MemPrftchAsyncKernel<<<(NumElms/32), 32, 0, strm>>>(Hmm, Hmm1, NumElms);
|
||||
MemPrftchAsyncKernel1<<<(NumElms/32), 32, 0, strm>>>(Hmm1, NumElms);
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
// Verifying the result
|
||||
for (int m = 0; m < NumElms; ++m) {
|
||||
if (Hmm[m] != (InitVal * InitVal)) {
|
||||
if (Hmm1[m] != (InitVal * InitVal)) {
|
||||
IfTestPassed = false;
|
||||
}
|
||||
}
|
||||
if (!IfTestPassed) {
|
||||
ReleaseResource(Hmm, Hmm1, &strm);
|
||||
ReleaseResource(Hmm1, &strm);
|
||||
INFO("Did not find expected value!");
|
||||
REQUIRE(false);
|
||||
}
|
||||
// Resetting the values in Hmm
|
||||
HIP_CHECK(hipMemset(Hmm, 0, MemSz));
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(strm));
|
||||
}
|
||||
}
|
||||
// Releasing the resources in case all the scenarios passed
|
||||
HIP_CHECK(hipFree(Hmm));
|
||||
HIP_CHECK(hipFree(Hmm1));
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
|
||||
|
||||
@@ -72,7 +72,7 @@ void Memcpy_And_verify(int NUM_ELM) {
|
||||
for (int i = 0; i < Available_Gpus; ++i) {
|
||||
for (int j = i+1; j < Available_Gpus; ++j) {
|
||||
canAccessPeer = 0;
|
||||
hipDeviceCanAccessPeer(&canAccessPeer, i, j);
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
|
||||
if (canAccessPeer) {
|
||||
HIP_CHECK(hipMemcpy(A_d[j], A_d[i], NUM_ELM * sizeof(TestType),
|
||||
hipMemcpyDefault));
|
||||
@@ -122,7 +122,7 @@ void Memcpy_And_verify(int NUM_ELM) {
|
||||
int canAccessPeer = 0;
|
||||
for (int i = 0; i < Available_Gpus; ++i) {
|
||||
for (int j = i+1; j < Available_Gpus; ++j) {
|
||||
hipDeviceCanAccessPeer(&canAccessPeer, i, j);
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
|
||||
if (canAccessPeer) {
|
||||
HIP_CHECK(hipMemcpyHtoD(hipDeviceptr_t(A_d[i]),
|
||||
A_h, NUM_ELM * sizeof(TestType)));
|
||||
@@ -165,7 +165,7 @@ void Memcpy_And_verify(int NUM_ELM) {
|
||||
for (int i = 0; i < Available_Gpus; ++i) {
|
||||
for (int j = i+1; j < Available_Gpus; ++j) {
|
||||
canAccessPeer = 0;
|
||||
hipDeviceCanAccessPeer(&canAccessPeer, i, j);
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
|
||||
if (canAccessPeer) {
|
||||
HIP_CHECK(hipMemcpyAsync(A_d[j], A_d[i],
|
||||
NUM_ELM * sizeof(TestType),
|
||||
@@ -219,7 +219,7 @@ void Memcpy_And_verify(int NUM_ELM) {
|
||||
for (int i = 0; i < Available_Gpus; ++i) {
|
||||
for (int j = i+1; j < Available_Gpus; ++j) {
|
||||
canAccessPeer = 0;
|
||||
hipDeviceCanAccessPeer(&canAccessPeer, i, j);
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
|
||||
if (canAccessPeer) {
|
||||
HIP_CHECK(hipSetDevice(j));
|
||||
HIP_CHECK(hipMemcpyDtoDAsync(hipDeviceptr_t(A_d[j]),
|
||||
|
||||
@@ -5,25 +5,25 @@ TEST_CASE("Stress_hipMalloc", "DifferentSizes") {
|
||||
SECTION("Size 10") {
|
||||
auto res = hipMalloc(&d_a, sizeof(10));
|
||||
REQUIRE(res == hipSuccess);
|
||||
hipFree(d_a);
|
||||
HIP_CHECK(hipFree(d_a));
|
||||
d_a = nullptr;
|
||||
}
|
||||
SECTION("Size 100") {
|
||||
auto res = hipMalloc(&d_a, sizeof(100));
|
||||
REQUIRE(res == hipSuccess);
|
||||
hipFree(d_a);
|
||||
HIP_CHECK(hipFree(d_a));
|
||||
d_a = nullptr;
|
||||
}
|
||||
SECTION("Size 1000") {
|
||||
auto res = hipMalloc(&d_a, sizeof(1000));
|
||||
REQUIRE(res == hipSuccess);
|
||||
hipFree(d_a);
|
||||
HIP_CHECK(hipFree(d_a));
|
||||
d_a = nullptr;
|
||||
}
|
||||
SECTION("Size 10000") {
|
||||
auto res = hipMalloc(&d_a, sizeof(10000));
|
||||
REQUIRE(res == hipSuccess);
|
||||
hipFree(d_a);
|
||||
HIP_CHECK(hipFree(d_a));
|
||||
d_a = nullptr;
|
||||
}
|
||||
SECTION("Size MAX") {
|
||||
@@ -31,4 +31,4 @@ TEST_CASE("Stress_hipMalloc", "DifferentSizes") {
|
||||
REQUIRE(res == hipErrorOutOfMemory);
|
||||
d_a = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -494,7 +494,7 @@ TEST_CASE("Stress_printf_ComplexKernelMultStreamMultGpu") {
|
||||
unsigned int print_limit = 4; // = 4 GB
|
||||
uint32_t iterCount = 1;
|
||||
int numOfGPUs = 0;
|
||||
hipGetDeviceCount(&numOfGPUs);
|
||||
HIP_CHECK(hipGetDeviceCount(&numOfGPUs));
|
||||
if (numOfGPUs < 2) {
|
||||
printf("Skipping test because numOfGPUs < 2\n");
|
||||
return;
|
||||
|
||||
@@ -26,11 +26,13 @@ THE SOFTWARE.
|
||||
#include <random>
|
||||
#include <thread>
|
||||
|
||||
__global__ void addVal(unsigned long long* ptr, size_t index, unsigned long long val) {
|
||||
__global__ void addVal(unsigned long long* ptr, size_t index,
|
||||
unsigned long long val) {
|
||||
atomicAdd(ptr + index, val);
|
||||
}
|
||||
|
||||
// Create a copy constructible AtomicWrap around std::atomic so that we can put it in a vector
|
||||
// Create a copy constructible AtomicWrap around std::atomic so that
|
||||
// we can put it in a vector
|
||||
template <typename T> struct AtomicWrap {
|
||||
std::atomic<T> data;
|
||||
|
||||
@@ -68,18 +70,19 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
|
||||
constexpr size_t maxWork = 10000;
|
||||
constexpr size_t maxVal = 10;
|
||||
|
||||
std::uniform_int_distribution<std::mt19937::result_type> genIndex(0, hwThreads - 1);
|
||||
std::uniform_int_distribution<std::mt19937::result_type> genIndex(0,
|
||||
hwThreads - 1);
|
||||
std::uniform_int_distribution<std::mt19937::result_type> genWork(0, maxWork);
|
||||
std::uniform_int_distribution<std::mt19937::result_type> genVal(0, maxVal);
|
||||
|
||||
auto enqueueKernelThread = [&](hipStream_t stream) {
|
||||
auto iter = genWork(engine); // Generate work to be done via thread
|
||||
for (auto i = 0; i < iter; i++) {
|
||||
for (unsigned long i = 0; i < iter; i++) {
|
||||
auto index = genIndex(engine); // Generate Index to add to
|
||||
auto val = genVal(engine); // Generate value to add to the destination
|
||||
auto val = genVal(engine); // Generate value to add to the destination
|
||||
hostData[index].data += val; // Replicate it on host
|
||||
addVal<<<1, 1, 0, stream>>>(dPtr, static_cast<size_t>(index),
|
||||
static_cast<unsigned long long>(val)); // And on device
|
||||
static_cast<unsigned long long>(val)); // And on device
|
||||
}
|
||||
};
|
||||
|
||||
@@ -101,8 +104,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
|
||||
auto hPtr = std::make_unique<unsigned long long[]>(hwThreads);
|
||||
HIP_CHECK(
|
||||
hipMemcpy(hPtr.get(), dPtr, sizeof(unsigned long long) * hwThreads, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(hPtr.get(), dPtr, sizeof(unsigned long long) * hwThreads,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
HIP_CHECK(hipFree(dPtr));
|
||||
|
||||
@@ -113,7 +116,7 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void doOperation(int* dPtr, size_t size, int val) {
|
||||
__global__ void doOperation(int* dPtr, int val) {
|
||||
auto i = threadIdx.x;
|
||||
atomicAdd(dPtr + i, val);
|
||||
}
|
||||
@@ -135,14 +138,15 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
|
||||
|
||||
std::vector<hipStream_t> streamPool{};
|
||||
streamPool.reserve(deviceCount * streamPerGPU);
|
||||
|
||||
std::map<hipStream_t, int*> streamToDeviceMemory; // Map of stream and device memory
|
||||
std::map<hipStream_t, AtomicWrap<int>> streamToHostMemory; // Map of stream and host result
|
||||
std::map<hipStream_t, size_t> streamToDeviceIndex; // Map of stream and device it was created on
|
||||
|
||||
// Map of stream and device memory
|
||||
std::map<hipStream_t, int*> streamToDeviceMemory;
|
||||
// Map of stream and host result
|
||||
std::map<hipStream_t, AtomicWrap<int>> streamToHostMemory;
|
||||
// Map of stream and device it was created on
|
||||
std::map<hipStream_t, size_t> streamToDeviceIndex;
|
||||
constexpr size_t size = 1024;
|
||||
|
||||
for (size_t i = 0; i < deviceCount; i++) {
|
||||
for (int i = 0; i < deviceCount; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
|
||||
for (size_t j = 0; j < streamPerGPU; j++) {
|
||||
@@ -155,8 +159,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
|
||||
HIP_CHECK(hipMalloc(&dPtr, sizeof(int) * size));
|
||||
REQUIRE(dPtr != nullptr);
|
||||
HIP_CHECK(hipMemset(dPtr, 0, sizeof(int) * size));
|
||||
|
||||
streamToDeviceMemory[stream] = dPtr; // All streams work on exclusive memory
|
||||
// All streams work on exclusive memory
|
||||
streamToDeviceMemory[stream] = dPtr;
|
||||
|
||||
streamToHostMemory[stream] = AtomicWrap<int>(0); // CPU result
|
||||
|
||||
@@ -171,8 +175,10 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
|
||||
std::random_device device;
|
||||
std::mt19937 engine(device());
|
||||
|
||||
std::uniform_int_distribution<std::mt19937::result_type> genVal(-maxVal, maxVal);
|
||||
std::uniform_int_distribution<std::mt19937::result_type> genStream(0, streamPool.size() - 1);
|
||||
std::uniform_int_distribution<std::mt19937::result_type> genVal(-maxVal,
|
||||
maxVal);
|
||||
std::uniform_int_distribution<std::mt19937::result_type> genStream(0,
|
||||
streamPool.size() - 1);
|
||||
|
||||
#if HT_NVIDIA
|
||||
std::mutex ness; // On nvidia, current device needs to match stream's device
|
||||
@@ -183,7 +189,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
|
||||
#if HT_NVIDIA
|
||||
std::unique_lock<std::mutex> lock(ness); // Lock on creation
|
||||
#endif
|
||||
hipStream_t stream = streamPool[genStream(engine)]; // Get a random stream
|
||||
// Get a random stream
|
||||
hipStream_t stream = streamPool[genStream(engine)];
|
||||
|
||||
// TODO use HIP_CHECK_THREAD when PR#2664 is merged
|
||||
if (hipSuccess != hipSetDevice(streamToDeviceIndex[stream])) {
|
||||
@@ -191,11 +198,10 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
|
||||
}
|
||||
|
||||
int val = genVal(engine); // Generate Value to add/sub to
|
||||
|
||||
streamToHostMemory[stream].data.fetch_add(val); // Replicate result on CPU
|
||||
// Replicate result on CPU
|
||||
streamToHostMemory[stream].data.fetch_add(val);
|
||||
auto dPtr = streamToDeviceMemory[stream];
|
||||
doOperation<<<1, 1024, 0, stream>>>(dPtr, size,
|
||||
val); // On GPU
|
||||
doOperation<<<1, 1024, 0, stream>>>(dPtr, val); // On GPU
|
||||
}
|
||||
};
|
||||
|
||||
@@ -219,13 +225,14 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
|
||||
for (auto& i : streamPool) {
|
||||
HIP_CHECK(hipStreamSynchronize(i));
|
||||
auto dResult = std::make_unique<int[]>(size);
|
||||
HIP_CHECK(hipMemcpy(dResult.get(), streamToDeviceMemory[i], sizeof(int) * size,
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(dResult.get(), streamToDeviceMemory[i],
|
||||
sizeof(int) * size, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipFree(streamToDeviceMemory[i]));
|
||||
HIP_CHECK(hipStreamDestroy(i));
|
||||
auto res = streamToHostMemory[i].data.load();
|
||||
INFO("Matching CPU: " << res << " GPU: " << dResult[0] << " Dev Ptr: "
|
||||
<< streamToDeviceMemory[i] << " on Device: " << streamToDeviceIndex[i]);
|
||||
REQUIRE(std::all_of(dResult.get(), dResult.get() + size, [=](int r) { return r == res; }));
|
||||
<< streamToDeviceMemory[i] << " on Device: " << streamToDeviceIndex[i]);
|
||||
REQUIRE(std::all_of(dResult.get(), dResult.get() + size,
|
||||
[=](int r) { return r == res; }));
|
||||
}
|
||||
}
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user