diff --git a/projects/hip-tests/catch/stress/deviceallocation/Stress_deviceAllocationStress.cc b/projects/hip-tests/catch/stress/deviceallocation/Stress_deviceAllocationStress.cc index e60bf66bb4..5e6c51216e 100644 --- a/projects/hip-tests/catch/stress/deviceallocation/Stress_deviceAllocationStress.cc +++ b/projects/hip-tests/catch/stress/deviceallocation/Stress_deviceAllocationStress.cc @@ -349,7 +349,7 @@ static bool TestMemoryAllocationInLoop(int test_type, } if (!bPassed) break; } - hipFree(outputVec_d); + HIP_CHECK(hipFree(outputVec_d)); free(outputVec_h); return bPassed; } diff --git a/projects/hip-tests/catch/stress/memory/hipMallocManagedStress.cc b/projects/hip-tests/catch/stress/memory/hipMallocManagedStress.cc index 8a11ab35ef..87950a4f66 100644 --- a/projects/hip-tests/catch/stress/memory/hipMallocManagedStress.cc +++ b/projects/hip-tests/catch/stress/memory/hipMallocManagedStress.cc @@ -158,7 +158,7 @@ TEST_CASE("Stress_hipMallocManaged_MultiSize") { hipStream_t strm; HIP_CHECK(hipStreamCreate(&strm)); dim3 dimBlock(blockSize, 1, 1); - for (int i = 1; i < (1024*1024); ++i) { + for (int i = 1; i < (1024*100); ++i) { HIP_CHECK(hipMallocManaged(&Hmm1, i)); HIP_CHECK(hipMallocManaged(&Hmm2, i)); for (int j = 0; j < i; ++j) { diff --git a/projects/hip-tests/catch/stress/memory/hipMemPrftchAsyncStressTst.cc b/projects/hip-tests/catch/stress/memory/hipMemPrftchAsyncStressTst.cc index 0a159b8dbb..a551721b40 100644 --- a/projects/hip-tests/catch/stress/memory/hipMemPrftchAsyncStressTst.cc +++ b/projects/hip-tests/catch/stress/memory/hipMemPrftchAsyncStressTst.cc @@ -57,9 +57,8 @@ static int HmmAttrPrint() { return managed; } -static void ReleaseResource(int *Hmm, int *Hmm1, hipStream_t *strm) { +static void ReleaseResource(int *Hmm, hipStream_t *strm) { HIP_CHECK(hipFree(Hmm)); - HIP_CHECK(hipFree(Hmm1)); HIP_CHECK(hipStreamDestroy(*strm)); } @@ -70,11 +69,10 @@ static void ReleaseResource(int *Hmm, int *Hmm1, hipStream_t *strm) { TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") { int MangdMem = HmmAttrPrint(); if (MangdMem == 1) { - int *Hmm = nullptr, *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4); + int *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4); int InitVal = 123, NumElms = MemSz/4; bool IfTestPassed = true; HIP_CHECK(hipGetDeviceCount(&NumDevs)); - HIP_CHECK(hipMallocManaged(&Hmm, MemSz)); HIP_CHECK(hipMallocManaged(&Hmm1, MemSz)); for (int i = 0; i < NumElms; ++i) { Hmm1[i] = InitVal; @@ -93,44 +91,40 @@ TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") { // Prefetching memory from i to j HIP_CHECK(hipMemPrefetchAsync(Hmm1, MemSz, j, strm)); HIP_CHECK(hipStreamSynchronize(strm)); - MemPrftchAsyncKernel<<<(NumElms/32), 32, 0, strm>>>(Hmm, Hmm1, NumElms); + MemPrftchAsyncKernel1<<<(NumElms/32), 32, 0, strm>>>(Hmm1, NumElms); HIP_CHECK(hipStreamSynchronize(strm)); // Verifying the result for (int m = 0; m < NumElms; ++m) { - if (Hmm[m] != (InitVal * InitVal)) { + if (Hmm1[m] != (InitVal * InitVal)) { IfTestPassed = false; } } if (!IfTestPassed) { - ReleaseResource(Hmm, Hmm1, &strm); + ReleaseResource(Hmm1, &strm); INFO("Did not find expected value!"); REQUIRE(false); } - // Resetting the values in Hmm - HIP_CHECK(hipMemset(Hmm, 0, MemSz)); // Prefetching memory from j to i HIP_CHECK(hipMemPrefetchAsync(Hmm1, MemSz, i, strm)); HIP_CHECK(hipStreamSynchronize(strm)); - MemPrftchAsyncKernel<<<(NumElms/32), 32, 0, strm>>>(Hmm, Hmm1, NumElms); + MemPrftchAsyncKernel1<<<(NumElms/32), 32, 0, strm>>>(Hmm1, NumElms); HIP_CHECK(hipStreamSynchronize(strm)); // Verifying the result for (int m = 0; m < NumElms; ++m) { - if (Hmm[m] != (InitVal * InitVal)) { + if (Hmm1[m] != (InitVal * InitVal)) { IfTestPassed = false; } } if (!IfTestPassed) { - ReleaseResource(Hmm, Hmm1, &strm); + ReleaseResource(Hmm1, &strm); INFO("Did not find expected value!"); REQUIRE(false); } - // Resetting the values in Hmm - HIP_CHECK(hipMemset(Hmm, 0, MemSz)); + HIP_CHECK(hipStreamDestroy(strm)); } } // Releasing the resources in case all the scenarios passed - HIP_CHECK(hipFree(Hmm)); HIP_CHECK(hipFree(Hmm1)); } else { SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory " diff --git a/projects/hip-tests/catch/stress/memory/hipMemcpyMThreadMSize.cc b/projects/hip-tests/catch/stress/memory/hipMemcpyMThreadMSize.cc index 3b0e8b7ee1..301f3225ec 100644 --- a/projects/hip-tests/catch/stress/memory/hipMemcpyMThreadMSize.cc +++ b/projects/hip-tests/catch/stress/memory/hipMemcpyMThreadMSize.cc @@ -72,7 +72,7 @@ void Memcpy_And_verify(int NUM_ELM) { for (int i = 0; i < Available_Gpus; ++i) { for (int j = i+1; j < Available_Gpus; ++j) { canAccessPeer = 0; - hipDeviceCanAccessPeer(&canAccessPeer, i, j); + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j)); if (canAccessPeer) { HIP_CHECK(hipMemcpy(A_d[j], A_d[i], NUM_ELM * sizeof(TestType), hipMemcpyDefault)); @@ -122,7 +122,7 @@ void Memcpy_And_verify(int NUM_ELM) { int canAccessPeer = 0; for (int i = 0; i < Available_Gpus; ++i) { for (int j = i+1; j < Available_Gpus; ++j) { - hipDeviceCanAccessPeer(&canAccessPeer, i, j); + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j)); if (canAccessPeer) { HIP_CHECK(hipMemcpyHtoD(hipDeviceptr_t(A_d[i]), A_h, NUM_ELM * sizeof(TestType))); @@ -165,7 +165,7 @@ void Memcpy_And_verify(int NUM_ELM) { for (int i = 0; i < Available_Gpus; ++i) { for (int j = i+1; j < Available_Gpus; ++j) { canAccessPeer = 0; - hipDeviceCanAccessPeer(&canAccessPeer, i, j); + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j)); if (canAccessPeer) { HIP_CHECK(hipMemcpyAsync(A_d[j], A_d[i], NUM_ELM * sizeof(TestType), @@ -219,7 +219,7 @@ void Memcpy_And_verify(int NUM_ELM) { for (int i = 0; i < Available_Gpus; ++i) { for (int j = i+1; j < Available_Gpus; ++j) { canAccessPeer = 0; - hipDeviceCanAccessPeer(&canAccessPeer, i, j); + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j)); if (canAccessPeer) { HIP_CHECK(hipSetDevice(j)); HIP_CHECK(hipMemcpyDtoDAsync(hipDeviceptr_t(A_d[j]), diff --git a/projects/hip-tests/catch/stress/memory/memcpy.cc b/projects/hip-tests/catch/stress/memory/memcpy.cc index 21957e0fdb..7ba95b4e77 100644 --- a/projects/hip-tests/catch/stress/memory/memcpy.cc +++ b/projects/hip-tests/catch/stress/memory/memcpy.cc @@ -5,25 +5,25 @@ TEST_CASE("Stress_hipMalloc", "DifferentSizes") { SECTION("Size 10") { auto res = hipMalloc(&d_a, sizeof(10)); REQUIRE(res == hipSuccess); - hipFree(d_a); + HIP_CHECK(hipFree(d_a)); d_a = nullptr; } SECTION("Size 100") { auto res = hipMalloc(&d_a, sizeof(100)); REQUIRE(res == hipSuccess); - hipFree(d_a); + HIP_CHECK(hipFree(d_a)); d_a = nullptr; } SECTION("Size 1000") { auto res = hipMalloc(&d_a, sizeof(1000)); REQUIRE(res == hipSuccess); - hipFree(d_a); + HIP_CHECK(hipFree(d_a)); d_a = nullptr; } SECTION("Size 10000") { auto res = hipMalloc(&d_a, sizeof(10000)); REQUIRE(res == hipSuccess); - hipFree(d_a); + HIP_CHECK(hipFree(d_a)); d_a = nullptr; } SECTION("Size MAX") { @@ -31,4 +31,4 @@ TEST_CASE("Stress_hipMalloc", "DifferentSizes") { REQUIRE(res == hipErrorOutOfMemory); d_a = nullptr; } -} \ No newline at end of file +} diff --git a/projects/hip-tests/catch/stress/printf/Stress_printf_ComplexKernels.cc b/projects/hip-tests/catch/stress/printf/Stress_printf_ComplexKernels.cc index 46bb6f74b1..50ee409475 100644 --- a/projects/hip-tests/catch/stress/printf/Stress_printf_ComplexKernels.cc +++ b/projects/hip-tests/catch/stress/printf/Stress_printf_ComplexKernels.cc @@ -494,7 +494,7 @@ TEST_CASE("Stress_printf_ComplexKernelMultStreamMultGpu") { unsigned int print_limit = 4; // = 4 GB uint32_t iterCount = 1; int numOfGPUs = 0; - hipGetDeviceCount(&numOfGPUs); + HIP_CHECK(hipGetDeviceCount(&numOfGPUs)); if (numOfGPUs < 2) { printf("Skipping test because numOfGPUs < 2\n"); return; diff --git a/projects/hip-tests/catch/stress/stream/streamEnqueue.cc b/projects/hip-tests/catch/stress/stream/streamEnqueue.cc index c44ff96afc..9bbb641930 100644 --- a/projects/hip-tests/catch/stress/stream/streamEnqueue.cc +++ b/projects/hip-tests/catch/stress/stream/streamEnqueue.cc @@ -26,11 +26,13 @@ THE SOFTWARE. #include #include -__global__ void addVal(unsigned long long* ptr, size_t index, unsigned long long val) { +__global__ void addVal(unsigned long long* ptr, size_t index, + unsigned long long val) { atomicAdd(ptr + index, val); } -// Create a copy constructible AtomicWrap around std::atomic so that we can put it in a vector +// Create a copy constructible AtomicWrap around std::atomic so that +// we can put it in a vector template struct AtomicWrap { std::atomic data; @@ -68,18 +70,19 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") { constexpr size_t maxWork = 10000; constexpr size_t maxVal = 10; - std::uniform_int_distribution genIndex(0, hwThreads - 1); + std::uniform_int_distribution genIndex(0, + hwThreads - 1); std::uniform_int_distribution genWork(0, maxWork); std::uniform_int_distribution genVal(0, maxVal); auto enqueueKernelThread = [&](hipStream_t stream) { auto iter = genWork(engine); // Generate work to be done via thread - for (auto i = 0; i < iter; i++) { + for (unsigned long i = 0; i < iter; i++) { auto index = genIndex(engine); // Generate Index to add to - auto val = genVal(engine); // Generate value to add to the destination + auto val = genVal(engine); // Generate value to add to the destination hostData[index].data += val; // Replicate it on host addVal<<<1, 1, 0, stream>>>(dPtr, static_cast(index), - static_cast(val)); // And on device + static_cast(val)); // And on device } }; @@ -101,8 +104,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") { HIP_CHECK(hipStreamDestroy(stream)); auto hPtr = std::make_unique(hwThreads); - HIP_CHECK( - hipMemcpy(hPtr.get(), dPtr, sizeof(unsigned long long) * hwThreads, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(hPtr.get(), dPtr, sizeof(unsigned long long) * hwThreads, + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(dPtr)); @@ -113,7 +116,7 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") { } } -__global__ void doOperation(int* dPtr, size_t size, int val) { +__global__ void doOperation(int* dPtr, int val) { auto i = threadIdx.x; atomicAdd(dPtr + i, val); } @@ -135,14 +138,15 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") { std::vector streamPool{}; streamPool.reserve(deviceCount * streamPerGPU); - - std::map streamToDeviceMemory; // Map of stream and device memory - std::map> streamToHostMemory; // Map of stream and host result - std::map streamToDeviceIndex; // Map of stream and device it was created on - + // Map of stream and device memory + std::map streamToDeviceMemory; + // Map of stream and host result + std::map> streamToHostMemory; + // Map of stream and device it was created on + std::map streamToDeviceIndex; constexpr size_t size = 1024; - for (size_t i = 0; i < deviceCount; i++) { + for (int i = 0; i < deviceCount; i++) { HIP_CHECK(hipSetDevice(i)); for (size_t j = 0; j < streamPerGPU; j++) { @@ -155,8 +159,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") { HIP_CHECK(hipMalloc(&dPtr, sizeof(int) * size)); REQUIRE(dPtr != nullptr); HIP_CHECK(hipMemset(dPtr, 0, sizeof(int) * size)); - - streamToDeviceMemory[stream] = dPtr; // All streams work on exclusive memory + // All streams work on exclusive memory + streamToDeviceMemory[stream] = dPtr; streamToHostMemory[stream] = AtomicWrap(0); // CPU result @@ -171,8 +175,10 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") { std::random_device device; std::mt19937 engine(device()); - std::uniform_int_distribution genVal(-maxVal, maxVal); - std::uniform_int_distribution genStream(0, streamPool.size() - 1); + std::uniform_int_distribution genVal(-maxVal, + maxVal); + std::uniform_int_distribution genStream(0, + streamPool.size() - 1); #if HT_NVIDIA std::mutex ness; // On nvidia, current device needs to match stream's device @@ -183,7 +189,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") { #if HT_NVIDIA std::unique_lock lock(ness); // Lock on creation #endif - hipStream_t stream = streamPool[genStream(engine)]; // Get a random stream + // Get a random stream + hipStream_t stream = streamPool[genStream(engine)]; // TODO use HIP_CHECK_THREAD when PR#2664 is merged if (hipSuccess != hipSetDevice(streamToDeviceIndex[stream])) { @@ -191,11 +198,10 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") { } int val = genVal(engine); // Generate Value to add/sub to - - streamToHostMemory[stream].data.fetch_add(val); // Replicate result on CPU + // Replicate result on CPU + streamToHostMemory[stream].data.fetch_add(val); auto dPtr = streamToDeviceMemory[stream]; - doOperation<<<1, 1024, 0, stream>>>(dPtr, size, - val); // On GPU + doOperation<<<1, 1024, 0, stream>>>(dPtr, val); // On GPU } }; @@ -219,13 +225,14 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") { for (auto& i : streamPool) { HIP_CHECK(hipStreamSynchronize(i)); auto dResult = std::make_unique(size); - HIP_CHECK(hipMemcpy(dResult.get(), streamToDeviceMemory[i], sizeof(int) * size, - hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(dResult.get(), streamToDeviceMemory[i], + sizeof(int) * size, hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(streamToDeviceMemory[i])); HIP_CHECK(hipStreamDestroy(i)); auto res = streamToHostMemory[i].data.load(); INFO("Matching CPU: " << res << " GPU: " << dResult[0] << " Dev Ptr: " - << streamToDeviceMemory[i] << " on Device: " << streamToDeviceIndex[i]); - REQUIRE(std::all_of(dResult.get(), dResult.get() + size, [=](int r) { return r == res; })); + << streamToDeviceMemory[i] << " on Device: " << streamToDeviceIndex[i]); + REQUIRE(std::all_of(dResult.get(), dResult.get() + size, + [=](int r) { return r == res; })); } }