From bc851a78ac0dff1acff0ea5963f60ebc44299459 Mon Sep 17 00:00:00 2001 From: sumanthtg <90063301+sumanthtg@users.noreply.github.com> Date: Tue, 14 Sep 2021 13:38:07 +0530 Subject: [PATCH] SWDEV-292643 - [dtest] Catch2 unit tests for stream management apis. (#2343) Catch2 tests for : hipStreamAddCallback, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipExtStreamCreateWithCUMask Change-Id: Ia99c06b1e97fc945f1a740e47710f4dcd70f38cd [ROCm/hip commit: 2a17c5662df00bf028b6f9e4b3c58273d4ecdf12] --- .../tests/catch/unit/stream/CMakeLists.txt | 18 + .../unit/stream/hipStreamACb_MultiThread.cc | 168 +++++ .../catch/unit/stream/hipStreamAddCallback.cc | 231 +++++++ .../unit/stream/hipStreamCreateWithFlags.cc | 39 ++ .../stream/hipStreamCreateWithPriority.cc | 605 ++++++++++++++++++ .../catch/unit/stream/hipStreamWithCUMask.cc | 350 ++++++++++ 6 files changed, 1411 insertions(+) create mode 100644 projects/hip/tests/catch/unit/stream/hipStreamACb_MultiThread.cc create mode 100644 projects/hip/tests/catch/unit/stream/hipStreamAddCallback.cc create mode 100644 projects/hip/tests/catch/unit/stream/hipStreamCreateWithFlags.cc create mode 100644 projects/hip/tests/catch/unit/stream/hipStreamCreateWithPriority.cc create mode 100644 projects/hip/tests/catch/unit/stream/hipStreamWithCUMask.cc diff --git a/projects/hip/tests/catch/unit/stream/CMakeLists.txt b/projects/hip/tests/catch/unit/stream/CMakeLists.txt index 4b8e57a112..14c05e7f74 100644 --- a/projects/hip/tests/catch/unit/stream/CMakeLists.txt +++ b/projects/hip/tests/catch/unit/stream/CMakeLists.txt @@ -1,9 +1,27 @@ +if(HIP_PLATFORM MATCHES "amd") set(TEST_SRC hipStreamCreate.cc hipStreamGetFlags.cc hipStreamGetPriority.cc hipMultiStream.cc + hipStreamACb_MultiThread.cc + hipStreamAddCallback.cc + hipStreamCreateWithFlags.cc + hipStreamCreateWithPriority.cc + hipStreamWithCUMask.cc ) +else() +set(TEST_SRC + hipStreamCreate.cc + hipStreamGetFlags.cc + hipStreamGetPriority.cc + hipMultiStream.cc + hipStreamACb_MultiThread.cc + hipStreamAddCallback.cc + hipStreamCreateWithFlags.cc + hipStreamCreateWithPriority.cc +) +endif() # Create shared lib of all tests add_library(StreamTest SHARED EXCLUDE_FROM_ALL ${TEST_SRC}) diff --git a/projects/hip/tests/catch/unit/stream/hipStreamACb_MultiThread.cc b/projects/hip/tests/catch/unit/stream/hipStreamACb_MultiThread.cc new file mode 100644 index 0000000000..07b999d72a --- /dev/null +++ b/projects/hip/tests/catch/unit/stream/hipStreamACb_MultiThread.cc @@ -0,0 +1,168 @@ +/* +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** +Testcase Scenario : +Validate behaviour of HIP when multiple hipStreaAddCallback() are called over +multiple Threads. +*/ + +#include +#include + +static constexpr size_t N = 4096; +static constexpr int numThreads = 1000; +static std::atomic Cb_count{0}, Data_mismatch{0}; +static hipStream_t mystream; +static float *A1_h, *C1_h; + +#if HT_AMD +#define HIPRT_CB +#endif + +static __global__ void device_function(float* C_d, float* A_d, size_t Num) { + size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = gputhread; i < Num; i += stride) { + C_d[i] = A_d[i] * A_d[i]; + } + + // Delay thread 1 only in the GPU + if (gputhread == 1) { + uint64_t wait_t = 3200000000, start = clock64(), cur; + do { + cur = clock64() - start; + } while (cur < wait_t); + } +} + + +static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status, + void* userData) { + HIPASSERT(stream == mystream); + HIPASSERT(userData == nullptr); + HIPCHECK(status); + + for (size_t i = 0; i < N; i++) { + // Validate the data and update Data_mismatch + if (C1_h[i] != A1_h[i] * A1_h[i]) { + Data_mismatch++; + } + } + + // Increment the Cb_count to indicate that the callback is processed. + ++Cb_count; +} + +static void HIPRT_CB Thread2_Callback(hipStream_t stream, hipError_t status, + void* userData) { + HIPASSERT(stream == mystream); + HIPASSERT(userData == nullptr); + HIPCHECK(status); + + for (size_t i = 0; i < N; i++) { + // Validate the data and update Data_mismatch + if (C1_h[i] != A1_h[i] * A1_h[i]) { + Data_mismatch++; + } + } + + // Increment the Cb_count to indicate that the callback is processed. + ++Cb_count; +} + +void Thread1_func() { + HIPCHECK(hipStreamAddCallback(mystream, Thread1_Callback, nullptr, 0)); +} + +void Thread2_func() { + HIPCHECK(hipStreamAddCallback(mystream, Thread2_Callback, nullptr, 0)); +} + +/** + Test multiple hipStreamAddCallback() called over + multiple Threads. + */ +TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") { + float *A_d, *C_d; + size_t Nbytes = (N) * sizeof(float); + constexpr float Phi = 1.618f; + + A1_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A1_h != nullptr); + C1_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(C1_h != nullptr); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A1_h[i] = Phi + i; + } + + HIP_CHECK(hipMalloc(&A_d, Nbytes)); + HIP_CHECK(hipMalloc(&C_d, Nbytes)); + + HIP_CHECK( + hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking)); + + HIP_CHECK( + hipMemcpyAsync(A_d, A1_h, Nbytes, hipMemcpyHostToDevice, + mystream)); + + constexpr unsigned threadsPerBlock = 256; + constexpr unsigned blocks = (N + 255)/threadsPerBlock; + + hipLaunchKernelGGL((device_function), dim3(blocks), + dim3(threadsPerBlock), 0, + mystream, C_d, A_d, N); + + HIP_CHECK( + hipMemcpyAsync(C1_h, C_d, Nbytes, + hipMemcpyDeviceToHost, mystream)); + + std::thread *T = new std::thread[numThreads]; + for (int i = 0; i < numThreads; i++) { + // Use different callback for every even thread + // The callbacks will be added to same stream from different threads + if ((i%2) == 0) + T[i] = std::thread(Thread1_func); + else + T[i] = std::thread(Thread2_func); + } + + // Wait until all the threads finish their execution + for (int i = 0; i < numThreads; i++) { + T[i].join(); + } + + HIP_CHECK(hipStreamSynchronize(mystream)); + HIP_CHECK(hipStreamDestroy(mystream)); + + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(C_d)); + + free(A1_h); + free(C1_h); + + // Cb_count should match total number of callbacks added from both threads + // Data_mismatch will be updated if there is problem in data validation + REQUIRE(Cb_count.load() == numThreads); + REQUIRE(Data_mismatch.load() == 0); + delete[] T; +} diff --git a/projects/hip/tests/catch/unit/stream/hipStreamAddCallback.cc b/projects/hip/tests/catch/unit/stream/hipStreamAddCallback.cc new file mode 100644 index 0000000000..bc16eda3bd --- /dev/null +++ b/projects/hip/tests/catch/unit/stream/hipStreamAddCallback.cc @@ -0,0 +1,231 @@ +/* +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** +Testcase Scenarios : + 1) Validates parameter list of hipStreamAddCallback. + 2) Validates hipStreamAddCallback functionality with default stream. + 3) Validates hipStreamAddCallback functionality with defined stream. +*/ + +#include +#include +#include + +#define UNUSED(expr) do { (void)(expr); } while (0) + +#ifdef __HIP_PLATFORM_AMD__ +#define HIPRT_CB +#endif + +namespace hipStreaAddCallbackTest { +size_t NSize = 4 * 1024 * 1024; +float *A_h, *C_h; +bool gcbDone = false; +bool gPassed = true; +void *ptr0xff = reinterpret_cast(0xffffffff); +void *gusrptr; +hipStream_t gstream; + +void HIPRT_CB Callback(hipStream_t stream, hipError_t status, + void* userData) { + UNUSED(stream); + HIP_CHECK(status); + REQUIRE(userData == NULL); + gPassed = true; + for (size_t i = 0; i < NSize; i++) { + if (C_h[i] != A_h[i] * A_h[i]) { + UNSCOPED_INFO("Data mismatch :" << i); + gPassed = false; + break; + } + } + gcbDone = true; +} +/** + * Validates functionality of hipStreamAddCallback with default/created stream. + */ +bool testStreamCallbackFunctionality(bool isDefault) { + float *A_d, *C_d; + size_t Nbytes = NSize * sizeof(float); + gcbDone = false; + A_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + C_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(C_h != nullptr); + + // Fill with Phi + i + for (size_t i = 0; i < NSize; i++) { + A_h[i] = 1.618f + i; + } + + HIP_CHECK(hipMalloc(&A_d, Nbytes)); + HIP_CHECK(hipMalloc(&C_d, Nbytes)); + if (isDefault) { + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, + 0)); + + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks), + dim3(threadsPerBlock), 0, 0, A_d, C_d, NSize); + + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, + 0)); + HIP_CHECK(hipStreamAddCallback(0, Callback, nullptr, 0)); + while (!gcbDone) usleep(100000); // Sleep for 100 ms + } else { + hipStream_t mystream; + HIP_CHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking)); + + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, + mystream)); + + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks), + dim3(threadsPerBlock), 0, mystream, A_d, C_d, NSize); + + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, + mystream)); + HIP_CHECK(hipStreamAddCallback(mystream, Callback, nullptr, 0)); + while (!gcbDone) usleep(100000); // Sleep for 100 ms + HIP_CHECK(hipStreamDestroy(mystream)); + } + HIP_CHECK(hipFree(reinterpret_cast(C_d))); + HIP_CHECK(hipFree(reinterpret_cast(A_d))); + free(C_h); + free(A_h); + return gPassed; +} +/** + * Scenario1: Validates if callback = nullptr returns error code for created stream. + * Scenario2: Validates if callback = nullptr returns error code for default stream. + * Scenario3: Validates if flag != 0 returns error code for created stream. + * Scenario4: Validates if flag != 0 returns error code for default stream. + * Scenario5: Validates if userData pointer is passed properly to callback. + * Scenario6: Validates if stream value is passed properly to callback. + */ +void Callback_ChkUsrdataPtr(hipStream_t stream, hipError_t status, + void* userData) { + REQUIRE(stream == gstream); + HIP_CHECK(status); + gPassed = true; + if (gusrptr != userData) { + gPassed = false; + } + gcbDone = true; +} + +void Callback_ChkStreamValue(hipStream_t stream, hipError_t status, + void* userData) { + REQUIRE(userData == nullptr); + HIP_CHECK(status); + gPassed = true; + if (stream != gstream) { + gPassed = false; + } + gcbDone = true; +} +} // namespace hipStreaAddCallbackTest + + +using hipStreaAddCallbackTest::gcbDone; +using hipStreaAddCallbackTest::gPassed; +using hipStreaAddCallbackTest::ptr0xff; +using hipStreaAddCallbackTest::gusrptr; +using hipStreaAddCallbackTest::gstream; +using hipStreaAddCallbackTest::testStreamCallbackFunctionality; +using hipStreaAddCallbackTest::Callback; +using hipStreaAddCallbackTest::Callback_ChkUsrdataPtr; +using hipStreaAddCallbackTest::Callback_ChkStreamValue; + + +/* + * Validates parameter list of hipStreamAddCallback. + */ +TEST_CASE("Unit_hipStreamAddCallback_ParamTst") { + hipStream_t mystream; + HIP_CHECK(hipStreamCreate(&mystream)); + // Scenario1 + SECTION("callback is nullptr") { + REQUIRE_FALSE(hipSuccess == hipStreamAddCallback(mystream, nullptr, + nullptr, 0)); + } + // Scenario2 + SECTION("stream is default") { + REQUIRE_FALSE(hipSuccess == hipStreamAddCallback(0, nullptr, + nullptr, 0)); + } + // Scenario3 + SECTION("flag is nonzero for non-default stream") { + REQUIRE_FALSE(hipSuccess == hipStreamAddCallback(mystream, Callback, + nullptr, 10)); + } + // Scenario4 + SECTION("flag is nonzero for default stream") { + REQUIRE_FALSE(hipSuccess == hipStreamAddCallback(0, Callback, + nullptr, 10)); + } + // Scenario5 + SECTION("userData pointer value validation") { + gstream = mystream; + gusrptr = ptr0xff; + gPassed = true; + gcbDone = false; + HIP_CHECK(hipStreamAddCallback(mystream, Callback_ChkUsrdataPtr, + gusrptr, 0)); + while (!gcbDone) { + usleep(100000); // Sleep for 100 ms + } + REQUIRE_FALSE(!gPassed); + } + // Scenario6 + SECTION("stream value validation") { + gstream = mystream; + gPassed = true; + gcbDone = false; + HIP_CHECK(hipStreamAddCallback(mystream, Callback_ChkStreamValue, + nullptr, 0)); + while (!gcbDone) { + usleep(100000); // Sleep for 100 ms + } + REQUIRE_FALSE(!gPassed); + } + HIP_CHECK(hipStreamDestroy(mystream)); +} + +/* + * Validates hipStreamAddCallback functionality with default stream. + */ +TEST_CASE("Unit_hipStreamAddCallback_WithDefaultStream") { + bool TestPassed = true; + TestPassed = testStreamCallbackFunctionality(true); + REQUIRE(TestPassed); +} + +/* + * Validates hipStreamAddCallback functionality with defined stream. + */ +TEST_CASE("Unit_hipStreamAddCallback_WithCreatedStream") { + bool TestPassed = true; + TestPassed = testStreamCallbackFunctionality(false); + REQUIRE(TestPassed); +} + diff --git a/projects/hip/tests/catch/unit/stream/hipStreamCreateWithFlags.cc b/projects/hip/tests/catch/unit/stream/hipStreamCreateWithFlags.cc new file mode 100644 index 0000000000..95606f418a --- /dev/null +++ b/projects/hip/tests/catch/unit/stream/hipStreamCreateWithFlags.cc @@ -0,0 +1,39 @@ +/* +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** +Testcase Scenarios : +1) Validates functionality of hipStreamCreateWithFlags when stream = nullptr. +2) Validates functionality of hipStreamCreateWithFlags when flag = 0xffffffff. +*/ + +#include + + +TEST_CASE("Unit_hipStreamCreateWithFlags_ArgValidation") { + // stream = nullptr test + SECTION("stream is nullptr") { + REQUIRE(hipStreamCreateWithFlags(nullptr, hipStreamDefault) != hipSuccess); + } + // flag value invalid test + SECTION("flag value invalid") { + hipStream_t stream; + REQUIRE(hipStreamCreateWithFlags(&stream, 0xffffffff) != hipSuccess); + } +} diff --git a/projects/hip/tests/catch/unit/stream/hipStreamCreateWithPriority.cc b/projects/hip/tests/catch/unit/stream/hipStreamCreateWithPriority.cc new file mode 100644 index 0000000000..2a75684b50 --- /dev/null +++ b/projects/hip/tests/catch/unit/stream/hipStreamCreateWithPriority.cc @@ -0,0 +1,605 @@ +/* +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** +Testcase Scenarios : + +1)Create streams with default flag for all available priority levels and +queue tasks in each of these streams, perform device synchronize and validate behavior. + +2)Create streams with non-blocking flag for all available priority levels and +queue tasks in each of these streams, perform stream synchronize and validate behavior. + +3)Create streams with default flag for all available priority levels and +queue tasks in each of these streams, perform stream synchronize and validate behavior. + +4)Create streams with non-blocking flag for all available priority levels and +queue tasks in each of these streams, perform device synchronize and validate behavior. + +5)Create a stream for each priority level with default flag, Launch memcpy and kernel +tasks on these streams from multiple threads. Validate all the results. + +6)Create a stream for each priority level with non-blocking flag, Launch memcpy and +kernel tasks on these streams from multiple threads. Validate all the results. + +7) Validate negative scenarios for hipStreamCreateWithPriority api. + +8) Validate stream priorities with event after classifying them as low, medium, high. +*/ + +#include +#include +#include +#include + +#define MEMCPYSIZE 64*1024*1024 +#define MEMCPYSIZE2 1024*1024 +#define NUMITERS 2 +#define GRIDSIZE 1024 +#define BLOCKSIZE 256 +#define TOTALTHREADS 16 + +namespace hipStreamCreateWithPriorityTest { + +std::atomic g_thTestPassed(1); +// helper rountine to initialize memory +template +void mem_init(T* buf, size_t n) { + for (size_t i = 0; i < n; i++) { + buf[i] = i; + } +} + +// kernel to copy n elements from src to dst +template +__global__ void memcpy_kernel(T* dst, T* src, size_t n) { + int num = gridDim.x * blockDim.x; + int id = blockDim.x * blockIdx.x + threadIdx.x; + + for (size_t i = id; i < n; i += num) { + dst[i] = src[i]; + } +} + +/** + * Scenario: Create a stream for all available priority levels + * and queue tasks in each of these streams and default stream. + * Validate the calculated results. + */ +void funcTestsForAllPriorityLevelsWrtNullStrm(unsigned int flags, + bool deviceSynchronize) { + int priority; + int priority_low{}; + int priority_high{}; + size_t size = MEMCPYSIZE2*sizeof(int); + // Test is to get the Stream Priority Range + HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_low, &priority_high)); + + // Check if priorities are indeed supported + if (priority_low == priority_high) { + WARN("Stream priority range not supported. Skipping test."); + return; + } + + int numOfPriorities = priority_low - priority_high; + INFO("numOfPriorities = " << numOfPriorities); + const int arr_size = numOfPriorities + 1; + // 0 idx is for default stream + hipStream_t *stream = reinterpret_cast( + malloc(arr_size*sizeof(hipStream_t))); + REQUIRE(stream != nullptr); + stream[0] = 0; + int count = 1; + // Create a stream for each of the priority levels + for (priority = priority_high; priority < priority_low; priority++) { + HIP_CHECK(hipStreamCreateWithPriority(&stream[count++], + flags, priority)); + } + // Allocate memory + int **A_d = reinterpret_cast(malloc(arr_size*sizeof(int*))); + int **C_d = reinterpret_cast(malloc(arr_size*sizeof(int*))); + int **A_h = reinterpret_cast(malloc(arr_size*sizeof(int*))); + int **C_h = reinterpret_cast(malloc(arr_size*sizeof(int*))); + + REQUIRE(A_d != nullptr); + REQUIRE(C_d != nullptr); + REQUIRE(A_h != nullptr); + REQUIRE(C_h != nullptr); + + for (int idx = 0; idx < arr_size; idx++) { + A_h[idx] = reinterpret_cast(malloc(size)); + REQUIRE(A_h[idx] != nullptr); + C_h[idx] = reinterpret_cast(malloc(size)); + REQUIRE(C_h[idx] != nullptr); + HIP_CHECK(hipMalloc(&A_d[idx], size)); + HIP_CHECK(hipMalloc(&C_d[idx], size)); + } + + // Initialize host memory + constexpr int initVal = 2; + for (int idx = 0; idx < arr_size; idx++) { + for (int idy = 0; idy < MEMCPYSIZE2; idy++) { + A_h[idx][idy] = initVal; + } + } + + // Launch task on each stream + for (int idx = 0; idx < arr_size; idx++) { + HIP_CHECK(hipMemcpyAsync(A_d[idx], A_h[idx], size, + hipMemcpyHostToDevice, stream[idx])); + hipLaunchKernelGGL((HipTest::vector_square), dim3(GRIDSIZE), + dim3(BLOCKSIZE), 0, stream[idx], A_d[idx], + C_d[idx], MEMCPYSIZE2); + HIP_CHECK(hipMemcpyAsync(C_h[idx], C_d[idx], size, + hipMemcpyDeviceToHost, stream[idx])); + } + + if (deviceSynchronize) { + HIP_CHECK(hipDeviceSynchronize()); + } + + // Validate the output of each queue + for (int idx = 0; idx < arr_size; idx++) { + if (!deviceSynchronize) { + HIP_CHECK(hipStreamSynchronize(stream[idx])); + } + for (int idy = 0; idy < MEMCPYSIZE2; idy++) { + if (C_h[idx][idy] != A_h[idx][idy] * A_h[idx][idy]) { + INFO("Data mismatch at idx:" << idx << " idy:" << idy); + REQUIRE(false); + } + } + } + + // Deallocate memory + for (int idx = 0; idx < arr_size; idx++) { + HIP_CHECK(hipFree(reinterpret_cast(C_d[idx]))); + HIP_CHECK(hipFree(reinterpret_cast(A_d[idx]))); + free(C_h[idx]); + free(A_h[idx]); + } + + // Destroy the stream for each of the priority levels + count = 1; + for (priority = priority_high; priority < priority_low; priority++) { + HIP_CHECK(hipStreamDestroy(stream[count++])); + } + free(stream); + free(A_d); + free(C_d); + free(A_h); + free(C_h); +} + +/** + * Scenario: Queue tasks in each of these streams and default stream. + * Validate the calculated results. + */ +void queueTasksInStreams(std::vector *stream, + const int arrsize) { + size_t size = MEMCPYSIZE2*sizeof(int); + // Allocate memory + int **A_d = reinterpret_cast(malloc(arrsize*sizeof(int *))); + int **C_d = reinterpret_cast(malloc(arrsize*sizeof(int *))); + int **A_h = reinterpret_cast(malloc(arrsize*sizeof(int *))); + int **C_h = reinterpret_cast(malloc(arrsize*sizeof(int *))); + + HIPASSERT(A_d != nullptr); + HIPASSERT(C_d != nullptr); + HIPASSERT(A_h != nullptr); + HIPASSERT(C_h != nullptr); + + for (int idx = 0; idx < arrsize; idx++) { + A_h[idx] = reinterpret_cast(malloc(size)); + HIPASSERT(A_h[idx] != nullptr); + C_h[idx] = reinterpret_cast(malloc(size)); + HIPASSERT(C_h[idx] != nullptr); + HIPCHECK(hipMalloc(&A_d[idx], size)); + HIPCHECK(hipMalloc(&C_d[idx], size)); + } + // Initialize host memory + constexpr int initVal = 2; + for (int idx = 0; idx < arrsize; idx++) { + for (int idy = 0; idy < MEMCPYSIZE2; idy++) { + A_h[idx][idy] = initVal; + } + } + // Launch task on each stream + for (int idx = 0; idx < arrsize; idx++) { + HIPCHECK(hipMemcpyAsync(A_d[idx], A_h[idx], size, + hipMemcpyHostToDevice, (*stream)[idx])); + hipLaunchKernelGGL((HipTest::vector_square), dim3(GRIDSIZE), + dim3(BLOCKSIZE), 0, (*stream)[idx], A_d[idx], + C_d[idx], MEMCPYSIZE2); + HIPCHECK(hipMemcpyAsync(C_h[idx], C_d[idx], size, + hipMemcpyDeviceToHost, (*stream)[idx])); + } + + bool isPassed = true; + // Validate the output of each queue + for (int idx = 0; idx < arrsize; idx++) { + HIPCHECK(hipStreamSynchronize((*stream)[idx])); + for (int idy = 0; idy < MEMCPYSIZE2; idy++) { + if (C_h[idx][idy] != A_h[idx][idy] * A_h[idx][idy]) { + UNSCOPED_INFO("Data mismatch at idx:" << idx << " idy:" << idy); + isPassed = false; + break; + } + } + if (false == isPassed) break; + } + // Deallocate memory + for (int idx = 0; idx < arrsize; idx++) { + HIPCHECK(hipFree(reinterpret_cast(C_d[idx]))); + HIPCHECK(hipFree(reinterpret_cast(A_d[idx]))); + free(C_h[idx]); + free(A_h[idx]); + } + free(A_d); + free(C_d); + free(A_h); + free(C_h); + g_thTestPassed &= static_cast(isPassed); +} + +/** + * Scenario: + * Common streams used across multiple threads:Create a stream for each + * priority level (flag = hipStreamDefault/hipStreamNonBlocking) + * and 1 default stream. + * Launch memcpy and kernel tasks on these streams from multiple threads + * (use 16 threads). Validate all the results. +*/ +bool runFuncTestsForAllPriorityLevelsMultThread(unsigned int flags) { + bool TestPassed = true; + std::thread T[TOTALTHREADS]; + int priority; + int priority_low; + int priority_high; + // Test is to get the Stream Priority Range + HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_low, &priority_high)); + + // Check if priorities are indeed supported + if (priority_low == priority_high) { + WARN("Stream priority range not supported. Skipping test."); + return true; + } + + int numOfPriorities = priority_low - priority_high; + INFO("numOfPriorities : " << numOfPriorities); + + // 0 idx is for default stream + std::vector stream(numOfPriorities + 1); + stream[0] = 0; + + // Create a stream for each of the priority levels + int count = 1; + for (priority = priority_high; priority < priority_low; priority++) { + HIP_CHECK(hipStreamCreateWithPriority(&stream[count++], flags, + priority)); + } + + for (int i = 0; i < TOTALTHREADS; i++) { + T[i] = std::thread(queueTasksInStreams, + &stream, numOfPriorities + 1); + } + + for (int i=0; i < TOTALTHREADS; i++) { + T[i].join(); + } + if (g_thTestPassed) { + TestPassed = true; + } else { + TestPassed = false; + } + + // Destroy the stream for each of the priority levels + count = 1; + for (priority = priority_high; priority < priority_low; priority++) { + HIP_CHECK(hipStreamDestroy(stream[count++])); + } + return TestPassed; +} + + +template +bool validateStreamPrioritiesWithEvents() { + size_t size = NUMITERS*MEMCPYSIZE; + + // get the range of priorities available + #define OP(x) \ + int priority_##x; \ + bool enable_priority_##x = false; + OP(low) + OP(normal) + OP(high) + #undef OP + HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_low, &priority_high)); + + INFO("HIP stream priority range - low: " << priority_low << ",high: " + << priority_high << ",normal: " + << (priority_low + priority_high)/2); + // Check if priorities are indeed supported + if (priority_low == priority_high) { + WARN("Stream priority range not supported. Skipping test."); + return true; + } + + // Enable/disable priorities based on number of available priority levels + enable_priority_low = true; + enable_priority_high = true; + if ((priority_low - priority_high) > 1) { + enable_priority_normal = true; + } + if (enable_priority_normal) { + priority_normal = ((priority_low + priority_high) / 2); + } + // create streams with highest and lowest available priorities + #define OP(x)\ + hipStream_t stream_##x;\ + if (enable_priority_##x) {\ + HIP_CHECK(hipStreamCreateWithPriority(&stream_##x, \ + hipStreamDefault, priority_##x));\ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // allocate and initialise host source and destination buffers + #define OP(x) \ + T* src_h_##x; \ + T* dst_h_##x; \ + if (enable_priority_##x) { \ + src_h_##x = reinterpret_cast(malloc(size)); \ + REQUIRE(src_h_##x != nullptr); \ + mem_init(src_h_##x, (size / sizeof(T))); \ + dst_h_##x = reinterpret_cast(malloc(size)); \ + REQUIRE(dst_h_##x != nullptr); \ + memset(dst_h_##x, 0, size); \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // allocate and initialize device source and destination buffers + #define OP(x) \ + T* src_d_##x; \ + T* dst_d_##x; \ + if (enable_priority_##x) { \ + HIP_CHECK(hipMalloc(&src_d_##x, size)); \ + HIP_CHECK( \ + hipMemcpy(src_d_##x, src_h_##x, size, hipMemcpyHostToDevice)); \ + HIP_CHECK(hipMalloc(&dst_d_##x, size)); \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // create events for measuring time spent in kernel execution + #define OP(x) \ + hipEvent_t event_start_##x; \ + hipEvent_t event_end_##x; \ + if (enable_priority_##x) { \ + HIP_CHECK(hipEventCreate(&event_start_##x)); \ + HIP_CHECK(hipEventCreate(&event_end_##x)); \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // record start events for each of the priority streams + #define OP(x) \ + if (enable_priority_##x) { \ + HIP_CHECK(\ + hipEventRecord(event_start_##x, stream_##x)); \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // launch kernels repeatedly on each of the prioritiy streams + for (int i = 0; i < static_cast(size); i += MEMCPYSIZE) { + int j = i / sizeof(T); + #define OP(x) \ + if (enable_priority_##x) { \ + hipLaunchKernelGGL((memcpy_kernel), dim3(GRIDSIZE), \ + dim3(BLOCKSIZE), 0, stream_##x, dst_d_##x + j, src_d_##x + j, \ + (MEMCPYSIZE / sizeof(T))); \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + } + + // record end events for each of the priority streams + #define OP(x) \ + if (enable_priority_##x) { \ + HIP_CHECK(hipEventRecord(event_end_##x, stream_##x)); \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // synchronize events for each of the priority streams + #define OP(x) \ + if (enable_priority_##x) { \ + HIP_CHECK(hipEventSynchronize(event_end_##x)); \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // compute time spent for memcpy in each stream + #define OP(x) \ + float time_spent_##x; \ + if (enable_priority_##x) { \ + HIP_CHECK(hipEventElapsedTime(&time_spent_##x, \ + event_start_##x, event_end_##x)); \ + INFO("time spent for memcpy in " << #x << \ + " priority stream: " << time_spent_##x << " ms"); \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // sanity check + #define OP(x) \ + if (enable_priority_##x) { \ + HIP_CHECK(hipMemcpy(dst_h_##x, dst_d_##x, size, \ + hipMemcpyDeviceToHost)); \ + if (memcmp(dst_h_##x, src_h_##x, size) != 0) { \ + REQUIRE(false); \ + } \ + } + OP(low) + OP(normal) + OP(high) + #undef OP + + // validate that stream priorities are working as expected + #define OP(x, y) \ + if (enable_priority_##x && enable_priority_##y) { \ + if ((1.05f * time_spent_##x) < time_spent_##y) { \ + INFO("time_spent_##x : " << time_spent_##x << \ + "time_spent_##y : " << time_spent_##y); \ + REQUIRE(false); \ + } \ + } + OP(low, normal) + OP(normal, high) + OP(low, high) + #undef OP + + return true; +} + +} // namespace hipStreamCreateWithPriorityTest + + + +/** +Tests following scenarios. + 1)Create streams with default flag for all available priority levels and + queue tasks in each of these streams, perform device synchronize and validate behavior. + 2)Create streams with non-blocking flag for all available priority levels and + queue tasks in each of these streams, perform stream synchronize and validate behavior. + 3)Create streams with default flag for all available priority levels and + queue tasks in each of these streams, perform stream synchronize and validate behavior. + 4)Create streams with non-blocking flag for all available priority levels and + queue tasks in each of these streams, perform device synchronize and validate behavior. +*/ +TEST_CASE("Unit_hipStreamCreateWithPriority_FunctionalForAllPriorities") { + SECTION("Default flag and device synchronize") { + hipStreamCreateWithPriorityTest:: + funcTestsForAllPriorityLevelsWrtNullStrm(hipStreamDefault, true); + } + + SECTION("Stream non-blocking flag and stream synchronize") { + hipStreamCreateWithPriorityTest:: + funcTestsForAllPriorityLevelsWrtNullStrm(hipStreamNonBlocking, false); + } + + SECTION("Default flag and stream synchronize") { + hipStreamCreateWithPriorityTest:: + funcTestsForAllPriorityLevelsWrtNullStrm(hipStreamDefault, false); + } + + SECTION("Stream non-blocking flag and device synchronize") { + hipStreamCreateWithPriorityTest:: + funcTestsForAllPriorityLevelsWrtNullStrm(hipStreamNonBlocking, true); + } +} + +/** + * Create a stream for each priority level with default flag, Launch memcpy and kernel + * tasks on these streams from multiple threads. Validate all the results. + */ +TEST_CASE("Unit_hipStreamCreateWithPriority_MulthreadDefaultflag") { + bool TestPassed = true; + TestPassed = hipStreamCreateWithPriorityTest:: + runFuncTestsForAllPriorityLevelsMultThread(hipStreamDefault); + REQUIRE(TestPassed); +} + +/** + * Create a stream for each priority level with non-blocking flag, Launch memcpy and + * kernel tasks on these streams from multiple threads. Validate all the results. + */ +TEST_CASE("Unit_hipStreamCreateWithPriority_MulthreadNonblockingflag") { + bool TestPassed = true; + TestPassed = hipStreamCreateWithPriorityTest:: + runFuncTestsForAllPriorityLevelsMultThread(hipStreamNonBlocking); + REQUIRE(TestPassed); +} + + +/** + * Scenario1: Validates functionality of hipStreamCreateWithPriority when + stream = nullptr. + * Scenario2: Validates functionality of hipStreamCreateWithPriority when + flag = 0xffffffff. +*/ +TEST_CASE("Unit_hipStreamCreateWithPriority_NegTst") { + hipStream_t stream; + int priority_low; + int priority_high; + hipError_t ret; + + // Test is to get the Stream Priority Range + HIP_CHECK( + hipDeviceGetStreamPriorityRange(&priority_low, &priority_high)); + // Check if priorities are indeed supported + if (priority_low == priority_high) { + WARN("Stream priority range not supported. Skipping test."); + return; // exit the test since priorities are not supported + } + + SECTION("stream = nullptr test") { + ret = hipStreamCreateWithPriority(nullptr, hipStreamDefault, + priority_low); + + REQUIRE(ret != hipSuccess); + } + + SECTION("flag value invalid test") { + ret = hipStreamCreateWithPriority(&stream, 0xffffffff, + priority_low); + + REQUIRE(ret != hipSuccess); + } +} + +/** + * Validate stream priorities with event after classifying them as low, medium and high. + */ +TEST_CASE("Unit_hipStreamCreateWithPriority_ValidateWithEvents") { + bool TestPassed = true; + TestPassed = hipStreamCreateWithPriorityTest:: + validateStreamPrioritiesWithEvents(); + REQUIRE(TestPassed); +} diff --git a/projects/hip/tests/catch/unit/stream/hipStreamWithCUMask.cc b/projects/hip/tests/catch/unit/stream/hipStreamWithCUMask.cc new file mode 100644 index 0000000000..f78ef9ee2c --- /dev/null +++ b/projects/hip/tests/catch/unit/stream/hipStreamWithCUMask.cc @@ -0,0 +1,350 @@ +/* +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** +Testcase Scenarios : + +1)Validates functionality of hipStreamAddCallback with created stream. + +2)Validates functionality of stream with cu mask. + +3)Create a stream with all CU masks disabled (0x00000000). +Verify that default CU mask is set for the stream. + +4)Size is greater than physical CU number. In this case the extra elements +are ignored and hipExtStreamCreateWithCUMask must return hipSuccess. + +5)Negative Testing of hipExtStreamCreateWithCUMask. +*/ + + +#include +#include +#include +#include +#include + +#define NUM_CU_PARTITIONS 4 +#define CONSTANT 1.618f +#define SIZE_INBYTES_OF_MB (1024*1024) +#define GRIDSIZE 512 +#define BLOCKSIZE 256 +#define ZERO_MASK 0x00000000 + + +namespace hipExtStreamCreateWithCUMaskTest { + +float *A_h, *C_h; +bool cbDone = false; +bool isPassed = true; +size_t N = 4 * SIZE_INBYTES_OF_MB; + + +// Make a default CU mask bit-array where all CUs are active +// this default mask is expected to be returned when there is no +// custom or global CU mask defined. +void createDefaultCUMask(std::vector *pdefaultCUMask, + int numOfCUs) { + uint32_t temp = 0; + uint32_t bit_index = 0; + for (int i = 0; i < numOfCUs; i++) { + temp |= 1UL << bit_index; + if (bit_index >= 32) { + (*pdefaultCUMask).push_back(temp); + temp = 0; + bit_index = 0; + temp |= 1UL << bit_index; + } + bit_index += 1; + } + if (bit_index != 0) { + (*pdefaultCUMask).push_back(temp); + } +} +// Create masks of disabled CU masks. +void createDisabledCUMask(std::vector *pdisabledCUMask, + int numOfCUs) { + uint32_t temp = ZERO_MASK; + uint32_t bit_index = 0; + for (int i = 0; i < numOfCUs; i++) { + if (bit_index >= 32) { + (*pdisabledCUMask).push_back(temp); + temp = ZERO_MASK; + bit_index = 0; + } + bit_index += 1; + } + if (bit_index != 0) { + (*pdisabledCUMask).push_back(temp); + } +} + +void Callback(hipStream_t stream, hipError_t status, + void* userData) { + isPassed = true; + stream = 0; + HIP_CHECK(status); + REQUIRE(userData == nullptr); + for (size_t i = 0; i < N; i++) { + if (C_h[i] != A_h[i] * A_h[i]) { + UNSCOPED_INFO("Data mismatch at index: " << i); + isPassed = false; + break; + } + } + cbDone = true; +} +} // namespace hipExtStreamCreateWithCUMaskTest + +using hipExtStreamCreateWithCUMaskTest::A_h; +using hipExtStreamCreateWithCUMaskTest::C_h; +using hipExtStreamCreateWithCUMaskTest::cbDone; +using hipExtStreamCreateWithCUMaskTest::isPassed; +using hipExtStreamCreateWithCUMaskTest::N; +using hipExtStreamCreateWithCUMaskTest::createDefaultCUMask; +using hipExtStreamCreateWithCUMaskTest::createDisabledCUMask; +using hipExtStreamCreateWithCUMaskTest::Callback; + + +/** + * Scenario: Validates functionality of hipStreamAddCallback with created stream. + */ +TEST_CASE("Unit_hipExtStreamCreateWithCUMask_ValidateCallbackFunc") { + float *A_d, *C_d; + size_t Nbytes = N * sizeof(float); + cbDone = false; + A_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + C_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(C_h != nullptr); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = CONSTANT + i; + } + + HIP_CHECK(hipMalloc(&A_d, Nbytes)); + HIP_CHECK(hipMalloc(&C_d, Nbytes)); + + hipStream_t mystream; + std::vector defaultCUMask; + HIP_CHECK(hipSetDevice(0)); + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0)); + createDefaultCUMask(&defaultCUMask, props.multiProcessorCount); + + hipExtStreamCreateWithCUMask(&mystream, defaultCUMask.size(), + defaultCUMask.data()); + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, + mystream)); + const unsigned blocks = GRIDSIZE; + const unsigned threadsPerBlock = BLOCKSIZE; + hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks), + dim3(threadsPerBlock), 0, mystream, A_d, C_d, N); + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, + mystream)); + HIP_CHECK(hipStreamAddCallback(mystream, Callback, nullptr, 0)); + while (!cbDone) usleep(100000); // Sleep for 100 ms + HIP_CHECK(hipStreamDestroy(mystream)); + HIP_CHECK(hipFree(reinterpret_cast(C_d))); + HIP_CHECK(hipFree(reinterpret_cast(A_d))); + free(C_h); + free(A_h); + REQUIRE(isPassed == true); +} + +/** + * Scenario: Validates functionality of stream with cu mask. + */ +TEST_CASE("Unit_hipExtStreamCreateWithCUMask_Functionality") { + const int KNumPartition = NUM_CU_PARTITIONS; + float *dA[KNumPartition], *dC[KNumPartition]; + float *hA, *hC; + size_t N = 25*SIZE_INBYTES_OF_MB; + size_t Nbytes = N * sizeof(float); + std::vector streams(KNumPartition); + std::vector> cuMasks(KNumPartition); + std::stringstream ss[KNumPartition]; + + int nGpu = 0; + HIP_CHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + WARN("Didn't find any GPU! skipping the test!"); + return; + } + + static int device = 0; + HIP_CHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, device)); + INFO("info: running on bus " << "0x" << props.pciBusID << " " + << props.name << " with " << props.multiProcessorCount << " CUs"); + + hA = new float[Nbytes]; + REQUIRE(hA != nullptr); + hC = new float[Nbytes]; + REQUIRE(hC != nullptr); + for (size_t i = 0; i < N; i++) { + hA[i] = CONSTANT + i; + } + + for (int np = 0; np < KNumPartition; np++) { + HIP_CHECK(hipMalloc(&dA[np], Nbytes)); + HIP_CHECK(hipMalloc(&dC[np], Nbytes)); + // make unique CU masks in the multiple of dwords for each stream + uint32_t temp = 0; + uint32_t bit_index = np; + for (int i = np; i < props.multiProcessorCount; i = i + 4) { + temp |= 1UL << bit_index; + if (bit_index >= 32) { + cuMasks[np].push_back(temp); + temp = 0; + bit_index = np; + temp |= 1UL << bit_index; + } + bit_index += 4; + } + if (bit_index != 0) { + cuMasks[np].push_back(temp); + } + + HIP_CHECK(hipExtStreamCreateWithCUMask(&streams[np], cuMasks[np].size(), + cuMasks[np].data())); + + HIP_CHECK(hipMemcpy(dA[np], hA, Nbytes, hipMemcpyHostToDevice)); + + ss[np] << std::hex; + for (int i = cuMasks[np].size() - 1; i >= 0; i--) { + ss[np] << cuMasks[np][i]; + } + } + + const unsigned blocks = GRIDSIZE; + const unsigned threadsPerBlock = BLOCKSIZE; + + auto single_start = std::chrono::steady_clock::now(); + INFO("info: launch 'vector_square' kernel on one stream " << + streams[0] << " with CU mask: 0x" << ss[0].str().c_str()); + + hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), + dim3(threadsPerBlock), 0, streams[0], dA[0], dC[0], N); + hipDeviceSynchronize(); + + auto single_end = std::chrono::steady_clock::now(); + std::chrono::duration single_kernel_time = single_end - single_start; + + HIP_CHECK(hipMemcpy(hC, dC[0], Nbytes, hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < N; i++) { + REQUIRE(hC[i] == (hA[i] * hA[i])); + } + + INFO("info: launch 'vector_square' kernel on " + << KNumPartition << " streams:"); + auto all_start = std::chrono::steady_clock::now(); + for (int np = 0; np < KNumPartition; np++) { + INFO("info: launch 'vector_square' kernel on the stream " + << streams[np] << " with CU mask: 0x" << ss[np].str().c_str()); + hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), + dim3(threadsPerBlock), 0, streams[np], dA[np], dC[np], N); + } + hipDeviceSynchronize(); + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + + for (int np = 0; np < KNumPartition; np++) { + HIP_CHECK(hipMemcpy(hC, dC[np], Nbytes, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < N; i++) { + REQUIRE(hC[i] == (hA[i] * hA[i])); + } + } + + INFO("info: kernel launched on one stream took: " << + single_kernel_time.count() << " seconds"); + INFO("info: kernels launched on " << KNumPartition << + " streams took: " << all_kernel_time.count() << " seconds"); + INFO("info: launching kernels on " << KNumPartition << + " streams asynchronously is " << + single_kernel_time.count() / (all_kernel_time.count() / KNumPartition) + << " times faster per stream than launching on one stream alone"); + + delete [] hA; + delete [] hC; + for (int np = 0; np < KNumPartition; np++) { + hipFree(dC[np]); + hipFree(dA[np]); + HIP_CHECK(hipStreamDestroy(streams[np])); + } +} + +/** + * Scenario: Create a stream with all CU masks disabled (0x00000000). + * Verify that default CU mask is set for the stream. + */ +TEST_CASE("Unit_hipExtStreamCreateWithCUMask_AllCUsMasked") { + HIP_CHECK(hipSetDevice(0)); + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0)); + // make a CU mask with all CUs disabled. + std::vector allCUDisabled; + createDisabledCUMask(&allCUDisabled, props.multiProcessorCount); + hipStream_t stream; + HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, allCUDisabled.size(), + allCUDisabled.data())); + // Verify whether default CU mask is set for the stream. + uint32_t size = (props.multiProcessorCount / 32) + 1; + std::vector cuMask(size); + std::vector defaultCUMask; + createDefaultCUMask(&defaultCUMask, props.multiProcessorCount); + HIP_CHECK(hipExtStreamGetCUMask(stream, cuMask.size(), &cuMask[0])); + for (int i = 0; i < static_cast(defaultCUMask.size()); i++) { + REQUIRE(defaultCUMask[i] == cuMask[i]); + } + HIP_CHECK(hipStreamDestroy(stream)); +} + +/** + * Scenario: Negative Testing of hipExtStreamCreateWithCUMask. + */ +TEST_CASE("Unit_hipExtStreamCreateWithCUMask_NegTst") { + std::vector defaultCUMask; + REQUIRE(hipSuccess == hipSetDevice(0)); + hipDeviceProp_t props; + REQUIRE(hipSuccess == hipGetDeviceProperties(&props, 0)); + createDefaultCUMask(&defaultCUMask, props.multiProcessorCount); + hipStream_t stream; + // Negative Scenario 1: stream = nullptr + SECTION("stream is nullptr") { + REQUIRE_FALSE(hipExtStreamCreateWithCUMask(nullptr, + defaultCUMask.size(), + defaultCUMask.data()) == hipSuccess); + } + // Negative Scenario 2: cuMaskSize = 0 + SECTION("cuMaskSize is 0") { + REQUIRE_FALSE(hipExtStreamCreateWithCUMask(&stream, 0, + defaultCUMask.data()) == hipSuccess); + } + // Negative Scenario 3: cuMask = nullptr + SECTION("cuMask is nullptr") { + REQUIRE_FALSE(hipExtStreamCreateWithCUMask(&stream, + defaultCUMask.size(), + nullptr) == hipSuccess); + } +}