From 798c4e72552105be124b1d620cd3bc064aa39bf3 Mon Sep 17 00:00:00 2001 From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com> Date: Tue, 28 Feb 2023 05:24:03 +0530 Subject: [PATCH] SWDEV-326250 - Adding new set 1 of test cases to test compiler flag -fgpu-default-stream=per-thread (#176) Change-Id: I1a13ebdf366913ce938b3891cfdc549c6c618569 --- catch/unit/streamperthread/CMakeLists.txt | 9 + .../hipStreamPerThrdCompilerOptn.cc | 793 ++++++++++++++++++ 2 files changed, 802 insertions(+) create mode 100644 catch/unit/streamperthread/hipStreamPerThrdCompilerOptn.cc diff --git a/catch/unit/streamperthread/CMakeLists.txt b/catch/unit/streamperthread/CMakeLists.txt index dd5b54dabc..e185942712 100644 --- a/catch/unit/streamperthread/CMakeLists.txt +++ b/catch/unit/streamperthread/CMakeLists.txt @@ -5,8 +5,17 @@ set(TEST_SRC hipStreamPerThread_MultiThread.cc hipStreamPerThread_DeviceReset.cc hipStreamPerThrdTsts.cc + hipStreamPerThrdCompilerOptn.cc ) +if(HIP_PLATFORM MATCHES "amd") + set_source_files_properties(hipStreamPerThrdCompilerOptn.cc PROPERTIES COMPILE_OPTIONS "-fgpu-default-stream=per-thread") +endif() + +if(HIP_PLATFORM MATCHES "nvidia") + set_source_files_properties(hipStreamPerThrdCompilerOptn.cc PROPERTIES COMPILE_OPTIONS "--default-stream=per-thread") +endif() + hip_add_exe_to_target(NAME StreamPerThreadTest TEST_SRC ${TEST_SRC} TEST_TARGET_NAME build_tests) diff --git a/catch/unit/streamperthread/hipStreamPerThrdCompilerOptn.cc b/catch/unit/streamperthread/hipStreamPerThrdCompilerOptn.cc new file mode 100644 index 0000000000..15e15a38bf --- /dev/null +++ b/catch/unit/streamperthread/hipStreamPerThrdCompilerOptn.cc @@ -0,0 +1,793 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* This file will be compiled using the compiler option given below. + The functions will be calculating the time taken to execute under + the influence of the compiler flag: + -fgpu-default-stream=per-thread +*/ + +#include +#include +#include "hip/hip_cooperative_groups.h" + +namespace cg = cooperative_groups; + +namespace DefltStrmPT { + int64_t N = 1024 * 1024 * 100; + int64_t Sz = N * sizeof(int64_t); + int64_t *DevA, *HstA, *HstRes; + int64_t OneMB = 1024 * 1024; + int64_t OneMBSz = OneMB * sizeof(int64_t); + hipStream_t Strm; + int clockrate, CONST = 123; + size_t numH = 1024, numW = 1024; + size_t pitch_A, width = numW * sizeof(int64_t); + size_t sizeElements = width * numH; + size_t elements = numW * numH; +} // namespace DefltStrmPT + +__device__ int64_t globalInDStrmPT[1024 * 1024]; +__device__ int SigComplte = 0; + +// Kernel codes +__global__ void DefltStrmPT_Square(int64_t *C_d, int64_t N) { + int64_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int64_t stride = hipBlockDim_x * hipGridDim_x; + + for (int64_t i = offset; i < N; i += stride) { + C_d[i] = C_d[i] * C_d[i]; + } +} + +__global__ void Wait_Kernel3(int clockrate, uint64_t WaitSecs, + int PassSignal = 0) { + uint64_t num_cycles = WaitSecs * clockrate * 1000; + uint64_t start = clock64(), cycles = 0; + while (cycles < num_cycles) { + cycles = clock64() - start; + } + if (PassSignal) { + SigComplte = 1; + } +} + +__global__ void DefltStrmPT_Test_gws(uint* buf, uint bufSize, + int64_t* tmpBuf, int64_t* result) { + extern __shared__ int64_t tmp[]; + uint offset = blockIdx.x * blockDim.x + threadIdx.x; + uint stride = gridDim.x * blockDim.x; + cg::grid_group gg = cg::this_grid(); + + int64_t sum = 0; + for (uint i = offset; i < bufSize; i += stride) { + sum += buf[i]; + } + tmp[threadIdx.x] = sum; + + __syncthreads(); + + if (threadIdx.x == 0) { + sum = 0; + for (uint i = 0; i < blockDim.x; i++) { + sum += tmp[i]; + } + tmpBuf[blockIdx.x] = sum; + } + + gg.sync(); + + if (offset == 0) { + for (uint i = 1; i < gridDim.x; ++i) { + sum += tmpBuf[i]; + } + *result = sum; + } +} + +float DefaultPT2_Memcpy_MemSet(int CpyAsync, int MemSetAsync) { + bool IfTstPassed = true; + DefltStrmPT::HstA = reinterpret_cast (malloc(DefltStrmPT::Sz)); + DefltStrmPT::HstRes = reinterpret_cast (malloc(DefltStrmPT::Sz)); + HIP_CHECK(hipDeviceGetAttribute(&(DefltStrmPT::clockrate), + hipDeviceAttributeMemoryClockRate, 0)); + HIP_CHECK(hipMalloc(&(DefltStrmPT::DevA), DefltStrmPT::Sz)); + for (int64_t i = 0; i < DefltStrmPT::N; ++i) { + DefltStrmPT::HstA[i] = DefltStrmPT::CONST; + } + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + if (CpyAsync) { + HIP_CHECK(hipMemcpyAsync(DefltStrmPT::DevA, DefltStrmPT::HstA, + DefltStrmPT::Sz, hipMemcpyHostToDevice, + DefltStrmPT::Strm)); + HIP_CHECK(hipStreamSynchronize(DefltStrmPT::Strm)); + } else { + HIP_CHECK(hipMemcpy(DefltStrmPT::DevA, DefltStrmPT::HstA, + DefltStrmPT::Sz, hipMemcpyHostToDevice)); + } + DefltStrmPT_Square<<<(DefltStrmPT::N/256 + 1), 256, 0, DefltStrmPT::Strm>>> + (DefltStrmPT::DevA, DefltStrmPT::N); + HIP_CHECK(hipStreamSynchronize(DefltStrmPT::Strm)); + HIP_CHECK(hipMemcpy(DefltStrmPT::HstRes, DefltStrmPT::DevA, + DefltStrmPT::Sz, hipMemcpyDeviceToHost)); + // Verifying the result + for (int64_t i = 0; i < DefltStrmPT::N; ++i) { + if (DefltStrmPT::HstRes[i] != + (DefltStrmPT::HstA[i] * DefltStrmPT::HstA[i])) { + IfTstPassed = false; + } + } + if (MemSetAsync) { + HIP_CHECK(hipMemsetAsync(DefltStrmPT::DevA, 0, DefltStrmPT::Sz, + DefltStrmPT::Strm)); + HIP_CHECK(hipStreamSynchronize(DefltStrmPT::Strm)); + } else { + HIP_CHECK(hipMemset(DefltStrmPT::DevA, 0, + DefltStrmPT::Sz)); + } + // Copying the device memory to host to check if Memset is successful + HIP_CHECK(hipMemcpy(DefltStrmPT::HstA, DefltStrmPT::DevA, + DefltStrmPT::Sz, hipMemcpyDeviceToHost)); + // verifying if memset was successful + for (int64_t i = 0; i < DefltStrmPT::N; ++i) { + if (DefltStrmPT::HstA[i] != 0) { + IfTstPassed = false; + } + } + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); + HIP_CHECK(hipFree(DefltStrmPT::DevA)); + free(DefltStrmPT::HstA); + free(DefltStrmPT::HstRes); + return IfTstPassed; +} + +void DefaultPT2_Memset2D(int Async) { + constexpr int memsetval = 0x24; + constexpr size_t numH = 256; + constexpr size_t numW = 256; + size_t pitch_A; + size_t width = numW * sizeof(char); + size_t sizeElements = width * numH; + size_t elements = numW * numH; + char *A_d, *A_h; + + HIP_CHECK(hipMallocPitch(reinterpret_cast(&A_d), &pitch_A, width, + numH)); + A_h = reinterpret_cast(malloc(sizeElements)); + REQUIRE(A_h != nullptr); + + for (size_t i = 0; i < elements; i++) { + A_h[i] = 1; + } + + if (Async) { + hipStream_t Strm; + HIP_CHECK(hipStreamCreate(&Strm)); + HIP_CHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, Strm)); + HIP_CHECK(hipStreamSynchronize(Strm)); + HIP_CHECK(hipStreamDestroy(Strm)); + } else { + HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); + } + HIP_CHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, + hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < elements; i++) { + if (A_h[i] != memsetval) { + INFO("Memset2D mismatch at index:" << i << " computed:" + << A_h[i] << " memsetval:" << memsetval); + REQUIRE(false); + } + } + + HIP_CHECK(hipFree(A_d)); + free(A_h); +} + + +void PerThrdDefltStrm_Memset3D(int Async) { + constexpr int memsetval = 0x22; + constexpr size_t numH = 256; + constexpr size_t numW = 256; + constexpr size_t depth = 10; + size_t width = numW * sizeof(char); + size_t sizeElements = width * numH * depth; + size_t elements = numW * numH * depth; + char *A_h; + + hipExtent extent = make_hipExtent(width, numH, depth); + hipPitchedPtr devPitchedPtr; + + HIP_CHECK(hipMalloc3D(&devPitchedPtr, extent)); + A_h = reinterpret_cast(malloc(sizeElements)); + REQUIRE(A_h != nullptr); + + for (size_t i = 0; i < elements; i++) { + A_h[i] = 1; + } + + if (Async) { + hipStream_t Strm; + HIP_CHECK(hipStreamCreate(&Strm)); + HIP_CHECK(hipMemset3DAsync(devPitchedPtr, memsetval, extent, Strm)); + HIP_CHECK(hipStreamSynchronize(Strm)); + HIP_CHECK(hipStreamDestroy(Strm)); + } else { + HIP_CHECK(hipMemset3D(devPitchedPtr, memsetval, extent)); + } + hipMemcpy3DParms myparms{}; + myparms.srcPos = make_hipPos(0, 0, 0); + myparms.dstPos = make_hipPos(0, 0, 0); + myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH); + myparms.srcPtr = devPitchedPtr; + myparms.extent = extent; +#if HT_NVIDIA + myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost); +#else + myparms.kind = hipMemcpyDeviceToHost; +#endif + HIP_CHECK(hipMemcpy3D(&myparms)); + + for (size_t i = 0; i < elements; i++) { + if (A_h[i] != memsetval) { + INFO("Memset3D mismatch at index:" << i << " computed:" + << A_h[i] << " memsetval:" << memsetval); + REQUIRE(false); + } + } + HIP_CHECK(hipFree(devPitchedPtr.ptr)); + free(A_h); +} + + +void DefaultPT2_StrmQuery() { + HIP_CHECK(hipDeviceGetAttribute(&(DefltStrmPT::clockrate), + hipDeviceAttributeMemoryClockRate, 0)); + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + // StreamQuery with null stream + Wait_Kernel3<<<1, 1>>>(DefltStrmPT::clockrate, 3); + REQUIRE((hipErrorNotReady == hipStreamQuery(0))); + // StreamQuery with user created stream + Wait_Kernel3<<<1, 1, 0, DefltStrmPT::Strm>>>(DefltStrmPT::clockrate, 3); + REQUIRE((hipErrorNotReady == hipStreamQuery(DefltStrmPT::Strm))); + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); +} + + +void DefaultPT2_StreamSync() { + HIP_CHECK(hipDeviceGetAttribute(&(DefltStrmPT::clockrate), + hipDeviceAttributeMemoryClockRate, 0)); + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + // Calling hipStreamSync on user created stream object + Wait_Kernel3<<<1, 1, 0, DefltStrmPT::Strm>>>(DefltStrmPT::clockrate, 1); + HIP_CHECK(hipStreamSynchronize(DefltStrmPT::Strm)); + // Calling hipStreamSync on null stream + Wait_Kernel3<<<1, 1>>>(DefltStrmPT::clockrate, 1); + HIP_CHECK(hipStreamSynchronize(0)); + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); +} + + +void DefaultPT2_StrmWaitEvent() { + hipEvent_t evt; + hipStream_t Strm1; + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + HIP_CHECK(hipStreamCreate(&Strm1)); + HIP_CHECK(hipEventCreate(&evt)); + Wait_Kernel3<<<1, 1, 0, DefltStrmPT::Strm>>>(DefltStrmPT::clockrate, 3, 1); + HIP_CHECK(hipEventRecord(evt, DefltStrmPT::Strm)); + HIP_CHECK(hipStreamWaitEvent(Strm1, evt, 0)); + Wait_Kernel3<<<1, 1, 0, Strm1>>>(DefltStrmPT::clockrate, 1); + // By the time control reaches the below point SigComplte is expected + // to be still zero + if (SigComplte) { + REQUIRE(false); + } + HIP_CHECK(hipStreamSynchronize(Strm1)); + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); + HIP_CHECK(hipStreamDestroy(Strm1)); + HIP_CHECK(hipEventDestroy(evt)); +} + +void DefaultPT2_EvtQuery() { + hipEvent_t evt, evt1; + hipError_t err; + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + HIP_CHECK(hipEventCreate(&evt)); + HIP_CHECK(hipEventCreate(&evt1)); + Wait_Kernel3<<<1, 1, 0, DefltStrmPT::Strm>>>(DefltStrmPT::clockrate, 3); + HIP_CHECK(hipEventRecord(evt, DefltStrmPT::Strm)); + err = hipEventQuery(evt); + if (err != hipErrorNotReady) { + REQUIRE(false); + } + // Testing for Null or default stream + HIP_CHECK(hipEventRecord(evt1, 0)); + std::chrono::time_point start = std::chrono::steady_clock::now(); + int Got_hipSuccess = 0; // 0 for no, 1 for yes + while (true) { + err = hipEventQuery(evt1); + if (err == hipSuccess) { + Got_hipSuccess = 1; + break; + } + if (std::chrono::steady_clock::now() - start > std::chrono::seconds(60)) { + break; + } + } + if (!Got_hipSuccess) { + REQUIRE(false); + } + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); + HIP_CHECK(hipEventDestroy(evt)); + HIP_CHECK(hipEventDestroy(evt1)); +} + + +void Default_LaunchKernel(int NullStrm) { + DefltStrmPT::N = DefltStrmPT::N/4; + DefltStrmPT::Sz = DefltStrmPT::N * sizeof(int64_t); + DefltStrmPT::HstA = reinterpret_cast (malloc(DefltStrmPT::Sz)); + HIP_CHECK(hipMalloc(&(DefltStrmPT::DevA), DefltStrmPT::Sz)); + for (int64_t i = 0; i < DefltStrmPT::N; ++i) { + DefltStrmPT::HstA[i] = DefltStrmPT::CONST; + } + HIP_CHECK(hipMemcpy(DefltStrmPT::DevA, DefltStrmPT::HstA, DefltStrmPT::Sz, + hipMemcpyHostToDevice)); + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + unsigned ThrdsPerBlk = 32; + unsigned Blocks = ((DefltStrmPT::N + ThrdsPerBlk - 1)/ThrdsPerBlk); + void *Args[] = {&(DefltStrmPT::DevA), &(DefltStrmPT::N)}; + // launch Kernel + if (NullStrm) { + HIP_CHECK(hipLaunchKernel((const void*)DefltStrmPT_Square, + dim3(Blocks, 1, 1), dim3(ThrdsPerBlk, 1, 1), Args, 0, + 0)); + HIP_CHECK(hipStreamSynchronize(0)); + } else { + HIP_CHECK(hipLaunchKernel((const void*)DefltStrmPT_Square, + dim3(Blocks, 1, 1), dim3(ThrdsPerBlk, 1, 1), Args, 0, + DefltStrmPT::Strm)); + HIP_CHECK(hipStreamSynchronize(DefltStrmPT::Strm)); + } + HIP_CHECK(hipMemcpy(DefltStrmPT::HstA, DefltStrmPT::DevA, DefltStrmPT::Sz, + hipMemcpyDeviceToHost)); + for (int64_t i = 0; i < DefltStrmPT::N; ++i) { + if (DefltStrmPT::HstA[i] != (DefltStrmPT::CONST * DefltStrmPT::CONST)) { + REQUIRE(false); + } + } + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); + HIP_CHECK(hipFree(DefltStrmPT::DevA)); + free(DefltStrmPT::HstA); +} + + +void DefaultPT2_LaunchCooperativeKernel(int NullStrm) { + bool IfTestPassed = true; + uint32_t *dA; + int64_t *dB, *dC; + uint32_t BufferSizeInDwords = 448 * 1024 * 1024; + uint32_t* init = new uint32_t[BufferSizeInDwords]; + for (uint32_t i = 0; i < BufferSizeInDwords; ++i) { + init[i] = i; + } + size_t SIZE = BufferSizeInDwords * sizeof(uint); + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + hipDeviceProp_t deviceProp; + HIP_CHECK(hipGetDeviceProperties(&deviceProp, 0)); + HIPCHECK(hipMalloc(reinterpret_cast(&dA), SIZE)); + HIPCHECK(hipHostMalloc(reinterpret_cast(&dC), sizeof(int64_t))); + HIPCHECK(hipMemcpy(dA, init, SIZE, hipMemcpyHostToDevice)); + + dim3 dimBlock = dim3(1); + dim3 dimGrid = dim3(1); + int numBlocks = 0; + dimBlock.x = 32; + // Calculate the device occupancy to know how many blocks can be run + // concurrently + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, + DefltStrmPT_Test_gws, + dimBlock.x * dimBlock.y * dimBlock.z, + dimBlock.x * sizeof(int64_t))); + dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); + HIPCHECK(hipMalloc(reinterpret_cast(&dB), + dimGrid.x * sizeof(int64_t))); + + void *params[4]; + params[0] = reinterpret_cast(&dA); + params[1] = reinterpret_cast(&BufferSizeInDwords); + params[2] = reinterpret_cast(&dB); + params[3] = reinterpret_cast(&dC); + if (NullStrm) { + HIPCHECK(hipLaunchCooperativeKernel( + reinterpret_cast(DefltStrmPT_Test_gws), + dimGrid, dimBlock, params, dimBlock.x * sizeof(int64_t), 0)); + HIP_CHECK(hipStreamSynchronize(0)); + } else { + HIPCHECK(hipLaunchCooperativeKernel( + reinterpret_cast(DefltStrmPT_Test_gws), + dimGrid, dimBlock, params, dimBlock.x * sizeof(int64_t), + DefltStrmPT::Strm)); + HIP_CHECK(hipStreamSynchronize(DefltStrmPT::Strm)); + } + HIPCHECK(hipMemcpy(init, dC, sizeof(int64_t), hipMemcpyDeviceToHost)); + + if (*dC != (((int64_t)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2)) { + std::cout << "Data validation failed for grid size = " << dimGrid.x << + " and block size = " << dimBlock.x << "\n"; + std::cout << "Test failed! \n"; + IfTestPassed = false; + } + HIPCHECK(hipStreamDestroy(DefltStrmPT::Strm)); + HIPCHECK(hipHostFree(dC)); + HIPCHECK(hipFree(dB)); + HIPCHECK(hipFree(dA)); + delete [] init; + REQUIRE(IfTestPassed); +} + + +void DefaultPT2_StrmGetFlag() { + HIP_CHECK(hipStreamCreateWithFlags(&(DefltStrmPT::Strm), hipStreamDefault)); + unsigned int flag = 9999; + HIP_CHECK(hipStreamGetFlags(DefltStrmPT::Strm, &flag)); + if (flag != 0) { + INFO("Expected flag value: 0, Received flag value: %u\n" << flag); + REQUIRE(false); + } + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); + HIP_CHECK(hipStreamCreateWithFlags(&(DefltStrmPT::Strm), + hipStreamNonBlocking)); + flag = 9999; + HIP_CHECK(hipStreamGetFlags(DefltStrmPT::Strm, &flag)); + if (flag != 1) { + INFO("Expected flag value: 1, Received flag value: %u\n" << flag); + REQUIRE(false); + } + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); +} + +void DefaultPT2_StrmGetPriority() { + int low, high, ObsrvdPriority; + HIP_CHECK(hipDeviceGetStreamPriorityRange(&low, &high)); + INFO("Lowest possible priority: %d\n" << low); + INFO("Highest possible priority: %d\n" << high); + INFO("Creating streams with flag hipStreamNonBlocking\n"); + // hipStrmFlg = 0 = hipStreamDefault + // hipStrmFlg = 1 = hipStreamNonBlocking + for (int hipStrmFlg = 0; hipStrmFlg < 2; ++hipStrmFlg) { + for (int Priority = low; Priority <= high; ++Priority) { + if (hipStrmFlg == 0) { + HIP_CHECK(hipStreamCreateWithPriority(&(DefltStrmPT::Strm), + hipStreamDefault, Priority)); + } else { + HIP_CHECK(hipStreamCreateWithPriority(&(DefltStrmPT::Strm), + hipStreamNonBlocking, Priority)); + } + HIP_CHECK(hipStreamGetPriority(DefltStrmPT::Strm, &ObsrvdPriority)); + if (ObsrvdPriority != Priority) { + INFO("Expected priority: %d" << Priority << " Observed Priority: %d\n" + << ObsrvdPriority); + INFO("Test Failed!\n\n"); + REQUIRE(false); + } + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); + } + } + INFO("Checking priority on null stream!!\n"); + HIP_CHECK(hipStreamGetPriority(0, &ObsrvdPriority)); + if (ObsrvdPriority != 0) { + INFO("Expected priority: 0, Observed Priority: %d\n" + << ObsrvdPriority); + INFO("Test Failed!\n\n"); + REQUIRE(false); + } +} + + +void DefaultPT2_hipMemcpyFromSymbol() { + int64_t *Hst = nullptr; + HIP_CHECK(hipHostMalloc(&(Hst), DefltStrmPT::OneMBSz)); + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + for (int i = 0; i < DefltStrmPT::OneMB; ++i) { + Hst[i] = DefltStrmPT::CONST; + } + HIP_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(globalInDStrmPT), Hst, + DefltStrmPT::OneMBSz, 0, hipMemcpyHostToDevice)); + for (int i = 0; i < DefltStrmPT::OneMB; ++i) { + Hst[i] = 0; + } + HIP_CHECK(hipMemcpyFromSymbol(Hst, HIP_SYMBOL(globalInDStrmPT), + DefltStrmPT::OneMBSz, 0, hipMemcpyDeviceToHost)); + for (int i = 0; i < DefltStrmPT::OneMB; ++i) { + if (Hst[i] != DefltStrmPT::CONST) { + REQUIRE(false); + } + } + HIP_CHECK(hipHostFree(Hst)); + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); +} + +void DefaultPT2_hipMemcpy2D(int Async) { + DefltStrmPT::numH = 1024; + DefltStrmPT::numW = 1024; + DefltStrmPT::width = DefltStrmPT::numW * sizeof(int64_t); + HIP_CHECK(hipHostMalloc(&(DefltStrmPT::HstA), + (DefltStrmPT::numH * DefltStrmPT::numW * sizeof(int64_t)))); + HIP_CHECK(hipHostMalloc(&(DefltStrmPT::HstRes), + (DefltStrmPT::numH * DefltStrmPT::numW * sizeof(int64_t)))); + DefltStrmPT::width = DefltStrmPT::numW * sizeof(int64_t); + for (size_t row = 0; row < DefltStrmPT::numH; ++row) { + for (size_t column = 0; column < DefltStrmPT::numW; ++column) { + DefltStrmPT::HstA[(row * DefltStrmPT::numW) + column] = + DefltStrmPT::CONST; + DefltStrmPT::HstRes[(row * DefltStrmPT::numW) + column] = 0; + } + } + HIP_CHECK(hipMallocPitch(reinterpret_cast(&(DefltStrmPT::DevA)), + &(DefltStrmPT::pitch_A), DefltStrmPT::width, DefltStrmPT::numH)); + if (Async) { + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + HIP_CHECK(hipMemcpy2DAsync(DefltStrmPT::DevA, DefltStrmPT::pitch_A, + DefltStrmPT::HstA, DefltStrmPT::numW*sizeof(int64_t), + DefltStrmPT::numW*sizeof(int64_t), DefltStrmPT::numH, + hipMemcpyHostToDevice, DefltStrmPT::Strm)); + HIP_CHECK(hipMemcpy2DAsync(DefltStrmPT::HstRes, + DefltStrmPT::numW*sizeof(int64_t), + DefltStrmPT::DevA, DefltStrmPT::pitch_A, + DefltStrmPT::numW*sizeof(int64_t), DefltStrmPT::numH, + hipMemcpyDeviceToHost, DefltStrmPT::Strm)); + HIP_CHECK(hipStreamSynchronize(DefltStrmPT::Strm)); + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); + } else { + HIP_CHECK(hipMemcpy2D(DefltStrmPT::DevA, DefltStrmPT::pitch_A, + DefltStrmPT::HstA, DefltStrmPT::numW*sizeof(int64_t), + DefltStrmPT::numW*sizeof(int64_t), DefltStrmPT::numH, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy2D(DefltStrmPT::HstRes, + DefltStrmPT::numW*sizeof(int64_t), DefltStrmPT::DevA, + DefltStrmPT::pitch_A, DefltStrmPT::numW*sizeof(int64_t), + DefltStrmPT::numH, hipMemcpyDeviceToHost)); + } + for (size_t row = 0; row < DefltStrmPT::numH; ++row) { + for (size_t column = 0; column < DefltStrmPT::numW; ++column) { + if (DefltStrmPT::HstRes[(row * DefltStrmPT::numW) + column] + != DefltStrmPT::CONST) { + REQUIRE(false); + } + } + } + HIP_CHECK(hipFree(DefltStrmPT::DevA)); + HIP_CHECK(hipHostFree(DefltStrmPT::HstA)); +} + + +void DefaultPT2_hipMemcpy2DToArray() { + hipArray *Dptr = nullptr; + float *Hptr = nullptr, *HRes = nullptr; + DefltStrmPT::numH = 1024; + DefltStrmPT::numW = 1024; + DefltStrmPT::width = DefltStrmPT::numW * sizeof(float); + Hptr = new float[DefltStrmPT::width * DefltStrmPT::numH]; + HRes = new float[DefltStrmPT::width * DefltStrmPT::numH]; + for (size_t i = 0; i < DefltStrmPT::width * DefltStrmPT::numH; ++i) { + Hptr[i] = DefltStrmPT::CONST; + } + hipChannelFormatDesc desc = hipCreateChannelDesc(); + HIP_CHECK(hipMallocArray(&(Dptr), &desc, DefltStrmPT::numW, + DefltStrmPT::numH, hipArrayDefault)); + HIP_CHECK(hipMemcpy2DToArray(Dptr, 0, 0, Hptr, DefltStrmPT::width, + DefltStrmPT::width, DefltStrmPT::numH, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy2DFromArray(HRes, DefltStrmPT::width, Dptr, 0, 0, + DefltStrmPT::width, DefltStrmPT::numH, hipMemcpyDeviceToHost)); + // verifying the result + for (size_t i = 0; i < DefltStrmPT::numW * DefltStrmPT::numH; ++i) { + if (HRes[i] != DefltStrmPT::CONST) { + REQUIRE(false); + } + } + delete[] Hptr; + delete[] HRes; + HIP_CHECK(hipFreeArray(Dptr)); +} + + +float DefaultPT2_hipMemcpy2DFromArray() { + hipArray *Dptr = nullptr; + float *Hptr_A = nullptr, *Hptr_B = nullptr; + DefltStrmPT::numH = 1024; + DefltStrmPT::numW = 1024; + DefltStrmPT::width = DefltStrmPT::numW * sizeof(float); + HIP_CHECK(hipDeviceGetAttribute(&(DefltStrmPT::clockrate), + hipDeviceAttributeMemoryClockRate, 0)); + HIP_CHECK(hipHostMalloc(&(Hptr_A), + (DefltStrmPT::width * DefltStrmPT::numH * sizeof(float)))); + HIP_CHECK(hipHostMalloc(&(Hptr_B), + (DefltStrmPT::width * DefltStrmPT::numH * sizeof(float)))); + for (size_t i = 0; i < (DefltStrmPT::width * DefltStrmPT::numH); ++i) { + Hptr_A[i] = DefltStrmPT::CONST; + } + hipChannelFormatDesc desc = hipCreateChannelDesc(); + HIP_CHECK(hipMallocArray(&(Dptr), &desc, DefltStrmPT::numW, + DefltStrmPT::numH, hipArrayDefault)); + HIP_CHECK(hipStreamCreate(&(DefltStrmPT::Strm))); + HIP_CHECK(hipMemcpy2DToArray(Dptr, 0, 0, Hptr_A, DefltStrmPT::width, + DefltStrmPT::width, DefltStrmPT::numH, + hipMemcpyHostToDevice)); + Wait_Kernel3 <<< 1, 1, 0, DefltStrmPT::Strm >>> (DefltStrmPT::clockrate, + 1); + HIP_CHECK(hipMemcpy2DFromArray(Hptr_B, DefltStrmPT::width, Dptr, 0, 0, + DefltStrmPT::width, DefltStrmPT::numH, hipMemcpyDeviceToHost)); + HIP_CHECK(hipStreamDestroy(DefltStrmPT::Strm)); + HIP_CHECK(hipFreeArray(Dptr)); + HIP_CHECK(hipHostFree(Hptr_A)); + HIP_CHECK(hipHostFree(Hptr_B)); + return true; +} + +void DefaultPT2_hipMemcpy3D() { + int width = 8, height = 8, depth = 8; + unsigned int size = width * height * depth * sizeof(float); + float* Hptr = reinterpret_cast(malloc(size)); + float* HRes = reinterpret_cast(malloc(size)); + memset(Hptr, 0, size); + memset(HRes, 0, size); + hipExtent extent = make_hipExtent(width, height, depth); + + for (int i = 0; i < depth; i++) { + for (int j = 0; j < height; j++) { + for (int k = 0; k < width; k++) { + Hptr[i*width*height + j*width +k] = i*width*height + j*width + k; + } + } + } + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(float)*8, 0, + 0, 0, hipChannelFormatKindFloat); + hipArray *arr; + + HIP_CHECK(hipMalloc3DArray(&arr, &channelDesc, + make_hipExtent(width, height, depth), hipArrayDefault)); + hipMemcpy3DParms myparms{0, {0, 0, 0}, {0, 0, 0, 0}, 0, {0, 0, 0}, + {0, 0, 0, 0}, {0, 0, 0}, hipMemcpyDefault}; + myparms.srcPos = make_hipPos(0, 0, 0); + myparms.dstPos = make_hipPos(0, 0, 0); + myparms.srcPtr = make_hipPitchedPtr(Hptr, width * sizeof(float), width, + height); + myparms.dstArray = arr; + myparms.extent = extent; + +#ifdef __HIP_PLATFORM_NVIDIA__ + myparms.kind = cudaMemcpyHostToDevice; +#else + myparms.kind = hipMemcpyHostToDevice; +#endif + // Host to Device copy + HIP_CHECK(hipMemcpy3D(&myparms)); + + // Device to Host copy + memset(&myparms, 0x0, sizeof(hipMemcpy3DParms)); + myparms.srcPos = make_hipPos(0, 0, 0); + myparms.dstPos = make_hipPos(0, 0, 0); + myparms.dstPtr = make_hipPitchedPtr(HRes, width * sizeof(float), width, + height); + myparms.srcArray = arr; + myparms.extent = extent; +#ifdef __HIP_PLATFORM_NVIDIA__ + myparms.kind = cudaMemcpyDeviceToHost; +#else + myparms.kind = hipMemcpyDeviceToHost; +#endif + HIP_CHECK(hipMemcpy3D(&myparms)); + + for (int i = 0; i < depth; i++) { + for (int j = 0; j < height; j++) { + for (int k = 0; k < width; k++) { + if (HRes[i*width*height + j*width +k] != i*width*height + j*width + k) { + REQUIRE(false); + } + } + } + } + HIP_CHECK(hipFreeArray(arr)); + free(Hptr); + free(HRes); +} + + +TEST_CASE("Unit_hipStrmPerThrdDefault") { + SECTION("Testing hipMemset/Memcpy() and their async version") { + REQUIRE(DefaultPT2_Memcpy_MemSet(1, 0)); + REQUIRE(DefaultPT2_Memcpy_MemSet(1, 1)); + REQUIRE(DefaultPT2_Memcpy_MemSet(0, 1)); + REQUIRE(DefaultPT2_Memcpy_MemSet(0, 0)); + } + SECTION("Testing hipMemset2D() and its async version") { + DefaultPT2_Memset2D(0); + DefaultPT2_Memset2D(1); + } + + SECTION("Testing_hipMemset3D() and its async version") { + PerThrdDefltStrm_Memset3D(0); + PerThrdDefltStrm_Memset3D(1); + } + + SECTION("Testing_hipStreamQuery()") { + DefaultPT2_StrmQuery(); + } + + SECTION("Testing_hipStreamSynchronize()") { + DefaultPT2_StreamSync(); + } + + SECTION("Testing_hipLaunchKernel()") { + // launch with null stream + Default_LaunchKernel(1); + // launch with user created stream + Default_LaunchKernel(0); + } + + hipDeviceProp_t deviceProp; + HIP_CHECK(hipGetDeviceProperties(&deviceProp, 0)); + if (deviceProp.cooperativeLaunch) { + SECTION("Testing_hipLaunchCooperativeKernel()") { + // launching hipLaunchCooperativeKernel() with Null stream + DefaultPT2_LaunchCooperativeKernel(1); + // launching hipLaunchCooperativeKernel() with user created stream + DefaultPT2_LaunchCooperativeKernel(0); + } + } else { + INFO("Cooperative Launch feature is not supported, therefore skipping"); + INFO(" the test Testing_hipLaunchCooperativeKernel()"); + } + + SECTION("Testing_StrmWaitEvent()") { + DefaultPT2_StrmWaitEvent(); + } + + SECTION("Testing_hipStreamGetFlag()") { + DefaultPT2_StrmGetFlag(); + } + + SECTION("Testing_hipStreamGetPriority()") { + DefaultPT2_StrmGetPriority(); + } + + SECTION("Testing_hipMemcpyFrom/To/Symbol()") { + DefaultPT2_hipMemcpyFromSymbol(); + } + + SECTION("Testing_hipMemcpy2D() & its Async version") { + DefaultPT2_hipMemcpy2D(0); + DefaultPT2_hipMemcpy2D(1); + } + + SECTION("Testing_hipMemcpy2DToArray()") { + DefaultPT2_hipMemcpy2DToArray(); + } + + SECTION("Testing_hipMemcpy3D()") { + DefaultPT2_hipMemcpy3D(); + } + + SECTION("Testing_hipEventQuery()") { + DefaultPT2_EvtQuery(); + } +}