diff --git a/catch/unit/streamperthread/hipStreamPerThrdTsts.cc b/catch/unit/streamperthread/hipStreamPerThrdTsts.cc index 418e5332ab..95f5c27fd5 100644 --- a/catch/unit/streamperthread/hipStreamPerThrdTsts.cc +++ b/catch/unit/streamperthread/hipStreamPerThrdTsts.cc @@ -155,9 +155,7 @@ __global__ void test_gwsPerThrd(uint* buf, uint bufSize, int64_t* tmpBuf, } } #endif -static const uint BufferSizeInDwords = 256 * 1024 * 1024; -static constexpr uint NumKernelArgs = 4; -static constexpr uint MaxGPUs = 8; + // callback function static void HIPRT_CB CallBackFunctn(hipStream_t strm, hipError_t err, void *ChkVal) { @@ -472,126 +470,3 @@ TEST_CASE("Unit_hipStreamPerThread_CoopLaunch") { } } } - -/* Testing hipLaunchCooperativeKernelMultiDevice() with hipStreamPerThread*/ -#if HT_AMD -TEST_CASE("Unit_hipStreamPerThread_CoopLaunchMDev") { - uint* dA[MaxGPUs]; - int64_t* dB[MaxGPUs]; - int64_t* dC; - - uint32_t* init = new uint32_t[BufferSizeInDwords]; - for (uint32_t i = 0; i < BufferSizeInDwords; ++i) { - init[i] = i; - } - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - size_t copySizeInDwords = BufferSizeInDwords / nGpu; - hipDeviceProp_t deviceProp[MaxGPUs]; - - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - // Calculate the device occupancy to know how many blocks can be - // run concurrently - HIP_CHECK(hipGetDeviceProperties(&deviceProp[i], 0)); - if (!deviceProp[i].cooperativeMultiDeviceLaunch) { - WARN("Device doesn't support cooperative launch!"); - return; - } - size_t SIZE = copySizeInDwords * sizeof(uint); - - HIPCHECK(hipMalloc(reinterpret_cast(&dA[i]), SIZE)); - HIPCHECK(hipMalloc(reinterpret_cast(&dB[i]), - 64 * deviceProp[i].multiProcessorCount * sizeof(int64_t))); - if (i == 0) { - HIPCHECK(hipHostMalloc(reinterpret_cast(&dC), - (nGpu + 1) * sizeof(int64_t))); - } - HIPCHECK(hipMemcpy(dA[i], &init[i * copySizeInDwords] , SIZE, - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - } - - dim3 dimBlock; - dim3 dimGrid; - dimGrid.x = 1; - dimGrid.y = 1; - dimGrid.z = 1; - dimBlock.x = 64; - dimBlock.y = 1; - dimBlock.z = 1; - - int numBlocks = 0; - uint workgroups[3] = {64, 128, 256}; - - hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; - std::time_t end_time; - double time = 0; - for (uint set = 0; set < 3; ++set) { - void* args[MaxGPUs * NumKernelArgs]; - WARN("---------- Test#" << set << ", size: "<< BufferSizeInDwords << - " dwords ---------------\n"); - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - dimBlock.x = workgroups[set]; - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, - test_gwsPerThrd, dimBlock.x * dimBlock.y * dimBlock.z, - dimBlock.x * sizeof(int64_t))); - - WARN("GPU(" << i << ") Block size: " << dimBlock.x << - " Num blocks per CU: " << numBlocks << "\n"); - - dimGrid.x = deviceProp[i].multiProcessorCount * (std::min)(numBlocks, 32); - - args[i * NumKernelArgs] = reinterpret_cast(&dA[i]); - args[i * NumKernelArgs + 1] = reinterpret_cast(©SizeInDwords); - args[i * NumKernelArgs + 2] = reinterpret_cast(&dB[i]); - args[i * NumKernelArgs + 3] = reinterpret_cast(&dC); - - launchParamsList[i].func = reinterpret_cast(test_gwsPerThrd); - launchParamsList[i].gridDim = dimGrid; - launchParamsList[i].blockDim = dimBlock; - launchParamsList[i].sharedMem = dimBlock.x * sizeof(int64_t); - - launchParamsList[i].stream = hipStreamPerThread; - launchParamsList[i].args = &args[i * NumKernelArgs]; - } - - system_clock::time_point start = system_clock::now(); - HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0)); - for (int i = 0; i < nGpu; i++) { - HIP_CHECK(hipSetDevice(i)); - HIP_CHECK(hipDeviceSynchronize()); - } - system_clock::time_point end = system_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - end_time = std::chrono::system_clock::to_time_t(end); - - time += elapsed_seconds.count(); - - size_t processedDwords = copySizeInDwords * nGpu; - if (*dC != (((int64_t)(processedDwords) * (processedDwords - 1)) / 2)) { - WARN("Data validation failed ("<< *dC << " != " << - (((int64_t)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2) << - ") for grid size = " << dimGrid.x << " and block size = " << - dimBlock.x << "\n"); - WARN("Test failed!"); - } - } - - delete [] launchParamsList; - - WARN("finished computation at " << std::ctime(&end_time)); - WARN("elapsed time: " << time << "s\n"); - - HIP_CHECK(hipSetDevice(0)); - HIP_CHECK(hipFree(dC)); - for (int i = 0; i < nGpu; i++) { - HIP_CHECK(hipFree(dA[i])); - HIP_CHECK(hipFree(dB[i])); - } - delete [] init; -} -#endif