diff --git a/projects/hip-tests/catch/stress/CMakeLists.txt b/projects/hip-tests/catch/stress/CMakeLists.txt index 681f7e95c4..8aea253b33 100644 --- a/projects/hip-tests/catch/stress/CMakeLists.txt +++ b/projects/hip-tests/catch/stress/CMakeLists.txt @@ -1,7 +1,13 @@ add_custom_target(build_stress_test) -add_executable(StressTest EXCLUDE_FROM_ALL ../hipTestMain/main.cc) +add_executable(StressTest EXCLUDE_FROM_ALL ../hipTestMain/main.cc ../hipTestMain/hip_test_context.cc) +set_property(TARGET StressTest PROPERTY CXX_STANDARD 17) +target_link_libraries(StressTest PRIVATE stdc++fs) add_dependencies(build_stress_test StressTest) add_custom_target(stress_test COMMAND StressTest) -#add_dependencies(stress_test build_stress_test) add_subdirectory(memory) -target_link_libraries(StressTest PRIVATE memory ht_context) +if(HIP_PLATFORM MATCHES "amd") +add_subdirectory(printf) +add_subdirectory(stream) +target_link_libraries(StressTest PRIVATE printf stream) +endif() +target_link_libraries(StressTest PRIVATE memory) diff --git a/projects/hip-tests/catch/stress/printf/CMakeLists.txt b/projects/hip-tests/catch/stress/printf/CMakeLists.txt new file mode 100644 index 0000000000..4a7a915658 --- /dev/null +++ b/projects/hip-tests/catch/stress/printf/CMakeLists.txt @@ -0,0 +1,11 @@ +# Common Tests - Test independent of all platforms +set(TEST_SRC + Stress_printf_ComplexKernels.cc + Stress_printf_SimpleKernels.cc +) + +# Create shared lib of all tests +add_library(printf SHARED EXCLUDE_FROM_ALL ${TEST_SRC}) + +# Add dependency on build_tests to build it on this custom target +add_dependencies(build_stress_test printf) diff --git a/projects/hip-tests/catch/stress/printf/Stress_printf_ComplexKernels.cc b/projects/hip-tests/catch/stress/printf/Stress_printf_ComplexKernels.cc new file mode 100644 index 0000000000..59c7f3ecfa --- /dev/null +++ b/projects/hip-tests/catch/stress/printf/Stress_printf_ComplexKernels.cc @@ -0,0 +1,511 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#ifdef __linux__ +#include "printf_common.h" +#endif +#include + +#define MAX_BLOCK_SIZE 523 +#define MAX_GRID_SIZE 503 +#define CHUNK_SIZE 1024 +#define NUM_STREAM 4 +#define CONST_WEIGHTING_FACT1 7 +#define CONST_WEIGHTING_FACT2 5 + +namespace hipPrintfStressTest { +struct printInfo { + uint32_t printSizeinBytes, lineCount; +}; + +__device__ __host__ struct printInfo startPrint(uint32_t tid, + uint32_t iterCount, uint32_t *a, uint32_t *b) { + uint32_t printSize = 0; + uint32_t lineCount = 0; + // The 2nd modulus operand is arbitrarily chosen as 7 below to + // diversify the printf output as much as possible while also being + // a prime number. This number is fixed to 7 and should not be changed. + uint32_t mod = tid % 7; + // Perform some calculations and print the values. + uint32_t uiresult; + int32_t iresult; + float fresult; + for (uint32_t count = 0; count < iterCount; count++) { + if (0 == mod) { + // Perform Vector Multiplication a(i)*b(i) + // Print both tid and result + uiresult = a[tid]*b[tid]; + printSize += + printf("tid %u: Value of result=%u or %x\n", + tid, uiresult, uiresult); + lineCount++; + } else if (1 == mod) { + // Perform Array Addition a(i) + b(i) + // Print both tid and result + uiresult = a[tid] + b[tid]; + printSize += + printf("tid %u: Value of result=%u or %x \n", + tid, uiresult, uiresult); + lineCount++; + } else if (2 == mod) { + // Perform Array Subtraction a(i) - b(i) + // Print both tid and result (as both int, uint) + iresult = a[tid] - b[tid]; + printSize += + printf("tid %u: Value of result=%d or %x\n", + tid, iresult, iresult); + lineCount++; + } else if (3 == mod) { + // Perform Sum of Squares a(i)*a(i) + b(i)*b(i) + // Print both tid and result + uiresult = a[tid]*a[tid] + b[tid]*b[tid]; + printSize += + printf("tid %u: Value of result=%u or %x\n", + tid, uiresult, uiresult); + lineCount++; + } else if (4 == mod) { + // Perform (a(i)*a(i) + b(i)*b(i))/a(i)*b(i) + // Print both tid and result (in float upto 2 decimal precision) + fresult = (a[tid]*a[tid] + b[tid]*b[tid])/(a[tid]*b[tid]); + printSize += + printf("tid %u: Value of result[%d] = %.2f or %.2e\n", + tid, tid, fresult, fresult); + lineCount++; + } else if (5 == mod) { + // Perform (a(i)*a(i) - b(i)*b(i))/a(i)*b(i) + // Print both tid and result (in float upto 4 decimal precision) + fresult = (a[tid]*a[tid] - b[tid]*b[tid])/(a[tid]*b[tid]); + printSize += + printf("tid %u: Value of result[%d] = %.4f or %.4e \n", + tid, tid, fresult, fresult); + lineCount++; + } else if (6 == mod) { + // Perform (a(i)*a(i) + b(i)*b(i))/(a(i)*a(i) - b(i)*b(i)) + // Print both tid and result (in float upto 6 decimal precision) + fresult = (a[tid]*a[tid] + b[tid]*b[tid])/ + (a[tid]*a[tid] - b[tid]*b[tid]); + printSize += + printf("tid %u: Value of result[%d] = %.6f or %.6e \n", + tid, tid, fresult, fresult); + lineCount++; + } + // Print a random character string of variable size + // and number. + const char* msg; + for (int i = 0; i < 12; i++) { + int imod = (i % 6); + if (0 == imod) { + msg = "jhwehde2hl"; + } else if (1 == imod) { + msg = "jhwehde2hlmc,prmlsl4"; + } else if (2 == imod) { + msg = "xkdojdewnd34dMMnl2o4AAdeBEjbX0"; + } else if (3 == imod) { + msg = "mcropkaA234dmelmfhja44ndalomkfokdMDFK328"; + } else if (4 == imod) { + msg = + "udnekc8939MDkdnjj3knsdlmnekdlgJNls328419i905409dfm"; + } else if (5 == imod) { + msg = + "lfjweknm4349u34sdlk09j3mAADDSDkeffe575675fdvfLKMWMORMFREKLkl"; + } + printSize += printf("tid %u: %s imod = %d \n", tid, msg, imod); + lineCount++; + } + // Print a long string with data + msg = + "jheku83290dnmnd##9u9BJKHFJLKsMMMMdkejwejjj232indnfdmsnndnsdn****bsXxZz"; + float pi = 3.141592; + uint32_t unum = 123456789; + int32_t inum = -123456789; + printSize += + printf("%s,%d,%s,%u,%s,%x,%s,%f,%s,%e\n", + msg, inum, msg, unum, msg, unum, msg, pi, msg, pi); + lineCount++; + // Print different data types using different specifiers + float fmaxvalue = std::numeric_limits::max(); + float fminvalue = std::numeric_limits::min(); + double dmaxvalue = std::numeric_limits::max(); + double dminvalue = std::numeric_limits::min(); + printSize += + printf("%f, %f, %e, %e \n", fmaxvalue, fminvalue, fmaxvalue, fminvalue); + printSize += + printf("%f, %f, %e, %e \n", dmaxvalue, dminvalue, dmaxvalue, dminvalue); + printSize += + printf("%a, %a, %A, %A \n", fmaxvalue, fminvalue, fmaxvalue, fminvalue); + printSize += + printf("%a, %a, %A, %A \n", dmaxvalue, dminvalue, dmaxvalue, dminvalue); + lineCount+=4; + size_t size_tmaxvalue = std::numeric_limits::max(); + size_t size_tminvalue = std::numeric_limits::min(); + long long llmaxvalue = std::numeric_limits::max(); + long long llminvalue = std::numeric_limits::min(); + unsigned long long ullmaxvalue = + std::numeric_limits::max(); + unsigned long long ullminvalue = + std::numeric_limits::min(); + long lmaxvalue = std::numeric_limits::max(); + long lminvalue = std::numeric_limits::min(); + unsigned long ulmaxvalue = std::numeric_limits::max(); + unsigned long ulminvalue = std::numeric_limits::min(); + short smaxvalue = std::numeric_limits::max(); + short sminvalue = std::numeric_limits::min(); + unsigned short usmaxvalue = std::numeric_limits::max(); + unsigned short usminvalue = std::numeric_limits::min(); + char cmaxvalue = std::numeric_limits::max(); + char cminvalue = std::numeric_limits::min(); + unsigned char ucmaxvalue = std::numeric_limits::max(); + unsigned char ucminvalue = std::numeric_limits::min(); + int32_t imaxvalue = std::numeric_limits::max(); + int32_t iminvalue = std::numeric_limits::min(); + uint32_t uimaxvalue = std::numeric_limits::max(); + uint32_t uiminvalue = std::numeric_limits::min(); + printSize += + printf("%zu, %zu, %lli, %lli, %llu, %llu, %li, %li, %lu, %lu\n", + size_tmaxvalue, size_tminvalue, llmaxvalue, llminvalue, + ullmaxvalue, ullminvalue, lmaxvalue, lminvalue, + ulmaxvalue, ulminvalue); + printSize += + printf("%zx, %zx, %llx, %llx, %llx, %llx, %lx, %lx, %lx, %lx\n", + size_tmaxvalue, size_tminvalue, llmaxvalue, llminvalue, + ullmaxvalue, ullminvalue, lmaxvalue, lminvalue, + ulmaxvalue, ulminvalue); + printSize += + printf("%zX, %zX, %llX, %llX, %llX, %llX, %lX, %lX, %lX, %lX\n", + size_tmaxvalue, size_tminvalue, llmaxvalue, llminvalue, + ullmaxvalue, ullminvalue, lmaxvalue, lminvalue, + ulmaxvalue, ulminvalue); + printSize += + printf("%zo, %zo, %llo, %llo, %llo, %llo, %lo, %lo, %lo, %lo\n", + size_tmaxvalue, size_tminvalue, llmaxvalue, llminvalue, + ullmaxvalue, ullminvalue, lmaxvalue, lminvalue, + ulmaxvalue, ulminvalue); + printSize += + printf("%hd, %hd, %hu, %hu, %hhd, %hhd, %hhu, %hhu, %d, %d, %u, %u\n", + smaxvalue, sminvalue, usmaxvalue, usminvalue, + cmaxvalue, cminvalue, ucmaxvalue, ucminvalue, + imaxvalue, iminvalue, uimaxvalue, uiminvalue); + printSize += + printf("%hx, %hx, %hx, %hx, %hhx, %hhx, %hhx, %hhx, %x, %x, %x, %x\n", + smaxvalue, sminvalue, usmaxvalue, usminvalue, + cmaxvalue, cminvalue, ucmaxvalue, ucminvalue, + imaxvalue, iminvalue, uimaxvalue, uiminvalue); + printSize += + printf("%hX, %hX, %hX, %hX, %hhX, %hhX, %hhX, %hhX, %X, %X, %X, %X\n", + smaxvalue, sminvalue, usmaxvalue, usminvalue, + cmaxvalue, cminvalue, ucmaxvalue, ucminvalue, + imaxvalue, iminvalue, uimaxvalue, uiminvalue); + printSize += + printf("%ho, %ho, %ho, %ho, %hho, %hho, %hho, %hho, %o, %o, %o, %o\n", + smaxvalue, sminvalue, usmaxvalue, usminvalue, + cmaxvalue, cminvalue, ucmaxvalue, ucminvalue, + imaxvalue, iminvalue, uimaxvalue, uiminvalue); + printSize += + printf("%c, %c, %c, %c\n", cmaxvalue, cminvalue, ucmaxvalue, ucminvalue); + lineCount+=9; + } + struct printInfo pInfo = {printSize, lineCount}; + return pInfo; +} +// This kernel is launched only in X dimension +__global__ void kernel_complex_opX(uint32_t *a, uint32_t *b, + uint32_t iterCount) { + uint32_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + startPrint(tid, iterCount, a, b); +} +// This kernel is launched only in Y dimension +__global__ void kernel_complex_opY(uint32_t *a, uint32_t *b, + uint32_t iterCount) { + uint32_t tid = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y; + startPrint(tid, iterCount, a, b); +} +// This kernel is launched only in Z dimension +__global__ void kernel_complex_opZ(uint32_t *a, uint32_t *b, + uint32_t iterCount) { + uint32_t tid = hipThreadIdx_z + hipBlockIdx_z * hipBlockDim_z; + startPrint(tid, iterCount, a, b); +} +#ifdef __linux__ +// Performs printf stress test on a single GPU using multiple streams. +bool test_printf_multistream(uint32_t num_blocks, + uint32_t threads_per_block, + uint32_t iterCount) { + uint32_t buffsize = num_blocks*threads_per_block; + size_t actualFileSize = 0; + uint32_t totalActualLinecount = 0; + uint32_t *Ah, *Bh; + uint32_t *Ad, *Bd; + Ah = new uint32_t[buffsize]; + Bh = new uint32_t[buffsize]; + for (uint32_t i = 0; i < buffsize; i++) { + Ah[i] = i + 1; + Bh[i] = buffsize - i; + } + HIP_CHECK(hipMalloc(&Ad, buffsize*sizeof(uint32_t))); + HIP_CHECK(hipMalloc(&Bd, buffsize*sizeof(uint32_t))); + HIP_CHECK(hipMemcpy(Ad, Ah, buffsize*sizeof(uint32_t), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(Bd, Bh, buffsize*sizeof(uint32_t), + hipMemcpyHostToDevice)); + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipStream_t stream[NUM_STREAM]; + for (int i = 0; i < NUM_STREAM; i++) { + HIP_CHECK(hipStreamCreate(&stream[i])); + hipLaunchKernelGGL(kernel_complex_opX, dim3(num_blocks, 1, 1), + dim3(threads_per_block, 1, 1), + 0, stream[i], Ad, Bd, iterCount); + hipLaunchKernelGGL(kernel_complex_opY, dim3(1, num_blocks, 1), + dim3(1, threads_per_block, 1), + 0, stream[i], Ad, Bd, iterCount); + hipLaunchKernelGGL(kernel_complex_opZ, dim3(1, 1, num_blocks), + dim3(1, 1, threads_per_block), + 0, stream[i], Ad, Bd, iterCount); + } + HIP_CHECK(hipDeviceSynchronize()); + for (int i = 0; i < NUM_STREAM; i++) { + HIP_CHECK(hipStreamDestroy(stream[i])); + } + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + totalActualLinecount++; + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found \n"); + return false; + } + actualFileSize = st.st_size; + } + struct printInfo pInfo; + size_t estimatedPrintSize = 0; + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + uint32_t lop = 0; + { + CaptureStream captured(stdout); + for (int j = 0; j < NUM_STREAM; j++) { + for (uint32_t tid = 0; tid < (buffsize); tid++) { + pInfo = startPrint(tid, iterCount, Ah, Bh); + lop += pInfo.lineCount; + estimatedPrintSize += pInfo.printSizeinBytes; + } + for (uint32_t tid = 0; tid < (buffsize); tid++) { + pInfo = startPrint(tid, iterCount, Ah, Bh); + lop += pInfo.lineCount; + estimatedPrintSize += pInfo.printSizeinBytes; + } + for (uint32_t tid = 0; tid < (buffsize); tid++) { + pInfo = startPrint(tid, iterCount, Ah, Bh); + lop += pInfo.lineCount; + estimatedPrintSize += pInfo.printSizeinBytes; + } + } + } + printf("estimatedPrintSize = %zu, actualFileSize = %zu\n", + estimatedPrintSize, actualFileSize); + printf("estimatedLinesPrinted = %u, actualLinesPrinted = %u\n", + lop, totalActualLinecount-1); + HIP_CHECK(hipFree(Bd)); + HIP_CHECK(hipFree(Ad)); + delete[] Bh; + delete[] Ah; + if ((estimatedPrintSize != actualFileSize)|| + (lop != (totalActualLinecount-1))) { + return false; + } + return true; +} + +bool test_printf_multigpu(int gpu, + uint32_t num_blocks, + uint32_t threads_per_block, + uint32_t iterCount, + size_t *actualFileSize, + uint32_t *totalActualLinecount) { + uint32_t buffsize = num_blocks*threads_per_block; + uint32_t *Ah, *Bh; + uint32_t *Ad, *Bd; + HIP_CHECK(hipSetDevice(gpu)); + Ah = new uint32_t[buffsize]; + Bh = new uint32_t[buffsize]; + for (uint32_t i = 0; i < buffsize; i++) { + Ah[i] = i + 1; + Bh[i] = buffsize - i; + } + HIP_CHECK(hipMalloc(&Ad, buffsize*sizeof(uint32_t))); + HIP_CHECK(hipMalloc(&Bd, buffsize*sizeof(uint32_t))); + HIP_CHECK(hipMemcpy(Ad, Ah, buffsize*sizeof(uint32_t), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(Bd, Bh, buffsize*sizeof(uint32_t), + hipMemcpyHostToDevice)); + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipLaunchKernelGGL(kernel_complex_opX, dim3(num_blocks, 1, 1), + dim3(threads_per_block, 1, 1), + 0, 0, Ad, Bd, iterCount); + hipLaunchKernelGGL(kernel_complex_opY, dim3(1, num_blocks, 1), + dim3(1, threads_per_block, 1), + 0, 0, Ad, Bd, iterCount); + hipLaunchKernelGGL(kernel_complex_opZ, dim3(1, 1, num_blocks), + dim3(1, 1, threads_per_block), + 0, 0, Ad, Bd, iterCount); + HIP_CHECK(hipDeviceSynchronize()); + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + *totalActualLinecount += 1; + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found \n"); + return false; + } + *actualFileSize += st.st_size; + } + HIP_CHECK(hipFree(Bd)); + HIP_CHECK(hipFree(Ad)); + delete[] Bh; + delete[] Ah; + *totalActualLinecount -= 1; // Removing Empty Line + HIP_CHECK(hipSetDevice(0)); + return true; +} + +// Performs printf stress test on all GPUs present in the system. +bool testPrintfMultGPU(int numOfGPUs, + uint32_t num_blocks, + uint32_t threads_per_block, + uint32_t iterCount) { + uint32_t buffsize = num_blocks*threads_per_block; + size_t actualFileSize = 0; + uint32_t totalActualLinecount = 0; + for (int gpu = 0; gpu < numOfGPUs; gpu++) { + test_printf_multigpu(gpu, num_blocks, threads_per_block, + iterCount, &actualFileSize, &totalActualLinecount); + } + struct printInfo pInfo; + size_t estimatedPrintSize = 0; + uint32_t *Ah, *Bh; + Ah = new uint32_t[buffsize]; + Bh = new uint32_t[buffsize]; + for (uint32_t i = 0; i < buffsize; i++) { + Ah[i] = i + 1; + Bh[i] = buffsize - i; + } + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + uint32_t lop = 0; + { + CaptureStream captured(stdout); + for (int gpu = 0; gpu < numOfGPUs; gpu++) { + for (uint32_t tid = 0; tid < (buffsize); tid++) { + pInfo = startPrint(tid, iterCount, Ah, Bh); + lop += pInfo.lineCount; + estimatedPrintSize += pInfo.printSizeinBytes; + } + for (uint32_t tid = 0; tid < (buffsize); tid++) { + pInfo = startPrint(tid, iterCount, Ah, Bh); + lop += pInfo.lineCount; + estimatedPrintSize += pInfo.printSizeinBytes; + } + for (uint32_t tid = 0; tid < (buffsize); tid++) { + pInfo = startPrint(tid, iterCount, Ah, Bh); + lop += pInfo.lineCount; + estimatedPrintSize += pInfo.printSizeinBytes; + } + } + } + delete[] Bh; + delete[] Ah; + printf("estimatedPrintSize = %zu, actualFileSize = %zu\n", + estimatedPrintSize, actualFileSize); + printf("estimatedLinesPrinted = %u, actualLinesPrinted = %u\n", + lop, totalActualLinecount); + if ((estimatedPrintSize != actualFileSize)|| + (lop != totalActualLinecount)) { + return false; + } + return true; +} +#endif +} // namespace hipPrintfStressTest + +TEST_CASE("Stress_printf_ComplexKernelMultStream") { +#ifdef __linux__ + printf("Test - Stress_printf_ComplexKernelMultStream start\n"); + bool TestPassed = true; + uint threads_per_block = MAX_BLOCK_SIZE; + // N provide the print limit + unsigned int print_limit = 4; // = 4 GB + uint32_t iterCount = 1; + // num_blocks is calculated using an approximate formula to arrive at + // the required print data quantity. CONST_WEIGHTING_FACT1 and + // CONST_WEIGHTING_FACT2 are empirically determined. + uint32_t num_blocks = (MAX_GRID_SIZE*print_limit)/CONST_WEIGHTING_FACT1 + - (CONST_WEIGHTING_FACT2*print_limit); + TestPassed = + hipPrintfStressTest::test_printf_multistream(num_blocks, threads_per_block, + iterCount); + REQUIRE(TestPassed); + printf("Test - Stress_printf_ComplexKernelMultStream completed \n"); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_ComplexKernelMultStreamMultGpu") { +#ifdef __linux__ + printf("Test - Stress_printf_ComplexKernelMultStreamMultGpu start \n"); + bool TestPassed = true; + uint threads_per_block = MAX_BLOCK_SIZE; + // N provide the print limit + unsigned int print_limit = 4; // = 4 GB + uint32_t iterCount = 1; + int numOfGPUs = 0; + hipGetDeviceCount(&numOfGPUs); + if (numOfGPUs < 2) { + printf("Skipping test because numOfGPUs < 2\n"); + return; + } + // num_blocks is calculated using an approximate formula to arrive at + // the required print data quantity. CONST_WEIGHTING_FACT1 and + // CONST_WEIGHTING_FACT2 are empirically determined. + uint32_t num_blocks = + (((MAX_GRID_SIZE*print_limit)/CONST_WEIGHTING_FACT1 - + (CONST_WEIGHTING_FACT2*print_limit))*4)/numOfGPUs; + TestPassed = + hipPrintfStressTest::testPrintfMultGPU(numOfGPUs, num_blocks, + threads_per_block, + iterCount); + REQUIRE(TestPassed); + printf("Test - Stress_printf_ComplexKernelMultStreamMultGpu end \n"); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} diff --git a/projects/hip-tests/catch/stress/printf/Stress_printf_SimpleKernels.cc b/projects/hip-tests/catch/stress/printf/Stress_printf_SimpleKernels.cc new file mode 100644 index 0000000000..843709c476 --- /dev/null +++ b/projects/hip-tests/catch/stress/printf/Stress_printf_SimpleKernels.cc @@ -0,0 +1,790 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#ifdef __linux__ +#include "printf_common.h" +#endif +#include + +#define BLOCK_SIZE 512 +#define GRID_SIZE 512 +#define CHUNK_SIZE 256 +#define CONST_STR "Hello World from Device.Iam printing 55 bytes of data.\n" +#define CONST_STR1 "Hello World from Device.Iam printing from even thread.\n" +#define CONST_STR2 "Hello World from Device.This is odd thread.\n" +#define CONST_STR3 "Hello World from Device. The sum of all threadID = " + +namespace hipPrintfStressTest { +struct SizeStruct { + unsigned int block_size; + unsigned int grid_size; + unsigned int iteration; +}; +// These values are empirically determined for kernel_divergent_str3 +// Any modification to the function or CONST_STR3 will change these values +const struct SizeStruct EmpiricalValues1[12] = { + {512, 512, 16}, + {512, 512, 32}, + {512, 512, 48}, + {512, 512, 64}, + {512, 512, 80}, + {512, 512, 96}, + {512, 512, 110}, + {512, 512, 126}, + {512, 512, 140}, + {512, 512, 156}, + {512, 512, 172}, + {512, 512, 186} +}; +// These values are empirically determined for kernel_dependent_calc +// and kernel_dependent_calc_atomic. +// Any modification to the functions will change these values. +const struct SizeStruct EmpiricalValues2[12] = { + {512, 512, 20}, + {512, 512, 40}, + {512, 512, 60}, + {512, 512, 80}, + {512, 512, 100}, + {512, 512, 120}, + {512, 512, 140}, + {512, 512, 160}, + {512, 512, 180}, + {512, 512, 200}, + {512, 512, 220}, + {512, 512, 240} +}; +// Print a constant string in a kernel for 'n' iterations per thread +// using 'b' block size and 'g' grid size such that +// (total bytes per iteration)*n*b*g ≈ N GB where N is user input. +__global__ void kernel_printf_conststr(uint iterCount) { + for (uint count = 0; count < iterCount; count++) { + printf("%s", CONST_STR); + } +} +// Print 2 different constant strings (using if and else conditionals) +// in a kernel for 'n' iterations per thread using 'b' block size and +// 'g' grid size such that (total bytes per iteration)*n*b*g ≈ N GB, +// where N is user input. +__global__ void kernel_printf_two_conditionalstr(uint iterCount) { + uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + uint mod_tid = (tid % 2); + if (0 == mod_tid) { + for (uint count = 0; count < iterCount; count++) { + printf("%s", CONST_STR1); + } + } else { + for (uint count = 0; count < iterCount; count++) { + printf("%s", CONST_STR2); + } + } +} +// Print a constant string (using only if condition) in a kernel for 'n' +// iterations per thread using 'b' block size and 'g' grid size such that +// (total bytes per iteration)*n*b*g ≈ N GB, where N is user input. +__global__ void kernel_printf_single_conditionalstr(uint iterCount) { + uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + uint mod_tid = (tid % 2); + if (0 == mod_tid) { + for (uint count = 0; count < iterCount; count++) { + printf("%s", CONST_STR1); + } + } +} +// Please do not nodify this function. +// Any modification to this function will fail the test case. +// Print variable size string using integer data in a kernel for 'n' +// iterations per thread using 'b' block size and 'g' grid size such +// that (total bytes per iteration)*n*b*g ≈ N GB, where N is user input. +__global__ void kernel_printf_variablestr(uint iterCount, int *ret) { + uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int retlocal = 0; + const char *const_str = + "Hello World from Device.Iam printing (threadID,number)="; + for (int count = 0; count < (const int)iterCount; count++) { + retlocal += printf("%s%u,%d\n", const_str, tid, count); + retlocal += printf("%s%u,%d\n", const_str, tid, 10*count); + retlocal += printf("%s%u,%d\n", const_str, tid, 100*count); + retlocal += printf("%s%u,%d\n", const_str, tid, 1000*count); + } + ret[tid] = retlocal; +} +// Please do not nodify this function. +// Any modification to this function will fail the test case. +// Perform dependent calculations and print the result after each +// calculation in a kernel for 'n' iterations per thread using 'b' block +// size and 'g' grid size such that +// (total bytes per iteration)*n*b*g ≈ N GB, where N is user input. +__global__ void kernel_dependent_calc(uint32_t iterCount, int *ret) { + uint32_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int retlocal = 0; + const char *const_str = + "Hello World from Device.Iam printing number="; + for (int count = 0; count < (const int)iterCount; count++) { + uint32_t x = tid + count; + retlocal += printf("%s%u\n", const_str, x); + uint32_t y = x + tid; + retlocal += printf("%s%u\n", const_str, y); + uint32_t z = x*y; + retlocal += printf("%s%u\n", const_str, z); + uint32_t a = z/(tid + 1); + retlocal += printf("%s%u\n", const_str, a); + } + ret[tid] = retlocal; +} +// Please do not nodify this function. +// Any modification to this function will fail the test case. +// Perform atomic calculations and print the result after each +// calculation in a kernel for 'n' iterations per thread using 'b' block +// size and 'g' grid size such that +// (total bytes per iteration)*n*b*g ≈ N GB, where N is user input. +__global__ void kernel_dependent_calc_atomic(uint32_t iterCount, + int *ret) { + uint32_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int retlocal = 0; + const char *const_str = + "Hello World from Device.Iam printing number="; + for (uint32_t count = 0; count < iterCount; count++) { + uint32_t x = tid; + atomicAdd(&x, count); + retlocal += printf("%s%u\n", const_str, x); + uint32_t y = x; + atomicAdd(&y, tid); + retlocal += printf("%s%u\n", const_str, y); + uint32_t z = y; + atomicSub(&z, count); + retlocal += printf("%s%u\n", const_str, z); + uint32_t a = z; + atomicAnd(&a, 0x0000ffff); + retlocal += printf("%s%u\n", const_str, a); + } + ret[tid] = retlocal; +} +// Print variable size string using floating point data of varying +// precision in a kernel for 'n' iterations per thread using 'b' block +// size and 'g' grid size such that +// (total bytes per iteration)*n*b*g ≈ N GB, where N is user input. +__device__ __host__ int printPi(int maxPrecision) { + int printSize = 0; + size_t expo = 1000000000000; + double pi = 3.1415926535; + double piScaled = pi*expo; + const char *const_str = + "Hello World from Device.Iam printing decimal number="; + for (int prec = 0; prec <= maxPrecision ; prec++) { + printSize += printf("%s%.*f %.*e\n", const_str, prec, pi, + prec, piScaled); + } + return printSize; +} + +__global__ void kernel_decimal_calculation(uint iterCount, + int maxPrecision) { + for (int count = 0; count < (const int)iterCount; count++) { + printPi(maxPrecision); + } +} +// Print the value of shared memory variable using a stream of size 'n', +// 'b' block size and 'g' grid size such that +// (total bytes per thread)*n*b*g ≈ N GB, where N is user input. +__global__ void kernel_shared_mem() { + __shared__ uint32_t sharedMem; + sharedMem = 0; + __syncthreads(); + atomicAdd(&sharedMem, hipThreadIdx_x); + __syncthreads(); + printf("%s%u\n", CONST_STR3, sharedMem); +} +// Synchronize the prints in a block using __syncthreads. Only 1 block +// is launched in a stream of size 'n'. The size of the block is 'b'. +// (total bytes per thread)*n*b ≈ N GB. where N is user input. +__global__ void kernel_synchronized_printf() { + printf("%s%u\n", CONST_STR3, 0); + __syncthreads(); + printf("%s%u\n", CONST_STR3, 1); + __syncthreads(); + printf("%s%u\n", CONST_STR3, 2); +} +#ifdef __linux__ +// Launches kernel_printf_conststr to generate the printf log file +// and validates the generated file size and number of printed lines +// with the calculated file size and lines. +bool test_printf_conststr(uint32_t num_blocks, uint32_t threads_per_block, + uint32_t print_limit) { + uint32_t iterCount = 0; + uint32_t sizePrintString = (sizeof(CONST_STR)-1); // Excluding NULL character + // Calculate the number of iterations from print_limit. + size_t stress_limit_bytes = ((size_t)print_limit*1024*1024*1024); + iterCount = static_cast(1 + + stress_limit_bytes/(num_blocks*threads_per_block*sizePrintString)); + // Calculate expected lines of print and file size. + uint32_t totalExpectedLines = num_blocks*threads_per_block*iterCount; + size_t expectedFileSize = ((size_t)totalExpectedLines*sizePrintString); + size_t actualFileSize = 0; + uint32_t totalActualLinecount = 0; + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipLaunchKernelGGL(kernel_printf_conststr, dim3(num_blocks, 1, 1), + dim3(threads_per_block, 1, 1), + 0, 0, iterCount); + HIP_CHECK(hipStreamSynchronize(0)); + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + totalActualLinecount++; + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found \n"); + return false; + } + actualFileSize = st.st_size; + } + printf("totalExpectedLines = %u \n", totalExpectedLines); + // Excluding the trailing newline + printf("totalActualLinecount = %u \n", totalActualLinecount-1); + printf("expectedFileSize = %zu \n", expectedFileSize); + printf("actualFileSize = %zu \n", actualFileSize); + if ((totalExpectedLines != (totalActualLinecount - 1))|| + (expectedFileSize != actualFileSize)) { + return false; + } + return true; +} +// Launches kernel_printf_two_conditionalstr to generate the printf log file +// and validates the generated file size and number of printed lines +// with the calculated file size and lines. +bool test_printf_two_conditionalstr(uint32_t num_blocks, + uint32_t threads_per_block, + uint32_t print_limit) { + uint32_t iterCount = 0; + uint32_t sizePrintStringEven, sizePrintStringOdd, avgsizePrintString; + sizePrintStringEven = (sizeof(CONST_STR1)-1); // Excluding NULL character + sizePrintStringOdd = (sizeof(CONST_STR2)-1); // Excluding NULL character + avgsizePrintString = (sizePrintStringEven + sizePrintStringOdd)/2; + // Calculate the number of iterations from print_limit + size_t stress_limit_bytes = ((size_t)print_limit*1024*1024*1024); + iterCount = static_cast(1 + + stress_limit_bytes/(num_blocks*threads_per_block*avgsizePrintString)); + // Calculate expected lines of print and file size. + uint32_t totalExpectedEvenLines, totalExpectedOddLines; + // 0, 1, 2, 3 + // 0, 1, 2 + totalExpectedEvenLines = ((num_blocks*threads_per_block)%2 == 0)? + (num_blocks*threads_per_block*iterCount)/2 : + (((num_blocks*threads_per_block)/2)+ 1)*iterCount; + totalExpectedOddLines = (num_blocks*threads_per_block*iterCount + - totalExpectedEvenLines); + size_t expectedFileSize = + ((size_t)totalExpectedEvenLines*sizePrintStringEven + + (size_t)totalExpectedOddLines*sizePrintStringOdd); + size_t actualFileSize = 0; + uint32_t totalActualEvenLines = 0, totalActualOddLines = 0; + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipLaunchKernelGGL(kernel_printf_two_conditionalstr, + dim3(num_blocks, 1, 1), + dim3(threads_per_block, 1, 1), + 0, 0, iterCount); + HIP_CHECK(hipStreamSynchronize(0)); + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + uint32_t bufferlen = strlen(buffer); + if ((sizePrintStringEven - 1) == bufferlen) { + totalActualEvenLines++; + } else if ((sizePrintStringOdd - 1) == bufferlen) { + totalActualOddLines++; + } + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found \n"); + return false; + } + actualFileSize = st.st_size; + } + printf("totalExpectedEvenLines = %u \n", totalExpectedEvenLines); + printf("totalActualEvenLines = %u \n", totalActualEvenLines); + printf("totalExpectedOddLines = %u \n", totalExpectedOddLines); + printf("totalActualOddLines = %u \n", totalActualOddLines); + printf("expectedFileSize = %zu \n", expectedFileSize); + printf("actualFileSize = %zu \n", actualFileSize); + if ((totalExpectedEvenLines != totalActualEvenLines)|| + (totalExpectedOddLines != totalActualOddLines)|| + (expectedFileSize != actualFileSize)) { + return false; + } + return true; +} +// Launches kernel_printf_single_conditionalstr to generate the printf log +// and validates the generated file size and number of printed lines +// with the calculated file size and lines. +bool test_printf_single_conditionalstr(uint32_t num_blocks, + uint32_t threads_per_block, + uint32_t print_limit) { + uint32_t iterCount = 0; + uint32_t sizePrintStringEven = (sizeof(CONST_STR1)-1); + // Excluding NULL character + // Calculate the number of iterations from print_limit + size_t stress_limit_bytes = ((size_t)print_limit*1024*1024*1024); + iterCount = static_cast((2*stress_limit_bytes)/ + (num_blocks*threads_per_block*sizePrintStringEven)); + // Calculate expected lines of print and file size. + uint32_t totalExpectedLines; + totalExpectedLines = ((num_blocks*threads_per_block)%2 == 0)? + (num_blocks*threads_per_block*iterCount)/2 : + (((num_blocks*threads_per_block)/2)+ 1)*iterCount; + size_t expectedFileSize = + (size_t)totalExpectedLines*sizePrintStringEven; + size_t actualFileSize = 0; + uint32_t totalActualLines = 0; + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipLaunchKernelGGL(kernel_printf_single_conditionalstr, + dim3(num_blocks, 1, 1), + dim3(threads_per_block, 1, 1), + 0, 0, iterCount); + HIP_CHECK(hipStreamSynchronize(0)); + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + totalActualLines++; + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found \n"); + return false; + } + actualFileSize = st.st_size; + } + printf("totalExpectedLines = %u \n", totalExpectedLines); + printf("totalActualLines = %u \n", totalActualLines-1); + printf("expectedFileSize = %zu \n", expectedFileSize); + printf("actualFileSize = %zu \n", actualFileSize); + if ((totalExpectedLines != (totalActualLines - 1))|| + (expectedFileSize != actualFileSize)) { + return false; + } + return true; +} +// Launches kernel_printf_variablestr Or kernel_dependent_calc Or +// kernel_dependent_calc_atomic to generate the printf log +// and validates the generated file size and number of printed lines +// with the calculated file size and lines. +bool test_variable_str(uint32_t print_limit, + void(*func)(uint32_t, int *), + const struct SizeStruct* table) { + uint32_t iterCount = table[print_limit - 1].iteration; + uint32_t num_blocks = table[print_limit - 1].grid_size; + uint32_t threads_per_block = table[print_limit - 1].block_size; + // Calculate expected lines of print and file size. + size_t actualFileSize = 0; + uint32_t totalActualLines = 0; + uint32_t totalExpectedLines = 4*iterCount*num_blocks*threads_per_block; + size_t expectedFileSize = 0; + + uint32_t buffsize = threads_per_block*num_blocks; + int32_t *Ah; + int32_t *Ad; + Ah = new int32_t[buffsize]; + for (uint32_t i = 0; i < buffsize; i++) { + Ah[i] = 0; + } + HIP_CHECK(hipMalloc(&Ad, buffsize*sizeof(int32_t))); + HIP_CHECK(hipMemcpy(Ad, Ah, buffsize*sizeof(int32_t), + hipMemcpyHostToDevice)); + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipLaunchKernelGGL(func, dim3(num_blocks, 1, 1), + dim3(threads_per_block, 1, 1), + 0, 0, iterCount, Ad); + HIP_CHECK(hipStreamSynchronize(0)); + HIP_CHECK(hipMemcpy(Ah, Ad, buffsize*sizeof(int32_t), + hipMemcpyDeviceToHost)); + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + totalActualLines++; + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found \n"); + return false; + } + actualFileSize = st.st_size; + } + for (uint32_t i = 0; i < buffsize; i++) { + expectedFileSize += Ah[i]; + } + HIP_CHECK(hipFree(Ad)); + delete[] Ah; + printf("totalExpectedLines = %u \n", totalExpectedLines); + printf("totalActualLines = %u \n", totalActualLines-1); + printf("expectedFileSize = %zu \n", expectedFileSize); + printf("actualFileSize = %zu \n", actualFileSize); + if ((totalExpectedLines != (totalActualLines - 1))|| + (expectedFileSize != actualFileSize)) { + return false; + } + return true; +} +// Launches kernel_decimal_calculation to generate the printf log file +// and validates the generated file size and number of printed lines +// with the calculated file size and lines. +bool test_decimal_str(uint32_t num_blocks, uint32_t threads_per_block, + uint32_t print_limit) { + // Calculate the number of iterations from print_limit + size_t stress_limit_bytes = ((size_t)print_limit*1024*1024*1024); + int maxPrecision = 10; + int totalPrintSizePerIter = printPi(maxPrecision); + uint32_t iterCount = static_cast(1+ stress_limit_bytes/ + (num_blocks*threads_per_block*totalPrintSizePerIter)); + // Calculate expected lines of print and file size. + size_t actualFileSize = 0; + size_t expectedFileSize = + (size_t)num_blocks*threads_per_block*iterCount*totalPrintSizePerIter; + uint32_t totalActualLines = 0; + uint32_t totalExpectedLines = + (maxPrecision + 1)*iterCount*num_blocks*threads_per_block; + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipLaunchKernelGGL(kernel_decimal_calculation, dim3(num_blocks, 1, 1), + dim3(threads_per_block, 1, 1), + 0, 0, iterCount, maxPrecision); + HIP_CHECK(hipStreamSynchronize(0)); + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + totalActualLines++; + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found \n"); + return false; + } + actualFileSize = st.st_size; + } + printf("totalExpectedLines = %u \n", totalExpectedLines); + printf("totalActualLines = %u \n", totalActualLines-1); + printf("expectedFileSize = %zu \n", expectedFileSize); + printf("actualFileSize = %zu \n", actualFileSize); + if ((totalExpectedLines != (totalActualLines - 1))|| + (expectedFileSize != actualFileSize)) { + return false; + } + return true; +} +// Launches kernel_shared_mem to generate the printf log file +// and validates the generated file size and number of printed lines +// with the calculated file size and lines. +bool test_shared_mem(uint32_t num_blocks, uint32_t threads_per_block, + uint32_t print_limit) { + // Calculate the number of iterations from print_limit + size_t stress_limit_bytes = ((size_t)print_limit*1024*1024*1024); + unsigned total_0_to_blksize = (BLOCK_SIZE - 1)*BLOCK_SIZE / 2; + char buffer[CHUNK_SIZE]; + int totalPrintSizePerThread = snprintf(buffer, CHUNK_SIZE, + "%s%u\n", CONST_STR3, total_0_to_blksize); + uint32_t iterCount = static_cast(1+ stress_limit_bytes/ + (num_blocks*threads_per_block*totalPrintSizePerThread)); + // Calculate expected lines of print and file size. + size_t actualFileSize = 0; + size_t expectedFileSize = + (size_t)num_blocks*threads_per_block*iterCount*totalPrintSizePerThread; + uint32_t totalActualLines = 0; + uint32_t totalExpectedLines = iterCount*num_blocks*threads_per_block; + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + for (int count = 0; count < (const int)iterCount; count++) { + HIP_CHECK(hipLaunchKernel((const void*)kernel_shared_mem, + dim3(num_blocks, 1, 1), dim3(threads_per_block, 1, 1), + NULL, 0, stream)); + } + HIP_CHECK(hipStreamSynchronize(stream)); + HIP_CHECK(hipStreamDestroy(stream)); + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + totalActualLines++; + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found \n"); + return false; + } + actualFileSize = st.st_size; + } + printf("totalExpectedLines = %u \n", totalExpectedLines); + printf("totalActualLines = %u \n", totalActualLines-1); + printf("expectedFileSize = %zu \n", expectedFileSize); + printf("actualFileSize = %zu \n", actualFileSize); + if ((totalExpectedLines != (totalActualLines - 1))|| + (expectedFileSize != actualFileSize)) { + return false; + } + return true; +} +// Launches kernel_synchronized_printf to generate the printf log file +// and validates the generated file size and number of printed lines +// with the calculated file size and lines. +bool test_synchronized_printf(uint32_t num_blocks, + uint32_t threads_per_block, + uint32_t print_limit) { + // Calculate the number of iterations from print_limit + size_t stress_limit_bytes = ((size_t)print_limit*1024*1024*1024); + char buffer0[CHUNK_SIZE], buffer1[CHUNK_SIZE], buffer2[CHUNK_SIZE]; + int totalPrintSizePerThread = snprintf(buffer0, CHUNK_SIZE, + "%s%u\n", CONST_STR3, 0); + totalPrintSizePerThread += snprintf(buffer1, CHUNK_SIZE, + "%s%u\n", CONST_STR3, 1); + totalPrintSizePerThread += snprintf(buffer2, CHUNK_SIZE, + "%s%u\n", CONST_STR3, 2); + uint32_t iterCount = static_cast(1+ stress_limit_bytes/ + (num_blocks*threads_per_block*totalPrintSizePerThread)); + // Calculate expected lines of print and file size. + size_t actualFileSize = 0; + size_t expectedFileSize = + (size_t)num_blocks*threads_per_block*iterCount*totalPrintSizePerThread; + uint32_t totalActualLines = 0; + uint32_t totalExpectedLines = 3*iterCount*num_blocks*threads_per_block; + bool TestPassed = true; + size_t len = strlen(buffer0) - 1; + // DO NOT PUT ANY PRINTF WITHIN THIS BLOCK OF CODE + { + CaptureStream captured(stdout); + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + for (int count = 0; count < (const int)iterCount; count++) { + HIP_CHECK(hipLaunchKernel((const void*)kernel_synchronized_printf, + dim3(num_blocks, 1, 1), dim3(threads_per_block, 1, 1), + NULL, 0, stream)); + } + HIP_CHECK(hipStreamSynchronize(stream)); + HIP_CHECK(hipStreamDestroy(stream)); + std::ifstream CapturedData = captured.getCapturedData(); + char *buffer = new char[CHUNK_SIZE]; + while (CapturedData.good()) { + CapturedData.getline(buffer, CHUNK_SIZE); + if (!strcmp(buffer, "")) { + break; + } + if (0 == ((totalActualLines / threads_per_block) % 3)) { + if (strncmp(buffer, buffer0, len)) { + TestPassed = false; + break; + } + } else if (1 == ((totalActualLines / threads_per_block) % 3)) { + if (strncmp(buffer, buffer1, len)) { + TestPassed = false; + break; + } + } else if (2 == ((totalActualLines / threads_per_block) % 3)) { + if (strncmp(buffer, buffer2, len)) { + TestPassed = false; + break; + } + } + totalActualLines++; + } + delete[] buffer; + struct stat st; + if (stat(captured.getTempFilename(), &st)) { + printf("Temp File not found"); + return false; + } + actualFileSize = st.st_size; + } + printf("totalExpectedLines = %u \n", totalExpectedLines); + printf("totalActualLines = %u \n", totalActualLines); + printf("expectedFileSize = %zu \n", expectedFileSize); + printf("actualFileSize = %zu \n", actualFileSize); + if ((TestPassed == false)|| + (expectedFileSize != actualFileSize)) { + return false; + } + return true; +} +#endif +} // namespace hipPrintfStressTest + +TEST_CASE("Stress_printf_ConstStr") { +#ifdef __linux__ + printf("Test: Stress_printf_ConstStr\n"); + bool TestPassed = true; + uint threads_per_block = BLOCK_SIZE; + uint num_blocks = GRID_SIZE; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = + hipPrintfStressTest::test_printf_conststr(num_blocks, threads_per_block, + print_limit); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_IfElseConditionalStr") { +#ifdef __linux__ + printf("Test: Stress_printf_IfElseConditionalStr\n"); + bool TestPassed = true; + uint threads_per_block = BLOCK_SIZE; + uint num_blocks = GRID_SIZE; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = + hipPrintfStressTest::test_printf_two_conditionalstr(num_blocks, + threads_per_block, print_limit); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_IfConditionalStr") { +#ifdef __linux__ + printf("Test: Stress_printf_IfConditionalStr\n"); + bool TestPassed = true; + uint threads_per_block = BLOCK_SIZE; + uint num_blocks = GRID_SIZE; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = + hipPrintfStressTest::test_printf_single_conditionalstr(num_blocks, + threads_per_block, print_limit); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_VariableStr") { +#ifdef __linux__ + printf("Test: Stress_printf_VariableStr\n"); + bool TestPassed = true; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = hipPrintfStressTest::test_variable_str(print_limit, + hipPrintfStressTest::kernel_printf_variablestr, + hipPrintfStressTest::EmpiricalValues1); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_DependentCalc") { +#ifdef __linux__ + printf("Test: Stress_printf_DependentCalc\n"); + bool TestPassed = true; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = hipPrintfStressTest::test_variable_str(print_limit, + hipPrintfStressTest::kernel_dependent_calc, + hipPrintfStressTest::EmpiricalValues2); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_DecimalStr") { +#ifdef __linux__ + printf("Test: Stress_printf_DecimalStr\n"); + bool TestPassed = true; + uint threads_per_block = BLOCK_SIZE; + uint num_blocks = GRID_SIZE; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = hipPrintfStressTest::test_decimal_str(num_blocks, + threads_per_block, print_limit); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_SharedMem") { +#ifdef __linux__ + printf("Test: Stress_printf_SharedMem\n"); + bool TestPassed = true; + uint threads_per_block = BLOCK_SIZE; + uint num_blocks = GRID_SIZE; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = hipPrintfStressTest::test_shared_mem(num_blocks, + threads_per_block, print_limit); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_SynchronizedPrintf") { +#ifdef __linux__ + printf("Test: Stress_printf_SynchronizedPrintf\n"); + bool TestPassed = true; + uint threads_per_block = BLOCK_SIZE; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = hipPrintfStressTest::test_synchronized_printf(1, + threads_per_block, print_limit); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} + +TEST_CASE("Stress_printf_AtomicCalc") { +#ifdef __linux__ + printf("Test: Stress_printf_AtomicCalc\n"); + bool TestPassed = true; + // N provide the print limit + unsigned int print_limit = 1; // = 1 GB + TestPassed = hipPrintfStressTest::test_variable_str(print_limit, + hipPrintfStressTest::kernel_dependent_calc_atomic, + hipPrintfStressTest::EmpiricalValues2); + REQUIRE(TestPassed); +#else + printf("This test is skipped due to non linux environment.\n"); +#endif +} diff --git a/projects/hip-tests/catch/stress/printf/printf_common.h b/projects/hip-tests/catch/stress/printf/printf_common.h new file mode 100644 index 0000000000..38684f0dfe --- /dev/null +++ b/projects/hip-tests/catch/stress/printf/printf_common.h @@ -0,0 +1,99 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef _STRESSTEST_PRINTF_COMMON_H_ +#define _STRESSTEST_PRINTF_COMMON_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct CaptureStream { + int saved_fd; + int orig_fd; + int temp_fd; + + char tempname[13] = "mytestXXXXXX"; + + explicit CaptureStream(FILE *original) { + orig_fd = fileno(original); + saved_fd = dup(orig_fd); + + if ((temp_fd = mkstemp(tempname)) == -1) { + error(0, errno, "Error"); + assert(false); + } + + fflush(nullptr); + if (dup2(temp_fd, orig_fd) == -1) { + error(0, errno, "Error"); + assert(false); + } + if (close(temp_fd) != 0) { + error(0, errno, "Error"); + assert(false); + } + } + + void restoreStream() { + if (saved_fd == -1) + return; + fflush(nullptr); + if (dup2(saved_fd, orig_fd) == -1) { + error(0, errno, "Error"); + assert(false); + } + if (close(saved_fd) != 0) { + error(0, errno, "Error"); + assert(false); + } + saved_fd = -1; + } + + const char *getTempFilename() { + return (const char*)tempname; + } + + std::ifstream getCapturedData() { + restoreStream(); + std::ifstream temp(tempname); + return temp; + } + + ~CaptureStream() { + restoreStream(); + if (remove(tempname) != 0) { + error(0, errno, "Error"); + assert(false); + } + } +}; + +#endif // _STRESSTEST_PRINTF_COMMON_H_ diff --git a/projects/hip-tests/catch/stress/stream/CMakeLists.txt b/projects/hip-tests/catch/stress/stream/CMakeLists.txt new file mode 100644 index 0000000000..e2b5ea0c5d --- /dev/null +++ b/projects/hip-tests/catch/stress/stream/CMakeLists.txt @@ -0,0 +1,10 @@ +# Common Tests - Test independent of all platforms +set(TEST_SRC + Stress_hipStreamCreate.cc +) + +# Create shared lib of all tests +add_library(stream SHARED EXCLUDE_FROM_ALL ${TEST_SRC}) + +# Add dependency on build_tests to build it on this custom target +add_dependencies(build_stress_test stream) diff --git a/projects/hip-tests/catch/stress/stream/Stress_hipStreamCreate.cc b/projects/hip-tests/catch/stress/stream/Stress_hipStreamCreate.cc new file mode 100644 index 0000000000..e16a45d998 --- /dev/null +++ b/projects/hip-tests/catch/stress/stream/Stress_hipStreamCreate.cc @@ -0,0 +1,203 @@ +/* +Copyright (c) 2021-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#define NUM_ITER 100000 +#define TOTALSEQ 18 + +namespace hipStreamCreateStressTest { +__global__ void kernel_do_nothing() { + // do nothing +} + +int stream_seq[TOTALSEQ][4] = { + {0, 1, 2, 0} , // Launch0->Launch1->Launch2->Sync0 + {0, 2, 1, 0} , // Launch0->Launch2->Launch1->Sync0 + {1, 0, 2, 0} , // Launch1->Launch0->Launch2->Sync0 + {1, 2, 0, 0} , // Launch1->Launch2->Launch0->Sync0 + {2, 0, 1, 0} , // Launch2->Launch0->Launch1->Sync0 + {2, 1, 0, 0} , // Launch2->Launch1->Launch0->Sync0 + {0, 1, 2, 1} , // Launch0->Launch1->Launch2->Sync1 + {0, 2, 1, 1} , // Launch0->Launch2->Launch1->Sync1 + {1, 0, 2, 1} , // Launch1->Launch0->Launch2->Sync1 + {1, 2, 0, 1} , // Launch1->Launch2->Launch0->Sync1 + {2, 0, 1, 1} , // Launch2->Launch0->Launch1->Sync1 + {2, 1, 0, 1} , // Launch2->Launch1->Launch0->Sync1 + {0, 1, 2, 2} , // Launch0->Launch1->Launch2->Sync2 + {0, 2, 1, 2} , // Launch0->Launch2->Launch1->Sync2 + {1, 0, 2, 2} , // Launch1->Launch0->Launch2->Sync2 + {1, 2, 0, 2} , // Launch1->Launch2->Launch0->Sync2 + {2, 0, 1, 2} , // Launch2->Launch0->Launch1->Sync2 + {2, 1, 0, 2} // Launch2->Launch1->Launch0->Sync2 + }; + +/** + * Scenario: This test extends the DTEST introduced for SWDEV-238360 to test + * all the possible scenarios mentioned under comments section + * in SWDEV-237846. +*/ + +void testhipStreamCreate(int *stream_sequence) { + printf("%s: Testing sequence %d->%d->%d->sync(%d) \n", __func__, + stream_sequence[0], stream_sequence[1], stream_sequence[2], + stream_sequence[3]); + // Streams + hipStream_t stream[3]; + stream[0] = 0; + HIP_CHECK(hipStreamCreate(&stream[1])); + HIP_CHECK(hipStreamCreate(&stream[2])); + // Run test loop + for (int k = 0; k < NUM_ITER; ++k) { + // Sync + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[0]])); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[1]])); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[2]])); + // Sync stream 1 + HIP_CHECK(hipStreamSynchronize(stream[stream_sequence[3]])); + } + HIP_CHECK(hipDeviceSynchronize()); + // Clean up + HIP_CHECK(hipStreamDestroy(stream[1])); + HIP_CHECK(hipStreamDestroy(stream[2])); +} +/** + * Scenario: This test extends the above test by using 2 streams + * (of highest and lowest priority) created using hipStreamCreateWithPriority + * along with the default stream. +*/ +void testhipStreamCreatePriority(int *stream_sequence, + unsigned int flag) { + printf("%s: Testing sequence %d->%d->%d->sync(%d) \n", __func__, + stream_sequence[0], stream_sequence[1], stream_sequence[2], + stream_sequence[3]); + // Streams + hipStream_t stream[3]; + stream[0] = 0; + int priority_low = 0; + int priority_high = 0; + HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_low, &priority_high)); + if (priority_low == priority_high) { + printf("Exiting test since priorities are not supported \n"); + return; + } + HIP_CHECK(hipStreamCreateWithPriority(&stream[1], + flag, priority_high)); + HIP_CHECK(hipStreamCreateWithPriority(&stream[2], + flag, priority_low)); + // Run test loop + for (int k = 0; k < NUM_ITER; ++k) { + // Sync + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[0]])); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[1]])); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[2]])); + // Sync stream 1 + HIP_CHECK(hipStreamSynchronize(stream[stream_sequence[3]])); + } + HIP_CHECK(hipDeviceSynchronize()); + // Clean up + HIP_CHECK(hipStreamDestroy(stream[1])); + HIP_CHECK(hipStreamDestroy(stream[2])); +} +/** + * Scenario: This test extends the above test by using 2 streams + * created using hipStreamCreateWithFlags along with the default stream. +*/ +void testhipStreamCreateFlags(int *stream_sequence, + unsigned int flag) { + printf("%s: Testing sequence %d->%d->%d->sync(%d) \n", __func__, + stream_sequence[0], stream_sequence[1], stream_sequence[2], + stream_sequence[3]); + // Streams + hipStream_t stream[3]; + stream[0] = 0; + HIP_CHECK(hipStreamCreateWithFlags(&stream[1], flag)); + HIP_CHECK(hipStreamCreateWithFlags(&stream[2], flag)); + // Run test loop + for (int k = 0; k < NUM_ITER; ++k) { + // Sync + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[0]])); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[1]])); + HIP_CHECK(hipLaunchKernel((const void*)kernel_do_nothing, + dim3(1, 1, 1), dim3(1, 1, 1), NULL, 0, + stream[stream_sequence[2]])); + // Sync stream 1 + HIP_CHECK(hipStreamSynchronize(stream[stream_sequence[3]])); + } + HIP_CHECK(hipDeviceSynchronize()); + // Clean up + HIP_CHECK(hipStreamDestroy(stream[1])); + HIP_CHECK(hipStreamDestroy(stream[2])); +} +} // namespace hipStreamCreateStressTest + +TEST_CASE("Stress_hipStreamCreate_SyncTest") { + printf("hipStreamCreate stress test:\n"); + for (int i = 0; i < TOTALSEQ; i++) { + hipStreamCreateStressTest::testhipStreamCreate( + hipStreamCreateStressTest::stream_seq[i]); + } +} + +TEST_CASE("Stress_hipStreamCreatePriority_SyncTest") { + printf("hipStreamCreateWithPriority(hipStreamDefault) stress test:\n"); + for (int i = 0; i < TOTALSEQ; i++) { + hipStreamCreateStressTest::testhipStreamCreatePriority( + hipStreamCreateStressTest::stream_seq[i], hipStreamDefault); + } + printf("hipStreamCreateWithPriority(hipStreamNonBlocking) stress test:\n"); + for (int i = 0; i < TOTALSEQ; i++) { + hipStreamCreateStressTest::testhipStreamCreatePriority( + hipStreamCreateStressTest::stream_seq[i], hipStreamNonBlocking); + } +} + +TEST_CASE("Stress_hipStreamCreateWithFlags_SyncTest") { + printf("hipStreamCreateWithFlags(hipStreamDefault) stress test:\n"); + for (int i = 0; i < TOTALSEQ; i++) { + hipStreamCreateStressTest::testhipStreamCreateFlags( + hipStreamCreateStressTest::stream_seq[i], hipStreamDefault); + } + printf("hipStreamCreateWithFlags(hipStreamNonBlocking) stress test:\n"); + for (int i = 0; i < TOTALSEQ; i++) { + hipStreamCreateStressTest::testhipStreamCreateFlags( + hipStreamCreateStressTest::stream_seq[i], hipStreamNonBlocking); + } +}