diff --git a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md index b9c9708ccb..3aa0b0c08a 100644 --- a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md +++ b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md @@ -851,7 +851,7 @@ | |*`cudaGraphNodeTypeCount`* | 10.0 | | | enum |***`cudaLimit`*** | |***`hipLimit_t`*** | | 0x00 |*`cudaLimitStackSize`* | | | -| 0x01 |*`cudaLimitPrintfFifoSize`* | | | +| 0x01 |*`cudaLimitPrintfFifoSize`* | |*`hipLimitPrintfFifoSize`* | | 0x02 |*`cudaLimitMallocHeapSize`* | |*`hipLimitMallocHeapSize`* | | 0x03 |*`cudaLimitDevRuntimeSyncDepth`* | | | | 0x04 |*`cudaLimitDevRuntimePendingLaunchCount`* | | | diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index 719248689e..a59e7e3e3a 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -529,6 +529,7 @@ typedef struct hipFuncAttributes { } hipFuncAttributes; typedef struct ihipEvent_t* hipEvent_t; enum hipLimit_t { + hipLimitPrintfFifoSize = 0x01, hipLimitMallocHeapSize = 0x02, }; /** diff --git a/samples/2_Cookbook/18_cmake_hip_device/square.cpp b/samples/2_Cookbook/18_cmake_hip_device/square.cpp index 971488d994..a13f58b41e 100644 --- a/samples/2_Cookbook/18_cmake_hip_device/square.cpp +++ b/samples/2_Cookbook/18_cmake_hip_device/square.cpp @@ -1,4 +1,3 @@ -#include "hip/hip_runtime.h" /* Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved. diff --git a/tests/src/printf/hipPrintfAltForms.cpp b/tests/src/printf/hipPrintfAltForms.cpp index a396ac293b..5185b76785 100644 --- a/tests/src/printf/hipPrintfAltForms.cpp +++ b/tests/src/printf/hipPrintfAltForms.cpp @@ -21,8 +21,8 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia - * TEST: %t EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s + * TEST: %t * HIT_END */ @@ -35,7 +35,11 @@ __global__ void test_kernel() { printf("%#X\n", 0x42); printf("%#08x\n", 0x42); printf("%#f\n", -123.456); +#ifdef __HIP_PLATFORM_AMD__ printf("%#F\n", 123.456); +#else + printf("%#f\n", 123.456); // In Cuda, printf only supports "%cdiouxXpeEfgGaAs" +#endif printf("%#e\n", 123.456); printf("%#E\n", -123.456); printf("%#g\n", -123.456); diff --git a/tests/src/printf/hipPrintfBasic.cpp b/tests/src/printf/hipPrintfBasic.cpp index e663bc127c..8ad578cecc 100644 --- a/tests/src/printf/hipPrintfBasic.cpp +++ b/tests/src/printf/hipPrintfBasic.cpp @@ -21,14 +21,15 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia - * TEST: %t EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s + * TEST: %t * HIT_END */ #include "test_common.h" #include "printf_common.h" #include +#include // Global string constants don't work inside device functions, so we // use a macro to repeat the declaration in host and device contexts. @@ -36,7 +37,8 @@ DECLARE_DATA(); __global__ void kernel_uniform0(int *retval) { uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - retval[tid] = printf("Hello World\n"); + retval[tid] = printf("Hello World\n"); // In Hip-Rocclr, printf returns number of characters printed. + // In Cuda, printf returns the number of arguments parsed. } static void test_uniform0(int *retval, uint num_blocks, @@ -55,7 +57,11 @@ static void test_uniform0(int *retval, uint num_blocks, capture.End(); for (uint ii = 0; ii != num_threads; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ HIPASSERT(retval[ii] == strlen("Hello World\n")); +#else + HIPASSERT(retval[ii] == 0); +#endif } std::string data = capture.getData(); @@ -92,7 +98,11 @@ static void test_uniform1(int *retval, uint num_blocks, capture.End(); for (uint ii = 0; ii != num_threads; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ HIPASSERT(retval[ii] == strlen("Six times Eight is 42") + 1); +#else + HIPASSERT(retval[ii] == 1); +#endif } std::string data = capture.getData(); @@ -129,11 +139,19 @@ static void test_divergent0(int *retval, uint num_blocks, capture.End(); for (uint ii = 0; ii != 10; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ HIPASSERT(retval[ii] == 13); +#else + HIPASSERT(retval[ii] == 1); +#endif } for (uint ii = 10; ii != num_threads; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ HIPASSERT(retval[ii] == 14); +#else + HIPASSERT(retval[ii] == 1); +#endif } std::string data = capture.getData(); @@ -178,7 +196,11 @@ static void test_divergent1(int *retval, uint num_blocks, for (uint ii = 0; ii != num_threads; ++ii) { if (ii % 2) { +#ifdef __HIP_PLATFORM_AMD__ HIPASSERT(retval[ii] == strlen("Hello World\n")); +#else + HIPASSERT(retval[ii] == 0); +#endif } else { HIPASSERT(retval[ii] == -1); } @@ -202,11 +224,9 @@ __global__ void kernel_series(int *retval) { const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; int result = 0; - result += printf("%s\n", msg_long1); result += printf("%s\n", msg_short); result += printf("%s\n", msg_long2); - retval[tid] = result; } @@ -225,8 +245,12 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) { capture.End(); for (uint ii = 0; ii != num_threads; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ HIPASSERT(retval[ii] == strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3); +#else + HIPASSERT(retval[ii] == 3); +#endif } std::string data = capture.getData(); @@ -245,8 +269,6 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) { } __global__ void kernel_divergent_loop() { - DECLARE_DATA(); - const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; int result = 0; diff --git a/tests/src/printf/hipPrintfManyDevices.cpp b/tests/src/printf/hipPrintfManyDevices.cpp old mode 100644 new mode 100755 index 8e1451a2b0..2e7d99461f --- a/tests/src/printf/hipPrintfManyDevices.cpp +++ b/tests/src/printf/hipPrintfManyDevices.cpp @@ -21,8 +21,8 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia - * TEST: %t EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s + * TEST: %t * HIT_END */ @@ -44,6 +44,22 @@ __global__ void print_things() { printf("%s\n", msg[(tid + 2) % 3]); } +size_t get_things_size(uint threads_per_device, uint num_devices) { + DECLARE_DATA(); + const char *msg[] = {msg_short, msg_long1, msg_long2}; + uint num_threads = threads_per_device * num_devices; + size_t size = 0; + + for(auto str: msg) { + size += strlen(str) + 1; + } + + size *= num_threads; + size += ((threads_per_device + 2) / 3) * num_devices * (strlen(msg_short) + 1); + + return size; +} + int main() { uint num_blocks = 14; uint threads_per_block = 250; @@ -53,6 +69,20 @@ int main() { int num_devices = 0; hipGetDeviceCount(&num_devices); +#ifdef __HIP_PLATFORM_NVIDIA__ + // By default, Cuda has different printf ring buffer size in different GPUs(or ENVs). + // For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests. + // We have to detect, compare and set it + size_t size = get_things_size(threads_per_device, num_devices); + size_t size_expected = size * 4; // Cuda printf buffer format is unknown, but test shows 4 times can work here. + size_t size_current = 0; + HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize)); + printf("things size = %zu, expected %zu, current %zu\n", size, size_expected, size_current); + + if(size_current < size_expected) { + HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected)); + } +#endif capture.Begin(); for (int i = 0; i != num_devices; ++i) { hipSetDevice(i); @@ -77,6 +107,5 @@ int main() { HIPASSERT(linecount[msg_long2] == num_threads); HIPASSERT(linecount[msg_short] == num_threads + ((threads_per_device + 2) / 3) * num_devices); - passed(); } diff --git a/tests/src/printf/hipPrintfManyWaves.cpp b/tests/src/printf/hipPrintfManyWaves.cpp index b812950cef..0cb2b16c24 100644 --- a/tests/src/printf/hipPrintfManyWaves.cpp +++ b/tests/src/printf/hipPrintfManyWaves.cpp @@ -21,14 +21,15 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia - * TEST: %t EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s + * TEST: %t * HIT_END */ #include "test_common.h" #include "printf_common.h" #include +#include // Global string constants don't work inside device functions, so we // use a macro to repeat the declaration in host and device contexts. @@ -38,7 +39,6 @@ __global__ void kernel_mixed0(int *retval) { DECLARE_DATA(); uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - ulong result = 0; // Three strings passed as divergent values to the same hostcall. const char *msg; @@ -72,6 +72,7 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) { capture.End(); for (uint ii = 0; ii != num_threads; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ switch (ii % 3) { case 0: HIPASSERT(retval[ii] == strlen(msg_short) + 1); @@ -83,6 +84,9 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) { HIPASSERT(retval[ii] == strlen(msg_long2) + 1); break; } +#else + HIPASSERT(retval[ii] == 1); +#endif } std::string data = capture.getData(); @@ -134,6 +138,7 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) { capture.End(); for (uint ii = 0; ii != num_threads; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ switch (ii % 3) { case 0: HIPASSERT(retval[ii] == strlen(msg_short) + 1); @@ -145,6 +150,9 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) { HIPASSERT(retval[ii] == strlen(msg_long2) + 1); break; } +#else + HIPASSERT(retval[ii] == 1); +#endif } std::string data = capture.getData(); @@ -189,8 +197,12 @@ static void test_mixed2(int *retval, uint num_blocks, uint threads_per_block) { capture.End(); for (uint ii = 0; ii != num_threads; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ HIPASSERT(retval[ii] == strlen(msg_short) + strlen(msg_long1) + strlen(msg_long2) + 1); +#else + HIPASSERT(retval[ii] == 3); +#endif } std::string data = capture.getData(); @@ -230,6 +242,21 @@ __global__ void kernel_mixed3(int *retval) { retval[tid] = result; } +size_t get_mixed3_size(uint num_threads) { + DECLARE_DATA(); + const char *msg[] = {msg_long1, msg_long2}; + size_t size = 0; + + for(auto str: msg) { + size += strlen(str) + 1; + } + + size *= num_threads; + size += ((num_threads + 2) / 3) * (strlen(msg_short) + 1); + + return size; +} + static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) { CaptureStream capture(stdout); @@ -245,12 +272,16 @@ static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) { capture.End(); for (uint ii = 0; ii != num_threads; ++ii) { +#ifdef __HIP_PLATFORM_AMD__ if (ii % 3 == 0) { HIPASSERT(retval[ii] == strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3); } else { HIPASSERT(retval[ii] == strlen(msg_long1) + strlen(msg_long2) + 2); } +#else + HIPASSERT(retval[ii] == (ii % 3 ? 2 : 3)); +#endif } std::string data = capture.getData(); @@ -276,6 +307,18 @@ __global__ void kernel_numbers() { } } +size_t get_numbers_size(uint num_threads) { + char buf[100] = { 0 }; + size_t size = 0; + for (uint tid = 0; tid < num_threads; tid++) { + for (uint i = 0; i != 7; ++i) { + uint base = tid * 21 + i * 3; + size += snprintf(buf, 100, "%d %d %d\n", base, base + 1, base + 2); + } + } + return size; +} + static void test_numbers(uint num_blocks, uint threads_per_block) { CaptureStream capture(stdout); uint num_threads = num_blocks * threads_per_block; @@ -309,7 +352,23 @@ int main(int argc, char **argv) { uint num_blocks = 150; uint threads_per_block = 250; uint num_threads = num_blocks * threads_per_block; +#ifdef __HIP_PLATFORM_NVIDIA__ + // By default, Cuda has different printf ring buffer size in different GPUs(or ENVs). + // For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests. + // We have to detect, compare and set it + size_t size_mixed3 = get_mixed3_size(num_threads); + size_t size_numbers = get_numbers_size(num_threads); + size_t size_max = size_mixed3 >= size_numbers ? size_mixed3 : size_numbers; // Max size + size_t size_expected = size_max * 10; // Cuda printf buffer format is unknown, but test shows 10 times can work here. + size_t size_current = 0; + HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize)); + printf("size_mixed3 = %zu, size_numbers = %zu\n", size_mixed3, size_numbers); + printf("max size = %zu, expected %zu, current %zu\n", size_max, size_expected, size_current); + if(size_current < size_expected) { + HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected)); + } +#endif void *retval_void; HIPCHECK(hipHostMalloc(&retval_void, 4 * num_threads)); auto retval = reinterpret_cast(retval_void); @@ -319,6 +378,5 @@ int main(int argc, char **argv) { test_mixed2(retval, num_blocks, threads_per_block); test_mixed3(retval, num_blocks, threads_per_block); test_numbers(num_blocks, threads_per_block); - passed(); } diff --git a/tests/src/printf/hipPrintfSpecifiers.cpp b/tests/src/printf/hipPrintfSpecifiers.cpp index 74d41d1f14..b3f62f4c86 100644 --- a/tests/src/printf/hipPrintfSpecifiers.cpp +++ b/tests/src/printf/hipPrintfSpecifiers.cpp @@ -21,8 +21,8 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia - * TEST: %t EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s + * TEST: %t * HIT_END */ @@ -32,7 +32,6 @@ THE SOFTWARE. __global__ void test_kernel() { const char *N = nullptr; const char *s = "hello world"; - printf("xyzzy\n"); printf("%%\n"); printf("hello %% world\n"); @@ -46,7 +45,11 @@ __global__ void test_kernel() { printf("%d\n", -42); printf("%u\n", 42); printf("%f\n", 123.456); +#ifdef __HIP_PLATFORM_AMD__ printf("%F\n", -123.456); +#else + printf("%f\n", -123.456); +#endif printf("%e\n", -123.456); printf("%E\n", 123.456); printf("%g\n", 123.456); @@ -54,11 +57,37 @@ __global__ void test_kernel() { printf("%c\n", 'x'); printf("%s\n", N); printf("%p\n", N); +#ifdef __HIP_PLATFORM_AMD__ printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 5, s, (void *)0xf01dab1eca55e77e); +#else + // In Cuda, printf doesn't support %.*, %*.* + printf("%.8f %8.5s %p\n", 3.14159, s, (void *)0xf01dab1eca55e77e); +#endif } int main(int argc, char **argv) { -#if !defined(_WIN32) +#ifdef __HIP_PLATFORM_NVIDIA__ + std::string reference(R"here(xyzzy +% +hello % world +%s +%s0xf01dab1eca55e77e +%cxyzzy +sep +-42 +42 +123.456000 +-123.456000 +-1.234560e+02 +1.234560E+02 +123.456 +-123.456 +x +(null) +(nil) +3.14159000 hello 0xf01dab1eca55e77e +)here"); +#elif !defined(_WIN32) std::string reference(R"here(xyzzy % hello % world diff --git a/tests/src/printf/hipPrintfStar.cpp b/tests/src/printf/hipPrintfStar.cpp index c915d43edb..990c6af173 100644 --- a/tests/src/printf/hipPrintfStar.cpp +++ b/tests/src/printf/hipPrintfStar.cpp @@ -21,8 +21,8 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia - * TEST: %t EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s + * TEST: %t * HIT_END */ @@ -31,9 +31,16 @@ THE SOFTWARE. __global__ void test_kernel() { printf("%*d\n", 16, 42); +#ifdef __HIP_PLATFORM_AMD__ printf("%.*d\n", 8, 42); printf("%*.*d\n", -16, 8, 42); printf("%*.*f %s * %.*s\n", 16, 8, 123.456, "hello", 5, "worldxyz"); +#else + // In Cuda, printf doesn't support %.*, %*.* + printf("%.8d\n", 42); + printf("%-16.8d\n", 42); + printf("%16.8f %s * %.5s\n", 123.456, "hello", "worldxyz"); +#endif } int main(int argc, char **argv) {