SWDEV-294588 - Enable NV printf DTests

Enable NV printf DTests as many as possible. Fix the bugs due to behavour difference between Hip-Rocclr and Cuda. Add hipLimitPrintfFifoSize. Change-Id: I3fe6dbc35a7a140a9919df197b7885df83d28049
2021-07-12 22:51:48 -04:00
parent b7c0b21814
commit 586165ebc2
@@ -851,7 +851,7 @@
 |              |*`cudaGraphNodeTypeCount`*                           | 10.0             |                                                            |
 | enum         |***`cudaLimit`***                                    |                  |***`hipLimit_t`***                                          |
 |         0x00 |*`cudaLimitStackSize`*                               |                  |                                                            |
-|         0x01 |*`cudaLimitPrintfFifoSize`*                          |                  |                                                            |
+|         0x01 |*`cudaLimitPrintfFifoSize`*                          |                  |*`hipLimitPrintfFifoSize`*                                  |
 |         0x02 |*`cudaLimitMallocHeapSize`*                          |                  |*`hipLimitMallocHeapSize`*                                  |
 |         0x03 |*`cudaLimitDevRuntimeSyncDepth`*                     |                  |                                                            |
 |         0x04 |*`cudaLimitDevRuntimePendingLaunchCount`*            |                  |                                                            |
@@ -529,6 +529,7 @@ typedef struct hipFuncAttributes {
 } hipFuncAttributes;
 typedef struct ihipEvent_t* hipEvent_t;
 enum hipLimit_t {
+    hipLimitPrintfFifoSize = 0x01,
    hipLimitMallocHeapSize = 0x02,
 };
 /**
@@ -1,4 +1,3 @@
-#include "hip/hip_runtime.h"
 /*
 Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved.

@@ -21,8 +21,8 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
 * HIT_END
 */

@@ -35,7 +35,11 @@ __global__ void test_kernel() {
  printf("%#X\n", 0x42);
  printf("%#08x\n", 0x42);
  printf("%#f\n", -123.456);
+#ifdef __HIP_PLATFORM_AMD__
  printf("%#F\n", 123.456);
+#else
+  printf("%#f\n", 123.456); // In Cuda, printf only supports "%cdiouxXpeEfgGaAs"
+#endif
  printf("%#e\n", 123.456);
  printf("%#E\n", -123.456);
  printf("%#g\n", -123.456);
@@ -21,14 +21,15 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
 * HIT_END
 */

 #include "test_common.h"
 #include "printf_common.h"
 #include <vector>
+#include <algorithm>

 // Global string constants don't work inside device functions, so we
 // use a macro to repeat the declaration in host and device contexts.
@@ -36,7 +37,8 @@ DECLARE_DATA();

 __global__ void kernel_uniform0(int *retval) {
  uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-  retval[tid] = printf("Hello World\n");
+  retval[tid] = printf("Hello World\n"); // In Hip-Rocclr, printf returns number of characters printed.
+                                         // In Cuda, printf returns the number of arguments parsed.
 }

 static void test_uniform0(int *retval, uint num_blocks,
@@ -55,7 +57,11 @@ static void test_uniform0(int *retval, uint num_blocks,
  capture.End();

  for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    HIPASSERT(retval[ii] == strlen("Hello World\n"));
+#else
+    HIPASSERT(retval[ii] == 0);
+#endif
  }

  std::string data = capture.getData();
@@ -92,7 +98,11 @@ static void test_uniform1(int *retval, uint num_blocks,
  capture.End();

  for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    HIPASSERT(retval[ii] == strlen("Six times Eight is 42") + 1);
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
  }

  std::string data = capture.getData();
@@ -129,11 +139,19 @@ static void test_divergent0(int *retval, uint num_blocks,
  capture.End();

  for (uint ii = 0; ii != 10; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    HIPASSERT(retval[ii] == 13);
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
  }

  for (uint ii = 10; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    HIPASSERT(retval[ii] == 14);
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
  }

  std::string data = capture.getData();
@@ -178,7 +196,11 @@ static void test_divergent1(int *retval, uint num_blocks,

  for (uint ii = 0; ii != num_threads; ++ii) {
    if (ii % 2) {
+#ifdef __HIP_PLATFORM_AMD__
      HIPASSERT(retval[ii] == strlen("Hello World\n"));
+#else
+      HIPASSERT(retval[ii] == 0);
+#endif
    } else {
      HIPASSERT(retval[ii] == -1);
    }
@@ -202,11 +224,9 @@ __global__ void kernel_series(int *retval) {

  const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  int result = 0;
-
  result += printf("%s\n", msg_long1);
  result += printf("%s\n", msg_short);
  result += printf("%s\n", msg_long2);
-
  retval[tid] = result;
 }

@@ -225,8 +245,12 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) {
  capture.End();

  for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    HIPASSERT(retval[ii] ==
              strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3);
+#else
+    HIPASSERT(retval[ii] == 3);
+#endif
  }

  std::string data = capture.getData();
@@ -245,8 +269,6 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) {
 }

 __global__ void kernel_divergent_loop() {
-  DECLARE_DATA();
-
  const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  int result = 0;

@@ -21,8 +21,8 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
 * HIT_END
 */

@@ -44,6 +44,22 @@ __global__ void print_things() {
  printf("%s\n", msg[(tid + 2) % 3]);
 }

+size_t get_things_size(uint threads_per_device, uint num_devices) {
+  DECLARE_DATA();
+  const char *msg[] = {msg_short, msg_long1, msg_long2};
+  uint num_threads = threads_per_device * num_devices;
+  size_t size = 0;
+
+  for(auto str: msg) {
+    size += strlen(str) + 1;
+  }
+
+  size *= num_threads;
+  size += ((threads_per_device + 2) / 3) * num_devices * (strlen(msg_short) + 1);
+
+  return size;
+}
+
 int main() {
  uint num_blocks = 14;
  uint threads_per_block = 250;
@@ -53,6 +69,20 @@ int main() {

  int num_devices = 0;
  hipGetDeviceCount(&num_devices);
+#ifdef __HIP_PLATFORM_NVIDIA__
+  // By default, Cuda has different printf ring buffer size in different GPUs(or ENVs).
+  // For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests.
+  // We have to detect, compare and set it
+  size_t size = get_things_size(threads_per_device, num_devices);
+  size_t size_expected = size * 4;  // Cuda printf buffer format is unknown, but test shows 4 times can work here.
+  size_t size_current = 0;
+  HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize));
+  printf("things size = %zu, expected %zu, current %zu\n", size, size_expected, size_current);
+
+  if(size_current < size_expected) {
+    HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected));
+  }
+#endif
  capture.Begin();
  for (int i = 0; i != num_devices; ++i) {
    hipSetDevice(i);
@@ -77,6 +107,5 @@ int main() {
  HIPASSERT(linecount[msg_long2] == num_threads);
  HIPASSERT(linecount[msg_short] ==
            num_threads + ((threads_per_device + 2) / 3) * num_devices);
-
  passed();
 }
@@ -21,14 +21,15 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
 * HIT_END
 */

 #include "test_common.h"
 #include "printf_common.h"
 #include <vector>
+#include <algorithm>

 // Global string constants don't work inside device functions, so we
 // use a macro to repeat the declaration in host and device contexts.
@@ -38,7 +39,6 @@ __global__ void kernel_mixed0(int *retval) {
  DECLARE_DATA();

  uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-  ulong result = 0;

  // Three strings passed as divergent values to the same hostcall.
  const char *msg;
@@ -72,6 +72,7 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) {
  capture.End();

  for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    switch (ii % 3) {
    case 0:
      HIPASSERT(retval[ii] == strlen(msg_short) + 1);
@@ -83,6 +84,9 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) {
      HIPASSERT(retval[ii] == strlen(msg_long2) + 1);
      break;
    }
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
  }

  std::string data = capture.getData();
@@ -134,6 +138,7 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) {
  capture.End();

  for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    switch (ii % 3) {
    case 0:
      HIPASSERT(retval[ii] == strlen(msg_short) + 1);
@@ -145,6 +150,9 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) {
      HIPASSERT(retval[ii] == strlen(msg_long2) + 1);
      break;
    }
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
  }

  std::string data = capture.getData();
@@ -189,8 +197,12 @@ static void test_mixed2(int *retval, uint num_blocks, uint threads_per_block) {
  capture.End();

  for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    HIPASSERT(retval[ii] ==
              strlen(msg_short) + strlen(msg_long1) + strlen(msg_long2) + 1);
+#else
+    HIPASSERT(retval[ii] == 3);
+#endif
  }

  std::string data = capture.getData();
@@ -230,6 +242,21 @@ __global__ void kernel_mixed3(int *retval) {
  retval[tid] = result;
 }

+size_t get_mixed3_size(uint num_threads) {
+  DECLARE_DATA();
+  const char *msg[] = {msg_long1, msg_long2};
+  size_t size = 0;
+
+  for(auto str: msg) {
+    size += strlen(str) + 1;
+  }
+
+  size *= num_threads;
+  size += ((num_threads + 2) / 3) * (strlen(msg_short) + 1);
+
+  return size;
+}
+
 static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) {
  CaptureStream capture(stdout);

@@ -245,12 +272,16 @@ static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) {
  capture.End();

  for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
    if (ii % 3 == 0) {
      HIPASSERT(retval[ii] ==
                strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3);
    } else {
      HIPASSERT(retval[ii] == strlen(msg_long1) + strlen(msg_long2) + 2);
    }
+#else
+    HIPASSERT(retval[ii] == (ii % 3 ? 2 : 3));
+#endif
  }

  std::string data = capture.getData();
@@ -276,6 +307,18 @@ __global__ void kernel_numbers() {
  }
 }

+size_t get_numbers_size(uint num_threads) {
+  char buf[100] = { 0 };
+  size_t size = 0;
+  for (uint tid = 0; tid < num_threads; tid++) {
+    for (uint i = 0; i != 7; ++i) {
+      uint base = tid * 21 + i * 3;
+      size += snprintf(buf, 100, "%d %d %d\n", base, base + 1, base + 2);
+    }
+  }
+  return size;
+}
+
 static void test_numbers(uint num_blocks, uint threads_per_block) {
  CaptureStream capture(stdout);
  uint num_threads = num_blocks * threads_per_block;
@@ -309,7 +352,23 @@ int main(int argc, char **argv) {
  uint num_blocks = 150;
  uint threads_per_block = 250;
  uint num_threads = num_blocks * threads_per_block;
+#ifdef __HIP_PLATFORM_NVIDIA__
+  // By default, Cuda has different printf ring buffer size in different GPUs(or ENVs).
+  // For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests.
+  // We have to detect, compare and set it
+  size_t size_mixed3 = get_mixed3_size(num_threads);
+  size_t size_numbers = get_numbers_size(num_threads);
+  size_t size_max = size_mixed3 >= size_numbers ? size_mixed3 : size_numbers;  // Max size
+  size_t size_expected = size_max * 10;  // Cuda printf buffer format is unknown, but test shows 10 times can work here.
+  size_t size_current = 0;
+  HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize));
+  printf("size_mixed3 = %zu, size_numbers = %zu\n", size_mixed3, size_numbers);
+  printf("max size = %zu, expected %zu, current %zu\n", size_max, size_expected, size_current);

+  if(size_current < size_expected) {
+    HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected));
+  }
+#endif
  void *retval_void;
  HIPCHECK(hipHostMalloc(&retval_void, 4 * num_threads));
  auto retval = reinterpret_cast<int *>(retval_void);
@@ -319,6 +378,5 @@ int main(int argc, char **argv) {
  test_mixed2(retval, num_blocks, threads_per_block);
  test_mixed3(retval, num_blocks, threads_per_block);
  test_numbers(num_blocks, threads_per_block);
-
  passed();
 }
@@ -21,8 +21,8 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
 * HIT_END
 */

@@ -32,7 +32,6 @@ THE SOFTWARE.
 __global__ void test_kernel() {
  const char *N = nullptr;
  const char *s = "hello world";
-
  printf("xyzzy\n");
  printf("%%\n");
  printf("hello %% world\n");
@@ -46,7 +45,11 @@ __global__ void test_kernel() {
  printf("%d\n", -42);
  printf("%u\n", 42);
  printf("%f\n", 123.456);
+#ifdef __HIP_PLATFORM_AMD__
  printf("%F\n", -123.456);
+#else
+  printf("%f\n", -123.456);
+#endif
  printf("%e\n", -123.456);
  printf("%E\n", 123.456);
  printf("%g\n", 123.456);
@@ -54,11 +57,37 @@ __global__ void test_kernel() {
  printf("%c\n", 'x');
  printf("%s\n", N);
  printf("%p\n", N);
+#ifdef __HIP_PLATFORM_AMD__
  printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 5, s, (void *)0xf01dab1eca55e77e);
+#else
+  // In Cuda, printf doesn't support %.*, %*.*
+  printf("%.8f %8.5s %p\n", 3.14159, s, (void *)0xf01dab1eca55e77e);
+#endif
 }

 int main(int argc, char **argv) {
-#if !defined(_WIN32)
+#ifdef __HIP_PLATFORM_NVIDIA__
+  std::string reference(R"here(xyzzy
+%
+hello % world
+%s
+%s0xf01dab1eca55e77e
+%cxyzzy
+sep
+-42
+42
+123.456000
+-123.456000
+-1.234560e+02
+1.234560E+02
+123.456
+-123.456
+x
+(null)
+(nil)
+3.14159000    hello 0xf01dab1eca55e77e
+)here");
+#elif !defined(_WIN32)
  std::string reference(R"here(xyzzy
 %
 hello % world
@@ -21,8 +21,8 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
 * HIT_END
 */

@@ -31,9 +31,16 @@ THE SOFTWARE.

 __global__ void test_kernel() {
  printf("%*d\n", 16, 42);
+#ifdef __HIP_PLATFORM_AMD__
  printf("%.*d\n", 8, 42);
  printf("%*.*d\n", -16, 8, 42);
  printf("%*.*f %s * %.*s\n", 16, 8, 123.456, "hello", 5, "worldxyz");
+#else
+  // In Cuda, printf doesn't support %.*, %*.*
+  printf("%.8d\n", 42);
+  printf("%-16.8d\n", 42);
+  printf("%16.8f %s * %.5s\n", 123.456, "hello", "worldxyz");
+#endif
 }

 int main(int argc, char **argv) {