SWDEV-294588 - Enable NV printf DTests

Enable NV printf DTests as many as possible.
Fix the bugs due to behavour difference between
Hip-Rocclr and Cuda.
Add hipLimitPrintfFifoSize.

Change-Id: I3fe6dbc35a7a140a9919df197b7885df83d28049
This commit is contained in:
Tao Sang
2021-07-12 22:51:48 -04:00
committato da Tao Sang
parent b7c0b21814
commit 586165ebc2
9 ha cambiato i file con 173 aggiunte e 24 eliminazioni
@@ -851,7 +851,7 @@
| |*`cudaGraphNodeTypeCount`* | 10.0 | |
| enum |***`cudaLimit`*** | |***`hipLimit_t`*** |
| 0x00 |*`cudaLimitStackSize`* | | |
| 0x01 |*`cudaLimitPrintfFifoSize`* | | |
| 0x01 |*`cudaLimitPrintfFifoSize`* | |*`hipLimitPrintfFifoSize`* |
| 0x02 |*`cudaLimitMallocHeapSize`* | |*`hipLimitMallocHeapSize`* |
| 0x03 |*`cudaLimitDevRuntimeSyncDepth`* | | |
| 0x04 |*`cudaLimitDevRuntimePendingLaunchCount`* | | |
+1
Vedi File
@@ -529,6 +529,7 @@ typedef struct hipFuncAttributes {
} hipFuncAttributes;
typedef struct ihipEvent_t* hipEvent_t;
enum hipLimit_t {
hipLimitPrintfFifoSize = 0x01,
hipLimitMallocHeapSize = 0x02,
};
/**
@@ -1,4 +1,3 @@
#include "hip/hip_runtime.h"
/*
Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved.
+6 -2
Vedi File
@@ -21,8 +21,8 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s
* TEST: %t
* HIT_END
*/
@@ -35,7 +35,11 @@ __global__ void test_kernel() {
printf("%#X\n", 0x42);
printf("%#08x\n", 0x42);
printf("%#f\n", -123.456);
#ifdef __HIP_PLATFORM_AMD__
printf("%#F\n", 123.456);
#else
printf("%#f\n", 123.456); // In Cuda, printf only supports "%cdiouxXpeEfgGaAs"
#endif
printf("%#e\n", 123.456);
printf("%#E\n", -123.456);
printf("%#g\n", -123.456);
+29 -7
Vedi File
@@ -21,14 +21,15 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s
* TEST: %t
* HIT_END
*/
#include "test_common.h"
#include "printf_common.h"
#include <vector>
#include <algorithm>
// Global string constants don't work inside device functions, so we
// use a macro to repeat the declaration in host and device contexts.
@@ -36,7 +37,8 @@ DECLARE_DATA();
__global__ void kernel_uniform0(int *retval) {
uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
retval[tid] = printf("Hello World\n");
retval[tid] = printf("Hello World\n"); // In Hip-Rocclr, printf returns number of characters printed.
// In Cuda, printf returns the number of arguments parsed.
}
static void test_uniform0(int *retval, uint num_blocks,
@@ -55,7 +57,11 @@ static void test_uniform0(int *retval, uint num_blocks,
capture.End();
for (uint ii = 0; ii != num_threads; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
HIPASSERT(retval[ii] == strlen("Hello World\n"));
#else
HIPASSERT(retval[ii] == 0);
#endif
}
std::string data = capture.getData();
@@ -92,7 +98,11 @@ static void test_uniform1(int *retval, uint num_blocks,
capture.End();
for (uint ii = 0; ii != num_threads; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
HIPASSERT(retval[ii] == strlen("Six times Eight is 42") + 1);
#else
HIPASSERT(retval[ii] == 1);
#endif
}
std::string data = capture.getData();
@@ -129,11 +139,19 @@ static void test_divergent0(int *retval, uint num_blocks,
capture.End();
for (uint ii = 0; ii != 10; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
HIPASSERT(retval[ii] == 13);
#else
HIPASSERT(retval[ii] == 1);
#endif
}
for (uint ii = 10; ii != num_threads; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
HIPASSERT(retval[ii] == 14);
#else
HIPASSERT(retval[ii] == 1);
#endif
}
std::string data = capture.getData();
@@ -178,7 +196,11 @@ static void test_divergent1(int *retval, uint num_blocks,
for (uint ii = 0; ii != num_threads; ++ii) {
if (ii % 2) {
#ifdef __HIP_PLATFORM_AMD__
HIPASSERT(retval[ii] == strlen("Hello World\n"));
#else
HIPASSERT(retval[ii] == 0);
#endif
} else {
HIPASSERT(retval[ii] == -1);
}
@@ -202,11 +224,9 @@ __global__ void kernel_series(int *retval) {
const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int result = 0;
result += printf("%s\n", msg_long1);
result += printf("%s\n", msg_short);
result += printf("%s\n", msg_long2);
retval[tid] = result;
}
@@ -225,8 +245,12 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) {
capture.End();
for (uint ii = 0; ii != num_threads; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
HIPASSERT(retval[ii] ==
strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3);
#else
HIPASSERT(retval[ii] == 3);
#endif
}
std::string data = capture.getData();
@@ -245,8 +269,6 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) {
}
__global__ void kernel_divergent_loop() {
DECLARE_DATA();
const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int result = 0;
+32 -3
Vedi File
@@ -21,8 +21,8 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s
* TEST: %t
* HIT_END
*/
@@ -44,6 +44,22 @@ __global__ void print_things() {
printf("%s\n", msg[(tid + 2) % 3]);
}
size_t get_things_size(uint threads_per_device, uint num_devices) {
DECLARE_DATA();
const char *msg[] = {msg_short, msg_long1, msg_long2};
uint num_threads = threads_per_device * num_devices;
size_t size = 0;
for(auto str: msg) {
size += strlen(str) + 1;
}
size *= num_threads;
size += ((threads_per_device + 2) / 3) * num_devices * (strlen(msg_short) + 1);
return size;
}
int main() {
uint num_blocks = 14;
uint threads_per_block = 250;
@@ -53,6 +69,20 @@ int main() {
int num_devices = 0;
hipGetDeviceCount(&num_devices);
#ifdef __HIP_PLATFORM_NVIDIA__
// By default, Cuda has different printf ring buffer size in different GPUs(or ENVs).
// For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests.
// We have to detect, compare and set it
size_t size = get_things_size(threads_per_device, num_devices);
size_t size_expected = size * 4; // Cuda printf buffer format is unknown, but test shows 4 times can work here.
size_t size_current = 0;
HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize));
printf("things size = %zu, expected %zu, current %zu\n", size, size_expected, size_current);
if(size_current < size_expected) {
HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected));
}
#endif
capture.Begin();
for (int i = 0; i != num_devices; ++i) {
hipSetDevice(i);
@@ -77,6 +107,5 @@ int main() {
HIPASSERT(linecount[msg_long2] == num_threads);
HIPASSERT(linecount[msg_short] ==
num_threads + ((threads_per_device + 2) / 3) * num_devices);
passed();
}
+62 -4
Vedi File
@@ -21,14 +21,15 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s
* TEST: %t
* HIT_END
*/
#include "test_common.h"
#include "printf_common.h"
#include <vector>
#include <algorithm>
// Global string constants don't work inside device functions, so we
// use a macro to repeat the declaration in host and device contexts.
@@ -38,7 +39,6 @@ __global__ void kernel_mixed0(int *retval) {
DECLARE_DATA();
uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
ulong result = 0;
// Three strings passed as divergent values to the same hostcall.
const char *msg;
@@ -72,6 +72,7 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) {
capture.End();
for (uint ii = 0; ii != num_threads; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
switch (ii % 3) {
case 0:
HIPASSERT(retval[ii] == strlen(msg_short) + 1);
@@ -83,6 +84,9 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) {
HIPASSERT(retval[ii] == strlen(msg_long2) + 1);
break;
}
#else
HIPASSERT(retval[ii] == 1);
#endif
}
std::string data = capture.getData();
@@ -134,6 +138,7 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) {
capture.End();
for (uint ii = 0; ii != num_threads; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
switch (ii % 3) {
case 0:
HIPASSERT(retval[ii] == strlen(msg_short) + 1);
@@ -145,6 +150,9 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) {
HIPASSERT(retval[ii] == strlen(msg_long2) + 1);
break;
}
#else
HIPASSERT(retval[ii] == 1);
#endif
}
std::string data = capture.getData();
@@ -189,8 +197,12 @@ static void test_mixed2(int *retval, uint num_blocks, uint threads_per_block) {
capture.End();
for (uint ii = 0; ii != num_threads; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
HIPASSERT(retval[ii] ==
strlen(msg_short) + strlen(msg_long1) + strlen(msg_long2) + 1);
#else
HIPASSERT(retval[ii] == 3);
#endif
}
std::string data = capture.getData();
@@ -230,6 +242,21 @@ __global__ void kernel_mixed3(int *retval) {
retval[tid] = result;
}
size_t get_mixed3_size(uint num_threads) {
DECLARE_DATA();
const char *msg[] = {msg_long1, msg_long2};
size_t size = 0;
for(auto str: msg) {
size += strlen(str) + 1;
}
size *= num_threads;
size += ((num_threads + 2) / 3) * (strlen(msg_short) + 1);
return size;
}
static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) {
CaptureStream capture(stdout);
@@ -245,12 +272,16 @@ static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) {
capture.End();
for (uint ii = 0; ii != num_threads; ++ii) {
#ifdef __HIP_PLATFORM_AMD__
if (ii % 3 == 0) {
HIPASSERT(retval[ii] ==
strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3);
} else {
HIPASSERT(retval[ii] == strlen(msg_long1) + strlen(msg_long2) + 2);
}
#else
HIPASSERT(retval[ii] == (ii % 3 ? 2 : 3));
#endif
}
std::string data = capture.getData();
@@ -276,6 +307,18 @@ __global__ void kernel_numbers() {
}
}
size_t get_numbers_size(uint num_threads) {
char buf[100] = { 0 };
size_t size = 0;
for (uint tid = 0; tid < num_threads; tid++) {
for (uint i = 0; i != 7; ++i) {
uint base = tid * 21 + i * 3;
size += snprintf(buf, 100, "%d %d %d\n", base, base + 1, base + 2);
}
}
return size;
}
static void test_numbers(uint num_blocks, uint threads_per_block) {
CaptureStream capture(stdout);
uint num_threads = num_blocks * threads_per_block;
@@ -309,7 +352,23 @@ int main(int argc, char **argv) {
uint num_blocks = 150;
uint threads_per_block = 250;
uint num_threads = num_blocks * threads_per_block;
#ifdef __HIP_PLATFORM_NVIDIA__
// By default, Cuda has different printf ring buffer size in different GPUs(or ENVs).
// For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests.
// We have to detect, compare and set it
size_t size_mixed3 = get_mixed3_size(num_threads);
size_t size_numbers = get_numbers_size(num_threads);
size_t size_max = size_mixed3 >= size_numbers ? size_mixed3 : size_numbers; // Max size
size_t size_expected = size_max * 10; // Cuda printf buffer format is unknown, but test shows 10 times can work here.
size_t size_current = 0;
HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize));
printf("size_mixed3 = %zu, size_numbers = %zu\n", size_mixed3, size_numbers);
printf("max size = %zu, expected %zu, current %zu\n", size_max, size_expected, size_current);
if(size_current < size_expected) {
HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected));
}
#endif
void *retval_void;
HIPCHECK(hipHostMalloc(&retval_void, 4 * num_threads));
auto retval = reinterpret_cast<int *>(retval_void);
@@ -319,6 +378,5 @@ int main(int argc, char **argv) {
test_mixed2(retval, num_blocks, threads_per_block);
test_mixed3(retval, num_blocks, threads_per_block);
test_numbers(num_blocks, threads_per_block);
passed();
}
+33 -4
Vedi File
@@ -21,8 +21,8 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s
* TEST: %t
* HIT_END
*/
@@ -32,7 +32,6 @@ THE SOFTWARE.
__global__ void test_kernel() {
const char *N = nullptr;
const char *s = "hello world";
printf("xyzzy\n");
printf("%%\n");
printf("hello %% world\n");
@@ -46,7 +45,11 @@ __global__ void test_kernel() {
printf("%d\n", -42);
printf("%u\n", 42);
printf("%f\n", 123.456);
#ifdef __HIP_PLATFORM_AMD__
printf("%F\n", -123.456);
#else
printf("%f\n", -123.456);
#endif
printf("%e\n", -123.456);
printf("%E\n", 123.456);
printf("%g\n", 123.456);
@@ -54,11 +57,37 @@ __global__ void test_kernel() {
printf("%c\n", 'x');
printf("%s\n", N);
printf("%p\n", N);
#ifdef __HIP_PLATFORM_AMD__
printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 5, s, (void *)0xf01dab1eca55e77e);
#else
// In Cuda, printf doesn't support %.*, %*.*
printf("%.8f %8.5s %p\n", 3.14159, s, (void *)0xf01dab1eca55e77e);
#endif
}
int main(int argc, char **argv) {
#if !defined(_WIN32)
#ifdef __HIP_PLATFORM_NVIDIA__
std::string reference(R"here(xyzzy
%
hello % world
%s
%s0xf01dab1eca55e77e
%cxyzzy
sep
-42
42
123.456000
-123.456000
-1.234560e+02
1.234560E+02
123.456
-123.456
x
(null)
(nil)
3.14159000 hello 0xf01dab1eca55e77e
)here");
#elif !defined(_WIN32)
std::string reference(R"here(xyzzy
%
hello % world
+9 -2
Vedi File
@@ -21,8 +21,8 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s
* TEST: %t
* HIT_END
*/
@@ -31,9 +31,16 @@ THE SOFTWARE.
__global__ void test_kernel() {
printf("%*d\n", 16, 42);
#ifdef __HIP_PLATFORM_AMD__
printf("%.*d\n", 8, 42);
printf("%*.*d\n", -16, 8, 42);
printf("%*.*f %s * %.*s\n", 16, 8, 123.456, "hello", 5, "worldxyz");
#else
// In Cuda, printf doesn't support %.*, %*.*
printf("%.8d\n", 42);
printf("%-16.8d\n", 42);
printf("%16.8f %s * %.5s\n", 123.456, "hello", "worldxyz");
#endif
}
int main(int argc, char **argv) {