SWDEV-294588 - Enable NV printf DTests
Enable NV printf DTests as many as possible. Fix the bugs due to behavour difference between Hip-Rocclr and Cuda. Add hipLimitPrintfFifoSize. Change-Id: I3fe6dbc35a7a140a9919df197b7885df83d28049
This commit is contained in:
@@ -851,7 +851,7 @@
|
||||
| |*`cudaGraphNodeTypeCount`* | 10.0 | |
|
||||
| enum |***`cudaLimit`*** | |***`hipLimit_t`*** |
|
||||
| 0x00 |*`cudaLimitStackSize`* | | |
|
||||
| 0x01 |*`cudaLimitPrintfFifoSize`* | | |
|
||||
| 0x01 |*`cudaLimitPrintfFifoSize`* | |*`hipLimitPrintfFifoSize`* |
|
||||
| 0x02 |*`cudaLimitMallocHeapSize`* | |*`hipLimitMallocHeapSize`* |
|
||||
| 0x03 |*`cudaLimitDevRuntimeSyncDepth`* | | |
|
||||
| 0x04 |*`cudaLimitDevRuntimePendingLaunchCount`* | | |
|
||||
|
||||
@@ -529,6 +529,7 @@ typedef struct hipFuncAttributes {
|
||||
} hipFuncAttributes;
|
||||
typedef struct ihipEvent_t* hipEvent_t;
|
||||
enum hipLimit_t {
|
||||
hipLimitPrintfFifoSize = 0x01,
|
||||
hipLimitMallocHeapSize = 0x02,
|
||||
};
|
||||
/**
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "hip/hip_runtime.h"
|
||||
/*
|
||||
Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
|
||||
@@ -21,8 +21,8 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -35,7 +35,11 @@ __global__ void test_kernel() {
|
||||
printf("%#X\n", 0x42);
|
||||
printf("%#08x\n", 0x42);
|
||||
printf("%#f\n", -123.456);
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
printf("%#F\n", 123.456);
|
||||
#else
|
||||
printf("%#f\n", 123.456); // In Cuda, printf only supports "%cdiouxXpeEfgGaAs"
|
||||
#endif
|
||||
printf("%#e\n", 123.456);
|
||||
printf("%#E\n", -123.456);
|
||||
printf("%#g\n", -123.456);
|
||||
|
||||
@@ -21,14 +21,15 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#include "printf_common.h"
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
// Global string constants don't work inside device functions, so we
|
||||
// use a macro to repeat the declaration in host and device contexts.
|
||||
@@ -36,7 +37,8 @@ DECLARE_DATA();
|
||||
|
||||
__global__ void kernel_uniform0(int *retval) {
|
||||
uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
retval[tid] = printf("Hello World\n");
|
||||
retval[tid] = printf("Hello World\n"); // In Hip-Rocclr, printf returns number of characters printed.
|
||||
// In Cuda, printf returns the number of arguments parsed.
|
||||
}
|
||||
|
||||
static void test_uniform0(int *retval, uint num_blocks,
|
||||
@@ -55,7 +57,11 @@ static void test_uniform0(int *retval, uint num_blocks,
|
||||
capture.End();
|
||||
|
||||
for (uint ii = 0; ii != num_threads; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
HIPASSERT(retval[ii] == strlen("Hello World\n"));
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string data = capture.getData();
|
||||
@@ -92,7 +98,11 @@ static void test_uniform1(int *retval, uint num_blocks,
|
||||
capture.End();
|
||||
|
||||
for (uint ii = 0; ii != num_threads; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
HIPASSERT(retval[ii] == strlen("Six times Eight is 42") + 1);
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string data = capture.getData();
|
||||
@@ -129,11 +139,19 @@ static void test_divergent0(int *retval, uint num_blocks,
|
||||
capture.End();
|
||||
|
||||
for (uint ii = 0; ii != 10; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
HIPASSERT(retval[ii] == 13);
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (uint ii = 10; ii != num_threads; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
HIPASSERT(retval[ii] == 14);
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string data = capture.getData();
|
||||
@@ -178,7 +196,11 @@ static void test_divergent1(int *retval, uint num_blocks,
|
||||
|
||||
for (uint ii = 0; ii != num_threads; ++ii) {
|
||||
if (ii % 2) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
HIPASSERT(retval[ii] == strlen("Hello World\n"));
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 0);
|
||||
#endif
|
||||
} else {
|
||||
HIPASSERT(retval[ii] == -1);
|
||||
}
|
||||
@@ -202,11 +224,9 @@ __global__ void kernel_series(int *retval) {
|
||||
|
||||
const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int result = 0;
|
||||
|
||||
result += printf("%s\n", msg_long1);
|
||||
result += printf("%s\n", msg_short);
|
||||
result += printf("%s\n", msg_long2);
|
||||
|
||||
retval[tid] = result;
|
||||
}
|
||||
|
||||
@@ -225,8 +245,12 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
capture.End();
|
||||
|
||||
for (uint ii = 0; ii != num_threads; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
HIPASSERT(retval[ii] ==
|
||||
strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3);
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 3);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string data = capture.getData();
|
||||
@@ -245,8 +269,6 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
}
|
||||
|
||||
__global__ void kernel_divergent_loop() {
|
||||
DECLARE_DATA();
|
||||
|
||||
const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int result = 0;
|
||||
|
||||
|
||||
Regular → Executable
+32
-3
@@ -21,8 +21,8 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -44,6 +44,22 @@ __global__ void print_things() {
|
||||
printf("%s\n", msg[(tid + 2) % 3]);
|
||||
}
|
||||
|
||||
size_t get_things_size(uint threads_per_device, uint num_devices) {
|
||||
DECLARE_DATA();
|
||||
const char *msg[] = {msg_short, msg_long1, msg_long2};
|
||||
uint num_threads = threads_per_device * num_devices;
|
||||
size_t size = 0;
|
||||
|
||||
for(auto str: msg) {
|
||||
size += strlen(str) + 1;
|
||||
}
|
||||
|
||||
size *= num_threads;
|
||||
size += ((threads_per_device + 2) / 3) * num_devices * (strlen(msg_short) + 1);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
int main() {
|
||||
uint num_blocks = 14;
|
||||
uint threads_per_block = 250;
|
||||
@@ -53,6 +69,20 @@ int main() {
|
||||
|
||||
int num_devices = 0;
|
||||
hipGetDeviceCount(&num_devices);
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
// By default, Cuda has different printf ring buffer size in different GPUs(or ENVs).
|
||||
// For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests.
|
||||
// We have to detect, compare and set it
|
||||
size_t size = get_things_size(threads_per_device, num_devices);
|
||||
size_t size_expected = size * 4; // Cuda printf buffer format is unknown, but test shows 4 times can work here.
|
||||
size_t size_current = 0;
|
||||
HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize));
|
||||
printf("things size = %zu, expected %zu, current %zu\n", size, size_expected, size_current);
|
||||
|
||||
if(size_current < size_expected) {
|
||||
HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected));
|
||||
}
|
||||
#endif
|
||||
capture.Begin();
|
||||
for (int i = 0; i != num_devices; ++i) {
|
||||
hipSetDevice(i);
|
||||
@@ -77,6 +107,5 @@ int main() {
|
||||
HIPASSERT(linecount[msg_long2] == num_threads);
|
||||
HIPASSERT(linecount[msg_short] ==
|
||||
num_threads + ((threads_per_device + 2) / 3) * num_devices);
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -21,14 +21,15 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#include "printf_common.h"
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
// Global string constants don't work inside device functions, so we
|
||||
// use a macro to repeat the declaration in host and device contexts.
|
||||
@@ -38,7 +39,6 @@ __global__ void kernel_mixed0(int *retval) {
|
||||
DECLARE_DATA();
|
||||
|
||||
uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
ulong result = 0;
|
||||
|
||||
// Three strings passed as divergent values to the same hostcall.
|
||||
const char *msg;
|
||||
@@ -72,6 +72,7 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
capture.End();
|
||||
|
||||
for (uint ii = 0; ii != num_threads; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
switch (ii % 3) {
|
||||
case 0:
|
||||
HIPASSERT(retval[ii] == strlen(msg_short) + 1);
|
||||
@@ -83,6 +84,9 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
HIPASSERT(retval[ii] == strlen(msg_long2) + 1);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string data = capture.getData();
|
||||
@@ -134,6 +138,7 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
capture.End();
|
||||
|
||||
for (uint ii = 0; ii != num_threads; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
switch (ii % 3) {
|
||||
case 0:
|
||||
HIPASSERT(retval[ii] == strlen(msg_short) + 1);
|
||||
@@ -145,6 +150,9 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
HIPASSERT(retval[ii] == strlen(msg_long2) + 1);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string data = capture.getData();
|
||||
@@ -189,8 +197,12 @@ static void test_mixed2(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
capture.End();
|
||||
|
||||
for (uint ii = 0; ii != num_threads; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
HIPASSERT(retval[ii] ==
|
||||
strlen(msg_short) + strlen(msg_long1) + strlen(msg_long2) + 1);
|
||||
#else
|
||||
HIPASSERT(retval[ii] == 3);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string data = capture.getData();
|
||||
@@ -230,6 +242,21 @@ __global__ void kernel_mixed3(int *retval) {
|
||||
retval[tid] = result;
|
||||
}
|
||||
|
||||
size_t get_mixed3_size(uint num_threads) {
|
||||
DECLARE_DATA();
|
||||
const char *msg[] = {msg_long1, msg_long2};
|
||||
size_t size = 0;
|
||||
|
||||
for(auto str: msg) {
|
||||
size += strlen(str) + 1;
|
||||
}
|
||||
|
||||
size *= num_threads;
|
||||
size += ((num_threads + 2) / 3) * (strlen(msg_short) + 1);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
CaptureStream capture(stdout);
|
||||
|
||||
@@ -245,12 +272,16 @@ static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) {
|
||||
capture.End();
|
||||
|
||||
for (uint ii = 0; ii != num_threads; ++ii) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
if (ii % 3 == 0) {
|
||||
HIPASSERT(retval[ii] ==
|
||||
strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3);
|
||||
} else {
|
||||
HIPASSERT(retval[ii] == strlen(msg_long1) + strlen(msg_long2) + 2);
|
||||
}
|
||||
#else
|
||||
HIPASSERT(retval[ii] == (ii % 3 ? 2 : 3));
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string data = capture.getData();
|
||||
@@ -276,6 +307,18 @@ __global__ void kernel_numbers() {
|
||||
}
|
||||
}
|
||||
|
||||
size_t get_numbers_size(uint num_threads) {
|
||||
char buf[100] = { 0 };
|
||||
size_t size = 0;
|
||||
for (uint tid = 0; tid < num_threads; tid++) {
|
||||
for (uint i = 0; i != 7; ++i) {
|
||||
uint base = tid * 21 + i * 3;
|
||||
size += snprintf(buf, 100, "%d %d %d\n", base, base + 1, base + 2);
|
||||
}
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static void test_numbers(uint num_blocks, uint threads_per_block) {
|
||||
CaptureStream capture(stdout);
|
||||
uint num_threads = num_blocks * threads_per_block;
|
||||
@@ -309,7 +352,23 @@ int main(int argc, char **argv) {
|
||||
uint num_blocks = 150;
|
||||
uint threads_per_block = 250;
|
||||
uint num_threads = num_blocks * threads_per_block;
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
// By default, Cuda has different printf ring buffer size in different GPUs(or ENVs).
|
||||
// For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests.
|
||||
// We have to detect, compare and set it
|
||||
size_t size_mixed3 = get_mixed3_size(num_threads);
|
||||
size_t size_numbers = get_numbers_size(num_threads);
|
||||
size_t size_max = size_mixed3 >= size_numbers ? size_mixed3 : size_numbers; // Max size
|
||||
size_t size_expected = size_max * 10; // Cuda printf buffer format is unknown, but test shows 10 times can work here.
|
||||
size_t size_current = 0;
|
||||
HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize));
|
||||
printf("size_mixed3 = %zu, size_numbers = %zu\n", size_mixed3, size_numbers);
|
||||
printf("max size = %zu, expected %zu, current %zu\n", size_max, size_expected, size_current);
|
||||
|
||||
if(size_current < size_expected) {
|
||||
HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected));
|
||||
}
|
||||
#endif
|
||||
void *retval_void;
|
||||
HIPCHECK(hipHostMalloc(&retval_void, 4 * num_threads));
|
||||
auto retval = reinterpret_cast<int *>(retval_void);
|
||||
@@ -319,6 +378,5 @@ int main(int argc, char **argv) {
|
||||
test_mixed2(retval, num_blocks, threads_per_block);
|
||||
test_mixed3(retval, num_blocks, threads_per_block);
|
||||
test_numbers(num_blocks, threads_per_block);
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -21,8 +21,8 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -32,7 +32,6 @@ THE SOFTWARE.
|
||||
__global__ void test_kernel() {
|
||||
const char *N = nullptr;
|
||||
const char *s = "hello world";
|
||||
|
||||
printf("xyzzy\n");
|
||||
printf("%%\n");
|
||||
printf("hello %% world\n");
|
||||
@@ -46,7 +45,11 @@ __global__ void test_kernel() {
|
||||
printf("%d\n", -42);
|
||||
printf("%u\n", 42);
|
||||
printf("%f\n", 123.456);
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
printf("%F\n", -123.456);
|
||||
#else
|
||||
printf("%f\n", -123.456);
|
||||
#endif
|
||||
printf("%e\n", -123.456);
|
||||
printf("%E\n", 123.456);
|
||||
printf("%g\n", 123.456);
|
||||
@@ -54,11 +57,37 @@ __global__ void test_kernel() {
|
||||
printf("%c\n", 'x');
|
||||
printf("%s\n", N);
|
||||
printf("%p\n", N);
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 5, s, (void *)0xf01dab1eca55e77e);
|
||||
#else
|
||||
// In Cuda, printf doesn't support %.*, %*.*
|
||||
printf("%.8f %8.5s %p\n", 3.14159, s, (void *)0xf01dab1eca55e77e);
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
#if !defined(_WIN32)
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
std::string reference(R"here(xyzzy
|
||||
%
|
||||
hello % world
|
||||
%s
|
||||
%s0xf01dab1eca55e77e
|
||||
%cxyzzy
|
||||
sep
|
||||
-42
|
||||
42
|
||||
123.456000
|
||||
-123.456000
|
||||
-1.234560e+02
|
||||
1.234560E+02
|
||||
123.456
|
||||
-123.456
|
||||
x
|
||||
(null)
|
||||
(nil)
|
||||
3.14159000 hello 0xf01dab1eca55e77e
|
||||
)here");
|
||||
#elif !defined(_WIN32)
|
||||
std::string reference(R"here(xyzzy
|
||||
%
|
||||
hello % world
|
||||
|
||||
@@ -21,8 +21,8 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
|
||||
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -31,9 +31,16 @@ THE SOFTWARE.
|
||||
|
||||
__global__ void test_kernel() {
|
||||
printf("%*d\n", 16, 42);
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
printf("%.*d\n", 8, 42);
|
||||
printf("%*.*d\n", -16, 8, 42);
|
||||
printf("%*.*f %s * %.*s\n", 16, 8, 123.456, "hello", 5, "worldxyz");
|
||||
#else
|
||||
// In Cuda, printf doesn't support %.*, %*.*
|
||||
printf("%.8d\n", 42);
|
||||
printf("%-16.8d\n", 42);
|
||||
printf("%16.8f %s * %.5s\n", 123.456, "hello", "worldxyz");
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user