diff --git a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
index b9c9708ccb..3aa0b0c08a 100644
--- a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
+++ b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
@@ -851,7 +851,7 @@
 |              |*`cudaGraphNodeTypeCount`*                           | 10.0             |                                                            |
 | enum         |***`cudaLimit`***                                    |                  |***`hipLimit_t`***                                          |
 |         0x00 |*`cudaLimitStackSize`*                               |                  |                                                            |
-|         0x01 |*`cudaLimitPrintfFifoSize`*                          |                  |                                                            |
+|         0x01 |*`cudaLimitPrintfFifoSize`*                          |                  |*`hipLimitPrintfFifoSize`*                                  |
 |         0x02 |*`cudaLimitMallocHeapSize`*                          |                  |*`hipLimitMallocHeapSize`*                                  |
 |         0x03 |*`cudaLimitDevRuntimeSyncDepth`*                     |                  |                                                            |
 |         0x04 |*`cudaLimitDevRuntimePendingLaunchCount`*            |                  |                                                            |
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index 719248689e..a59e7e3e3a 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -529,6 +529,7 @@ typedef struct hipFuncAttributes {
 } hipFuncAttributes;
 typedef struct ihipEvent_t* hipEvent_t;
 enum hipLimit_t {
+    hipLimitPrintfFifoSize = 0x01,
     hipLimitMallocHeapSize = 0x02,
 };
 /**
diff --git a/samples/2_Cookbook/18_cmake_hip_device/square.cpp b/samples/2_Cookbook/18_cmake_hip_device/square.cpp
index 971488d994..a13f58b41e 100644
--- a/samples/2_Cookbook/18_cmake_hip_device/square.cpp
+++ b/samples/2_Cookbook/18_cmake_hip_device/square.cpp
@@ -1,4 +1,3 @@
-#include "hip/hip_runtime.h"
 /*
 Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved.
 
diff --git a/tests/src/printf/hipPrintfAltForms.cpp b/tests/src/printf/hipPrintfAltForms.cpp
index a396ac293b..5185b76785 100644
--- a/tests/src/printf/hipPrintfAltForms.cpp
+++ b/tests/src/printf/hipPrintfAltForms.cpp
@@ -21,8 +21,8 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
  * HIT_END
  */
 
@@ -35,7 +35,11 @@ __global__ void test_kernel() {
   printf("%#X\n", 0x42);
   printf("%#08x\n", 0x42);
   printf("%#f\n", -123.456);
+#ifdef __HIP_PLATFORM_AMD__
   printf("%#F\n", 123.456);
+#else
+  printf("%#f\n", 123.456); // In Cuda, printf only supports "%cdiouxXpeEfgGaAs"
+#endif
   printf("%#e\n", 123.456);
   printf("%#E\n", -123.456);
   printf("%#g\n", -123.456);
diff --git a/tests/src/printf/hipPrintfBasic.cpp b/tests/src/printf/hipPrintfBasic.cpp
index e663bc127c..8ad578cecc 100644
--- a/tests/src/printf/hipPrintfBasic.cpp
+++ b/tests/src/printf/hipPrintfBasic.cpp
@@ -21,14 +21,15 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
  * HIT_END
  */
 
 #include "test_common.h"
 #include "printf_common.h"
 #include <vector>
+#include <algorithm>
 
 // Global string constants don't work inside device functions, so we
 // use a macro to repeat the declaration in host and device contexts.
@@ -36,7 +37,8 @@ DECLARE_DATA();
 
 __global__ void kernel_uniform0(int *retval) {
   uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-  retval[tid] = printf("Hello World\n");
+  retval[tid] = printf("Hello World\n"); // In Hip-Rocclr, printf returns number of characters printed.
+                                         // In Cuda, printf returns the number of arguments parsed.
 }
 
 static void test_uniform0(int *retval, uint num_blocks,
@@ -55,7 +57,11 @@ static void test_uniform0(int *retval, uint num_blocks,
   capture.End();
 
   for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     HIPASSERT(retval[ii] == strlen("Hello World\n"));
+#else
+    HIPASSERT(retval[ii] == 0);
+#endif
   }
 
   std::string data = capture.getData();
@@ -92,7 +98,11 @@ static void test_uniform1(int *retval, uint num_blocks,
   capture.End();
 
   for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     HIPASSERT(retval[ii] == strlen("Six times Eight is 42") + 1);
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
   }
 
   std::string data = capture.getData();
@@ -129,11 +139,19 @@ static void test_divergent0(int *retval, uint num_blocks,
   capture.End();
 
   for (uint ii = 0; ii != 10; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     HIPASSERT(retval[ii] == 13);
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
   }
 
   for (uint ii = 10; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     HIPASSERT(retval[ii] == 14);
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
   }
 
   std::string data = capture.getData();
@@ -178,7 +196,11 @@ static void test_divergent1(int *retval, uint num_blocks,
 
   for (uint ii = 0; ii != num_threads; ++ii) {
     if (ii % 2) {
+#ifdef __HIP_PLATFORM_AMD__
       HIPASSERT(retval[ii] == strlen("Hello World\n"));
+#else
+      HIPASSERT(retval[ii] == 0);
+#endif
     } else {
       HIPASSERT(retval[ii] == -1);
     }
@@ -202,11 +224,9 @@ __global__ void kernel_series(int *retval) {
 
   const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   int result = 0;
-
   result += printf("%s\n", msg_long1);
   result += printf("%s\n", msg_short);
   result += printf("%s\n", msg_long2);
-
   retval[tid] = result;
 }
 
@@ -225,8 +245,12 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) {
   capture.End();
 
   for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     HIPASSERT(retval[ii] ==
               strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3);
+#else
+    HIPASSERT(retval[ii] == 3);
+#endif
   }
 
   std::string data = capture.getData();
@@ -245,8 +269,6 @@ static void test_series(int *retval, uint num_blocks, uint threads_per_block) {
 }
 
 __global__ void kernel_divergent_loop() {
-  DECLARE_DATA();
-
   const uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   int result = 0;
 
diff --git a/tests/src/printf/hipPrintfManyDevices.cpp b/tests/src/printf/hipPrintfManyDevices.cpp
old mode 100644
new mode 100755
index 8e1451a2b0..2e7d99461f
--- a/tests/src/printf/hipPrintfManyDevices.cpp
+++ b/tests/src/printf/hipPrintfManyDevices.cpp
@@ -21,8 +21,8 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
  * HIT_END
  */
 
@@ -44,6 +44,22 @@ __global__ void print_things() {
   printf("%s\n", msg[(tid + 2) % 3]);
 }
 
+size_t get_things_size(uint threads_per_device, uint num_devices) {
+  DECLARE_DATA();
+  const char *msg[] = {msg_short, msg_long1, msg_long2};
+  uint num_threads = threads_per_device * num_devices;
+  size_t size = 0;
+
+  for(auto str: msg) {
+    size += strlen(str) + 1;
+  }
+
+  size *= num_threads;
+  size += ((threads_per_device + 2) / 3) * num_devices * (strlen(msg_short) + 1);
+
+  return size;
+}
+
 int main() {
   uint num_blocks = 14;
   uint threads_per_block = 250;
@@ -53,6 +69,20 @@ int main() {
 
   int num_devices = 0;
   hipGetDeviceCount(&num_devices);
+#ifdef __HIP_PLATFORM_NVIDIA__
+  // By default, Cuda has different printf ring buffer size in different GPUs(or ENVs).
+  // For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests.
+  // We have to detect, compare and set it
+  size_t size = get_things_size(threads_per_device, num_devices);
+  size_t size_expected = size * 4;  // Cuda printf buffer format is unknown, but test shows 4 times can work here.
+  size_t size_current = 0;
+  HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize));
+  printf("things size = %zu, expected %zu, current %zu\n", size, size_expected, size_current);
+
+  if(size_current < size_expected) {
+    HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected));
+  }
+#endif
   capture.Begin();
   for (int i = 0; i != num_devices; ++i) {
     hipSetDevice(i);
@@ -77,6 +107,5 @@ int main() {
   HIPASSERT(linecount[msg_long2] == num_threads);
   HIPASSERT(linecount[msg_short] ==
             num_threads + ((threads_per_device + 2) / 3) * num_devices);
-
   passed();
 }
diff --git a/tests/src/printf/hipPrintfManyWaves.cpp b/tests/src/printf/hipPrintfManyWaves.cpp
index b812950cef..0cb2b16c24 100644
--- a/tests/src/printf/hipPrintfManyWaves.cpp
+++ b/tests/src/printf/hipPrintfManyWaves.cpp
@@ -21,14 +21,15 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
  * HIT_END
  */
 
 #include "test_common.h"
 #include "printf_common.h"
 #include <vector>
+#include <algorithm>
 
 // Global string constants don't work inside device functions, so we
 // use a macro to repeat the declaration in host and device contexts.
@@ -38,7 +39,6 @@ __global__ void kernel_mixed0(int *retval) {
   DECLARE_DATA();
 
   uint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-  ulong result = 0;
 
   // Three strings passed as divergent values to the same hostcall.
   const char *msg;
@@ -72,6 +72,7 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) {
   capture.End();
 
   for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     switch (ii % 3) {
     case 0:
       HIPASSERT(retval[ii] == strlen(msg_short) + 1);
@@ -83,6 +84,9 @@ static void test_mixed0(int *retval, uint num_blocks, uint threads_per_block) {
       HIPASSERT(retval[ii] == strlen(msg_long2) + 1);
       break;
     }
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
   }
 
   std::string data = capture.getData();
@@ -134,6 +138,7 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) {
   capture.End();
 
   for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     switch (ii % 3) {
     case 0:
       HIPASSERT(retval[ii] == strlen(msg_short) + 1);
@@ -145,6 +150,9 @@ static void test_mixed1(int *retval, uint num_blocks, uint threads_per_block) {
       HIPASSERT(retval[ii] == strlen(msg_long2) + 1);
       break;
     }
+#else
+    HIPASSERT(retval[ii] == 1);
+#endif
   }
 
   std::string data = capture.getData();
@@ -189,8 +197,12 @@ static void test_mixed2(int *retval, uint num_blocks, uint threads_per_block) {
   capture.End();
 
   for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     HIPASSERT(retval[ii] ==
               strlen(msg_short) + strlen(msg_long1) + strlen(msg_long2) + 1);
+#else
+    HIPASSERT(retval[ii] == 3);
+#endif
   }
 
   std::string data = capture.getData();
@@ -230,6 +242,21 @@ __global__ void kernel_mixed3(int *retval) {
   retval[tid] = result;
 }
 
+size_t get_mixed3_size(uint num_threads) {
+  DECLARE_DATA();
+  const char *msg[] = {msg_long1, msg_long2};
+  size_t size = 0;
+
+  for(auto str: msg) {
+    size += strlen(str) + 1;
+  }
+
+  size *= num_threads;
+  size += ((num_threads + 2) / 3) * (strlen(msg_short) + 1);
+
+  return size;
+}
+
 static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) {
   CaptureStream capture(stdout);
 
@@ -245,12 +272,16 @@ static void test_mixed3(int *retval, uint num_blocks, uint threads_per_block) {
   capture.End();
 
   for (uint ii = 0; ii != num_threads; ++ii) {
+#ifdef __HIP_PLATFORM_AMD__
     if (ii % 3 == 0) {
       HIPASSERT(retval[ii] ==
                 strlen(msg_long1) + strlen(msg_short) + strlen(msg_long2) + 3);
     } else {
       HIPASSERT(retval[ii] == strlen(msg_long1) + strlen(msg_long2) + 2);
     }
+#else
+    HIPASSERT(retval[ii] == (ii % 3 ? 2 : 3));
+#endif
   }
 
   std::string data = capture.getData();
@@ -276,6 +307,18 @@ __global__ void kernel_numbers() {
   }
 }
 
+size_t get_numbers_size(uint num_threads) {
+  char buf[100] = { 0 };
+  size_t size = 0;
+  for (uint tid = 0; tid < num_threads; tid++) {
+    for (uint i = 0; i != 7; ++i) {
+      uint base = tid * 21 + i * 3;
+      size += snprintf(buf, 100, "%d %d %d\n", base, base + 1, base + 2);
+    }
+  }
+  return size;
+}
+
 static void test_numbers(uint num_blocks, uint threads_per_block) {
   CaptureStream capture(stdout);
   uint num_threads = num_blocks * threads_per_block;
@@ -309,7 +352,23 @@ int main(int argc, char **argv) {
   uint num_blocks = 150;
   uint threads_per_block = 250;
   uint num_threads = num_blocks * threads_per_block;
+#ifdef __HIP_PLATFORM_NVIDIA__
+  // By default, Cuda has different printf ring buffer size in different GPUs(or ENVs).
+  // For example, A100 has 7M, Quadro RTX 5000 has 1.5M, GeForce RTX 2070 Supper has 1.3M in tests.
+  // We have to detect, compare and set it
+  size_t size_mixed3 = get_mixed3_size(num_threads);
+  size_t size_numbers = get_numbers_size(num_threads);
+  size_t size_max = size_mixed3 >= size_numbers ? size_mixed3 : size_numbers;  // Max size
+  size_t size_expected = size_max * 10;  // Cuda printf buffer format is unknown, but test shows 10 times can work here.
+  size_t size_current = 0;
+  HIPCHECK(hipDeviceGetLimit(&size_current, hipLimitPrintfFifoSize));
+  printf("size_mixed3 = %zu, size_numbers = %zu\n", size_mixed3, size_numbers);
+  printf("max size = %zu, expected %zu, current %zu\n", size_max, size_expected, size_current);
 
+  if(size_current < size_expected) {
+    HIPCHECK(hipDeviceSetLimit(hipLimitPrintfFifoSize, size_expected));
+  }
+#endif
   void *retval_void;
   HIPCHECK(hipHostMalloc(&retval_void, 4 * num_threads));
   auto retval = reinterpret_cast<int *>(retval_void);
@@ -319,6 +378,5 @@ int main(int argc, char **argv) {
   test_mixed2(retval, num_blocks, threads_per_block);
   test_mixed3(retval, num_blocks, threads_per_block);
   test_numbers(num_blocks, threads_per_block);
-
   passed();
 }
diff --git a/tests/src/printf/hipPrintfSpecifiers.cpp b/tests/src/printf/hipPrintfSpecifiers.cpp
index 74d41d1f14..b3f62f4c86 100644
--- a/tests/src/printf/hipPrintfSpecifiers.cpp
+++ b/tests/src/printf/hipPrintfSpecifiers.cpp
@@ -21,8 +21,8 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
  * HIT_END
  */
 
@@ -32,7 +32,6 @@ THE SOFTWARE.
 __global__ void test_kernel() {
   const char *N = nullptr;
   const char *s = "hello world";
-
   printf("xyzzy\n");
   printf("%%\n");
   printf("hello %% world\n");
@@ -46,7 +45,11 @@ __global__ void test_kernel() {
   printf("%d\n", -42);
   printf("%u\n", 42);
   printf("%f\n", 123.456);
+#ifdef __HIP_PLATFORM_AMD__
   printf("%F\n", -123.456);
+#else
+  printf("%f\n", -123.456);
+#endif
   printf("%e\n", -123.456);
   printf("%E\n", 123.456);
   printf("%g\n", 123.456);
@@ -54,11 +57,37 @@ __global__ void test_kernel() {
   printf("%c\n", 'x');
   printf("%s\n", N);
   printf("%p\n", N);
+#ifdef __HIP_PLATFORM_AMD__
   printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 5, s, (void *)0xf01dab1eca55e77e);
+#else
+  // In Cuda, printf doesn't support %.*, %*.*
+  printf("%.8f %8.5s %p\n", 3.14159, s, (void *)0xf01dab1eca55e77e);
+#endif
 }
 
 int main(int argc, char **argv) {
-#if !defined(_WIN32)
+#ifdef __HIP_PLATFORM_NVIDIA__
+  std::string reference(R"here(xyzzy
+%
+hello % world
+%s
+%s0xf01dab1eca55e77e
+%cxyzzy
+sep
+-42
+42
+123.456000
+-123.456000
+-1.234560e+02
+1.234560E+02
+123.456
+-123.456
+x
+(null)
+(nil)
+3.14159000    hello 0xf01dab1eca55e77e
+)here");
+#elif !defined(_WIN32)
   std::string reference(R"here(xyzzy
 %
 hello % world
diff --git a/tests/src/printf/hipPrintfStar.cpp b/tests/src/printf/hipPrintfStar.cpp
index c915d43edb..990c6af173 100644
--- a/tests/src/printf/hipPrintfStar.cpp
+++ b/tests/src/printf/hipPrintfStar.cpp
@@ -21,8 +21,8 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s
+ * TEST: %t
  * HIT_END
  */
 
@@ -31,9 +31,16 @@ THE SOFTWARE.
 
 __global__ void test_kernel() {
   printf("%*d\n", 16, 42);
+#ifdef __HIP_PLATFORM_AMD__
   printf("%.*d\n", 8, 42);
   printf("%*.*d\n", -16, 8, 42);
   printf("%*.*f %s * %.*s\n", 16, 8, 123.456, "hello", 5, "worldxyz");
+#else
+  // In Cuda, printf doesn't support %.*, %*.*
+  printf("%.8d\n", 42);
+  printf("%-16.8d\n", 42);
+  printf("%16.8f %s * %.5s\n", 123.456, "hello", "worldxyz");
+#endif
 }
 
 int main(int argc, char **argv) {