SWDEV-361383 - Fixing compilation issue for stress test files under catch2 framework (#2998)

Change-Id: I4253a6cdb2c10bcb0021f869b7b3c4f1acd28787 [ROCm/hip-tests commit: 339a51dcaf]
2022-10-19 16:52:24 +05:30
parent 4ea956e684
commit ee95d39a40
@@ -349,7 +349,7 @@ static bool TestMemoryAllocationInLoop(int test_type,
    }
    if (!bPassed) break;
  }
-  hipFree(outputVec_d);
+  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
 }
@@ -158,7 +158,7 @@ TEST_CASE("Stress_hipMallocManaged_MultiSize") {
    hipStream_t strm;
    HIP_CHECK(hipStreamCreate(&strm));
    dim3 dimBlock(blockSize, 1, 1);
-    for (int i = 1; i < (1024*1024); ++i) {
+    for (int i = 1; i < (1024*100); ++i) {
      HIP_CHECK(hipMallocManaged(&Hmm1, i));
      HIP_CHECK(hipMallocManaged(&Hmm2, i));
      for (int j = 0; j < i; ++j) {
@@ -57,9 +57,8 @@ static int HmmAttrPrint() {
  return managed;
 }

-static void ReleaseResource(int *Hmm, int *Hmm1, hipStream_t *strm) {
+static void ReleaseResource(int *Hmm, hipStream_t *strm) {
  HIP_CHECK(hipFree(Hmm));
-  HIP_CHECK(hipFree(Hmm1));
  HIP_CHECK(hipStreamDestroy(*strm));
 }

@@ -70,11 +69,10 @@ static void ReleaseResource(int *Hmm, int *Hmm1, hipStream_t *strm) {
 TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") {
  int MangdMem = HmmAttrPrint();
  if (MangdMem == 1) {
-    int *Hmm = nullptr, *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4);
+    int *Hmm1 = nullptr, NumDevs, MemSz = (4096 * 4);
    int InitVal = 123, NumElms = MemSz/4;
    bool IfTestPassed = true;
    HIP_CHECK(hipGetDeviceCount(&NumDevs));
-    HIP_CHECK(hipMallocManaged(&Hmm, MemSz));
    HIP_CHECK(hipMallocManaged(&Hmm1, MemSz));
    for (int i = 0; i < NumElms; ++i) {
      Hmm1[i] = InitVal;
@@ -93,44 +91,40 @@ TEST_CASE("Unit_hipMemPrefetchAsyncOneToAll") {
        // Prefetching memory from i to j
        HIP_CHECK(hipMemPrefetchAsync(Hmm1, MemSz, j, strm));
        HIP_CHECK(hipStreamSynchronize(strm));
-        MemPrftchAsyncKernel<<<(NumElms/32), 32, 0, strm>>>(Hmm, Hmm1, NumElms);
+        MemPrftchAsyncKernel1<<<(NumElms/32), 32, 0, strm>>>(Hmm1, NumElms);
        HIP_CHECK(hipStreamSynchronize(strm));
        // Verifying the result
        for (int m = 0; m < NumElms; ++m) {
-          if (Hmm[m] != (InitVal * InitVal)) {
+          if (Hmm1[m] != (InitVal * InitVal)) {
            IfTestPassed = false;
          }
        }
        if (!IfTestPassed) {
-          ReleaseResource(Hmm, Hmm1, &strm);
+          ReleaseResource(Hmm1, &strm);
          INFO("Did not find expected value!");
          REQUIRE(false);
        }
-        // Resetting the values in Hmm
-        HIP_CHECK(hipMemset(Hmm, 0, MemSz));
        // Prefetching memory from j to i
        HIP_CHECK(hipMemPrefetchAsync(Hmm1, MemSz, i, strm));
        HIP_CHECK(hipStreamSynchronize(strm));
-        MemPrftchAsyncKernel<<<(NumElms/32), 32, 0, strm>>>(Hmm, Hmm1, NumElms);
+        MemPrftchAsyncKernel1<<<(NumElms/32), 32, 0, strm>>>(Hmm1, NumElms);
        HIP_CHECK(hipStreamSynchronize(strm));
        // Verifying the result
        for (int m = 0; m < NumElms; ++m) {
-          if (Hmm[m] != (InitVal * InitVal)) {
+          if (Hmm1[m] != (InitVal * InitVal)) {
            IfTestPassed = false;
          }
        }
        if (!IfTestPassed) {
-          ReleaseResource(Hmm, Hmm1, &strm);
+          ReleaseResource(Hmm1, &strm);
          INFO("Did not find expected value!");
          REQUIRE(false);
        }
-        // Resetting the values in Hmm
-        HIP_CHECK(hipMemset(Hmm, 0, MemSz));
+
        HIP_CHECK(hipStreamDestroy(strm));
      }
    }
    // Releasing the resources in case all the scenarios passed
-    HIP_CHECK(hipFree(Hmm));
    HIP_CHECK(hipFree(Hmm1));
  } else {
    SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
@@ -72,7 +72,7 @@ void Memcpy_And_verify(int NUM_ELM) {
          for (int i = 0; i < Available_Gpus; ++i) {
            for (int j = i+1; j < Available_Gpus; ++j) {
              canAccessPeer = 0;
-              hipDeviceCanAccessPeer(&canAccessPeer, i, j);
+              HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
              if (canAccessPeer) {
                HIP_CHECK(hipMemcpy(A_d[j], A_d[i], NUM_ELM * sizeof(TestType),
                      hipMemcpyDefault));
@@ -122,7 +122,7 @@ void Memcpy_And_verify(int NUM_ELM) {
            int canAccessPeer = 0;
            for (int i = 0; i < Available_Gpus; ++i) {
              for (int j = i+1; j < Available_Gpus; ++j) {
-                hipDeviceCanAccessPeer(&canAccessPeer, i, j);
+                HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
                if (canAccessPeer) {
                  HIP_CHECK(hipMemcpyHtoD(hipDeviceptr_t(A_d[i]),
                        A_h, NUM_ELM * sizeof(TestType)));
@@ -165,7 +165,7 @@ void Memcpy_And_verify(int NUM_ELM) {
          for (int i = 0; i < Available_Gpus; ++i) {
            for (int j = i+1; j < Available_Gpus; ++j) {
              canAccessPeer = 0;
-              hipDeviceCanAccessPeer(&canAccessPeer, i, j);
+              HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
              if (canAccessPeer) {
                HIP_CHECK(hipMemcpyAsync(A_d[j], A_d[i],
                      NUM_ELM * sizeof(TestType),
@@ -219,7 +219,7 @@ void Memcpy_And_verify(int NUM_ELM) {
            for (int i = 0; i < Available_Gpus; ++i) {
              for (int j = i+1; j < Available_Gpus; ++j) {
                canAccessPeer = 0;
-                hipDeviceCanAccessPeer(&canAccessPeer, i, j);
+                HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, i, j));
                if (canAccessPeer) {
                  HIP_CHECK(hipSetDevice(j));
                  HIP_CHECK(hipMemcpyDtoDAsync(hipDeviceptr_t(A_d[j]),
@@ -5,25 +5,25 @@ TEST_CASE("Stress_hipMalloc", "DifferentSizes") {
  SECTION("Size 10") {
    auto res = hipMalloc(&d_a, sizeof(10));
    REQUIRE(res == hipSuccess);
-    hipFree(d_a);
+    HIP_CHECK(hipFree(d_a));
    d_a = nullptr;
  }
  SECTION("Size 100") {
    auto res = hipMalloc(&d_a, sizeof(100));
    REQUIRE(res == hipSuccess);
-    hipFree(d_a);
+    HIP_CHECK(hipFree(d_a));
    d_a = nullptr;
  }
  SECTION("Size 1000") {
    auto res = hipMalloc(&d_a, sizeof(1000));
    REQUIRE(res == hipSuccess);
-    hipFree(d_a);
+    HIP_CHECK(hipFree(d_a));
    d_a = nullptr;
  }
  SECTION("Size 10000") {
    auto res = hipMalloc(&d_a, sizeof(10000));
    REQUIRE(res == hipSuccess);
-    hipFree(d_a);
+    HIP_CHECK(hipFree(d_a));
    d_a = nullptr;
  }
  SECTION("Size MAX") {
@@ -31,4 +31,4 @@ TEST_CASE("Stress_hipMalloc", "DifferentSizes") {
    REQUIRE(res == hipErrorOutOfMemory);
    d_a = nullptr;
  }
-}
+}
@@ -494,7 +494,7 @@ TEST_CASE("Stress_printf_ComplexKernelMultStreamMultGpu") {
  unsigned int print_limit = 4;  // = 4 GB
  uint32_t iterCount = 1;
  int numOfGPUs = 0;
-  hipGetDeviceCount(&numOfGPUs);
+  HIP_CHECK(hipGetDeviceCount(&numOfGPUs));
  if (numOfGPUs < 2) {
    printf("Skipping test because numOfGPUs < 2\n");
    return;
@@ -26,11 +26,13 @@ THE SOFTWARE.
 #include <random>
 #include <thread>

-__global__ void addVal(unsigned long long* ptr, size_t index, unsigned long long val) {
+__global__ void addVal(unsigned long long* ptr, size_t index,
+                       unsigned long long val) {
  atomicAdd(ptr + index, val);
 }

-// Create a copy constructible AtomicWrap around std::atomic so that we can put it in a vector
+// Create a copy constructible AtomicWrap around std::atomic so that
+// we can put it in a vector
 template <typename T> struct AtomicWrap {
  std::atomic<T> data;

@@ -68,18 +70,19 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
  constexpr size_t maxWork = 10000;
  constexpr size_t maxVal = 10;

-  std::uniform_int_distribution<std::mt19937::result_type> genIndex(0, hwThreads - 1);
+  std::uniform_int_distribution<std::mt19937::result_type> genIndex(0,
+                                                           hwThreads - 1);
  std::uniform_int_distribution<std::mt19937::result_type> genWork(0, maxWork);
  std::uniform_int_distribution<std::mt19937::result_type> genVal(0, maxVal);

  auto enqueueKernelThread = [&](hipStream_t stream) {
    auto iter = genWork(engine);  // Generate work to be done via thread
-    for (auto i = 0; i < iter; i++) {
+    for (unsigned long i = 0; i < iter; i++) {
      auto index = genIndex(engine);  // Generate Index to add to
-      auto val = genVal(engine);      // Generate value to add to the destination
+      auto val = genVal(engine);  // Generate value to add to the destination
      hostData[index].data += val;    // Replicate it on host
      addVal<<<1, 1, 0, stream>>>(dPtr, static_cast<size_t>(index),
-                                  static_cast<unsigned long long>(val));  // And on device
+            static_cast<unsigned long long>(val));  // And on device
    }
  };

@@ -101,8 +104,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
  HIP_CHECK(hipStreamDestroy(stream));

  auto hPtr = std::make_unique<unsigned long long[]>(hwThreads);
-  HIP_CHECK(
-      hipMemcpy(hPtr.get(), dPtr, sizeof(unsigned long long) * hwThreads, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(hPtr.get(), dPtr, sizeof(unsigned long long) * hwThreads,
+            hipMemcpyDeviceToHost));

  HIP_CHECK(hipFree(dPtr));

@@ -113,7 +116,7 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads") {
  }
 }

-__global__ void doOperation(int* dPtr, size_t size, int val) {
+__global__ void doOperation(int* dPtr, int val) {
  auto i = threadIdx.x;
  atomicAdd(dPtr + i, val);
 }
@@ -135,14 +138,15 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {

  std::vector<hipStream_t> streamPool{};
  streamPool.reserve(deviceCount * streamPerGPU);
-
-  std::map<hipStream_t, int*> streamToDeviceMemory;           // Map of stream and device memory
-  std::map<hipStream_t, AtomicWrap<int>> streamToHostMemory;  // Map of stream and host result
-  std::map<hipStream_t, size_t> streamToDeviceIndex;  // Map of stream and device it was created on
-
+  // Map of stream and device memory
+  std::map<hipStream_t, int*> streamToDeviceMemory;
+  // Map of stream and host result
+  std::map<hipStream_t, AtomicWrap<int>> streamToHostMemory;
+  // Map of stream and device it was created on
+  std::map<hipStream_t, size_t> streamToDeviceIndex;
  constexpr size_t size = 1024;

-  for (size_t i = 0; i < deviceCount; i++) {
+  for (int i = 0; i < deviceCount; i++) {
    HIP_CHECK(hipSetDevice(i));

    for (size_t j = 0; j < streamPerGPU; j++) {
@@ -155,8 +159,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
      HIP_CHECK(hipMalloc(&dPtr, sizeof(int) * size));
      REQUIRE(dPtr != nullptr);
      HIP_CHECK(hipMemset(dPtr, 0, sizeof(int) * size));
-
-      streamToDeviceMemory[stream] = dPtr;  // All streams work on exclusive memory
+      // All streams work on exclusive memory
+      streamToDeviceMemory[stream] = dPtr;

      streamToHostMemory[stream] = AtomicWrap<int>(0);  // CPU result

@@ -171,8 +175,10 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
  std::random_device device;
  std::mt19937 engine(device());

-  std::uniform_int_distribution<std::mt19937::result_type> genVal(-maxVal, maxVal);
-  std::uniform_int_distribution<std::mt19937::result_type> genStream(0, streamPool.size() - 1);
+  std::uniform_int_distribution<std::mt19937::result_type> genVal(-maxVal,
+                                                           maxVal);
+  std::uniform_int_distribution<std::mt19937::result_type> genStream(0,
+                                              streamPool.size() - 1);

 #if HT_NVIDIA
  std::mutex ness;  // On nvidia, current device needs to match stream's device
@@ -183,7 +189,8 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
 #if HT_NVIDIA
      std::unique_lock<std::mutex> lock(ness);  // Lock on creation
 #endif
-      hipStream_t stream = streamPool[genStream(engine)];  // Get a random stream
+      // Get a random stream
+      hipStream_t stream = streamPool[genStream(engine)];

      // TODO use HIP_CHECK_THREAD when PR#2664 is merged
      if (hipSuccess != hipSetDevice(streamToDeviceIndex[stream])) {
@@ -191,11 +198,10 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
      }

      int val = genVal(engine);  // Generate Value to add/sub to
-
-      streamToHostMemory[stream].data.fetch_add(val);  // Replicate result on CPU
+      // Replicate result on CPU
+      streamToHostMemory[stream].data.fetch_add(val);
      auto dPtr = streamToDeviceMemory[stream];
-      doOperation<<<1, 1024, 0, stream>>>(dPtr, size,
-                                          val);  // On GPU
+      doOperation<<<1, 1024, 0, stream>>>(dPtr, val);  // On GPU
    }
  };

@@ -219,13 +225,14 @@ TEST_CASE("Stress_StreamEnqueue_DifferentThreads_MultiGPU") {
  for (auto& i : streamPool) {
    HIP_CHECK(hipStreamSynchronize(i));
    auto dResult = std::make_unique<int[]>(size);
-    HIP_CHECK(hipMemcpy(dResult.get(), streamToDeviceMemory[i], sizeof(int) * size,
-                        hipMemcpyDeviceToHost));
+    HIP_CHECK(hipMemcpy(dResult.get(), streamToDeviceMemory[i],
+              sizeof(int) * size, hipMemcpyDeviceToHost));
    HIP_CHECK(hipFree(streamToDeviceMemory[i]));
    HIP_CHECK(hipStreamDestroy(i));
    auto res = streamToHostMemory[i].data.load();
    INFO("Matching CPU: " << res << " GPU: " << dResult[0] << " Dev Ptr: "
-                          << streamToDeviceMemory[i] << " on Device: " << streamToDeviceIndex[i]);
-    REQUIRE(std::all_of(dResult.get(), dResult.get() + size, [=](int r) { return r == res; }));
+    << streamToDeviceMemory[i] << " on Device: " << streamToDeviceIndex[i]);
+    REQUIRE(std::all_of(dResult.get(), dResult.get() + size,
+            [=](int r) { return r == res; }));
  }
 }