SWDEV-549309 - Ensure that the kernel is issued to a stream that is ass… (#552)

* SWDEV-549309 - Ensure that the kernel is issued to a stream that is associated to the current device * SWDEV-549309 - Ensure that the kernel is issued to a stream that is associated to the current device in atomic tests * SWDEV-549309 - Added fix for Unit_hipMallocFromPoolAsync_Multidevice_Concurrent test * SWDEV-549309 - Added fix for Unit_hipMemcpyPeerAsync_Positive_Default * SWDEV-549309 - Added fix for Unit_hipStreamAttachMemAsync_Positive_AttachGlobal and Unit_hipGetProcAddress_GraphAPIs_AddMemsetMemcpyNodes [ROCm/hip-tests commit: d21a95cea8]
2025-08-14 07:20:56 +01:00
@@ -446,6 +446,7 @@ void TestCore(const TestParams& p) {

 // Launch Kernel
  for (auto i = 0u; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
    for (auto j = 0u; j < p.kernel_count; ++j) {
      const auto& stream = streams[i * p.kernel_count + j].stream();
      const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
@@ -272,6 +272,7 @@ void TestCore(const TestParams& p) {
  }
  // Launch Kernel and get back old vals
  for (auto i = 0u; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
    for (auto j = 0u; j < p.kernel_count; ++j) {
      const auto& stream = streams[i * p.kernel_count + j].stream();
      const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
@@ -302,6 +302,7 @@ void TestCore(const TestParams& p) {

  // Launch kernel
  for (auto i = 0u; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
    for (auto j = 0u; j < p.kernel_count; ++j) {
      const auto& stream = streams[i * p.kernel_count + j].stream();
      const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
@@ -375,9 +375,7 @@ TEST_CASE("Unit_hipGetProcAddress_GraphAPIs_AddMemsetMemcpyNodes") {
  hipGraphExec_t graphExec;
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, 0));
-  #ifdef _WIN32
  HIP_CHECK(hipStreamSynchronize(0));
-  #endif

  REQUIRE(validateArrayT<char>(hostMemDst, N, value) == true);

@@ -319,8 +319,8 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice") {
 *    - HIP_VERSION >= 6.2
 */
 #if HT_AMD
-static void threadQAsyncCommands(streamMemAllocTest* testObj,
-                                hipStream_t strm) {
+static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) {
+  HIP_CHECK(hipSetDevice(idx));
  // Create host buffer with test data.
  testObj->createHostBufferWithData();
  // Allocate device memory and transfer data to it asyncronously on stream.
@@ -350,7 +350,7 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice_Concurrent") {
  // Queue commands in each device
  for (int idx = 0; idx < num_devices; idx++) {
    HIP_CHECK(hipSetDevice(idx));
-    std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx]);
+    std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx);
    test.join();
  }
  // Wait for the streams
@@ -405,10 +405,10 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice_MultiStream") {
  // Queue commands in each device
  for (int idx = 0; idx < num_devices; idx++) {
    HIP_CHECK(hipSetDevice(idx));
-    std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx],
-                    stream_buf[streamPerAsic*idx]);
-    std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx + 1],
-                    stream_buf[streamPerAsic*idx + 1]);
+    std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx],
+                      stream_buf[streamPerAsic * idx], idx);
+    std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1],
+                      stream_buf[streamPerAsic * idx + 1], idx);
    test1.join();
    test2.join();
  }
@@ -371,8 +371,8 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_ReleaseThreshold_Mgpu") {
 /**
 * Local Thread Functions
 */
-static void threadQAsyncCommands(streamMemAllocTest* testObj,
-                                hipStream_t strm) {
+static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) {
+  HIP_CHECK(hipSetDevice(idx));
  // Create host buffer with test data.
  testObj->createHostBufferWithData();
  // Allocate device memory and transfer data to it asyncronously on stream.
@@ -616,7 +616,7 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_Multidevice_Concurrent") {
  // Queue commands in each device
  for (int idx = 0; idx < num_devices; idx++) {
    HIP_CHECK(hipSetDevice(idx));
-    std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx]);
+    std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx);
    test.join();
  }
  // Wait for the streams
@@ -675,10 +675,10 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_Multidevice_MultiStream") {
  // Queue commands in each device
  for (int idx = 0; idx < num_devices; idx++) {
    HIP_CHECK(hipSetDevice(idx));
-    std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx],
-                    stream_buf[streamPerAsic*idx]);
-    std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx + 1],
-                    stream_buf[streamPerAsic*idx + 1]);
+    std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx],
+                      stream_buf[streamPerAsic * idx], idx);
+    std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1],
+                      stream_buf[streamPerAsic * idx + 1], idx);
    test1.join();
    test2.join();
  }
@@ -836,7 +836,6 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
    int *Hmm = NULL, NumElms = (1024 * 1024), InitVal = 123, blockSize = 64;
    int *Hmm1 = NULL, DataMismatch = 0;
    hipStream_t strm;
-    HIP_CHECK(hipStreamCreate(&strm));
    HIP_CHECK(hipMallocManaged(&Hmm, (NumElms * sizeof(int))));
    // Initializing memory
    for (int i = 0; i < NumElms; ++i) {
@@ -852,6 +851,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
      for (int i = 1; i < Ngpus; ++i) {
        DataMismatch = 0;
        HIP_CHECK(hipSetDevice(i));
+        HIP_CHECK(hipStreamCreate(&strm));
        HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
        MemAdvise3<<<dimGrid, dimBlock, 0, strm>>>(Hmm, Hmm1, NumElms);
        HIP_CHECK(hipStreamSynchronize(strm));
@@ -865,6 +865,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
          WARN("DataMismatch is observed with the gpu: " << i);
          REQUIRE(false);
        }
+        HIP_CHECK(hipStreamDestroy(strm));
        HIP_CHECK(hipFree(Hmm1));
      }
    }
@@ -873,10 +874,12 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
      for (int i = 0; i < Ngpus; ++i) {
        DataMismatch = 0;
        HIP_CHECK(hipSetDevice(i));
+        HIP_CHECK(hipStreamCreate(&strm));
        HIP_CHECK(hipMemAdvise(Hmm, (NumElms * sizeof(int)),
                               hipMemAdviseSetReadMostly, i));
        MemAdvise2<<<dimGrid, dimBlock, 0, strm>>>(Hmm, NumElms);
        HIP_CHECK(hipStreamSynchronize(strm));
+        HIP_CHECK(hipStreamDestroy(strm));
      }
      // verifying the final result
      for (int i = 0; i < NumElms; ++i) {
@@ -892,7 +895,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
    }
 #endif
    HIP_CHECK(hipFree(Hmm));
-    HIP_CHECK(hipStreamDestroy(strm));
+
  } else {
    SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
           "attribute. Hence skipping the testing with Pass result.\n");
@@ -51,9 +51,6 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Positive_Default") {
    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
    return;
  }
-  const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
-  const StreamGuard stream_guard(stream_type);
-  const hipStream_t stream = stream_guard.stream();

  const auto allocation_size = GENERATE(kPageSize / 2, kPageSize, kPageSize * 2);

@@ -64,6 +61,11 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Positive_Default") {
  INFO("Src device: " << src_device << ", Dst device: " << dst_device);

  HIP_CHECK(hipSetDevice(src_device));
+
+  const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
+  const StreamGuard stream_guard(stream_type);
+  const hipStream_t stream = stream_guard.stream();
+
  HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
  if (can_access_peer) {
    HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0));
@@ -510,6 +510,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(bool& val_res)
  }

  for (int i = 0; i < numDevices; ++i) {
+    HIP_CHECK_THREAD(hipSetDevice(i));
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
                       static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
    HIP_CHECK_THREAD(hipGetLastError());
@@ -475,6 +475,7 @@ void TestkindDefaultForDtoD(void) {
    }

    for (int i=0; i < NumDevices; ++i) {
+      HIP_CHECK(hipSetDevice(i));
      hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
                         dim3(threadsPerBlock),
                         0, stream[i], static_cast<const int*>(A_d[i]),
@@ -109,6 +109,7 @@ TEST_CASE("test_svm_byte_granularity") {
  // get all the devices going simultaneously
  for(unsigned int d = 0; d < num_devices; d++)  // device ids starting at 1.
  {
+    HIP_CHECK(hipSetDevice(d));
    write_owned_locations<<<num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host, d);
    HIP_CHECK(hipGetLastError());
  }
@@ -125,6 +126,7 @@ TEST_CASE("test_svm_byte_granularity") {
  size_t adjusted_num_elements = num_elements - num_devices;
  for(unsigned int d = 0; d < num_devices; d++)
  {
+    HIP_CHECK(hipSetDevice(d));
    sum_neighbor_locations<<<adjusted_num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host,
                                                                     error_counts[d]);
    HIP_CHECK(hipGetLastError());
@@ -129,6 +129,7 @@ void launch_kernels_and_verify(std::vector<hipStream_t> &streams, unsigned int n
  // all the pixels.
  for(unsigned int d=0; d < num_devices; d++)
  {
+    HIP_CHECK(hipSetDevice(d));
    build_hash_table_on_device<<<(num_pixels + 255) / 256, 256, 0, streams[d]>>>(
        pInputImage, num_pixels, pNodes, pNumNodes, numBins, d);
    HIP_CHECK(hipGetLastError());
@@ -208,6 +208,7 @@ TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
      }
      else
      {
+        HIP_CHECK(hipSetDevice(ci));
        create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
                                             ListLength);
      }
@@ -218,6 +219,7 @@ TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
      }
      else
      {
+        HIP_CHECK(hipSetDevice(vi));
        verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
                                             ListLength);
      }
@@ -87,6 +87,9 @@ TEST_CASE("Unit_hipStreamAttachMemAsync_Positive_AttachGlobal") {
  HIP_CHECK(hipStreamSynchronize(nullptr));

  for (int i = 0; i < stream_count; ++i) {
+    if (device_count > 1) {
+      HIP_CHECK(hipSetDevice(i));
+    }
    HipTest::launchKernel(Set, 1, 1, 0, streams.at(i)->stream(), managed_global.ptr() + i, i);
  }

@@ -407,6 +407,7 @@ class streamMemAllocTest {
                        dim3(THREADS_PER_BLOCK), 0, stream,
                        static_cast<const int*>(A_d),
                        static_cast<const int*>(B_d), C_d, size);
+    HIP_CHECK(hipGetLastError());
  }
  // Transfer data from device to host asynchronously.
  void transferFromMempool(hipStream_t stream) {