SWDEV-347670 - StreamWait and StreamWrite on Windows

__amd_streamOpsWrite blitkernel in device-libs has only 3 args. so getting rid of the 4th unused arg (sizeBytes) Change-Id: I81cc1107f8b424bf58558c93a2495a1b878aef91 [ROCm/clr commit: e643406caa]
2024-01-15 17:12:45 -08:00
commit e8a077dc68
@@ -39,7 +39,7 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(

  extern void __amd_copyBufferRectAligned(__global uint*, __global uint*, ulong4, ulong4, ulong4);

-  extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong);
+  extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong);

  extern void __amd_streamOpsWait(__global uint*, __global ulong*, ulong, ulong, ulong);

@@ -165,8 +165,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(

 const char* HipExtraSourceCode = BLIT_KERNELS(
  __kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong,
-                                            ulong value, ulong sizeBytes) {
-    __amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
+                                            ulong value) {
+    __amd_streamOpsWrite(ptrInt, ptrUlong, value);
  }

  __kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong,
@@ -186,8 +186,8 @@ const char* HipExtraSourceCode = BLIT_KERNELS(

 const char* HipExtraSourceCodeNoGWS = BLIT_KERNELS(
  __kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong,
-                                            ulong value, ulong sizeBytes) {
-    __amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
+                                            ulong value) {
+    __amd_streamOpsWrite(ptrInt, ptrUlong, value);
  }

  __kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong,
@@ -2520,13 +2520,11 @@ bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
  if (is32BitWrite) {
    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
    setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
-    setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
  } else {
    setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
-    setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
  }
-  setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
+  setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
  // Create ND range object for the kernel's execution
  amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
  // Execute the blit
@@ -2555,16 +2553,13 @@ bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, si
  if (is32BitWait) {
    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
    setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
-    setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
-    setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags);
-    setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask);
  } else {
    setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
-    setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
-    setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
-    setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);
  }
+  setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
+  setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
+  setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);

  // Create ND range object for the kernel's execution
  amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
@@ -660,6 +660,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,

    info_.cooperativeGroups_ = settings().enableCoopGroups_;
    info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;
+    // Enable StreamWrite and StreamWait for all devices
+    info_.aqlBarrierValue_ = true;

    if (amd::IS_HIP) {
      info_.largeBar_ = false;
@@ -1061,14 +1061,17 @@ VirtualGPU::~VirtualGPU() {
  amd::ScopedLock k(dev().lockAsyncOps());
  amd::ScopedLock lock(dev().vgpusAccess());

-  // Clear all timestamps, associated with this virtual GPU
-  auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
-  for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
-    if (mgmt.aql_vgpus_[i] == this) {
-      mgmt.aql_vgpus_[i] = nullptr;
-      mgmt.aql_events_[i].invalidate();
+  if (queues_[MainEngine] != nullptr) {
+    // Clear all timestamps, associated with this virtual GPU
+    auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
+    for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
+      if (mgmt.aql_vgpus_[i] == this) {
+        mgmt.aql_vgpus_[i] = nullptr;
+        mgmt.aql_events_[i].invalidate();
+      }
    }
  }
+
  // Destroy RGP trace
  if (rgpCaptureEna()) {
    dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
@@ -2567,7 +2567,6 @@ bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
    setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
  }
-  setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
  // Create ND range object for the kernel's execution
  amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
  // Execute the blit