SWDEV-443760 - Enable device kern args

- Implement workaround to ensure HDP writes are done by writing and reading the HDP MMIO register. - Implement the same workaround for graphs, we no longer need sentinel write/readback Change-Id: I0d3027b46a1f61131ec62e3c8c669ff5184fa6b2 [ROCm/clr commit: f138e0d113]
2024-02-10 00:51:14 +00:00
parent 6383be514b
commit ec59b1bc3e
5 changed files with 21 additions and 40 deletions
@@ -359,8 +359,6 @@ hipError_t GraphExec::CaptureAQLPackets() {
    auto device = g_devices[ihipGetDevice()]->devices()[0];
    if (kernArgSizeForGraph != 0) {
      if (device->info().largeBar_) {
-        // Pad kernel argument buffer with sentinal size bytes to do a readback later
-        kernArgSizeForGraph += sizeof(int);
        kernarg_pool_graph_ =
            reinterpret_cast<address>(device->deviceLocalAlloc(kernArgSizeForGraph));
        device_kernarg_pool_ = true;
@@ -391,27 +389,13 @@ hipError_t GraphExec::CaptureAQLPackets() {
      }
    }

-    if (device_kernarg_pool_) {
-      // Write HDP_MEM_COHERENCY_FLUSH_CNTL reg to initiate flush read to HDP mem. Verify mem
-      // by readback of sentinal value at the tail end of the kernarg surface (allocated above)
-      // This needs to be done for PCIE connected devices only. HDP path is disabled for XGMI
-      // between CPU<->GPU
-      if (!device->isXgmi()) {
-        int host_val = 1;
-        address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int);
-        *(reinterpret_cast<int*>(dev_ptr)) = host_val;
-        if (device->info().hdpMemFlushCntl == nullptr) {
-          amd::Command* command = new amd::Marker(*capture_stream_, true);
-          if (command != nullptr) {
-            command->enqueue();
-            command->release();
-          }
-        } else {
-          *device->info().hdpMemFlushCntl = 1;
-        }
-        if (*(reinterpret_cast<volatile int*>(dev_ptr)) != host_val);
+    if (device_kernarg_pool_ && !device->isXgmi()) {
+      *device->info().hdpMemFlushCntl = 1u;
+      if (*device->info().hdpMemFlushCntl != UINT32_MAX) {
+        LogError("Unexpected HDP Register readback value!");
      }
    }
+
    ResetQueueIndex();
  }
  return status;
@@ -95,7 +95,7 @@ Settings::Settings() {
  fgs_kernel_arg_ = false;
  barrier_value_packet_ = false;

-  host_hdp_flush_ = true;
+  device_kernel_args_ = false;
  gwsInitSupported_ = true;
  limit_blit_wg_ = 16;
 }
@@ -163,8 +163,10 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
      (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)))) {
    // Enable Barrier Value packet is only for MI2XX/300
    barrier_value_packet_ = true;
-    // On MI200 and MI300, the HDP will not cache RO=0 writes, so no flush is needed
-    host_hdp_flush_ = false;
+  }
+
+  if (gfxipMajor >= 9) {
+    device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
  }

  if (gfxipMajor >= 10) {
@@ -52,7 +52,7 @@ class Settings : public device::Settings {
      uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
      uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
      uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint host_hdp_flush_ : 1;         //!< Host HDP flush needed
+      uint device_kernel_args_ : 1;     //!< Allocate kernel args in device memory
      uint reserved_ : 20;
    };
    uint value_;
@@ -1376,8 +1376,9 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) {
  kernarg_pool_size_ = kernarg_pool_size;
  kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
  active_chunk_ = 0;
-  if (HIP_FORCE_DEV_KERNARG && roc_device_.info().largeBar_) {
-    kernarg_pool_base_ = reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
+  if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) {
+    kernarg_pool_base_ =
+      reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
  } else {
    kernarg_pool_base_ = reinterpret_cast<address>(roc_device_.hostAlloc(kernarg_pool_size_, 0,
                                                   Device::MemorySegment::kKernArg));
@@ -3208,8 +3209,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
      }
    }

-    constexpr uint64_t kSentinel = 0xdeadbeefdeadbeefull;
-    const auto pcieKernargs = !dev().isXgmi() && HIP_FORCE_DEV_KERNARG;
+    const auto pcieKernargs = !dev().isXgmi() && dev().settings().device_kernel_args_;

    address argBuffer = hidden_arguments;
    // Find all parameters for the current kernel
@@ -3218,8 +3218,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
      if(vcmd != nullptr && vcmd->getCapturingState()) {
        argBuffer = vcmd->getKernArgOffset();
      } else {
-        const auto kernargSize = gpuKernel.KernargSegmentByteSize() +
-                                  sizeof(kSentinel) * pcieKernargs;
+        const auto kernargSize = gpuKernel.KernargSegmentByteSize();
        argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
                                              gpuKernel.KernargSegmentAlignment()));
      }
@@ -3228,11 +3227,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
                        std::min(gpuKernel.KernargSegmentByteSize(),
                                 signature.paramsSize()));
      if (pcieKernargs) {
-        nontemporalMemcpy(argBuffer + gpuKernel.KernargSegmentByteSize(),
-                          &kSentinel, sizeof(kSentinel));
-        if (dev().settings().host_hdp_flush_) {
-          *dev().info().hdpMemFlushCntl = 1u;
-        }
+        *dev().info().hdpMemFlushCntl = 1u;
      }
    }

@@ -3295,9 +3290,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
      aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
    }
    if (pcieKernargs) {
-      __builtin_ia32_mfence();
-      while (*reinterpret_cast<volatile decltype(kSentinel)*>(
-        argBuffer + gpuKernel.KernargSegmentByteSize()) != kSentinel);
+      if (*dev().info().hdpMemFlushCntl != UINT32_MAX) {
+        LogError("Unexpected HDP Register readback value!");
+      }
    }
    if (vcmd == nullptr) {
      // Dispatch the packet
@@ -233,7 +233,7 @@ release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false,                            \
        "Set this to true to force runtime unbundler in hiprtc.")             \
 release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi,                                  \
        "Set initial heap size for device malloc.")                           \
-release(bool, HIP_FORCE_DEV_KERNARG, 0,                                       \
+release(bool, HIP_FORCE_DEV_KERNARG, 1,                                       \
         "Force device mem for kernel args.")                                 \
 release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true,                           \
         "Enable/Disable graph packet capturing")                             \