SWDEV-467102 - Hidden heap init for graph capture

If the graph has kernels that does device side allocation, during packet capture, heap is allocated because heap pointer has to be added to the AQL packet, and initialized during graph launch. Handle race with wait when 2 kernels with device heap are enqueued on multiple streams. Change-Id: I45933b77fcaf7bc8fdf1bc906462e32b5d8d3688 [ROCm/clr commit: 57156c524d]
2024-06-11 11:59:05 +00:00
@@ -358,8 +358,7 @@ void GetKernelArgSizeForGraph(std::vector<std::vector<Node>>& parallelLists,
  // arg size required for all graph kernel nodes to allocate
  for (const auto& list : parallelLists) {
    for (auto& node : list) {
-      if (node->GetType() == hipGraphNodeTypeKernel &&
-          !reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
+      if (node->GetType() == hipGraphNodeTypeKernel) {
        kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
      } else if (node->GetType() == hipGraphNodeTypeGraph) {
        auto& childParallelLists = reinterpret_cast<hip::ChildGraphNode*>(node)->GetParallelLists();
@@ -375,8 +374,13 @@ hipError_t AllocKernelArgForGraph(std::vector<hip::Node>& topoOrder, hip::Stream
                                  hip::GraphExec* graphExec) {
  hipError_t status = hipSuccess;
  for (auto& node : topoOrder) {
-    if (node->GetType() == hipGraphNodeTypeKernel &&
-        !reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
+    if (node->GetType() == hipGraphNodeTypeKernel) {
+      // Check if graph requires hidden heap and set as part of graphExec param.
+      static bool initialized = false;
+      if (!initialized && reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
+        graphExec->SetHiddenHeap();
+        initialized = true;
+      }
      auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
      // From the kernel pool allocate the kern arg size required for the current kernel node.
      address kernArgOffset = nullptr;
@@ -591,8 +595,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
    accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
  }
  for (int i = 0; i < topoOrder.size(); i++) {
-    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel &&
-        !reinterpret_cast<hip::GraphKernelNode*>(topoOrder[i])->HasHiddenHeap()) {
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) {
      if (topoOrder[i]->GetEnabled()) {
        hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(),
                                              topoOrder[i]->GetKernelName(),
@@ -640,6 +643,16 @@ hipError_t GraphExec::Run(hipStream_t stream) {

  if (parallelLists_.size() == 1 &&
      instantiateDeviceId_ == hip_stream->DeviceId()) {
+    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
+      // If the graph has kernels that does device side allocation,  during packet capture, heap is
+      // allocated because heap pointer has to be added to the AQL packet, and initialized during
+      // graph launch.
+      static bool initialized = false;
+      if (!initialized && HasHiddenHeap()) {
+        hip_stream->vdev()->HiddenHeapInit();
+        initialized = true;
+      }
+    }
    status = EnqueueGraphWithSingleList(topoOrder_, hip_stream, this);
  } else if (parallelLists_.size() == 1 &&
             instantiateDeviceId_ != hip_stream->DeviceId()) {
@@ -569,6 +569,7 @@ struct GraphExec : public amd::ReferenceCountedObject {
  uint32_t kernarg_graph_cur_offset_ = 0;
  uint32_t kernarg_graph_size_ = 128 * Ki;
  int instantiateDeviceId_ = -1;
+  bool hasHiddenHeap_ = false;                 //!< Kernel has hidden heap(device side allocation)

 public:
  GraphExec(std::vector<Node>& topoOrder, std::vector<std::vector<Node>>& lists,
@@ -619,6 +620,10 @@ struct GraphExec : public amd::ReferenceCountedObject {
    }
    return clonedNode;
  }
+  // returns if graph has nodes that require hidden heap/not
+  bool HasHiddenHeap() const { return hasHiddenHeap_; }
+  // Graph has nodes that require hidden heap.
+  void SetHiddenHeap() { hasHiddenHeap_ = true; }

  address allocKernArg(size_t size, size_t alignment) {
    assert(alignment != 0);
@@ -1308,7 +1308,8 @@ class VirtualDevice : public amd::HeapObject {

  //! Returns fence state of the VirtualGPU
  virtual bool isFenceDirty() const = 0;
-
+  //! Init hidden heap for device memory allocations
+  virtual void HiddenHeapInit() = 0;
  //! Dispatch captured AQL packet
  virtual bool dispatchAqlPacket(uint8_t* aqlpacket,
                                 const std::string& kernelName,
@@ -2102,7 +2103,9 @@ class Device : public RuntimeObject {
  static Memory* p2p_stage_;          //!< Staging resources
  std::vector<Device*> enabled_p2p_devices_;  //!< List of user enabled P2P devices for this device

-  std::once_flag heap_initialized_; //!< Heap buffer initialization flag
+  std::once_flag heap_initialized_;  //!< Heap buffer initialization flag
+  std::once_flag heap_allocated_;    //!< Heap buffer allocation flag
+
  device::Memory* heap_buffer_;     //!< Preallocated heap buffer for memory allocations on device

  amd::Memory* arena_mem_obj_;      //!< Arena memory object
@@ -357,6 +357,8 @@ class VirtualGPU : public device::VirtualDevice {

  bool isFenceDirty() const { return false; }

+  void HiddenHeapInit() {}
+
  inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
                                amd::AccumulateCommand* vcmd = nullptr) {
    vcmd->addKernelName(kernelName);
@@ -2621,8 +2621,7 @@ bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Mem
  address parameters = captureArguments(kernels_[blitType]);
  result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
  releaseArguments(parameters);
-  synchronize();
-
+  gpu().releaseGpuMemoryFence();
  return result;
 }

@@ -3511,7 +3511,7 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer

 // ================================================================================================
 void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
-  auto HeapAllocZeroOut = [this, &gpu]() -> bool {
+  auto HeapAllocOnly = [this, &gpu]() -> bool {
    // Allocate initial heap for device memory allocator
    static constexpr size_t HeapBufferSize = 128 * Ki;
    heap_buffer_ = createMemory(HeapBufferSize);
@@ -3523,12 +3523,22 @@ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
      LogError("Heap buffer allocation failed!");
      return false;
    }
-    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
-        heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));
+    return true;
+  };
+  std::call_once(heap_allocated_, HeapAllocOnly);
+}
+
+// ================================================================================================
+void Device::HiddenHeapInit(const VirtualGPU& gpu) {
+  auto HeapZeroOut = [this, &gpu]() -> bool {
+    static constexpr size_t HeapBufferSize = 128 * Ki;
+    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr())
+                      .initHeap(heap_buffer_, initial_heap_buffer_, HeapBufferSize,
+                                initial_heap_size_ / (2 * Mi));

    return result;
  };
-  std::call_once(heap_initialized_, HeapAllocZeroOut);
+  std::call_once(heap_initialized_, HeapZeroOut);
 }

 // ================================================================================================
@@ -594,7 +594,8 @@ class Device : public NullDevice {

  //! Allocates hidden heap for device memory allocations
  void HiddenHeapAlloc(const VirtualGPU& gpu);
-
+  //! Init hidden heap for device memory allocations
+  void HiddenHeapInit(const VirtualGPU& gpu);
  uint32_t fetchSDMAMask(const device::BlitManager* handle, bool readEngine = true) const;
  void resetSDMAMask(const device::BlitManager* handle) const;
  void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
@@ -2962,6 +2962,8 @@ static inline void nontemporalMemcpy(
 #endif
 }

+void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*this); }
+
 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    const amd::Kernel& kernel, const_address parameters, void* eventHandle,
@@ -3016,7 +3018,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,

  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
-
+  bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
  for (int j = 0; j < iteration; j++) {
    // Reset global size for dimension dim if split is needed
    if (dim != -1) {
@@ -3143,6 +3145,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
            const_cast<Device&>(dev()).HiddenHeapAlloc(*this);
          }
          if (dev().HeapBuffer() != nullptr) {
+            // Initialize hidden heap buffer
+            if (!isGraphCapture) {
+              const_cast<Device&>(dev()).HiddenHeapInit(*this);
+            }
            // Add heap pointer to the code
            size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());
            WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
@@ -3225,7 +3231,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    }

    address argBuffer = hidden_arguments;
-    bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
    size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());

    // Find all parameters for the current kernel
@@ -419,6 +419,8 @@ class VirtualGPU : public device::VirtualDevice {

  void* allocKernArg(size_t size, size_t alignment);
  bool isFenceDirty() const { return fence_dirty_; }
+  void HiddenHeapInit();
+
  void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
  uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
  // } roc OpenCL integration