SWDEV-444988 - Fix __amd_rocclr_initHeap sync with DEBUG_CLR_GRAPH_PACKET_CAPTURE

When kernel does device side malloc, initial heap is allocated with __amd_rocclr_initHeap.
During graph launch kernel __amd_rocclr_initHeap is enqueued followed by actual kernel . So kernel will execute after initHeap kernel.

But with graph optimizations during capture initHeap gets enqueued on device null stream and actual kernel on graph launch stream.
So no proper synchronization. Switch to command creation and enqueue during launch for kernel node with hidden heap.

Change-Id: Iaf600251faef9a448853f19429023c118aa760b9


[ROCm/clr commit: 2dc6ec68a5]
Cette révision appartient à :
Anusha GodavarthySurya
2024-02-12 16:34:47 +00:00
révisé par Saleel Kudchadker
Parent e8a077dc68
révision 49cbeaea21
2 fichiers modifiés avec 28 ajouts et 15 suppressions
+19 -15
Voir le fichier
@@ -351,7 +351,8 @@ hipError_t GraphExec::CaptureAQLPackets() {
// arg size required for all graph kernel nodes to allocate
for (const auto& list : parallelLists_) {
for (auto& node : list) {
if (node->GetType() == hipGraphNodeTypeKernel) {
if (node->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
}
}
@@ -373,7 +374,8 @@ hipError_t GraphExec::CaptureAQLPackets() {
kernarg_pool_size_graph_ = kernArgSizeForGraph;
}
for (auto& node : topoOrder_) {
if (node->GetType() == hipGraphNodeTypeKernel) {
if (node->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
// From the kernel pool allocate the kern arg size required for the current kernel node.
address kernArgOffset = nullptr;
@@ -560,16 +562,20 @@ hipError_t GraphExec::Run(hipStream_t stream) {
if (parallelLists_.size() == 1 &&
instantiateDeviceId_ == hip_stream->DeviceId()) {
amd::AccumulateCommand* accumulate = nullptr;
bool isLastPacketKernel = false;
bool isLastKernelWithoutHiddenHeap =
((topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) &&
!reinterpret_cast<hip::GraphKernelNode*>(topoOrder_.back())->HasHiddenHeap());
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
uint8_t* lastCapturedPacket = (topoOrder_.back()->GetType() == hipGraphNodeTypeKernel)
? topoOrder_.back()->GetAqlPacket()
: nullptr;
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket);
uint8_t* lastCapturedPacket =
isLastKernelWithoutHiddenHeap ? topoOrder_.back()->GetAqlPacket() : nullptr;
if (topoOrder_.back()->GetEnabled()) {
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket);
}
}
for (int i = 0; i < topoOrder_.size() - 1; i++) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(topoOrder_[i])->HasHiddenHeap()) {
if (topoOrder_[i]->GetEnabled()) {
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
accumulate->addKernelName(topoOrder_[i]->GetKernelName());
@@ -583,20 +589,18 @@ hipError_t GraphExec::Run(hipStream_t stream) {
// If last captured packet is kernel, optimize to detect completion of last kernel
// This saves on extra packet submitted to determine end of graph
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && isLastKernelWithoutHiddenHeap) {
// Add the last kernel node name to the accumulate command
accumulate->addKernelName(topoOrder_.back()->GetKernelName());
accumulate->enqueue();
accumulate->release();
isLastPacketKernel = true;
if (topoOrder_.back()->GetEnabled()) {
accumulate->addKernelName(topoOrder_.back()->GetKernelName());
}
} else {
topoOrder_.back()->SetStream(hip_stream, this);
status = topoOrder_.back()->CreateCommand(topoOrder_.back()->GetQueue());
topoOrder_.back()->EnqueueCommands(stream);
}
// If last packet is not kernel, submit a marker to detect end of graph
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && !isLastPacketKernel) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
accumulate->enqueue();
accumulate->release();
}
+9
Voir le fichier
@@ -785,11 +785,14 @@ class GraphKernelNode : public GraphNode {
size_t alignedKernArgSize_; //!< Aligned size required for kernel args
size_t kernargSegmentByteSize_; //!< Kernel arg segment byte size
size_t kernargSegmentAlignment_; //!< Kernel arg segment alignment
bool hasHiddenHeap_; //!< Kernel has hidden heap(device side allocation)
public:
size_t GetKerArgSize() const { return alignedKernArgSize_; }
size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
bool HasHiddenHeap() const { return hasHiddenHeap_; }
void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) override {
out << "[";
out << "style";
@@ -926,6 +929,11 @@ class GraphKernelNode : public GraphNode {
}
::memcpy(kernelParams_.kernelParams[i], (pNodeParams->kernelParams[i]), desc.size_);
}
for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
if (signature.at(i).info_.oclObject_ == amd::KernelParameterDescriptor::HiddenHeap) {
hasHiddenHeap_ = true;
}
}
}
// Allocate/assign memory if params are passed as part of 'extra'
@@ -969,6 +977,7 @@ class GraphKernelNode : public GraphNode {
}
memset(&kernelAttr_, 0, sizeof(kernelAttr_));
kernelAttrInUse_ = 0;
hasHiddenHeap_ = false;
}
~GraphKernelNode() { freeParams(); }