SWDEV-468424 - hipgraph capture memset node

Capture AQL packets during GraphInstantiation and enqueue AQL packets during graph launch.

Added support to capture single graph memset node.
Capture support for memset node is currently disabled.
Memset capture will be enabled when capture for multiple packets are supported..

Change-Id: I14dfbc41731025cc3a548a730558915def3fa384


[ROCm/clr commit: 346da4bb40]
Этот коммит содержится в:
Anusha GodavarthySurya
2024-06-18 11:12:37 +00:00
коммит произвёл Anusha Godavarthy Surya
родитель 7363b984c1
Коммит 7985a72073
5 изменённых файлов: 83 добавлений и 59 удалений
+10 -9
Просмотреть файл
@@ -362,9 +362,9 @@ void GetKernelArgSizeForGraph(std::vector<std::vector<Node>>& parallelLists,
// GPU packet capture is enabled for kernel nodes. Calculate the kernel
// arg size required for all graph kernel nodes to allocate
for (const auto& list : parallelLists) {
for (auto& node : list) {
if (node->GetType() == hipGraphNodeTypeKernel) {
kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
for (hip::GraphNode* node : list) {
if (node->GraphCaptureEnabled()) {
kernArgSizeForGraph += node->GetKerArgSize();
} else if (node->GetType() == hipGraphNodeTypeGraph) {
auto& childParallelLists =
reinterpret_cast<hip::ChildGraphNode*>(node)->GetParallelLists();
@@ -388,18 +388,19 @@ hipError_t AllocKernelArgForGraphNode(std::vector<hip::Node>& topoOrder,
graphExec->SetHiddenHeap();
initialized = true;
}
auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
// From the kernel pool allocate the kern arg size required for the current kernel node.
}
if (node->GraphCaptureEnabled()) {
// From the kernel pool allocate the kern arg size required for the current node.
address kernArgOffset = nullptr;
if (kernelNode->GetKernargSegmentByteSize()) {
if (node->GetKernargSegmentByteSize()) {
kernArgOffset = graphExec->kernArgManager_->AllocKernArg(
kernelNode->GetKernargSegmentByteSize(), kernelNode->GetKernargSegmentAlignment());
node->GetKernargSegmentByteSize(), node->GetKernargSegmentAlignment());
if (kernArgOffset == nullptr) {
return hipErrorMemoryAllocation;
}
}
// Form GPU packet capture for the kernel node.
kernelNode->CaptureAndFormPacket(capture_stream, kernArgOffset);
node->CaptureAndFormPacket(capture_stream, kernArgOffset);
} else if (node->GetType() == hipGraphNodeTypeGraph) {
auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
auto& childParallelLists = childNode->GetParallelLists();
@@ -551,7 +552,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
}
for (int i = 0; i < topoOrder.size(); i++) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) {
if (topoOrder[i]->GraphCaptureEnabled()) {
if (topoOrder[i]->GetEnabled()) {
hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(),
topoOrder[i]->GetKernelName(),
+30 -23
Просмотреть файл
@@ -186,6 +186,9 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
unsigned int isEnabled_;
uint8_t gpuPacket_[64]; //!< GPU Packet to enqueue during graph launch
std::string capturedKernelName_;
size_t alignedKernArgSize_ = 256; //!< Aligned size required for kernel args
size_t kernargSegmentByteSize_ = 256; //!< Kernel arg segment byte size
size_t kernargSegmentAlignment_ = 256; //!< Kernel arg segment alignment
public:
GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "",
@@ -237,7 +240,19 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
uint8_t* GetAqlPacket() { return gpuPacket_; }
void SetKernelName(const std::string& kernelName) { capturedKernelName_ = kernelName; }
const std::string& GetKernelName() const { return capturedKernelName_; }
size_t GetKerArgSize() const { return alignedKernArgSize_; }
size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
hipError_t status = CreateCommand(capture_stream);
for (auto& command : commands_) {
command->setCapturingState(true, GetAqlPacket(), kernArgOffset, &capturedKernelName_);
// Enqueue command to capture GPU Packet. The packet is not submitted to the device.
// The packet is stored in gpuPacket_ and submitted during graph launch.
command->submit(*(command->queue())->vdev());
command->release();
}
}
hip::Stream* GetQueue() const { return stream_; }
virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
@@ -380,6 +395,20 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
}
unsigned int GetEnabled() const { return isEnabled_; }
void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; }
// Returns true if capture is enabled for the current node.
bool GraphCaptureEnabled() {
bool isGraphCapture = false;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
switch (GetType()) {
case hipGraphNodeTypeKernel:
isGraphCapture = true;
break;
default:
break;
}
}
return isGraphCapture;
}
};
struct Graph {
@@ -835,16 +864,9 @@ class GraphKernelNode : public GraphNode {
hipKernelNodeAttrValue kernelAttr_; //!< Kernel node attributes
unsigned int kernelAttrInUse_; //!< Kernel attributes in use
ihipExtKernelEvents kernelEvents_; //!< Events for Ext launch kernel
size_t alignedKernArgSize_; //!< Aligned size required for kernel args
size_t kernargSegmentByteSize_; //!< Kernel arg segment byte size
size_t kernargSegmentAlignment_; //!< Kernel arg segment alignment
bool hasHiddenHeap_; //!< Kernel has hidden heap(device side allocation)
public:
size_t GetKerArgSize() const { return alignedKernArgSize_; }
size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
bool HasHiddenHeap() const { return hasHiddenHeap_; }
void EnqueueCommands(hipStream_t stream) override {
// If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
@@ -888,21 +910,6 @@ class GraphKernelNode : public GraphNode {
out << "];";
}
void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) {
hipError_t status = CreateCommand(capture_stream);
for (auto& command : commands_) {
reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
true, GetAqlPacket(), kernArgOffset);
// Enqueue command to capture GPU Packet. The packet is not submitted to the device.
// The packet is stored in gpuPacket_ and submitted during graph launch.
command->submit(*(command->queue())->vdev());
// Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
command->release();
}
}
virtual std::string GetLabel(hipGraphDebugDotFlags flag) override {
hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
+12 -7
Просмотреть файл
@@ -1497,6 +1497,9 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
}
}
}
if (command.getCapturingState()) {
currCmd_ = &command;
}
}
// ================================================================================================
@@ -1514,6 +1517,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
if (AMD_DIRECT_DISPATCH) {
assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
}
currCmd_ = nullptr;
}
// ================================================================================================
@@ -3018,7 +3022,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
bool isGraphCapture = currCmd_ != nullptr && currCmd_->getCapturingState();
for (int j = 0; j < iteration; j++) {
// Reset global size for dimension dim if split is needed
if (dim != -1) {
@@ -3238,7 +3242,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
// Allocate buffer to hold kernel arguments
if (isGraphCapture) {
argBuffer = vcmd->getKernArgOffset();
argBuffer = currCmd_->getKernArgOffset();
currCmd_->SetKernelName(gpuKernel.name());
} else {
argBuffer = reinterpret_cast<address>(
@@ -3324,18 +3329,18 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
}
if (vcmd == nullptr) {
if (isGraphCapture) {
// Dispatch the packet
if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
(sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
GPU_FLUSH_ON_EXECUTION)) {
(sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
GPU_FLUSH_ON_EXECUTION, currCmd_->getCapturingState(),
currCmd_->getAqlPacket())) {
return false;
}
} else {
if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
(sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
GPU_FLUSH_ON_EXECUTION, vcmd->getCapturingState(),
vcmd->getAqlPacket())) {
GPU_FLUSH_ON_EXECUTION)) {
return false;
}
}
+2
Просмотреть файл
@@ -575,5 +575,7 @@ class VirtualGPU : public device::VirtualDevice {
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
using KernelArgImpl = device::Settings::KernelArgImpl;
amd::Command* currCmd_ = nullptr; //!< Current command under capture
};
}
+29 -20
Просмотреть файл
@@ -256,6 +256,10 @@ class Command : public Event {
std::vector<void*> data_;
const Event* waitingEvent_; //!< Waiting event associated with the marker
bool capturing_ = false; //!< Flag to enable/disable graph gpu packet capture
uint8_t* gpuPacket_ = nullptr; //!< GPU packet to capture, when graph capturing is enabled
address kernArgOffset_ = nullptr; //!< KernelArg buffer to used when graph capturing is enabled
std::string* capturedKernelName_ = nullptr; //!< Kenrnel under capture
protected:
bool cpu_wait_ = false; //!< If true, then the command was issued for CPU/GPU sync
@@ -292,6 +296,31 @@ class Command : public Event {
}
public:
//! Returns AQL buffer state
bool getCapturingState() const { return capturing_; }
//! Sets AQL capture state, aql packet to capture and where to copy kernArgs
void setCapturingState(bool state, uint8_t* packet, address kernArgOffset,
std::string* capturedKernelName) {
capturing_ = state;
gpuPacket_ = packet;
kernArgOffset_ = kernArgOffset;
capturedKernelName_ = capturedKernelName;
}
//! Updates kernel name with the captured kernel name
void SetKernelName(const std::string& kernelName) {
if (capturedKernelName_ != nullptr) {
*capturedKernelName_ = kernelName;
}
}
//! returns the graph executable object command belongs to.
const uint8_t* getAqlPacket() const { return gpuPacket_; }
//! returns the graph executable object command belongs to.
const address getKernArgOffset() const { return kernArgOffset_; }
//! Overload new/delete for fast commands allocation/destruction
void* operator new(size_t size);
void operator delete(void* ptr);
@@ -1075,10 +1104,6 @@ class NDRangeKernelCommand : public Command {
uint32_t firstDevice_; //!< Device index of the first device in the gridc
uint32_t numWorkgroups_; //!< Total number of workgroups in the current launch
bool capturing_ = false; //!< Flag to enable/disable graph gpu packet capture
uint8_t* gpuPacket_ = nullptr; //!< GPU packet to capture, when graph capturing is enabled
address kernArgOffset_ = nullptr; //!< KernelArg buffer to used when graph capturing is enabled
public:
enum {
CooperativeGroups = 0x01,
@@ -1086,22 +1111,6 @@ class NDRangeKernelCommand : public Command {
AnyOrderLaunch = 0x04,
};
//! Returns AQL buffer state
bool getCapturingState() const { return capturing_; }
//! Sets AQL capture state, aql packet to capture and where to copy kernArgs
void setCapturingState(bool state, uint8_t* packet, address kernArgOffset) {
capturing_ = state;
gpuPacket_ = packet;
kernArgOffset_ = kernArgOffset;
}
//! returns the graph executable object command belongs to.
const uint8_t* getAqlPacket() const { return gpuPacket_; }
//! returns the graph executable object command belongs to.
const address getKernArgOffset() const { return kernArgOffset_; }
//! Construct an ExecuteKernel command
NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,