SWDEV-467102 - Hidden heap init for graph capture

If the graph has kernels that does device side allocation,  during packet capture, heap is
allocated because heap pointer has to be added to the AQL packet, and initialized during
graph launch.

Handle race with wait when 2 kernels with device heap are enqueued on multiple streams.

Change-Id: I45933b77fcaf7bc8fdf1bc906462e32b5d8d3688


[ROCm/clr commit: 57156c524d]
Этот коммит содержится в:
Anusha GodavarthySurya
2024-06-11 11:59:05 +00:00
коммит произвёл Anusha Godavarthy Surya
родитель c8bc9e3f2e
Коммит 291f079669
9 изменённых файлов: 57 добавлений и 17 удалений
+19 -6
Просмотреть файл
@@ -358,8 +358,7 @@ void GetKernelArgSizeForGraph(std::vector<std::vector<Node>>& parallelLists,
// arg size required for all graph kernel nodes to allocate
for (const auto& list : parallelLists) {
for (auto& node : list) {
if (node->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
if (node->GetType() == hipGraphNodeTypeKernel) {
kernArgSizeForGraph += reinterpret_cast<hip::GraphKernelNode*>(node)->GetKerArgSize();
} else if (node->GetType() == hipGraphNodeTypeGraph) {
auto& childParallelLists = reinterpret_cast<hip::ChildGraphNode*>(node)->GetParallelLists();
@@ -375,8 +374,13 @@ hipError_t AllocKernelArgForGraph(std::vector<hip::Node>& topoOrder, hip::Stream
hip::GraphExec* graphExec) {
hipError_t status = hipSuccess;
for (auto& node : topoOrder) {
if (node->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
if (node->GetType() == hipGraphNodeTypeKernel) {
// Check if graph requires hidden heap and set as part of graphExec param.
static bool initialized = false;
if (!initialized && reinterpret_cast<hip::GraphKernelNode*>(node)->HasHiddenHeap()) {
graphExec->SetHiddenHeap();
initialized = true;
}
auto kernelNode = reinterpret_cast<hip::GraphKernelNode*>(node);
// From the kernel pool allocate the kern arg size required for the current kernel node.
address kernArgOffset = nullptr;
@@ -591,8 +595,7 @@ hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::St
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
}
for (int i = 0; i < topoOrder.size(); i++) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel &&
!reinterpret_cast<hip::GraphKernelNode*>(topoOrder[i])->HasHiddenHeap()) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder[i]->GetType() == hipGraphNodeTypeKernel) {
if (topoOrder[i]->GetEnabled()) {
hip_stream->vdev()->dispatchAqlPacket(topoOrder[i]->GetAqlPacket(),
topoOrder[i]->GetKernelName(),
@@ -640,6 +643,16 @@ hipError_t GraphExec::Run(hipStream_t stream) {
if (parallelLists_.size() == 1 &&
instantiateDeviceId_ == hip_stream->DeviceId()) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
// If the graph has kernels that does device side allocation, during packet capture, heap is
// allocated because heap pointer has to be added to the AQL packet, and initialized during
// graph launch.
static bool initialized = false;
if (!initialized && HasHiddenHeap()) {
hip_stream->vdev()->HiddenHeapInit();
initialized = true;
}
}
status = EnqueueGraphWithSingleList(topoOrder_, hip_stream, this);
} else if (parallelLists_.size() == 1 &&
instantiateDeviceId_ != hip_stream->DeviceId()) {
+5
Просмотреть файл
@@ -569,6 +569,7 @@ struct GraphExec : public amd::ReferenceCountedObject {
uint32_t kernarg_graph_cur_offset_ = 0;
uint32_t kernarg_graph_size_ = 128 * Ki;
int instantiateDeviceId_ = -1;
bool hasHiddenHeap_ = false; //!< Kernel has hidden heap(device side allocation)
public:
GraphExec(std::vector<Node>& topoOrder, std::vector<std::vector<Node>>& lists,
@@ -619,6 +620,10 @@ struct GraphExec : public amd::ReferenceCountedObject {
}
return clonedNode;
}
// returns if graph has nodes that require hidden heap/not
bool HasHiddenHeap() const { return hasHiddenHeap_; }
// Graph has nodes that require hidden heap.
void SetHiddenHeap() { hasHiddenHeap_ = true; }
address allocKernArg(size_t size, size_t alignment) {
assert(alignment != 0);
+5 -2
Просмотреть файл
@@ -1308,7 +1308,8 @@ class VirtualDevice : public amd::HeapObject {
//! Returns fence state of the VirtualGPU
virtual bool isFenceDirty() const = 0;
//! Init hidden heap for device memory allocations
virtual void HiddenHeapInit() = 0;
//! Dispatch captured AQL packet
virtual bool dispatchAqlPacket(uint8_t* aqlpacket,
const std::string& kernelName,
@@ -2102,7 +2103,9 @@ class Device : public RuntimeObject {
static Memory* p2p_stage_; //!< Staging resources
std::vector<Device*> enabled_p2p_devices_; //!< List of user enabled P2P devices for this device
std::once_flag heap_initialized_; //!< Heap buffer initialization flag
std::once_flag heap_initialized_; //!< Heap buffer initialization flag
std::once_flag heap_allocated_; //!< Heap buffer allocation flag
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device
amd::Memory* arena_mem_obj_; //!< Arena memory object
+2
Просмотреть файл
@@ -357,6 +357,8 @@ class VirtualGPU : public device::VirtualDevice {
bool isFenceDirty() const { return false; }
void HiddenHeapInit() {}
inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
amd::AccumulateCommand* vcmd = nullptr) {
vcmd->addKernelName(kernelName);
+1 -2
Просмотреть файл
@@ -2621,8 +2621,7 @@ bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Mem
address parameters = captureArguments(kernels_[blitType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
releaseArguments(parameters);
synchronize();
gpu().releaseGpuMemoryFence();
return result;
}
+14 -4
Просмотреть файл
@@ -3511,7 +3511,7 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer
// ================================================================================================
void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
auto HeapAllocZeroOut = [this, &gpu]() -> bool {
auto HeapAllocOnly = [this, &gpu]() -> bool {
// Allocate initial heap for device memory allocator
static constexpr size_t HeapBufferSize = 128 * Ki;
heap_buffer_ = createMemory(HeapBufferSize);
@@ -3523,12 +3523,22 @@ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
LogError("Heap buffer allocation failed!");
return false;
}
bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));
return true;
};
std::call_once(heap_allocated_, HeapAllocOnly);
}
// ================================================================================================
void Device::HiddenHeapInit(const VirtualGPU& gpu) {
auto HeapZeroOut = [this, &gpu]() -> bool {
static constexpr size_t HeapBufferSize = 128 * Ki;
bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr())
.initHeap(heap_buffer_, initial_heap_buffer_, HeapBufferSize,
initial_heap_size_ / (2 * Mi));
return result;
};
std::call_once(heap_initialized_, HeapAllocZeroOut);
std::call_once(heap_initialized_, HeapZeroOut);
}
// ================================================================================================
+2 -1
Просмотреть файл
@@ -594,7 +594,8 @@ class Device : public NullDevice {
//! Allocates hidden heap for device memory allocations
void HiddenHeapAlloc(const VirtualGPU& gpu);
//! Init hidden heap for device memory allocations
void HiddenHeapInit(const VirtualGPU& gpu);
uint32_t fetchSDMAMask(const device::BlitManager* handle, bool readEngine = true) const;
void resetSDMAMask(const device::BlitManager* handle) const;
void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
+7 -2
Просмотреть файл
@@ -2962,6 +2962,8 @@ static inline void nontemporalMemcpy(
#endif
}
void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*this); }
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
@@ -3016,7 +3018,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
for (int j = 0; j < iteration; j++) {
// Reset global size for dimension dim if split is needed
if (dim != -1) {
@@ -3143,6 +3145,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const_cast<Device&>(dev()).HiddenHeapAlloc(*this);
}
if (dev().HeapBuffer() != nullptr) {
// Initialize hidden heap buffer
if (!isGraphCapture) {
const_cast<Device&>(dev()).HiddenHeapInit(*this);
}
// Add heap pointer to the code
size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());
WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
@@ -3225,7 +3231,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
}
address argBuffer = hidden_arguments;
bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());
// Find all parameters for the current kernel
+2
Просмотреть файл
@@ -419,6 +419,8 @@ class VirtualGPU : public device::VirtualDevice {
void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
void HiddenHeapInit();
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
// } roc OpenCL integration