From dec95e58e348987ef461da3fd870d5a7c04655b0 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 22 Jul 2020 11:35:15 -0700 Subject: [PATCH] Enable queue profile only if we attach a profiler Submit explicit profile marker for hipEventRecord to record timestamps. Enable explicit signal profiling if the API specifies start and stop events. Toggle this with env var HIP_FORCE_QUEUE_PROFILING=0 Change-Id: Iae449a63ec3ebf6c2880e65d7b1dd1031a29018f --- hipamd/rocclr/hip_event.cpp | 22 ++++++++++++++-------- hipamd/rocclr/hip_event.hpp | 7 ++++--- hipamd/rocclr/hip_module.cpp | 13 ++++++++----- hipamd/rocclr/hip_stream.cpp | 15 ++++++++++++--- 4 files changed, 38 insertions(+), 19 deletions(-) diff --git a/hipamd/rocclr/hip_event.cpp b/hipamd/rocclr/hip_event.cpp index a9ea30e15c..858309d638 100644 --- a/hipamd/rocclr/hip_event.cpp +++ b/hipamd/rocclr/hip_event.cpp @@ -140,6 +140,19 @@ hipError_t Event::streamWait(amd::HostQueue* hostQueue, uint flags) { void Event::addMarker(amd::HostQueue* queue, amd::Command* command, bool record) { amd::ScopedLock lock(lock_); + if (queue->properties().test(CL_QUEUE_PROFILING_ENABLE)) { + if (command == nullptr) { + command = queue->getLastQueuedCommand(true); + if (command == nullptr) { + command = new amd::Marker(*queue, kMarkerDisableFlush); + command->enqueue(); + } + } + } else if (command == nullptr) { + command = new hip::ProfileMarker(*queue, false); + command->enqueue(); + } + if (event_ == &command->event()) return; if (event_ != nullptr) { @@ -239,16 +252,9 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { } hip::Event* e = reinterpret_cast(event); - amd::ScopedLock lock(e->lock()); - amd::HostQueue* queue = hip::getQueue(stream); - amd::Command* command = queue->getLastQueuedCommand(true); - if (command == nullptr) { - command = new amd::Marker(*queue, kMarkerDisableFlush); - command->enqueue(); - } - e->addMarker(queue, command, true); + e->addMarker(queue, nullptr, true); HIP_RETURN(hipSuccess); } diff --git a/hipamd/rocclr/hip_event.hpp b/hipamd/rocclr/hip_event.hpp index dbd43b1a5c..dccd4e884a 100644 --- a/hipamd/rocclr/hip_event.hpp +++ b/hipamd/rocclr/hip_event.hpp @@ -26,12 +26,13 @@ namespace hip { -class TimerMarker: public amd::Marker { +class ProfileMarker: public amd::Marker { public: - TimerMarker(amd::HostQueue& queue) : amd::Marker(queue, false) { + ProfileMarker(amd::HostQueue& queue, bool disableFlush) + : amd::Marker(queue, disableFlush) { profilingInfo_.enabled_ = true; profilingInfo_.callback_ = nullptr; - profilingInfo_.start_ = profilingInfo_.end_ = 0; + profilingInfo_.clear(); } }; diff --git a/hipamd/rocclr/hip_module.cpp b/hipamd/rocclr/hip_module.cpp index 07944c68ad..07eeb55a84 100755 --- a/hipamd/rocclr/hip_module.cpp +++ b/hipamd/rocclr/hip_module.cpp @@ -249,7 +249,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, size_t localWorkSize[3] = { blockDimX, blockDimY, blockDimZ }; amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize); amd::Command::EventWaitList waitList; - + bool profileNDRange = false; address kernargs = nullptr; // 'extra' is a struct that contains the following info: { @@ -273,13 +273,16 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, desc.type_ == T_POINTER/*svmBound*/); } else { assert(extra == nullptr); - kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/); + kernel->parameters().set(i, desc.size_, kernelParams[i], + desc.type_ == T_POINTER/*svmBound*/); } } + profileNDRange = (startEvent != nullptr && stopEvent != nullptr); + amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand( *queue, waitList, *kernel, ndrange, sharedMemBytes, - params, gridId, numGrids, prevGridSum, allGridSum, firstDevice); + params, gridId, numGrids, prevGridSum, allGridSum, firstDevice, profileNDRange); if (!command) { return hipErrorOutOfMemory; } @@ -472,7 +475,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL uint64_t prevGridSize = 0; uint32_t firstDevice = 0; - // Sync the execution streams on all devices + // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) { for (int i = 0; i < numDevices; ++i) { amd::HostQueue* queue = @@ -520,7 +523,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; } - // Sync the execution streams on all devices + // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) { for (int i = 0; i < numDevices; ++i) { amd::HostQueue* queue = diff --git a/hipamd/rocclr/hip_stream.cpp b/hipamd/rocclr/hip_stream.cpp index 9da673f023..379954ef5c 100755 --- a/hipamd/rocclr/hip_stream.cpp +++ b/hipamd/rocclr/hip_stream.cpp @@ -22,6 +22,9 @@ #include "hip_internal.hpp" #include "hip_event.hpp" #include "thread/monitor.hpp" +#include "hip_prof_api.h" + +extern api_callbacks_table_t callbacks_table; static amd::Monitor streamSetLock{"Guards global stream set"}; static std::unordered_set streamSet; @@ -50,7 +53,12 @@ Stream::Stream(hip::Device* dev, Priority p, // ================================================================================================ bool Stream::Create() { - cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; + // Enable queue profiling if a profiler is attached which sets the callback_table flag + // or if we force it with env var. This would enable time stamp collection for every + // command submitted to the stream(queue). + cl_command_queue_properties properties = (callbacks_table.is_enabled() || + HIP_FORCE_QUEUE_PROFILING) ? + CL_QUEUE_PROFILING_ENABLE : 0; amd::CommandQueue::Priority p; switch (priority_) { case Priority::High: @@ -64,8 +72,9 @@ bool Stream::Create() { p = amd::CommandQueue::Priority::Normal; break; } - amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], properties, - amd::CommandQueue::RealTimeDisabled, p, cuMask_); + amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], + properties, amd::CommandQueue::RealTimeDisabled, + p, cuMask_); // Create a host queue bool result = (queue != nullptr) ? queue->create() : false;