Enable queue profile only if we attach a profiler

Submit explicit profile marker for hipEventRecord to record
timestamps. Enable explicit signal profiling if the API specifies
start and stop events.

Toggle this with env var HIP_FORCE_QUEUE_PROFILING=0

Change-Id: Iae449a63ec3ebf6c2880e65d7b1dd1031a29018f
This commit is contained in:
Saleel Kudchadker
2020-07-22 11:35:15 -07:00
szülő c24f884c76
commit dec95e58e3
4 fájl változott, egészen pontosan 38 új sor hozzáadva és 19 régi sor törölve
+14 -8
Fájl megtekintése
@@ -140,6 +140,19 @@ hipError_t Event::streamWait(amd::HostQueue* hostQueue, uint flags) {
void Event::addMarker(amd::HostQueue* queue, amd::Command* command, bool record) {
amd::ScopedLock lock(lock_);
if (queue->properties().test(CL_QUEUE_PROFILING_ENABLE)) {
if (command == nullptr) {
command = queue->getLastQueuedCommand(true);
if (command == nullptr) {
command = new amd::Marker(*queue, kMarkerDisableFlush);
command->enqueue();
}
}
} else if (command == nullptr) {
command = new hip::ProfileMarker(*queue, false);
command->enqueue();
}
if (event_ == &command->event()) return;
if (event_ != nullptr) {
@@ -239,16 +252,9 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
}
hip::Event* e = reinterpret_cast<hip::Event*>(event);
amd::ScopedLock lock(e->lock());
amd::HostQueue* queue = hip::getQueue(stream);
amd::Command* command = queue->getLastQueuedCommand(true);
if (command == nullptr) {
command = new amd::Marker(*queue, kMarkerDisableFlush);
command->enqueue();
}
e->addMarker(queue, command, true);
e->addMarker(queue, nullptr, true);
HIP_RETURN(hipSuccess);
}
+4 -3
Fájl megtekintése
@@ -26,12 +26,13 @@
namespace hip {
class TimerMarker: public amd::Marker {
class ProfileMarker: public amd::Marker {
public:
TimerMarker(amd::HostQueue& queue) : amd::Marker(queue, false) {
ProfileMarker(amd::HostQueue& queue, bool disableFlush)
: amd::Marker(queue, disableFlush) {
profilingInfo_.enabled_ = true;
profilingInfo_.callback_ = nullptr;
profilingInfo_.start_ = profilingInfo_.end_ = 0;
profilingInfo_.clear();
}
};
+8 -5
Fájl megtekintése
@@ -249,7 +249,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
size_t localWorkSize[3] = { blockDimX, blockDimY, blockDimZ };
amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
amd::Command::EventWaitList waitList;
bool profileNDRange = false;
address kernargs = nullptr;
// 'extra' is a struct that contains the following info: {
@@ -273,13 +273,16 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
desc.type_ == T_POINTER/*svmBound*/);
} else {
assert(extra == nullptr);
kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/);
kernel->parameters().set(i, desc.size_, kernelParams[i],
desc.type_ == T_POINTER/*svmBound*/);
}
}
profileNDRange = (startEvent != nullptr && stopEvent != nullptr);
amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(
*queue, waitList, *kernel, ndrange, sharedMemBytes,
params, gridId, numGrids, prevGridSum, allGridSum, firstDevice);
params, gridId, numGrids, prevGridSum, allGridSum, firstDevice, profileNDRange);
if (!command) {
return hipErrorOutOfMemory;
}
@@ -472,7 +475,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
uint64_t prevGridSize = 0;
uint32_t firstDevice = 0;
// Sync the execution streams on all devices
// Sync the execution streams on all devices
if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) {
for (int i = 0; i < numDevices; ++i) {
amd::HostQueue* queue =
@@ -520,7 +523,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
}
// Sync the execution streams on all devices
// Sync the execution streams on all devices
if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) {
for (int i = 0; i < numDevices; ++i) {
amd::HostQueue* queue =
+12 -3
Fájl megtekintése
@@ -22,6 +22,9 @@
#include "hip_internal.hpp"
#include "hip_event.hpp"
#include "thread/monitor.hpp"
#include "hip_prof_api.h"
extern api_callbacks_table_t callbacks_table;
static amd::Monitor streamSetLock{"Guards global stream set"};
static std::unordered_set<hip::Stream*> streamSet;
@@ -50,7 +53,12 @@ Stream::Stream(hip::Device* dev, Priority p,
// ================================================================================================
bool Stream::Create() {
cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
// Enable queue profiling if a profiler is attached which sets the callback_table flag
// or if we force it with env var. This would enable time stamp collection for every
// command submitted to the stream(queue).
cl_command_queue_properties properties = (callbacks_table.is_enabled() ||
HIP_FORCE_QUEUE_PROFILING) ?
CL_QUEUE_PROFILING_ENABLE : 0;
amd::CommandQueue::Priority p;
switch (priority_) {
case Priority::High:
@@ -64,8 +72,9 @@ bool Stream::Create() {
p = amd::CommandQueue::Priority::Normal;
break;
}
amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], properties,
amd::CommandQueue::RealTimeDisabled, p, cuMask_);
amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0],
properties, amd::CommandQueue::RealTimeDisabled,
p, cuMask_);
// Create a host queue
bool result = (queue != nullptr) ? queue->create() : false;