Correct reported info in ROC profiler

OCL can't distinguish different copy types, but ROC profiler
expects SDMA transfer visibility. Add extra code to detect
a transfer with the host memory and substitute OCL command

Change-Id: I5290acd0e10bc082e00c1d4ae1474a075de7f165
Этот коммит содержится в:
German Andryeyev
2020-10-16 14:20:58 -04:00
родитель 17d2e5cf56
Коммит bd340d8cbf
6 изменённых файлов: 47 добавлений и 11 удалений
+23 -3
Просмотреть файл
@@ -1630,6 +1630,17 @@ bool KernelBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory
address parameters = captureArguments(kernels_[blitType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
releaseArguments(parameters);
if (amd::IS_HIP) {
// Update the command type for ROC profiler
if (srcMemory.isHostMemDirectAccess()) {
gpu().SetCopyCommandType(CL_COMMAND_WRITE_BUFFER_RECT);
}
if (dstMemory.isHostMemDirectAccess()) {
gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER_RECT);
}
}
synchronize();
return result;
@@ -1857,6 +1868,7 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
return result;
}
// ================================================================================================
bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
@@ -1919,6 +1931,7 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
return result;
}
// ================================================================================================
bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& sizeIn, bool entire) const {
@@ -1975,12 +1988,10 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
// Program source origin
uint64_t srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
;
setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
// Program destinaiton origin
uint64_t dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
;
setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
uint64_t copySize = size[0];
@@ -2001,7 +2012,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
releaseArguments(parameters);
} else {
//printf("rocm!\n");
if (amd::IS_HIP) {
// Update the command type for ROC profiler
if (srcMemory.isHostMemDirectAccess()) {
gpu().SetCopyCommandType(CL_COMMAND_WRITE_BUFFER);
}
if (dstMemory.isHostMemDirectAccess()) {
gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER);
}
}
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
}
@@ -2010,6 +2029,7 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
return result;
}
// ================================================================================================
bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
+9 -1
Просмотреть файл
@@ -694,7 +694,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
schedulerQueue_(nullptr),
schedulerSignal_({0}),
cuMask_(cuMask),
priority_(priority)
priority_(priority),
copy_command_type_(0)
{
index_ = device.numOfVgpus_++;
gpu_device_ = device.getBackendDevice();
@@ -1360,6 +1361,7 @@ bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memo
return true;
}
// ================================================================================================
void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
@@ -1374,9 +1376,15 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
cmd.setStatus(CL_INVALID_OPERATION);
}
// Runtime may change the command type to report a more accurate info in ROC profiler
if (copy_command_type_ != 0) {
cmd.OverrrideCommandType(copy_command_type_);
copy_command_type_ = 0;
}
profilingEnd(cmd);
}
// ================================================================================================
void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
+5
Просмотреть файл
@@ -295,6 +295,7 @@ class VirtualGPU : public device::VirtualDevice {
void hasPendingDispatch() { hasPendingDispatch_ = true; }
void addSystemScope() { addSystemScope_ = true; }
void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
// } roc OpenCL integration
private:
@@ -403,6 +404,10 @@ class VirtualGPU : public device::VirtualDevice {
//!< bit-vector representing the CU mask. Each active bit represents using one CU
const std::vector<uint32_t> cuMask_;
amd::CommandQueue::Priority priority_; //!< The priority for the hsa queue
cl_command_type copy_command_type_; //!< Type of the copy command, used for ROC profiler
//!< OCL doesn't distinguish diffrent copy types,
//!< but ROC profiler expects D2H or H2D detection
};
template <typename T>
+5 -4
Просмотреть файл
@@ -115,7 +115,7 @@ class ActivityProf {
if (IsEnabled()) {
uint64_t start = obj.profilingInfo().start_;
uint64_t end = obj.profilingInfo().end_;
callback(start, end, bytes);
callback(obj.type(), start, end, bytes);
}
}
@@ -123,11 +123,12 @@ class ActivityProf {
private:
// Activity callback routine
void callback(const uint64_t begin_ts, const uint64_t end_ts, const size_t bytes) {
activity_op_t op_id = (command_id_ == CL_COMMAND_NDRANGE_KERNEL) ? OP_ID_DISPATCH : OP_ID_COPY;
void callback(const command_id_t command_id,
const uint64_t begin_ts, const uint64_t end_ts, const size_t bytes) {
activity_op_t op_id = (command_id == CL_COMMAND_NDRANGE_KERNEL) ? OP_ID_DISPATCH : OP_ID_COPY;
activity_record_t record {
ACTIVITY_DOMAIN_ID, // domain id
(activity_kind_t)command_id_, // activity kind
(activity_kind_t)command_id, // activity kind
op_id, // operation id
record_id_, // activity correlation id
begin_ts, // begin timestamp, ns
+1 -1
Просмотреть файл
@@ -123,7 +123,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
// status, we release all the resources associated with this instance.
releaseResources();
activity_.ReportEventTimestamps(*this);
activity_.ReportEventTimestamps(command());
// Broadcast all the waiters.
if (referenceCount() > 1) {
signal();
+4 -2
Просмотреть файл
@@ -213,8 +213,8 @@ class Command : public Event {
//! Next GPU command in the queue list
Command* next_;
const cl_command_type type_; //!< This command's OpenCL type.
volatile int32_t exception_; //!< The first raised exception.
cl_command_type type_; //!< This command's OpenCL type.
volatile int32_t exception_; //!< The first raised exception.
void* data_;
protected:
@@ -298,6 +298,8 @@ class Command : public Event {
//! Get command wait bits
uint32_t getWaitBits() const { return commandWaitBits_; }
void OverrrideCommandType(cl_command_type type) { type_ = type; }
};
class UserEvent : public Command {