Correct reported info in ROC profiler
OCL can't distinguish different copy types, but ROC profiler expects SDMA transfer visibility. Add extra code to detect a transfer with the host memory and substitute OCL command Change-Id: I5290acd0e10bc082e00c1d4ae1474a075de7f165
Этот коммит содержится в:
@@ -1630,6 +1630,17 @@ bool KernelBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory
|
||||
address parameters = captureArguments(kernels_[blitType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
|
||||
if (amd::IS_HIP) {
|
||||
// Update the command type for ROC profiler
|
||||
if (srcMemory.isHostMemDirectAccess()) {
|
||||
gpu().SetCopyCommandType(CL_COMMAND_WRITE_BUFFER_RECT);
|
||||
}
|
||||
if (dstMemory.isHostMemDirectAccess()) {
|
||||
gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER_RECT);
|
||||
}
|
||||
}
|
||||
|
||||
synchronize();
|
||||
|
||||
return result;
|
||||
@@ -1857,6 +1868,7 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
@@ -1919,6 +1931,7 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& sizeIn, bool entire) const {
|
||||
@@ -1975,12 +1988,10 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
||||
// Program source origin
|
||||
uint64_t srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
|
||||
;
|
||||
setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
|
||||
|
||||
// Program destinaiton origin
|
||||
uint64_t dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
|
||||
;
|
||||
setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
|
||||
|
||||
uint64_t copySize = size[0];
|
||||
@@ -2001,7 +2012,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
} else {
|
||||
//printf("rocm!\n");
|
||||
if (amd::IS_HIP) {
|
||||
// Update the command type for ROC profiler
|
||||
if (srcMemory.isHostMemDirectAccess()) {
|
||||
gpu().SetCopyCommandType(CL_COMMAND_WRITE_BUFFER);
|
||||
}
|
||||
if (dstMemory.isHostMemDirectAccess()) {
|
||||
gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER);
|
||||
}
|
||||
}
|
||||
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
|
||||
}
|
||||
|
||||
@@ -2010,6 +2029,7 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
|
||||
@@ -694,7 +694,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
schedulerQueue_(nullptr),
|
||||
schedulerSignal_({0}),
|
||||
cuMask_(cuMask),
|
||||
priority_(priority)
|
||||
priority_(priority),
|
||||
copy_command_type_(0)
|
||||
{
|
||||
index_ = device.numOfVgpus_++;
|
||||
gpu_device_ = device.getBackendDevice();
|
||||
@@ -1360,6 +1361,7 @@ bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memo
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
@@ -1374,9 +1376,15 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
|
||||
cmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
// Runtime may change the command type to report a more accurate info in ROC profiler
|
||||
if (copy_command_type_ != 0) {
|
||||
cmd.OverrrideCommandType(copy_command_type_);
|
||||
copy_command_type_ = 0;
|
||||
}
|
||||
profilingEnd(cmd);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
@@ -295,6 +295,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void hasPendingDispatch() { hasPendingDispatch_ = true; }
|
||||
void addSystemScope() { addSystemScope_ = true; }
|
||||
void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
|
||||
|
||||
// } roc OpenCL integration
|
||||
private:
|
||||
@@ -403,6 +404,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//!< bit-vector representing the CU mask. Each active bit represents using one CU
|
||||
const std::vector<uint32_t> cuMask_;
|
||||
amd::CommandQueue::Priority priority_; //!< The priority for the hsa queue
|
||||
|
||||
cl_command_type copy_command_type_; //!< Type of the copy command, used for ROC profiler
|
||||
//!< OCL doesn't distinguish diffrent copy types,
|
||||
//!< but ROC profiler expects D2H or H2D detection
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -115,7 +115,7 @@ class ActivityProf {
|
||||
if (IsEnabled()) {
|
||||
uint64_t start = obj.profilingInfo().start_;
|
||||
uint64_t end = obj.profilingInfo().end_;
|
||||
callback(start, end, bytes);
|
||||
callback(obj.type(), start, end, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,11 +123,12 @@ class ActivityProf {
|
||||
|
||||
private:
|
||||
// Activity callback routine
|
||||
void callback(const uint64_t begin_ts, const uint64_t end_ts, const size_t bytes) {
|
||||
activity_op_t op_id = (command_id_ == CL_COMMAND_NDRANGE_KERNEL) ? OP_ID_DISPATCH : OP_ID_COPY;
|
||||
void callback(const command_id_t command_id,
|
||||
const uint64_t begin_ts, const uint64_t end_ts, const size_t bytes) {
|
||||
activity_op_t op_id = (command_id == CL_COMMAND_NDRANGE_KERNEL) ? OP_ID_DISPATCH : OP_ID_COPY;
|
||||
activity_record_t record {
|
||||
ACTIVITY_DOMAIN_ID, // domain id
|
||||
(activity_kind_t)command_id_, // activity kind
|
||||
(activity_kind_t)command_id, // activity kind
|
||||
op_id, // operation id
|
||||
record_id_, // activity correlation id
|
||||
begin_ts, // begin timestamp, ns
|
||||
|
||||
@@ -123,7 +123,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
|
||||
// status, we release all the resources associated with this instance.
|
||||
releaseResources();
|
||||
|
||||
activity_.ReportEventTimestamps(*this);
|
||||
activity_.ReportEventTimestamps(command());
|
||||
// Broadcast all the waiters.
|
||||
if (referenceCount() > 1) {
|
||||
signal();
|
||||
|
||||
@@ -213,8 +213,8 @@ class Command : public Event {
|
||||
//! Next GPU command in the queue list
|
||||
Command* next_;
|
||||
|
||||
const cl_command_type type_; //!< This command's OpenCL type.
|
||||
volatile int32_t exception_; //!< The first raised exception.
|
||||
cl_command_type type_; //!< This command's OpenCL type.
|
||||
volatile int32_t exception_; //!< The first raised exception.
|
||||
void* data_;
|
||||
|
||||
protected:
|
||||
@@ -298,6 +298,8 @@ class Command : public Event {
|
||||
|
||||
//! Get command wait bits
|
||||
uint32_t getWaitBits() const { return commandWaitBits_; }
|
||||
|
||||
void OverrrideCommandType(cl_command_type type) { type_ = type; }
|
||||
};
|
||||
|
||||
class UserEvent : public Command {
|
||||
|
||||
Ссылка в новой задаче
Block a user