diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index 716790c837..12a502c7ea 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -1630,6 +1630,17 @@ bool KernelBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory address parameters = captureArguments(kernels_[blitType]); result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); releaseArguments(parameters); + + if (amd::IS_HIP) { + // Update the command type for ROC profiler + if (srcMemory.isHostMemDirectAccess()) { + gpu().SetCopyCommandType(CL_COMMAND_WRITE_BUFFER_RECT); + } + if (dstMemory.isHostMemDirectAccess()) { + gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER_RECT); + } + } + synchronize(); return result; @@ -1857,6 +1868,7 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst return result; } +// ================================================================================================ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { @@ -1919,6 +1931,7 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, return result; } +// ================================================================================================ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& sizeIn, bool entire) const { @@ -1975,12 +1988,10 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); // Program source origin uint64_t srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; - ; setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); // Program destinaiton origin uint64_t dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; - ; setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); uint64_t copySize = size[0]; @@ -2001,7 +2012,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); releaseArguments(parameters); } else { - //printf("rocm!\n"); + if (amd::IS_HIP) { + // Update the command type for ROC profiler + if (srcMemory.isHostMemDirectAccess()) { + gpu().SetCopyCommandType(CL_COMMAND_WRITE_BUFFER); + } + if (dstMemory.isHostMemDirectAccess()) { + gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER); + } + } result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); } @@ -2010,6 +2029,7 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds return result; } +// ================================================================================================ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index b8885e1dca..1e026c587e 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -694,7 +694,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, schedulerQueue_(nullptr), schedulerSignal_({0}), cuMask_(cuMask), - priority_(priority) + priority_(priority), + copy_command_type_(0) { index_ = device.numOfVgpus_++; gpu_device_ = device.getBackendDevice(); @@ -1360,6 +1361,7 @@ bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memo return true; } +// ================================================================================================ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); @@ -1374,9 +1376,15 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) { cmd.setStatus(CL_INVALID_OPERATION); } + // Runtime may change the command type to report a more accurate info in ROC profiler + if (copy_command_type_ != 0) { + cmd.OverrrideCommandType(copy_command_type_); + copy_command_type_ = 0; + } profilingEnd(cmd); } +// ================================================================================================ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 0941dd3db0..b1c79fd3f1 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -295,6 +295,7 @@ class VirtualGPU : public device::VirtualDevice { void hasPendingDispatch() { hasPendingDispatch_ = true; } void addSystemScope() { addSystemScope_ = true; } + void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; } // } roc OpenCL integration private: @@ -403,6 +404,10 @@ class VirtualGPU : public device::VirtualDevice { //!< bit-vector representing the CU mask. Each active bit represents using one CU const std::vector cuMask_; amd::CommandQueue::Priority priority_; //!< The priority for the hsa queue + + cl_command_type copy_command_type_; //!< Type of the copy command, used for ROC profiler + //!< OCL doesn't distinguish diffrent copy types, + //!< but ROC profiler expects D2H or H2D detection }; template diff --git a/rocclr/platform/activity.hpp b/rocclr/platform/activity.hpp index 65575cb403..5c28fb3968 100644 --- a/rocclr/platform/activity.hpp +++ b/rocclr/platform/activity.hpp @@ -115,7 +115,7 @@ class ActivityProf { if (IsEnabled()) { uint64_t start = obj.profilingInfo().start_; uint64_t end = obj.profilingInfo().end_; - callback(start, end, bytes); + callback(obj.type(), start, end, bytes); } } @@ -123,11 +123,12 @@ class ActivityProf { private: // Activity callback routine - void callback(const uint64_t begin_ts, const uint64_t end_ts, const size_t bytes) { - activity_op_t op_id = (command_id_ == CL_COMMAND_NDRANGE_KERNEL) ? OP_ID_DISPATCH : OP_ID_COPY; + void callback(const command_id_t command_id, + const uint64_t begin_ts, const uint64_t end_ts, const size_t bytes) { + activity_op_t op_id = (command_id == CL_COMMAND_NDRANGE_KERNEL) ? OP_ID_DISPATCH : OP_ID_COPY; activity_record_t record { ACTIVITY_DOMAIN_ID, // domain id - (activity_kind_t)command_id_, // activity kind + (activity_kind_t)command_id, // activity kind op_id, // operation id record_id_, // activity correlation id begin_ts, // begin timestamp, ns diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index 3930b9694a..e0e5ecd864 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -123,7 +123,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) { // status, we release all the resources associated with this instance. releaseResources(); - activity_.ReportEventTimestamps(*this); + activity_.ReportEventTimestamps(command()); // Broadcast all the waiters. if (referenceCount() > 1) { signal(); diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index aa692eceef..8b5d1d8bdb 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -213,8 +213,8 @@ class Command : public Event { //! Next GPU command in the queue list Command* next_; - const cl_command_type type_; //!< This command's OpenCL type. - volatile int32_t exception_; //!< The first raised exception. + cl_command_type type_; //!< This command's OpenCL type. + volatile int32_t exception_; //!< The first raised exception. void* data_; protected: @@ -298,6 +298,8 @@ class Command : public Event { //! Get command wait bits uint32_t getWaitBits() const { return commandWaitBits_; } + + void OverrrideCommandType(cl_command_type type) { type_ = type; } }; class UserEvent : public Command {