diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index 522f95cd03..fb98c982a8 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -85,6 +85,9 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
   return timeStamp;
 }
 
+// Global epoch time since the first processed command
+uint64_t epoch = 0;
+
 bool Event::setStatus(int32_t status, uint64_t timeStamp) {
   assert(status <= CL_QUEUED && "invalid status");
 
@@ -96,6 +99,9 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
 
   if (profilingInfo().enabled_) {
     timeStamp = recordProfilingInfo(status, timeStamp);
+    if (epoch == 0) {
+      epoch = profilingInfo().queued_;
+    }
   }
 
   if (!make_atomic(status_).compareAndSet(currentStatus, status)) {
@@ -112,8 +118,6 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
   }
 
   if (status <= CL_COMPLETE) {
-    ClPrint(LOG_DEBUG, LOG_CMD, "command %p complete", &command());
-
     // Before we notify the waiters that this event reached the CL_COMPLETE
     // status, we release all the resources associated with this instance.
     releaseResources();
@@ -123,6 +127,13 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
     if (referenceCount() > 1) {
       signal();
     }
+
+    ClPrint(LOG_DEBUG, LOG_CMD, "command %p complete (Wall: %ld, CPU: %ld, GPU: %ld us)",
+      &command(),
+      ((profilingInfo().end_ - epoch) / 1000),
+      ((profilingInfo().submitted_ - profilingInfo().queued_) / 1000),
+      ((profilingInfo().end_ - profilingInfo().start_) / 1000));
+
     release();
   }
 
diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp
index 6a43cc7cd4..63252fb455 100644
--- a/rocclr/platform/commandqueue.cpp
+++ b/rocclr/platform/commandqueue.cpp
@@ -86,15 +86,24 @@ bool HostQueue::terminate() {
 }
 
 void HostQueue::finish() {
-  // Send a finish to make sure we finished all commands
-  Command* command = new Marker(*this, false);
-  if (command == NULL) {
-    return;
+  Command* command = nullptr;
+  if (IS_HIP) {
+    command = getLastQueuedCommand(false);
+    if (nullptr != command) {
+      command->awaitCompletion();
+    }
+  }
+  if (nullptr == command) {
+    // Send a finish to make sure we finished all commands
+    command = new Marker(*this, false);
+    if (command == NULL) {
+      return;
+    }
+    ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
+    command->enqueue();
+    command->awaitCompletion();
+    command->release();
   }
-  ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
-  command->enqueue();
-  command->awaitCompletion();
-  command->release();
   ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished");
 }
 
diff --git a/rocclr/utils/debug.hpp b/rocclr/utils/debug.hpp
index 6672c2e6df..eacb0bbc68 100644
--- a/rocclr/utils/debug.hpp
+++ b/rocclr/utils/debug.hpp
@@ -47,6 +47,7 @@ enum LogMask {
   LOG_AQL2      = 0x00002000, //!< Show raw bytes of AQL packet
   LOG_CODE      = 0x00004000, //!< Show code creation debug
   LOG_CMD2      = 0x00008000, //!< More detailed command info, including barrier commands
+  LOG_LOCATION  = 0x00010000, //!< Log message location
   LOG_ALWAYS    = 0xFFFFFFFF, //!< Log always even mask flag is zero
 };
 
@@ -178,7 +179,11 @@ inline void warning(const char* msg) { amd::report_warning(msg); }
   do {                                                                                             \
     if (LOG_LEVEL >= level) {                                                                      \
       if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) {                                        \
-        amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__);                         \
+        if (GPU_LOG_MASK & amd::LOG_LOCATION) {                                                         \
+          amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__);                       \
+        } else {                                                                                   \
+          amd::log_printf(level, "", 0, format, ##__VA_ARGS__);                                   \
+        }                                                                                          \
       }                                                                                            \
     }                                                                                              \
   } while (false)