P4 to Git Change 2024454 by axie@axie-hip-rocm on 2019/11/04 14:38:31

SWDEV-198863 - Options for hip-clang-vdi path to provide the chicken bits, or functional equivalents to HCC_DB (phase 1) 1. The log macros is turned off for release build. So log functions has zero impact to release build. 2. The log macros have level, mask, condition control. So we can have more control to avoid log flooding. I also adjusted some existing log to use new log functions. 1. To excercise and test the new log functions. 2. To improve performance slightly. 3. The change is mainly for HIP-ROCM, we can move more in next phases for PAL or ORCA. 4. I make these log feature unavailable for release build. We can revert to old log functions for release build in a case by case method. Tests: 1. http://ocltc.amd.com:8111/viewModification.html?modId=128289&personal=true&tab=vcsModificationBuilds http://ocltc.amd.com:8111/viewModification.html?modId=128358&personal=true&tab=vcsModificationBuilds 2. release build, run hip program, there is no log 3. fastdebug build, run hip program, export LOG_LEVEL=3 export GPU_LOG_MASK=4294967295 There was a lot of logs. 4. fastdebug build, run hip program, export LOG_LEVEL=2 export GPU_LOG_MASK=4294967295 There was no logs. 5. fastdebug build, run hip program, export LOG_LEVEL=3 export GPU_LOG_MASK=4294967294 There was much less logs. 6. fastdebug build, run hip program, export LOG_LEVEL=3 export GPU_LOG_MASK=47102 There was even much less logs. The logs was expected according to the mask. 7. Tested step 2 to 6 similarily in Windows and Linux ReviewBoard: http://ocltc.amd.com/reviews/r/18215 Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_internal.hpp#46 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_memory.cpp#82 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hip_stream.cpp#26 edit ... //depot/stg/opencl/drivers/opencl/api/hip/hiprtc_internal.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#29 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/comgrctx.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#29 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#68 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#137 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#91 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#100 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.cpp#32 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/runtime.cpp#40 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/debug.hpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#323 edit [ROCm/clr commit: 3f6e18bf6b]
2019-11-04 14:44:59 -05:00
@@ -13,7 +13,7 @@ bool Comgr::is_ready_ = false;

 bool Comgr::LoadLib() {
 #if defined(COMGR_DYN_DLL)
-  LogInfo("Loading COMGR library.");
+  ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Loading COMGR library.");
  static const char* ComgrLibName =
    LP64_SWITCH(WINDOWS_SWITCH("amd_comgr32.dll", "libamd_comgr32.so"),
                WINDOWS_SWITCH("amd_comgr.dll", "libamd_comgr.so"));
@@ -1731,7 +1731,7 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
    } while (end != std::string::npos);

    if (tokens.size() < 2) {
-      LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
+      ClPrint(amd::LOG_WARNING, amd::LOG_KERN, "Invalid PrintInfo string: \"%s\"", str.c_str());
      continue;
    }

@@ -1747,7 +1747,7 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {

    // ensure that we have the correct number of tokens
    if (tokens.size() < end + 1 /*last token is the fmtString*/) {
-      LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
+      ClPrint(amd::LOG_WARNING, amd::LOG_KERN, "Invalid PrintInfo string: \"%s\"", str.c_str());
      continue;
    }

@@ -1398,7 +1398,7 @@ static void dumpCodeObject(const std::string& image) {
    char fname[30];
    static std::atomic<int> index;
    sprintf(fname, "_code_object%04d.o", index++);
-    LogPrintfInfo("Code object saved in %s\n", fname);
+    ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Code object saved in %s\n", fname);
    std::ofstream ofs;
    ofs.open(fname, std::ios::binary);
    ofs << image;
@@ -2905,7 +2905,7 @@ bool Program::createKernelMetadataMap() {

  status = amd::Comgr::metadata_lookup(metadata_, "Kernels", &kernelsMD);
  if (status == AMD_COMGR_STATUS_SUCCESS) {
-    LogInfo("Using Code Object V2.");
+    ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V2.");
    hasKernelMD = true;
    codeObjectVer_ = 2;
  }
@@ -2913,7 +2913,7 @@ bool Program::createKernelMetadataMap() {
    status = amd::Comgr::metadata_lookup(metadata_, "amdhsa.kernels", &kernelsMD);

    if (status == AMD_COMGR_STATUS_SUCCESS) {
-      LogInfo("Using Code Object V3.");
+      ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V3.");
      hasKernelMD = true;
      codeObjectVer_ = 3;
    }
@@ -396,7 +396,7 @@ void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) {
 }

 bool Device::init() {
-  LogInfo("Initializing HSA stack.");
+  ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack.");

  // Initialize the compiler
  if (!initCompiler(offlineDevice_)) {
@@ -469,7 +469,7 @@ bool Device::init() {
    }
    // If the AmdDeviceInfo for the HsaDevice Id could not be found return false
    if (id == HSA_INVALID_DEVICE_ID) {
-      LogPrintfWarning("Could not find a DeviceInfo entry for %d", deviceId);
+      ClPrint(amd::LOG_WARNING, amd::LOG_INIT, "Could not find a DeviceInfo entry for %d", deviceId);
      continue;
    }
    roc_device->deviceInfo_ = DeviceInfo[id];
@@ -1835,8 +1835,8 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI

 hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
  assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
-  LogPrintfInfo("number of allocated hardware queues: %d, maximum: %d",
-                queuePool_.size(), GPU_MAX_HW_QUEUES);
+  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
+          queuePool_.size(), GPU_MAX_HW_QUEUES);

  // If we have reached the max number of queues, reuse an existing queue,
  // choosing the one with the least number of users.
@@ -1846,7 +1846,7 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
                                   [] (PoolRef A, PoolRef B) {
                                     return A.second.refCount < B.second.refCount;
                                   });
-    LogPrintfInfo("selected queue with least refCount: %p (%d)",
+    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "selected queue with least refCount: %p (%d)",
                  lowest->first, lowest->second.refCount);
    lowest->second.refCount++;
    return lowest->first;
@@ -1870,7 +1870,7 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
      return nullptr;
    }
  }
-  LogPrintfInfo("created hardware queue %p with size %d",
+  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d",
                queue, queue_size);
  hsa_amd_profiling_set_profiler_enabled(queue, 1);
  auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
@@ -1890,7 +1890,7 @@ void Device::releaseQueue(hsa_queue_t* queue) {
  if (qInfo.refCount != 0) {
      return;
  }
-  LogPrintfInfo("deleting hardware queue %p with refCount 0", queue);
+  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);

  hsa_queue_destroy(queue);
  queuePool_.erase(qIter);
@@ -300,7 +300,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
            gpuMem->syncCacheFromHost(*this);
          }
          const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
-          LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
+          ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
            desc.typeName_.c_str(), desc.name_.c_str(),
            globalAddress, gpuMem->getDeviceMemory(),
            reinterpret_cast<address>(gpuMem->getDeviceMemory()) + mem->getSize(),
@@ -2034,7 +2034,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
      return false;
    }

-    LogPrintfInfo("[%zx]!\tShaderName : %s\n", std::this_thread::get_id(), gpuKernel.name().c_str());
+    ClPrint(amd::LOG_INFO, amd::LOG_KERN, "[%zx]!\tShaderName : %s\n", std::this_thread::get_id(), gpuKernel.name().c_str());

    // Check if runtime has to setup hidden arguments
    for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
@@ -96,6 +96,8 @@ bool Event::setStatus(cl_int status, uint64_t timeStamp) {
  }

  if (status <= CL_COMPLETE) {
+    ClPrint(LOG_DEBUG, LOG_CMD, "command %p complete", &command());
+
    // Before we notify the waiters that this event reached the CL_COMPLETE
    // status, we release all the resources associated with this instance.
    releaseResources();
@@ -160,12 +162,16 @@ bool Event::awaitCompletion() {
      return false;
    }

+    ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", this, status_);
+
    ScopedLock lock(lock_);

    // Wait until the status becomes CL_COMPLETE or negative.
    while (status_ > CL_COMPLETE) {
      lock_.wait();
    }
+
+    ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this);
  }

  return status_ == CL_COMPLETE;
@@ -180,6 +186,7 @@ bool Event::notifyCmdQueue() {
      notified_.clear();
      return false;
    }
+    ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
    command->enqueue();
    command->release();
  }
@@ -220,6 +227,7 @@ void Command::enqueue() {
  if (IS_HIP) {
    queue_->setLastQueuedCommand(this);
  }
+  ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this);
  queue_->append(*this);
  queue_->flush();
  if ((queue_->device().settings().waitCommand_ && (type_ != 0)) ||
@@ -75,10 +75,11 @@ void HostQueue::finish() {
  if (command == NULL) {
    return;
  }
-
+  ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
  command->enqueue();
  command->awaitCompletion();
  command->release();
+  ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished");
 }

 void HostQueue::loop(device::VirtualDevice* virtualDevice) {
@@ -132,6 +133,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
      continue;
    }

+    ClPrint(LOG_DEBUG, LOG_CMD, "command is submitted: %p", command);
    command->setStatus(CL_SUBMITTED);
    // Submit to the device queue.
    command->submit(*virtualDevice);
@@ -58,10 +58,12 @@ bool Runtime::init() {
  if (!Flag::init() || !option::init() || !Device::init()
      // Agent initializes last
      || !Agent::init()) {
+    ClPrint(LOG_ERROR, LOG_INIT, "Runtime initilization failed");
    return false;
  }

  initialized_ = true;
+  ClTrace(LOG_DEBUG, LOG_INIT);
  return true;
 }

@@ -69,6 +71,7 @@ void Runtime::tearDown() {
  if (!initialized_) {
    return;
  }
+  ClTrace(LOG_DEBUG, LOG_INIT);

  Agent::tearDown();
  Device::tearDown();
@@ -14,6 +14,26 @@ namespace amd { /*@{*/

 enum LogLevel { LOG_NONE = 0, LOG_ERROR = 1, LOG_WARNING = 2, LOG_INFO = 3, LOG_DEBUG = 4 };

+enum LogMask {
+  LOG_API       = 0x00000001, //!< API call
+  LOG_CMD       = 0x00000002, //!< Kernel and Copy Commands and Barriers
+  LOG_WAIT      = 0x00000004, //!< Synchronization and waiting for commands to finish
+  LOG_AQL       = 0x00000008, //!< Decode and display AQL packets
+  LOG_QUEUE     = 0x00000010, //!< Queue commands and queue contents
+  LOG_SIG       = 0x00000020, //!< Signal creation, allocation, pool
+  LOG_LOCK      = 0x00000040, //!< Locks and thread-safety code.
+  LOG_KERN      = 0x00000080, //!< kernel creations and arguments, etc.
+  LOG_COPY      = 0x00000100, //!< Copy debug
+  LOG_COPY2     = 0x00000200, //!< Detailed copy debug
+  LOG_RESOURCE  = 0x00000400, //!< Resource allocation, performance-impacting events.
+  LOG_INIT      = 0x00000800, //!< Initialization and shutdown
+  LOG_MISC      = 0x00001000, //!< misc debug, not yet classified
+  LOG_AQL2      = 0x00002000, //!< Show raw bytes of AQL packet
+  LOG_CODE      = 0x00004000, //!< Show code creation debug
+  LOG_CMD2      = 0x00008000, //!< More detailed command info, including barrier commands
+  LOG_ALWAYS    = 0xFFFFFFFF, //!< Log always even mask flag is zero
+};
+
 //! \cond ignore
 extern "C" void breakpoint();
 //! \endcond
@@ -137,4 +157,35 @@ inline void warning(const char* msg) { amd::report_warning(msg); }

 #define DebugInfoGuarantee(cond) LogGuarantee(cond, amd::LOG_INFO, "Warning")

+#ifndef NDEBUG
+#define CL_LOG
+#endif
+// You may define CL_LOG to enable following log functions even for release build
+#ifdef CL_LOG
+#define ClPrint(level, mask, format, ...)                                                          \
+  do {                                                                                             \
+    if (LOG_LEVEL >= level) {                                                                      \
+      if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) {                                        \
+        amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__);                         \
+      }                                                                                            \
+    }                                                                                              \
+  } while (false)
+
+#define ClCondPrint(level, mask, condition, format, ...)                                           \
+  do {                                                                                             \
+    if (LOG_LEVEL >= level && (condition)) {                                                       \
+      if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) {                                        \
+        amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__);                         \
+      }                                                                                            \
+    }                                                                                              \
+  } while (false)
+
+#define ClTrace(level, mask) ClPrint(level, mask, "%s", __func__)
+
+#else /*CL_LOG*/
+#define ClPrint(level, mask, format, ...) (void)(0)
+#define ClCondPrint(level, mask, condition, format, ...) (void)(0)
+#define ClTrace(level, mask) (void)(0)
+#endif /*CL_LOG*/
+
 #endif /*DEBUG_HPP_*/
@@ -10,6 +10,8 @@
                                                                              \
 release(int, LOG_LEVEL, 0,                                                    \
        "The default log level")                                              \
+release(uint, GPU_LOG_MASK, 0,                                                \
+        "The mask to enable specific kinds of logs")                          \
 debug(uint, DEBUG_GPU_FLAGS, 0,                                               \
        "The debug options for GPU device")                                   \
 release(uint, GPU_MAX_COMMAND_QUEUES, 300,                                    \