SWDEV-290384 - Add Linker API support in hiprtc

Change-Id: I4621a033a22e4da0201c3804e2b357470a681ab0
2022-03-14 12:36:16 -04:00
@@ -27,12 +27,12 @@

 #include <unordered_map>

-constexpr unsigned __hipFatMAGIC2 = 0x48495046; // "HIPF"
+constexpr unsigned __hipFatMAGIC2 = 0x48495046;  // "HIPF"

 thread_local std::stack<ihipExec_t> execStack_;
-PlatformState* PlatformState::platform_; // Initiaized as nullptr by default
+PlatformState* PlatformState::platform_;  // Initiaized as nullptr by default

-//forward declaration of methods required for __hipRegisrterManagedVar
+// forward declaration of methods required for __hipRegisrterManagedVar
 hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0);
 hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
                      amd::HostQueue& queue, bool isAsync = false);
@@ -40,26 +40,23 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
 struct __CudaFatBinaryWrapper {
  unsigned int magic;
  unsigned int version;
-  void*        binary;
-  void*        dummy1;
+  void* binary;
+  void* dummy1;
 };

-hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
-    hipModule_t hmod, const char* name);
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
+                              const char* name);

 hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
                                  hipDeviceptr_t* dptr, size_t* bytes);

-extern hipError_t ihipModuleLaunchKernel(hipFunction_t f,
-                                 uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
-                                 uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
-                                 uint32_t sharedMemBytes, hipStream_t hStream,
-                                 void **kernelParams, void **extra,
-                                 hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
-                                 uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
-                                 uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
-static bool isCompatibleCodeObject(const std::string& codeobj_target_id,
-                                   const char* device_name) {
+extern hipError_t ihipModuleLaunchKernel(
+    hipFunction_t f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX,
+    uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream,
+    void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent,
+    uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
+    uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
+static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const char* device_name) {
  // Workaround for device name mismatch.
  // Device name may contain feature strings delimited by '+', e.g.
  // gfx900+xnack. Currently HIP-Clang does not include feature strings
@@ -73,8 +70,7 @@ static bool isCompatibleCodeObject(const std::string& codeobj_target_id,
  return codeobj_target_id == short_name;
 }

-extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data)
-{
+extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data) {
  const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast<const __CudaFatBinaryWrapper*>(data);
  if (fbwrapper->magic != __hipFatMAGIC2 || fbwrapper->version != 1) {
    LogPrintfError("Cannot Register fat binary. FatMagic: %u version: %u ", fbwrapper->magic,
@@ -84,21 +80,14 @@ extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data)
  return PlatformState::instance().addFatBinary(fbwrapper->binary);
 }

-extern "C" void __hipRegisterFunction(
-  hip::FatBinaryInfo** modules,
-  const void*  hostFunction,
-  char*        deviceFunction,
-  const char*  deviceName,
-  unsigned int threadLimit,
-  uint3*       tid,
-  uint3*       bid,
-  dim3*        blockDim,
-  dim3*        gridDim,
-  int*         wSize) {
-  static int enable_deferred_loading { []() {
-    char *var = getenv("HIP_ENABLE_DEFERRED_LOADING");
+extern "C" void __hipRegisterFunction(hip::FatBinaryInfo** modules, const void* hostFunction,
+                                      char* deviceFunction, const char* deviceName,
+                                      unsigned int threadLimit, uint3* tid, uint3* bid,
+                                      dim3* blockDim, dim3* gridDim, int* wSize) {
+  static int enable_deferred_loading{[]() {
+    char* var = getenv("HIP_ENABLE_DEFERRED_LOADING");
    return var ? atoi(var) : 1;
-  }() };
+  }()};
  hipError_t hip_error = hipSuccess;
  hip::Function* func = new hip::Function(std::string(deviceName), modules);
  hip_error = PlatformState::instance().registerStatFunction(hostFunction, func);
@@ -121,42 +110,45 @@ extern "C" void __hipRegisterFunction(
 // track of the value of the device side global variable between kernel
 // executions.
 extern "C" void __hipRegisterVar(
-  hip::FatBinaryInfo** modules,   // The device modules containing code object
-  void*       var,       // The shadow variable in host code
-  char*       hostVar,   // Variable name in host code
-  char*       deviceVar, // Variable name in device code
-  int         ext,       // Whether this variable is external
-  size_t      size,      // Size of the variable
-  int         constant,  // Whether this variable is constant
-  int         global)    // Unknown, always 0
+    hip::FatBinaryInfo** modules,  // The device modules containing code object
+    void* var,                     // The shadow variable in host code
+    char* hostVar,                 // Variable name in host code
+    char* deviceVar,               // Variable name in device code
+    int ext,                       // Whether this variable is external
+    size_t size,                   // Size of the variable
+    int constant,                  // Whether this variable is constant
+    int global)                    // Unknown, always 0
 {
-  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Variable, size, 0, 0, modules);
+  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Variable,
+                                   size, 0, 0, modules);
  hipError_t err = PlatformState::instance().registerStatGlobalVar(var, var_ptr);
  guarantee((err == hipSuccess), "Cannot register Static Global Var");
 }

-extern "C" void __hipRegisterSurface(hip::FatBinaryInfo** modules,      // The device modules containing code object
-                                     void* var,        // The shadow variable in host code
-                                     char* hostVar,    // Variable name in host code
-                                     char* deviceVar,  // Variable name in device code
-                                     int type, int ext) {
-  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Surface, sizeof(surfaceReference), 0, 0, modules);
+extern "C" void __hipRegisterSurface(
+    hip::FatBinaryInfo** modules,  // The device modules containing code object
+    void* var,                     // The shadow variable in host code
+    char* hostVar,                 // Variable name in host code
+    char* deviceVar,               // Variable name in device code
+    int type, int ext) {
+  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Surface,
+                                   sizeof(surfaceReference), 0, 0, modules);
  hipError_t err = PlatformState::instance().registerStatGlobalVar(var, var_ptr);
  guarantee((err == hipSuccess), "Cannot register Static Glbal Var");
 }

-extern "C" void __hipRegisterManagedVar(void *hipModule,   // Pointer to hip module returned from __hipRegisterFatbinary
-                                        void **pointer,    // Pointer to a chunk of managed memory with size \p size and alignment \p align
-                                                           // HIP runtime allocates such managed memory and assign it to \p pointer
-                                        void *init_value,  // Initial value to be copied into \p pointer
-                                        const char *name,  // Name of the variable in code object
-                                        size_t size,
-                                        unsigned align) {
+extern "C" void __hipRegisterManagedVar(
+    void* hipModule,  // Pointer to hip module returned from __hipRegisterFatbinary
+    void** pointer,   // Pointer to a chunk of managed memory with size \p size and alignment \p
+                     // align HIP runtime allocates such managed memory and assign it to \p pointer
+    void* init_value,  // Initial value to be copied into \p pointer
+    const char* name,  // Name of the variable in code object
+    size_t size, unsigned align) {
  HIP_INIT_VOID();
  hipError_t status = ihipMallocManaged(pointer, size, align);
-  if( status == hipSuccess) {
+  if (status == hipSuccess) {
    amd::HostQueue* queue = hip::getNullStream();
-    if(queue != nullptr) {
+    if (queue != nullptr) {
      status = ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *queue);
      guarantee((status == hipSuccess), "Error during memcpy to managed memory!");
    } else {
@@ -171,28 +163,25 @@ extern "C" void __hipRegisterManagedVar(void *hipModule,   // Pointer to hip mod
  guarantee((status == hipSuccess), "Cannot register Static Managed Var");
 }

-extern "C" void __hipRegisterTexture(hip::FatBinaryInfo** modules,      // The device modules containing code object
-                                     void* var,        // The shadow variable in host code
-                                     char* hostVar,    // Variable name in host code
-                                     char* deviceVar,  // Variable name in device code
-                                     int type, int norm, int ext) {
-  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Texture, sizeof(textureReference), 0, 0, modules);
+extern "C" void __hipRegisterTexture(
+    hip::FatBinaryInfo** modules,  // The device modules containing code object
+    void* var,                     // The shadow variable in host code
+    char* hostVar,                 // Variable name in host code
+    char* deviceVar,               // Variable name in device code
+    int type, int norm, int ext) {
+  hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Texture,
+                                   sizeof(textureReference), 0, 0, modules);
  hipError_t err = PlatformState::instance().registerStatGlobalVar(var, var_ptr);
  guarantee((err == hipSuccess), "Cannot register Static Global Var");
 }

-extern "C" void __hipUnregisterFatBinary(hip::FatBinaryInfo** modules)
-{
+extern "C" void __hipUnregisterFatBinary(hip::FatBinaryInfo** modules) {
  hipError_t err = PlatformState::instance().removeFatBinary(modules);
  guarantee((err == hipSuccess), "Cannot Unregister Fat Binary");
 }

-extern "C" hipError_t hipConfigureCall(
-  dim3 gridDim,
-  dim3 blockDim,
-  size_t sharedMem,
-  hipStream_t stream)
-{
+extern "C" hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem,
+                                       hipStream_t stream) {
  HIP_INIT_API(hipConfigureCall, gridDim, blockDim, sharedMem, stream);

  PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
@@ -200,12 +189,8 @@ extern "C" hipError_t hipConfigureCall(
  HIP_RETURN(hipSuccess);
 }

-extern "C" hipError_t __hipPushCallConfiguration(
-  dim3 gridDim,
-  dim3 blockDim,
-  size_t sharedMem,
-  hipStream_t stream)
-{
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridDim, dim3 blockDim, size_t sharedMem,
+                                                 hipStream_t stream) {
  HIP_INIT_API(__hipPushCallConfiguration, gridDim, blockDim, sharedMem, stream);

  PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
@@ -213,10 +198,8 @@ extern "C" hipError_t __hipPushCallConfiguration(
  HIP_RETURN(hipSuccess);
 }

-extern "C" hipError_t __hipPopCallConfiguration(dim3 *gridDim,
-                                                dim3 *blockDim,
-                                                size_t *sharedMem,
-                                                hipStream_t *stream) {
+extern "C" hipError_t __hipPopCallConfiguration(dim3* gridDim, dim3* blockDim, size_t* sharedMem,
+                                                hipStream_t* stream) {
  HIP_INIT_API(__hipPopCallConfiguration, gridDim, blockDim, sharedMem, stream);

  ihipExec_t exec;
@@ -229,11 +212,7 @@ extern "C" hipError_t __hipPopCallConfiguration(dim3 *gridDim,
  HIP_RETURN(hipSuccess);
 }

-extern "C" hipError_t hipSetupArgument(
-  const void *arg,
-  size_t size,
-  size_t offset)
-{
+extern "C" hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset) {
  HIP_INIT_API(hipSetupArgument, arg, size, offset);

  PlatformState::instance().setupArgument(arg, size, offset);
@@ -241,15 +220,14 @@ extern "C" hipError_t hipSetupArgument(
  HIP_RETURN(hipSuccess);
 }

-extern "C" hipError_t hipLaunchByPtr(const void *hostFunction)
-{
+extern "C" hipError_t hipLaunchByPtr(const void* hostFunction) {
  HIP_INIT_API(hipLaunchByPtr, hostFunction);

  ihipExec_t exec;
  PlatformState::instance().popExec(exec);

  hip::Stream* stream = reinterpret_cast<hip::Stream*>(exec.hStream_);
-  int deviceId = (stream != nullptr)? stream->DeviceId() : ihipGetDevice();
+  int deviceId = (stream != nullptr) ? stream->DeviceId() : ihipGetDevice();
  if (deviceId == -1) {
    LogPrintfError("Wrong DeviceId: %d \n", deviceId);
    HIP_RETURN(hipErrorNoDevice);
@@ -262,16 +240,12 @@ extern "C" hipError_t hipLaunchByPtr(const void *hostFunction)
  }

  size_t size = exec.arguments_.size();
-  void *extra[] = {
-      HIP_LAUNCH_PARAM_BUFFER_POINTER, &exec.arguments_[0],
-      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
-      HIP_LAUNCH_PARAM_END
-    };
+  void* extra[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &exec.arguments_[0],
+                   HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};

-  HIP_RETURN(hipModuleLaunchKernel(func,
-    exec.gridDim_.x, exec.gridDim_.y, exec.gridDim_.z,
-    exec.blockDim_.x, exec.blockDim_.y, exec.blockDim_.z,
-    exec.sharedMem_, exec.hStream_, nullptr, extra));
+  HIP_RETURN(hipModuleLaunchKernel(func, exec.gridDim_.x, exec.gridDim_.y, exec.gridDim_.z,
+                                   exec.blockDim_.x, exec.blockDim_.y, exec.blockDim_.z,
+                                   exec.sharedMem_, exec.hStream_, nullptr, extra));
 }

 hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol) {
@@ -283,7 +257,8 @@ hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol) {
  }
  size_t sym_size = 0;

-  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), devPtr, &sym_size));
+  HIP_RETURN_ONFAIL(
+      PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), devPtr, &sym_size));

  HIP_RETURN(hipSuccess, *devPtr);
 }
@@ -295,14 +270,14 @@ hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  hipDeviceptr_t device_ptr = nullptr;
-  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), &device_ptr, sizePtr));
+  HIP_RETURN_ONFAIL(
+      PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), &device_ptr, sizePtr));

  HIP_RETURN(hipSuccess, *sizePtr);
 }

 hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
-                                  hipDeviceptr_t* dptr, size_t* bytes)
-{
+                                  hipDeviceptr_t* dptr, size_t* bytes) {
  HIP_INIT();

  /* Get Device Program pointer*/
@@ -325,10 +300,8 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor

 namespace hip_impl {
 hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize,
-    const amd::Device& device, hipFunction_t func, int inputBlockSize,
-    size_t dynamicSMemSize, bool bCalcPotentialBlkSz)
-{
+    int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize, const amd::Device& device,
+    hipFunction_t func, int inputBlockSize, size_t dynamicSMemSize, bool bCalcPotentialBlkSz) {
  hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
  const amd::Kernel& kernel = *function->kernel();

@@ -340,14 +313,12 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
    *bestBlockSize = 0;
    // Make sure the requested block size is smaller than max supported
    if (inputBlockSize > int(device.info().maxWorkGroupSize_)) {
-        *maxBlocksPerCU = 0;
-        *numBlocksPerGrid = 0;
-        return hipSuccess;
+      *maxBlocksPerCU = 0;
+      *numBlocksPerGrid = 0;
+      return hipSuccess;
    }
-  }
-  else {
-    if (inputBlockSize > int(device.info().maxWorkGroupSize_) ||
-            inputBlockSize <= 0) {
+  } else {
+    if (inputBlockSize > int(device.info().maxWorkGroupSize_) || inputBlockSize <= 0) {
      // The user wrote the kernel to work with a workgroup size
      // bigger than this hardware can support. Or they do not care
      // about the size So just assume its maximum size is
@@ -367,18 +338,15 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
  size_t maxVGPRs;
  uint32_t VgprGranularity;
  if (device.isa().versionMajor() <= 9) {
-    if (device.isa().versionMajor() == 9 &&
-        device.isa().versionMinor() == 0 &&
+    if (device.isa().versionMajor() == 9 && device.isa().versionMinor() == 0 &&
        device.isa().versionStepping() == 10) {
      maxVGPRs = 512;
      VgprGranularity = 8;
-    }
-    else {
+    } else {
      maxVGPRs = 256;
      VgprGranularity = 4;
    }
-  }
-  else {
+  } else {
    maxVGPRs = 1024;
    VgprGranularity = 8;
  }
@@ -391,12 +359,10 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
    size_t maxSGPRs;
    if (device.isa().versionMajor() < 8) {
      maxSGPRs = 512;
-    }
-    else if (device.isa().versionMajor() < 10) {
+    } else if (device.isa().versionMajor() < 10) {
      maxSGPRs = 800;
-    }
-    else {
-      maxSGPRs = SIZE_MAX; // gfx10+ does not share SGPRs between waves
+    } else {
+      maxSGPRs = SIZE_MAX;  // gfx10+ does not share SGPRs between waves
    }
    const size_t SgprWaves = maxSGPRs / amd::alignUp(wrkGrpInfo->usedSGPRs_, 16);
    GprWaves = std::min(VgprWaves, SgprWaves);
@@ -425,7 +391,8 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
  // the maximum available block size for this kernel, which could have come from the
  // user. e.g., if the user indicates the maximum block size is 64 threads, but we
  // calculate that 128 threads can fit in each CU, we have to give up and return 64.
-  *bestBlockSize = std::min(alu_limited_threads, amd::alignUp(inputBlockSize, wrkGrpInfo->wavefrontSize_));
+  *bestBlockSize =
+      std::min(alu_limited_threads, amd::alignUp(inputBlockSize, wrkGrpInfo->wavefrontSize_));
  // If the best block size is smaller than the block size used to fit the maximum,
  // then we need to make the grid bigger for full occupancy.
  const int bestBlocksPerCU = alu_limited_threads / (*bestBlockSize);
@@ -434,13 +401,11 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(

  return hipSuccess;
 }
-}
+}  // namespace hip_impl

 extern "C" {
-hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
-                                             const void* f, size_t dynSharedMemPerBlk,
-                                             int blockSizeLimit)
-{
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, const void* f,
+                                             size_t dynSharedMemPerBlk, int blockSizeLimit) {
  HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, f, dynSharedMemPerBlk, blockSizeLimit);
  if ((gridSize == nullptr) || (blockSize == nullptr)) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -455,7 +420,8 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
  int num_blocks = 0;
  int best_block_size = 0;
  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSizeLimit, dynSharedMemPerBlk,true);
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSizeLimit,
+      dynSharedMemPerBlk, true);
  if (ret == hipSuccess) {
    *blockSize = best_block_size;
    *gridSize = max_blocks_per_grid;
@@ -463,10 +429,8 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
  HIP_RETURN(ret);
 }

-hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
-                                             hipFunction_t f, size_t dynSharedMemPerBlk,
-                                             int blockSizeLimit)
-{
+hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, hipFunction_t f,
+                                                   size_t dynSharedMemPerBlk, int blockSizeLimit) {
  HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSize, f, dynSharedMemPerBlk, blockSizeLimit);
  if ((gridSize == nullptr) || (blockSize == nullptr)) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -476,7 +440,8 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize
  int num_blocks = 0;
  int best_block_size = 0;
  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, dynSharedMemPerBlk,true);
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit,
+      dynSharedMemPerBlk, true);
  if (ret == hipSuccess) {
    *blockSize = best_block_size;
    *gridSize = max_blocks_per_grid;
@@ -485,10 +450,12 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize
 }

 hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
-                                             hipFunction_t f, size_t dynSharedMemPerBlk,
-                                             int blockSizeLimit, unsigned int flags)
-{
-  HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSizeWithFlags, f, dynSharedMemPerBlk, blockSizeLimit, flags);
+                                                            hipFunction_t f,
+                                                            size_t dynSharedMemPerBlk,
+                                                            int blockSizeLimit,
+                                                            unsigned int flags) {
+  HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSizeWithFlags, f, dynSharedMemPerBlk,
+               blockSizeLimit, flags);
  if ((gridSize == nullptr) || (blockSize == nullptr)) {
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -497,7 +464,8 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int*
  int num_blocks = 0;
  int best_block_size = 0;
  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, dynSharedMemPerBlk,true);
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit,
+      dynSharedMemPerBlk, true);
  if (ret == hipSuccess) {
    *blockSize = best_block_size;
    *gridSize = max_blocks_per_grid;
@@ -505,10 +473,11 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int*
  HIP_RETURN(ret);
 }

-hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
-                                             hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
-{
-  HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, f, blockSize, dynSharedMemPerBlk);
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hipFunction_t f,
+                                                              int blockSize,
+                                                              size_t dynSharedMemPerBlk) {
+  HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, f, blockSize,
+               dynSharedMemPerBlk);
  if (numBlocks == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -518,16 +487,16 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
  int max_blocks_per_grid = 0;
  int best_block_size = 0;
  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, false);
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk,
+      false);
  *numBlocks = num_blocks;
  HIP_RETURN(ret);
 }

-hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
-                                                              hipFunction_t f, int blockSize,
-                                                              size_t dynSharedMemPerBlk, unsigned int flags)
-{
-  HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize, dynSharedMemPerBlk, flags);
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
+  HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize,
+               dynSharedMemPerBlk, flags);
  if (numBlocks == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -537,14 +506,14 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numB
  int max_blocks_per_grid = 0;
  int best_block_size = 0;
  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, false);
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk,
+      false);
  *numBlocks = num_blocks;
  HIP_RETURN(ret);
 }

-hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
-                                                        const void* f, int blockSize, size_t dynamicSMemSize)
-{
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, const void* f,
+                                                        int blockSize, size_t dynamicSMemSize) {
  HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, f, blockSize, dynamicSMemSize);
  if (numBlocks == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -562,16 +531,18 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
  int max_blocks_per_grid = 0;
  int best_block_size = 0;
  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, false);
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize,
+      false);
  *numBlocks = num_blocks;
  HIP_RETURN(ret);
 }

-hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
-                                                                 const void* f,
-                                                                 int  blockSize, size_t dynamicSMemSize, unsigned int flags)
-{
-  HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize, dynamicSMemSize, flags);
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, const void* f,
+                                                                 int blockSize,
+                                                                 size_t dynamicSMemSize,
+                                                                 unsigned int flags) {
+  HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize, dynamicSMemSize,
+               flags);
  if (numBlocks == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -588,7 +559,8 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
  int max_blocks_per_grid = 0;
  int best_block_size = 0;
  hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, false);
+      &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize,
+      false);
  *numBlocks = num_blocks;
  HIP_RETURN(ret);
 }
@@ -599,68 +571,48 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,

 namespace hip_impl {

-void hipLaunchKernelGGLImpl(
-  uintptr_t function_address,
-  const dim3& numBlocks,
-  const dim3& dimBlocks,
-  uint32_t sharedMemBytes,
-  hipStream_t stream,
-  void** kernarg)
-{
+void hipLaunchKernelGGLImpl(uintptr_t function_address, const dim3& numBlocks,
+                            const dim3& dimBlocks, uint32_t sharedMemBytes, hipStream_t stream,
+                            void** kernarg) {
  HIP_INIT_VOID();

  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
-  int deviceId = (s != nullptr)? s->DeviceId() : ihipGetDevice();
+  int deviceId = (s != nullptr) ? s->DeviceId() : ihipGetDevice();
  if (deviceId == -1) {
    LogPrintfError("Wrong Device Id: %d \n", deviceId);
  }

  hipFunction_t func = nullptr;
-  hipError_t hip_error =
-             PlatformState::instance().getStatFunc(&func,
-                                                  reinterpret_cast<void*>(function_address),
-                                                  deviceId);
+  hipError_t hip_error = PlatformState::instance().getStatFunc(
+      &func, reinterpret_cast<void*>(function_address), deviceId);
  if ((hip_error != hipSuccess) || (func == nullptr)) {
    LogPrintfError("Cannot find the static function: 0x%x", function_address);
  }

-  hip_error = hipModuleLaunchKernel(func,
-                                    numBlocks.x, numBlocks.y, numBlocks.z,
-                                    dimBlocks.x, dimBlocks.y, dimBlocks.z,
-                                    sharedMemBytes, stream, nullptr, kernarg);
+  hip_error =
+      hipModuleLaunchKernel(func, numBlocks.x, numBlocks.y, numBlocks.z, dimBlocks.x, dimBlocks.y,
+                            dimBlocks.z, sharedMemBytes, stream, nullptr, kernarg);
  assert(hip_error == hipSuccess);
 }

-void hipLaunchCooperativeKernelGGLImpl(
-  uintptr_t function_address,
-  const dim3& numBlocks,
-  const dim3& dimBlocks,
-  uint32_t sharedMemBytes,
-  hipStream_t stream,
-  void** kernarg)
-{
+void hipLaunchCooperativeKernelGGLImpl(uintptr_t function_address, const dim3& numBlocks,
+                                       const dim3& dimBlocks, uint32_t sharedMemBytes,
+                                       hipStream_t stream, void** kernarg) {
  HIP_INIT_VOID();

-  hipError_t err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(function_address),
-    numBlocks, dimBlocks, kernarg, sharedMemBytes, stream);
+  hipError_t err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(function_address), numBlocks,
+                                              dimBlocks, kernarg, sharedMemBytes, stream);
  assert(err == hipSuccess);
 }

-}
+}  // namespace hip_impl

-#endif // defined(ATI_OS_LINUX)
+#endif  // defined(ATI_OS_LINUX)

-hipError_t ihipLaunchKernel(const void* hostFunction,
-                                         dim3 gridDim,
-                                         dim3 blockDim,
-                                         void** args,
-                                         size_t sharedMemBytes,
-                                         hipStream_t stream,
-                                         hipEvent_t startEvent,
-                                         hipEvent_t stopEvent,
-                                         int flags)
-{
-  hipFunction_t func =  nullptr;
+hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
+                            size_t sharedMemBytes, hipStream_t stream, hipEvent_t startEvent,
+                            hipEvent_t stopEvent, int flags) {
+  hipFunction_t func = nullptr;
  int deviceId = hip::Stream::DeviceId(stream);
  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId);
  if ((hip_error != hipSuccess) || (func == nullptr)) {
@@ -674,19 +626,31 @@ hipError_t ihipLaunchKernel(const void* hostFunction,
      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
    HIP_RETURN(hipErrorInvalidConfiguration);
  }
-  HIP_RETURN(ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
-                                    static_cast<uint32_t>(globalWorkSizeY),
-                                    static_cast<uint32_t>(globalWorkSizeZ),
-                                    blockDim.x, blockDim.y, blockDim.z,
-                                    sharedMemBytes, stream, args, nullptr, startEvent, stopEvent,
-                                    flags));
+  HIP_RETURN(ihipModuleLaunchKernel(
+      func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
+      static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes,
+      stream, args, nullptr, startEvent, stopEvent, flags));
 }

 // conversion routines between float and half precision

-static inline std::uint32_t f32_as_u32(float f) { union { float f; std::uint32_t u; } v; v.f = f; return v.u; }
+static inline std::uint32_t f32_as_u32(float f) {
+  union {
+    float f;
+    std::uint32_t u;
+  } v;
+  v.f = f;
+  return v.u;
+}

-static inline float u32_as_f32(std::uint32_t u) { union { float f; std::uint32_t u; } v; v.u = u; return v.f; }
+static inline float u32_as_f32(std::uint32_t u) {
+  union {
+    float f;
+    std::uint32_t u;
+  } v;
+  v.u = u;
+  return v.f;
+}

 static inline int clamp_int(int i, int l, int h) { return std::min(std::max(i, l), h); }

@@ -694,15 +658,14 @@ static inline int clamp_int(int i, int l, int h) { return std::min(std::max(i, l
 // half float, the f16 is in the low 16 bits of the input argument

 static inline float __convert_half_to_float(std::uint32_t a) noexcept {
-
  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;

-  std::uint32_t v = f32_as_u32(u32_as_f32(u) * u32_as_f32(0x77800000U)/*0x1.0p+112f*/) + 0x38000000U;
+  std::uint32_t v =
+      f32_as_u32(u32_as_f32(u) * u32_as_f32(0x77800000U) /*0x1.0p+112f*/) + 0x38000000U;

  u = (a & 0x7fff) != 0 ? v : u;

-  return u32_as_f32(u) * u32_as_f32(0x07800000U)/*0x1.0p-112f*/;
-
+  return u32_as_f32(u) * u32_as_f32(0x07800000U) /*0x1.0p-112f*/;
 }

 // float half with nearest even rounding
@@ -714,7 +677,7 @@ static inline std::uint32_t __convert_float_to_half(float a) noexcept {
  std::uint32_t i = 0x7c00 | (m != 0 ? 0x0200 : 0);
  std::uint32_t n = ((std::uint32_t)e << 12) | m;
  std::uint32_t s = (u >> 16) & 0x8000;
-  int b = clamp_int(1-e, 0, 13);
+  int b = clamp_int(1 - e, 0, 13);
  std::uint32_t d = (0x1000 | m) >> b;
  d |= (d << b) != (0x1000 | m);
  std::uint32_t v = e < 1 ? d : n;
@@ -726,24 +689,25 @@ static inline std::uint32_t __convert_float_to_half(float a) noexcept {

 extern "C"
 #if !defined(_MSC_VER)
-__attribute__((weak))
+    __attribute__((weak))
 #endif
-float  __gnu_h2f_ieee(unsigned short h){
-  return __convert_half_to_float((std::uint32_t) h);
+    float
+    __gnu_h2f_ieee(unsigned short h) {
+  return __convert_half_to_float((std::uint32_t)h);
 }

 extern "C"
 #if !defined(_MSC_VER)
-__attribute__((weak))
+    __attribute__((weak))
 #endif
-unsigned short  __gnu_f2h_ieee(float f){
+    unsigned short
+    __gnu_f2h_ieee(float f) {
  return (unsigned short)__convert_float_to_half(f);
 }

-void PlatformState::init()
-{
+void PlatformState::init() {
  amd::ScopedLock lock(lock_);
-  if(initialized_ || g_devices.empty()) {
+  if (initialized_ || g_devices.empty()) {
    return;
  }
  initialized_ = true;
@@ -751,18 +715,18 @@ void PlatformState::init()
    hipError_t err = digestFatBinary(it.first, it.second);
    assert(err == hipSuccess);
  }
-  for (auto &it : statCO_.vars_) {
+  for (auto& it : statCO_.vars_) {
    it.second->resize_dVar(g_devices.size());
  }
-  for (auto &it : statCO_.functions_) {
+  for (auto& it : statCO_.functions_) {
    it.second->resize_dFunc(g_devices.size());
  }
 }

-hipError_t PlatformState::loadModule(hipModule_t *module, const char* fname, const void* image) {
+hipError_t PlatformState::loadModule(hipModule_t* module, const char* fname, const void* image) {
  amd::ScopedLock lock(lock_);

-  if(module == nullptr) {
+  if (module == nullptr) {
    return hipErrorInvalidValue;
  }

@@ -809,7 +773,7 @@ hipError_t PlatformState::unloadModule(hipModule_t hmod) {
 }

 hipError_t PlatformState::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod,
-                                         const char* func_name) {
+                                     const char* func_name) {
  amd::ScopedLock lock(lock_);

  auto it = dynCO_map_.find(hmod);
@@ -828,7 +792,7 @@ hipError_t PlatformState::getDynGlobalVar(const char* hostVar, hipModule_t hmod,
                                          hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
  amd::ScopedLock lock(lock_);

-  if(hostVar == nullptr || dev_ptr == nullptr || size_ptr == nullptr) {
+  if (hostVar == nullptr || dev_ptr == nullptr || size_ptr == nullptr) {
    return hipErrorInvalidValue;
  }

@@ -880,7 +844,8 @@ hipError_t PlatformState::getDynTexGlobalVar(textureReference* texRef, hipDevice
  return hipSuccess;
 }

-hipError_t PlatformState::getDynTexRef(const char* hostVar, hipModule_t hmod, textureReference** texRef) {
+hipError_t PlatformState::getDynTexRef(const char* hostVar, hipModule_t hmod,
+                                       textureReference** texRef) {
  amd::ScopedLock lock(lock_);

  auto it = dynCO_map_.find(hmod);
@@ -897,12 +862,12 @@ hipError_t PlatformState::getDynTexRef(const char* hostVar, hipModule_t hmod, te
  }

  dvar->shadowVptr = new texture<char>();
-  *texRef =  reinterpret_cast<textureReference*>(dvar->shadowVptr);
+  *texRef = reinterpret_cast<textureReference*>(dvar->shadowVptr);
  return hipSuccess;
 }

 hipError_t PlatformState::digestFatBinary(const void* data, hip::FatBinaryInfo*& programs) {
- return statCO_.digestFatBinary(data, programs);
+  return statCO_.digestFatBinary(data, programs);
 }

 hip::FatBinaryInfo** PlatformState::addFatBinary(const void* data) {
@@ -925,19 +890,21 @@ hipError_t PlatformState::registerStatManagedVar(hip::Var* var) {
  return statCO_.registerStatManagedVar(var);
 }

-hipError_t PlatformState::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId) {
+hipError_t PlatformState::getStatFunc(hipFunction_t* hfunc, const void* hostFunction,
+                                      int deviceId) {
  return statCO_.getStatFunc(hfunc, hostFunction, deviceId);
 }

-hipError_t PlatformState::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId) {
-  if(func_attr == nullptr || hostFunction == nullptr) {
+hipError_t PlatformState::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction,
+                                          int deviceId) {
+  if (func_attr == nullptr || hostFunction == nullptr) {
    return hipErrorInvalidValue;
  }
  return statCO_.getStatFuncAttr(func_attr, hostFunction, deviceId);
 }

-hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
-                                           size_t* size_ptr) {
+hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId,
+                                           hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
  return statCO_.getStatGlobalVar(hostVar, deviceId, dev_ptr, size_ptr);
 }

@@ -945,7 +912,7 @@ hipError_t PlatformState::initStatManagedVarDevicePtr(int deviceId) {
  return statCO_.initStatManagedVarDevicePtr(deviceId);
 }

-void PlatformState::setupArgument(const void *arg, size_t size, size_t offset) {
+void PlatformState::setupArgument(const void* arg, size_t size, size_t offset) {
  auto& arguments = execStack_.top().arguments_;

  if (arguments.size() < offset + size) {