From fb7da41d27abe316580862e3752dea80ed29dbde Mon Sep 17 00:00:00 2001
From: foreman <dl.swbuild@amd.com>
Date: Wed, 16 Oct 2019 11:24:09 -0400
Subject: [PATCH] P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16
 11:13:37

	SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice()
	- Add support for multi grid launch in hip
	- Detect the new hidden argument and pass the required information for the kernel launch
	- Memory for synchronization is allocated as a single object and then the offset for each GPU is found

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit


[ROCm/clr commit: 6e7e97987fc9df4ebc03cca491a16c1bd12870f5]
---
 projects/clr/rocclr/runtime/device/device.hpp | 23 +++++++-
 .../clr/rocclr/runtime/device/devkernel.cpp   |  3 +
 .../clr/rocclr/runtime/device/devkernel.hpp   | 17 +++---
 .../rocclr/runtime/device/pal/palkernel.cpp   |  2 +
 .../rocclr/runtime/device/rocm/rocdevice.cpp  | 37 +++++++-----
 .../rocclr/runtime/device/rocm/rocdevice.hpp  | 14 ++++-
 .../rocclr/runtime/device/rocm/rocvirtual.cpp | 56 ++++++++++++++-----
 .../rocclr/runtime/device/rocm/rocvirtual.hpp |  2 +-
 .../clr/rocclr/runtime/platform/command.cpp   | 19 +++++--
 .../clr/rocclr/runtime/platform/command.hpp   | 30 ++++++++--
 10 files changed, 151 insertions(+), 52 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index 932d543074..f273ecf37f 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -1153,7 +1153,24 @@ class Device : public RuntimeObject {
   typedef aclCompiler Compiler;
 
  public:
+  // The structures below for MGPU launch match the device library format
+  struct MGSyncData {
+    uint32_t w0;
+    uint32_t w1;
+  };
+
+  struct MGSyncInfo {
+    struct MGSyncData* mgs;
+    uint32_t grid_id;
+    uint32_t num_grids;
+    uint64_t prev_sum;
+    uint64_t all_sum;
+  };
+
   static constexpr size_t kP2PStagingSize = 4 * Mi;
+  static constexpr size_t kMGSyncDataSize = sizeof(MGSyncData);
+  static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
+
   typedef std::list<CommandQueue*> CommandQueues;
 
   struct BlitProgram : public amd::HeapObject {
@@ -1409,9 +1426,9 @@ class Device : public RuntimeObject {
   std::unique_ptr<amd::CacheCompilation> cacheCompilation_;
 #endif
 
-  static amd::Context* glb_ctx_;       //!< Global context with all devices
-  static amd::Monitor p2p_stage_ops_;  //!< Lock to serialise cache for the P2P resources
-  static Memory* p2p_stage_;           //!< Staging resources
+  static amd::Context* glb_ctx_;      //!< Global context with all devices
+  static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
+  static Memory* p2p_stage_;          //!< Staging resources
 
  private:
   bool IsTypeMatching(cl_device_type type, bool offlineDevices);
diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp
index 6fcab423aa..9fb0f69144 100644
--- a/projects/clr/rocclr/runtime/device/devkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/devkernel.cpp
@@ -752,6 +752,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
   case ValueKind::HiddenCompletionAction:
     *isHidden = true;
     return amd::KernelParameterDescriptor::HiddenCompletionAction;
+  case ValueKind::HiddenMultiGridSyncArg:
+    *isHidden = true;
+    return amd::KernelParameterDescriptor::HiddenMultiGridSync;
   case ValueKind::HiddenNone:
   default:
     *isHidden = true;
diff --git a/projects/clr/rocclr/runtime/device/devkernel.hpp b/projects/clr/rocclr/runtime/device/devkernel.hpp
index 561dfc087d..f944584252 100644
--- a/projects/clr/rocclr/runtime/device/devkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/devkernel.hpp
@@ -110,7 +110,8 @@ static const std::map<std::string,ValueKind> ArgValueKind =
   {"HiddenNone",              ValueKind::HiddenNone},
   {"HiddenPrintfBuffer",      ValueKind::HiddenPrintfBuffer},
   {"HiddenDefaultQueue",      ValueKind::HiddenDefaultQueue},
-  {"HiddenCompletionAction",  ValueKind::HiddenCompletionAction}
+  {"HiddenCompletionAction",  ValueKind::HiddenCompletionAction},
+  {"HiddenMultigridSyncArg",  ValueKind::HiddenMultiGridSyncArg}
 };
 
 static const std::map<std::string,ValueType> ArgValueType =
@@ -223,7 +224,8 @@ static const std::map<std::string,ValueKind> ArgValueKindV3 =
   {"hidden_none",               ValueKind::HiddenNone},
   {"hidden_printf_buffer",      ValueKind::HiddenPrintfBuffer},
   {"hidden_default_queue",      ValueKind::HiddenDefaultQueue},
-  {"hidden_completion_action",  ValueKind::HiddenCompletionAction}
+  {"hidden_completion_action",  ValueKind::HiddenCompletionAction},
+  {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg}
 };
 
 static const std::map<std::string,ValueType> ArgValueTypeV3 =
@@ -317,19 +319,20 @@ struct KernelParameterDescriptor {
     ValueObject = 10,
     ImageObject = 11,
     SamplerObject = 12,
-    QueueObject = 13
+    QueueObject = 13,
+    HiddenMultiGridSync = 14
   };
   clk_value_type_t type_;  //!< The parameter's type
   size_t offset_;          //!< Its offset in the parameter's stack
   size_t size_;            //!< Its size in bytes
   union InfoData {
     struct {
-      uint32_t oclObject_ : 4;   //!< OCL object type
+      uint32_t oclObject_ : 4;  //!< OCL object type
       uint32_t readOnly_ : 1;   //!< OCL object is read only, applied to memory only
-      uint32_t rawPointer_ : 1;   //!< Arguments have a raw GPU VA
-      uint32_t defined_ : 1;   //!< The argument was defined by the app
+      uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
+      uint32_t defined_ : 1;    //!< The argument was defined by the app
       uint32_t reserved_ : 1;   //!< reserved
-      uint32_t arrayIndex_ : 24;  //!< Index in the objects array or LDS alignment
+      uint32_t arrayIndex_ : 24;//!< Index in the objects array or LDS alignment
     };
     uint32_t allValues_;
     InfoData() : allValues_(0) {}
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index 985a907095..86b73f3c70 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -336,6 +336,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
           WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
         }
         break;
+      case amd::KernelParameterDescriptor::HiddenMultiGridSync:
+        break;
     }
   }
 
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index ced55b2c01..af4d6eec95 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -51,7 +51,7 @@ hsa_agent_t roc::Device::cpu_agent_ = {0};
 std::vector<hsa_agent_t> roc::Device::gpu_agents_;
 const bool roc::Device::offlineDevice_ = false;
 const bool roc::NullDevice::offlineDevice_ = true;
-
+address Device::mg_sync_ = nullptr;
 
 static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) {
   if (HSA_STATUS_SUCCESS !=
@@ -175,6 +175,10 @@ Device::~Device() {
     p2p_stage_->release();
     p2p_stage_ = nullptr;
   }
+  if (nullptr != mg_sync_) {
+    amd::SvmBuffer::free(GlbCtx(), mg_sync_);
+    mg_sync_ = nullptr;
+  }
   if (glb_ctx_ != nullptr) {
       glb_ctx_->release();
       glb_ctx_ = nullptr;
@@ -715,28 +719,25 @@ bool Device::create(bool sramEccEnabled) {
   // Use just 1 entry by default for the map cache
   mapCache_->push_back(nullptr);
 
-  if ((p2p_agents_.size() == 0) &&
-      (glb_ctx_ == nullptr) && (gpu_agents_.size() > 1) &&
+  if ((glb_ctx_ == nullptr) && (gpu_agents_.size() >= 1) &&
       // Allow creation for the last device in the list.
       (gpu_agents_[gpu_agents_.size() - 1].handle == _bkendDevice.handle)) {
-
     std::vector<amd::Device*> devices;
     uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false);
     // Add all PAL devices
     for (uint32_t i = 0; i < numDevices; ++i) {
-        devices.push_back(amd::Device::devices()[i]);
+      devices.push_back(amd::Device::devices()[i]);
     }
     // Add current
     devices.push_back(this);
+    // Create a dummy context
+    glb_ctx_ = new amd::Context(devices, info);
+    if (glb_ctx_ == nullptr) {
+      return false;
+    }
 
-    if (devices.size() > 1) {
-      // Create a dummy context
-      glb_ctx_ = new amd::Context(devices, info);
-      if (glb_ctx_ == nullptr) {
-        return false;
-      }
-      amd::Buffer* buf =
-        new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
+    if ((p2p_agents_.size() == 0) && (devices.size() > 1)) {
+      amd::Buffer* buf = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
       if ((buf != nullptr) && buf->create()) {
         p2p_stage_ = buf;
       }
@@ -745,6 +746,15 @@ bool Device::create(bool sramEccEnabled) {
         return false;
       }
     }
+    // Check if sync buffer wasn't allocated yet
+    if (amd::IS_HIP && mg_sync_ == nullptr) {
+      mg_sync_ = reinterpret_cast<address>(amd::SvmBuffer::malloc(
+          GlbCtx(), (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS),
+          kMGInfoSizePerDevice * GlbCtx().devices().size(), kMGInfoSizePerDevice));
+      if (mg_sync_ == nullptr) {
+        return false;
+      }
+    }
   }
 
   if (settings().stagedXferSize_ != 0) {
@@ -1817,6 +1827,7 @@ VirtualGPU* Device::xferQueue() const {
   xferQueue_->enableSyncBlit();
   return xferQueue_;
 }
+
 bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
   bool result = true;
   return result;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
index 378952501e..09ffb8e825 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -410,6 +410,16 @@ class Device : public NullDevice {
 
   hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
 
+  //! Acquire HSA queue. This method can create a new HSA queue or
+  //! share previously created
+  hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
+
+  //! Release HSA queue
+  void releaseQueue(hsa_queue_t*);
+
+  //! Return multi GPU grid launch sync buffer
+  address MGSync() const { return mg_sync_; }
+
  private:
   static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
 
@@ -440,6 +450,7 @@ class Device : public NullDevice {
   std::atomic<size_t> freeMem_;   //!< Total of free memory available
   mutable amd::Monitor vgpusAccess_;     //!< Lock to serialise virtual gpu list access
   bool hsa_exclusive_gpu_access_;  //!< TRUE if current device was moved into exclusive GPU access mode
+  static address mg_sync_;  //!< MGPU grid launch sync memory (SVM location)
 
   struct QueueInfo {
     int refCount;
@@ -448,9 +459,6 @@ class Device : public NullDevice {
 
  public:
   amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
-
-  hsa_queue_t *acquireQueue(uint32_t queue_size_hint);
-  void releaseQueue(hsa_queue_t*);
 };                                // class roc::Device
 }  // namespace roc
 
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 408a973fc3..fe18049bbe 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1961,13 +1961,14 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 }
 
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
-  const_address parameters, void* eventHandle, uint32_t sharedMemBytes, bool cooperativeGroups) {
+  const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
   size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
 
   // Check memory dependency and SVM objects
-  if (!processMemObjects(kernel, parameters, ldsUsage, cooperativeGroups)) {
+  bool coopGroups = (vcmd != nullptr) ? vcmd->cooperativeGroups() : false;
+  if (!processMemObjects(kernel, parameters, ldsUsage, coopGroups)) {
     LogError("Wrong memory objects!");
     return false;
   }
@@ -2099,6 +2100,27 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
           WriteAqlArgAt(const_cast<address>(parameters), &spVA, it.size_, it.offset_);
           break;
         }
+        case amd::KernelParameterDescriptor::HiddenMultiGridSync: {
+          uint64_t gridSync = coopGroups ? 1 : 0;
+          bool multiGrid = (vcmd != nullptr) ? vcmd->cooperativeMultiDeviceGroups() : false;
+          if (multiGrid) {
+            // Find CPU pointer to the right sync info structure. It should be after MGSyncData
+            Device::MGSyncInfo* syncInfo = reinterpret_cast<Device::MGSyncInfo*>(
+                dev().MGSync() + Device::kMGInfoSizePerDevice * dev().index() + Device::kMGSyncDataSize);
+            // Update sync data address. Use the offset adjustment to the right location
+            syncInfo->mgs = reinterpret_cast<Device::MGSyncData*>(dev().MGSync() +
+              Device::kMGInfoSizePerDevice * vcmd->firstDevice());
+            // Fill all sync info fields
+            syncInfo->grid_id = vcmd->gridId();
+            syncInfo->num_grids = vcmd->numGrids();
+            syncInfo->prev_sum = vcmd->prevGridSum();
+            syncInfo->all_sum = vcmd->allGridSum();
+            // Update GPU address for grid sync info. Use the offset adjustment for the right location
+            gridSync = reinterpret_cast<uint64_t>(syncInfo);
+          }
+          WriteAqlArgAt(const_cast<address>(parameters), &gridSync, it.size_, it.offset_);
+          break;
+        }
       }
     }
 
@@ -2177,32 +2199,36 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
  * the list of kernel parameters.
  */
 void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
-  if (vcmd.cooperativeGroups()) {
-    uint32_t workgroups = 0;
-    for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
-      if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
-        workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
-      }
-    }
-
+  if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
     // Get device queue for exclusive GPU access
     VirtualGPU* queue = dev().xferQueue();
 
+    // Lock the queue, using the blit manager lock
+    amd::ScopedLock lock(queue->blitMgr().lockXfer());
+
     // Wait for the execution on the current queue, since the coop groups will use the device queue
     releaseGpuMemoryFence();
 
-    // Lock the queue, using the blit manager lock
-    amd::ScopedLock lock(queue->blitMgr().lockXfer());
     queue->profilingBegin(vcmd);
 
-    static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups);
+    if (vcmd.cooperativeGroups()) {
+      // Initialize GWS if it's cooperative groups launch
+      uint32_t workgroups = 0;
+      for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
+        if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
+          workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
+        }
+      }
+
+      static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups - 1);
+    }
 
     // Sync AQL packets
     queue->setAqlHeader(dispatchPacketHeader_);
 
     // Submit kernel to HW
     if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
-      static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
+      static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), &vcmd)) {
       LogError("AQL dispatch failed!");
       vcmd.setStatus(CL_INVALID_OPERATION);
     }
@@ -2218,7 +2244,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 
     // Submit kernel to HW
     if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
-      static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
+      static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
       LogError("AQL dispatch failed!");
       vcmd.setStatus(CL_INVALID_OPERATION);
     }
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
index f45060322d..2e31dc4b4f 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -171,7 +171,7 @@ class VirtualGPU : public device::VirtualDevice {
                             const_address parameters,            //!< Parameters for the kernel
                             void* event_handle,  //!< Handle to OCL event for debugging
                             uint32_t sharedMemBytes = 0, //!< Shared memory size
-                            bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
+                            amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
                             );
   void submitNativeFn(amd::NativeFnCommand& cmd);
   void submitMarker(amd::Marker& cmd);
diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp
index e67c9067dc..fb87d99588 100644
--- a/projects/clr/rocclr/runtime/platform/command.cpp
+++ b/projects/clr/rocclr/runtime/platform/command.cpp
@@ -232,12 +232,19 @@ const Context& Command::context() const { return queue_->context(); }
 
 NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
                                            Kernel& kernel, const NDRangeContainer& sizes,
-                                           uint32_t sharedMemBytes, uint32_t extraParam)
-    : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL)
-    , kernel_(kernel)
-    , sizes_(sizes)
-    , sharedMemBytes_(sharedMemBytes)
-    , extraParam_(extraParam) {
+                                           uint32_t sharedMemBytes, uint32_t extraParam,
+                                           uint32_t gridId, uint32_t numGrids,
+                                           uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice) :
+    Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL),
+    kernel_(kernel),
+    sizes_(sizes),
+    sharedMemBytes_(sharedMemBytes),
+    extraParam_(extraParam),
+    gridId_(gridId),
+    numGrids_(numGrids),
+    prevGridSum_(prevGridSum),
+    allGridSum_(allGridSum),
+    firstDevice_(firstDevice) {
   auto& device = queue.device();
   auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
   profilingInfo_.setCallback(devKernel->getProfilingCallback(
diff --git a/projects/clr/rocclr/runtime/platform/command.hpp b/projects/clr/rocclr/runtime/platform/command.hpp
index e55a33fe2b..6610381869 100644
--- a/projects/clr/rocclr/runtime/platform/command.hpp
+++ b/projects/clr/rocclr/runtime/platform/command.hpp
@@ -764,9 +764,15 @@ class NDRangeKernelCommand : public Command {
  private:
   Kernel& kernel_;
   NDRangeContainer sizes_;
-  address parameters_;
-  uint32_t sharedMemBytes_;
-  uint32_t extraParam_;
+  address parameters_;      //!< Pointer to the kernel argumets
+  // The below fields are specific to the HIP functionality
+  uint32_t sharedMemBytes_; //!< Size of reserved shared memory
+  uint32_t extraParam_;     //!< Extra flags for the kernel launch
+  uint32_t gridId_;         //!< Grid ID in the multi GPU kernel launch
+  uint32_t numGrids_;       //!< Total number of grids in multi GPU launch
+  uint64_t prevGridSum_;    //!< A sum of previous grids to the current launch
+  uint64_t allGridSum_;     //!< A sum of all grids in multi GPU launch
+  uint32_t firstDevice_;    //!< Device index of the first device in the grid
 
  public:
   enum {
@@ -777,7 +783,8 @@ class NDRangeKernelCommand : public Command {
   //! Construct an ExecuteKernel command
   NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
                        const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,
-                       uint32_t extraParam = 0);
+                       uint32_t extraParam = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
+                       uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
 
   virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }
 
@@ -804,6 +811,21 @@ class NDRangeKernelCommand : public Command {
     return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false;
   }
 
+  //! Return the current grid ID for multidevice launch
+  uint32_t gridId() const { return gridId_; }
+
+  //! Return the number of launched grids
+  uint32_t numGrids() const { return numGrids_; }
+
+  //! Return the total workload size for up to the current
+  uint64_t prevGridSum() const { return prevGridSum_; }
+
+  //! Return the total workload size for all GPUs
+  uint64_t allGridSum() const { return allGridSum_; }
+
+  //! Return the index of the first device in multi GPU launch
+  uint64_t firstDevice() const { return firstDevice_; }
+
   //! Set the local work size.
   void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }