P4 to Git Change 2014404 by gandryey@gera-win10 on 2019/10/16 11:13:37

SWDEV-184710 - Support hipLaunchCooperativeKernelMultiDevice() - Add support for multi grid launch in hip - Detect the new hidden argument and pass the required information for the kernel launch - Memory for synchronization is allocated as a single object and then the offset for each GPU is found Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/hip_module.cpp#44 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#343 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#25 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#17 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#82 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#136 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#90 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#30 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#99 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#97 edit [ROCm/clr commit: 6e7e97987f]
2019-10-16 11:24:09 -04:00
@@ -1153,7 +1153,24 @@ class Device : public RuntimeObject {
  typedef aclCompiler Compiler;

 public:
+  // The structures below for MGPU launch match the device library format
+  struct MGSyncData {
+    uint32_t w0;
+    uint32_t w1;
+  };
+
+  struct MGSyncInfo {
+    struct MGSyncData* mgs;
+    uint32_t grid_id;
+    uint32_t num_grids;
+    uint64_t prev_sum;
+    uint64_t all_sum;
+  };
+
  static constexpr size_t kP2PStagingSize = 4 * Mi;
+  static constexpr size_t kMGSyncDataSize = sizeof(MGSyncData);
+  static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
+
  typedef std::list<CommandQueue*> CommandQueues;

  struct BlitProgram : public amd::HeapObject {
@@ -1409,9 +1426,9 @@ class Device : public RuntimeObject {
  std::unique_ptr<amd::CacheCompilation> cacheCompilation_;
 #endif

-  static amd::Context* glb_ctx_;       //!< Global context with all devices
-  static amd::Monitor p2p_stage_ops_;  //!< Lock to serialise cache for the P2P resources
-  static Memory* p2p_stage_;           //!< Staging resources
+  static amd::Context* glb_ctx_;      //!< Global context with all devices
+  static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
+  static Memory* p2p_stage_;          //!< Staging resources

 private:
  bool IsTypeMatching(cl_device_type type, bool offlineDevices);
@@ -752,6 +752,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
  case ValueKind::HiddenCompletionAction:
    *isHidden = true;
    return amd::KernelParameterDescriptor::HiddenCompletionAction;
+  case ValueKind::HiddenMultiGridSyncArg:
+    *isHidden = true;
+    return amd::KernelParameterDescriptor::HiddenMultiGridSync;
  case ValueKind::HiddenNone:
  default:
    *isHidden = true;
@@ -110,7 +110,8 @@ static const std::map<std::string,ValueKind> ArgValueKind =
  {"HiddenNone",              ValueKind::HiddenNone},
  {"HiddenPrintfBuffer",      ValueKind::HiddenPrintfBuffer},
  {"HiddenDefaultQueue",      ValueKind::HiddenDefaultQueue},
-  {"HiddenCompletionAction",  ValueKind::HiddenCompletionAction}
+  {"HiddenCompletionAction",  ValueKind::HiddenCompletionAction},
+  {"HiddenMultigridSyncArg",  ValueKind::HiddenMultiGridSyncArg}
 };

 static const std::map<std::string,ValueType> ArgValueType =
@@ -223,7 +224,8 @@ static const std::map<std::string,ValueKind> ArgValueKindV3 =
  {"hidden_none",               ValueKind::HiddenNone},
  {"hidden_printf_buffer",      ValueKind::HiddenPrintfBuffer},
  {"hidden_default_queue",      ValueKind::HiddenDefaultQueue},
-  {"hidden_completion_action",  ValueKind::HiddenCompletionAction}
+  {"hidden_completion_action",  ValueKind::HiddenCompletionAction},
+  {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg}
 };

 static const std::map<std::string,ValueType> ArgValueTypeV3 =
@@ -317,19 +319,20 @@ struct KernelParameterDescriptor {
    ValueObject = 10,
    ImageObject = 11,
    SamplerObject = 12,
-    QueueObject = 13
+    QueueObject = 13,
+    HiddenMultiGridSync = 14
  };
  clk_value_type_t type_;  //!< The parameter's type
  size_t offset_;          //!< Its offset in the parameter's stack
  size_t size_;            //!< Its size in bytes
  union InfoData {
    struct {
-      uint32_t oclObject_ : 4;   //!< OCL object type
+      uint32_t oclObject_ : 4;  //!< OCL object type
      uint32_t readOnly_ : 1;   //!< OCL object is read only, applied to memory only
-      uint32_t rawPointer_ : 1;   //!< Arguments have a raw GPU VA
-      uint32_t defined_ : 1;   //!< The argument was defined by the app
+      uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
+      uint32_t defined_ : 1;    //!< The argument was defined by the app
      uint32_t reserved_ : 1;   //!< reserved
-      uint32_t arrayIndex_ : 24;  //!< Index in the objects array or LDS alignment
+      uint32_t arrayIndex_ : 24;//!< Index in the objects array or LDS alignment
    };
    uint32_t allValues_;
    InfoData() : allValues_(0) {}
@@ -336,6 +336,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
          WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
        }
        break;
+      case amd::KernelParameterDescriptor::HiddenMultiGridSync:
+        break;
    }
  }

@@ -51,7 +51,7 @@ hsa_agent_t roc::Device::cpu_agent_ = {0};
 std::vector<hsa_agent_t> roc::Device::gpu_agents_;
 const bool roc::Device::offlineDevice_ = false;
 const bool roc::NullDevice::offlineDevice_ = true;
-
+address Device::mg_sync_ = nullptr;

 static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) {
  if (HSA_STATUS_SUCCESS !=
@@ -175,6 +175,10 @@ Device::~Device() {
    p2p_stage_->release();
    p2p_stage_ = nullptr;
  }
+  if (nullptr != mg_sync_) {
+    amd::SvmBuffer::free(GlbCtx(), mg_sync_);
+    mg_sync_ = nullptr;
+  }
  if (glb_ctx_ != nullptr) {
      glb_ctx_->release();
      glb_ctx_ = nullptr;
@@ -715,28 +719,25 @@ bool Device::create(bool sramEccEnabled) {
  // Use just 1 entry by default for the map cache
  mapCache_->push_back(nullptr);

-  if ((p2p_agents_.size() == 0) &&
-      (glb_ctx_ == nullptr) && (gpu_agents_.size() > 1) &&
+  if ((glb_ctx_ == nullptr) && (gpu_agents_.size() >= 1) &&
      // Allow creation for the last device in the list.
      (gpu_agents_[gpu_agents_.size() - 1].handle == _bkendDevice.handle)) {
-
    std::vector<amd::Device*> devices;
    uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false);
    // Add all PAL devices
    for (uint32_t i = 0; i < numDevices; ++i) {
-        devices.push_back(amd::Device::devices()[i]);
+      devices.push_back(amd::Device::devices()[i]);
    }
    // Add current
    devices.push_back(this);
+    // Create a dummy context
+    glb_ctx_ = new amd::Context(devices, info);
+    if (glb_ctx_ == nullptr) {
+      return false;
+    }

-    if (devices.size() > 1) {
-      // Create a dummy context
-      glb_ctx_ = new amd::Context(devices, info);
-      if (glb_ctx_ == nullptr) {
-        return false;
-      }
-      amd::Buffer* buf =
-        new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
+    if ((p2p_agents_.size() == 0) && (devices.size() > 1)) {
+      amd::Buffer* buf = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
      if ((buf != nullptr) && buf->create()) {
        p2p_stage_ = buf;
      }
@@ -745,6 +746,15 @@ bool Device::create(bool sramEccEnabled) {
        return false;
      }
    }
+    // Check if sync buffer wasn't allocated yet
+    if (amd::IS_HIP && mg_sync_ == nullptr) {
+      mg_sync_ = reinterpret_cast<address>(amd::SvmBuffer::malloc(
+          GlbCtx(), (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS),
+          kMGInfoSizePerDevice * GlbCtx().devices().size(), kMGInfoSizePerDevice));
+      if (mg_sync_ == nullptr) {
+        return false;
+      }
+    }
  }

  if (settings().stagedXferSize_ != 0) {
@@ -1817,6 +1827,7 @@ VirtualGPU* Device::xferQueue() const {
  xferQueue_->enableSyncBlit();
  return xferQueue_;
 }
+
 bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
  bool result = true;
  return result;
@@ -410,6 +410,16 @@ class Device : public NullDevice {

  hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }

+  //! Acquire HSA queue. This method can create a new HSA queue or
+  //! share previously created
+  hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
+
+  //! Release HSA queue
+  void releaseQueue(hsa_queue_t*);
+
+  //! Return multi GPU grid launch sync buffer
+  address MGSync() const { return mg_sync_; }
+
 private:
  static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;

@@ -440,6 +450,7 @@ class Device : public NullDevice {
  std::atomic<size_t> freeMem_;   //!< Total of free memory available
  mutable amd::Monitor vgpusAccess_;     //!< Lock to serialise virtual gpu list access
  bool hsa_exclusive_gpu_access_;  //!< TRUE if current device was moved into exclusive GPU access mode
+  static address mg_sync_;  //!< MGPU grid launch sync memory (SVM location)

  struct QueueInfo {
    int refCount;
@@ -448,9 +459,6 @@ class Device : public NullDevice {

 public:
  amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
-
-  hsa_queue_t *acquireQueue(uint32_t queue_size_hint);
-  void releaseQueue(hsa_queue_t*);
 };                                // class roc::Device
 }  // namespace roc

@@ -1961,13 +1961,14 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 }

 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
-  const_address parameters, void* eventHandle, uint32_t sharedMemBytes, bool cooperativeGroups) {
+  const_address parameters, void* eventHandle, uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
  device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
  Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
  size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();

  // Check memory dependency and SVM objects
-  if (!processMemObjects(kernel, parameters, ldsUsage, cooperativeGroups)) {
+  bool coopGroups = (vcmd != nullptr) ? vcmd->cooperativeGroups() : false;
+  if (!processMemObjects(kernel, parameters, ldsUsage, coopGroups)) {
    LogError("Wrong memory objects!");
    return false;
  }
@@ -2099,6 +2100,27 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
          WriteAqlArgAt(const_cast<address>(parameters), &spVA, it.size_, it.offset_);
          break;
        }
+        case amd::KernelParameterDescriptor::HiddenMultiGridSync: {
+          uint64_t gridSync = coopGroups ? 1 : 0;
+          bool multiGrid = (vcmd != nullptr) ? vcmd->cooperativeMultiDeviceGroups() : false;
+          if (multiGrid) {
+            // Find CPU pointer to the right sync info structure. It should be after MGSyncData
+            Device::MGSyncInfo* syncInfo = reinterpret_cast<Device::MGSyncInfo*>(
+                dev().MGSync() + Device::kMGInfoSizePerDevice * dev().index() + Device::kMGSyncDataSize);
+            // Update sync data address. Use the offset adjustment to the right location
+            syncInfo->mgs = reinterpret_cast<Device::MGSyncData*>(dev().MGSync() +
+              Device::kMGInfoSizePerDevice * vcmd->firstDevice());
+            // Fill all sync info fields
+            syncInfo->grid_id = vcmd->gridId();
+            syncInfo->num_grids = vcmd->numGrids();
+            syncInfo->prev_sum = vcmd->prevGridSum();
+            syncInfo->all_sum = vcmd->allGridSum();
+            // Update GPU address for grid sync info. Use the offset adjustment for the right location
+            gridSync = reinterpret_cast<uint64_t>(syncInfo);
+          }
+          WriteAqlArgAt(const_cast<address>(parameters), &gridSync, it.size_, it.offset_);
+          break;
+        }
      }
    }

@@ -2177,32 +2199,36 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
 * the list of kernel parameters.
 */
 void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
-  if (vcmd.cooperativeGroups()) {
-    uint32_t workgroups = 0;
-    for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
-      if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
-        workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
-      }
-    }
-
+  if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
    // Get device queue for exclusive GPU access
    VirtualGPU* queue = dev().xferQueue();

+    // Lock the queue, using the blit manager lock
+    amd::ScopedLock lock(queue->blitMgr().lockXfer());
+
    // Wait for the execution on the current queue, since the coop groups will use the device queue
    releaseGpuMemoryFence();

-    // Lock the queue, using the blit manager lock
-    amd::ScopedLock lock(queue->blitMgr().lockXfer());
    queue->profilingBegin(vcmd);

-    static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups);
+    if (vcmd.cooperativeGroups()) {
+      // Initialize GWS if it's cooperative groups launch
+      uint32_t workgroups = 0;
+      for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
+        if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
+          workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
+        }
+      }
+
+      static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups - 1);
+    }

    // Sync AQL packets
    queue->setAqlHeader(dispatchPacketHeader_);

    // Submit kernel to HW
    if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
-      static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
+      static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), &vcmd)) {
      LogError("AQL dispatch failed!");
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
@@ -2218,7 +2244,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {

    // Submit kernel to HW
    if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
-      static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
+      static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
      LogError("AQL dispatch failed!");
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
@@ -171,7 +171,7 @@ class VirtualGPU : public device::VirtualDevice {
                            const_address parameters,            //!< Parameters for the kernel
                            void* event_handle,  //!< Handle to OCL event for debugging
                            uint32_t sharedMemBytes = 0, //!< Shared memory size
-                            bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
+                            amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
                            );
  void submitNativeFn(amd::NativeFnCommand& cmd);
  void submitMarker(amd::Marker& cmd);
@@ -232,12 +232,19 @@ const Context& Command::context() const { return queue_->context(); }

 NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
                                           Kernel& kernel, const NDRangeContainer& sizes,
-                                           uint32_t sharedMemBytes, uint32_t extraParam)
-    : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL)
-    , kernel_(kernel)
-    , sizes_(sizes)
-    , sharedMemBytes_(sharedMemBytes)
-    , extraParam_(extraParam) {
+                                           uint32_t sharedMemBytes, uint32_t extraParam,
+                                           uint32_t gridId, uint32_t numGrids,
+                                           uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice) :
+    Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL),
+    kernel_(kernel),
+    sizes_(sizes),
+    sharedMemBytes_(sharedMemBytes),
+    extraParam_(extraParam),
+    gridId_(gridId),
+    numGrids_(numGrids),
+    prevGridSum_(prevGridSum),
+    allGridSum_(allGridSum),
+    firstDevice_(firstDevice) {
  auto& device = queue.device();
  auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
  profilingInfo_.setCallback(devKernel->getProfilingCallback(
@@ -764,9 +764,15 @@ class NDRangeKernelCommand : public Command {
 private:
  Kernel& kernel_;
  NDRangeContainer sizes_;
-  address parameters_;
-  uint32_t sharedMemBytes_;
-  uint32_t extraParam_;
+  address parameters_;      //!< Pointer to the kernel argumets
+  // The below fields are specific to the HIP functionality
+  uint32_t sharedMemBytes_; //!< Size of reserved shared memory
+  uint32_t extraParam_;     //!< Extra flags for the kernel launch
+  uint32_t gridId_;         //!< Grid ID in the multi GPU kernel launch
+  uint32_t numGrids_;       //!< Total number of grids in multi GPU launch
+  uint64_t prevGridSum_;    //!< A sum of previous grids to the current launch
+  uint64_t allGridSum_;     //!< A sum of all grids in multi GPU launch
+  uint32_t firstDevice_;    //!< Device index of the first device in the grid

 public:
  enum {
@@ -777,7 +783,8 @@ class NDRangeKernelCommand : public Command {
  //! Construct an ExecuteKernel command
  NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
                       const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0,
-                       uint32_t extraParam = 0);
+                       uint32_t extraParam = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
+                       uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);

  virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }

@@ -804,6 +811,21 @@ class NDRangeKernelCommand : public Command {
    return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false;
  }

+  //! Return the current grid ID for multidevice launch
+  uint32_t gridId() const { return gridId_; }
+
+  //! Return the number of launched grids
+  uint32_t numGrids() const { return numGrids_; }
+
+  //! Return the total workload size for up to the current
+  uint64_t prevGridSum() const { return prevGridSum_; }
+
+  //! Return the total workload size for all GPUs
+  uint64_t allGridSum() const { return allGridSum_; }
+
+  //! Return the index of the first device in multi GPU launch
+  uint64_t firstDevice() const { return firstDevice_; }
+
  //! Set the local work size.
  void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }