SWDEV-429529 - Allocate glb_ctx_ even for one device

Move context allocation into Device::init() method to simplify the logic and handle HIP_VISIBLE_DEVICES properly Change-Id: I0fc6f37c7ae39bedbdad0290295d6794c66d6c54 [ROCm/clr commit: a49d633883]
2023-10-27 15:00:15 -04:00
@@ -872,6 +872,7 @@ extern const char* SchedulerSourceCode;
 extern const char* SchedulerSourceCode20;
 extern const char* TrapHandlerCode;

+// ================================================================================================
 bool Device::create(Pal::IDevice* device) {
  resourceList_ = new std::unordered_set<Resource*>();
  if (nullptr == resourceList_) {
@@ -1066,33 +1067,6 @@ bool Device::create(Pal::IDevice* device) {
    return false;
  }

-  if ((glb_ctx_ == nullptr) && (gNumDevices > 1) && (device == gDeviceList[gNumDevices - 1])) {
-    std::vector<amd::Device*> devices;
-    uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
-    // Add all PAL devices
-    for (uint32_t i = gStartDevice; i < numDevices; ++i) {
-      devices.push_back(amd::Device::devices()[i]);
-    }
-    // Add current
-    devices.push_back(this);
-
-    if (devices.size() > 1) {
-      // Create a dummy context
-      glb_ctx_ = new amd::Context(devices, info);
-      if (glb_ctx_ == nullptr) {
-        return false;
-      }
-      amd::Buffer* buf =
-          new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
-      if ((buf != nullptr) && buf->create()) {
-        p2p_stage_ = buf;
-      } else {
-        delete buf;
-        return false;
-      }
-    }
-  }
-
  return true;
 }

@@ -1333,6 +1307,7 @@ static void parseRequestedDeviceList(const char* requestedDeviceList,
  }
 }

+// ================================================================================================
 bool Device::init() {
  gStartDevice = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
  bool useDeviceList = false;
@@ -1438,10 +1413,34 @@ bool Device::init() {
        }
      }
    }
+
+    // Query active devices only
+    constexpr bool kNoOfflineDevices = false;
+    std::vector<amd::Device*> devices = getDevices(CL_DEVICE_TYPE_GPU, kNoOfflineDevices);
+    if (devices.size() > 0) {
+      // Create a dummy context for internal memory allocations on all reported devices
+      glb_ctx_ = new amd::Context(devices, amd::Context::Info());
+      if (glb_ctx_ == nullptr) {
+        return false;
+      }
+      // Allocate a staging buffer for P2P emulation path
+      if (devices.size() > 1) {
+        amd::Buffer* buf =
+            new (*glb_ctx_) amd::Buffer(*glb_ctx_, CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
+        if ((buf != nullptr) && buf->create()) {
+          p2p_stage_ = buf;
+        } else {
+          delete buf;
+          return false;
+        }
+      }
+    }
  }
+
  return true;
 }

+// ================================================================================================
 void Device::tearDown() {
  if (platform_ != nullptr) {
    platform_->Destroy();
@@ -455,6 +455,7 @@ void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) {
  --acquiredCnt_;
 }

+// ================================================================================================
 bool Device::init() {
  ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack.");

@@ -543,21 +544,54 @@ bool Device::init() {
    roc_device.release()->registerDevice();
  }

-  if (0 != Device::numDevices(CL_DEVICE_TYPE_GPU, false)) {
+  // Query active devices only
+  constexpr bool kNoOfflineDevices = false;
+  std::vector<amd::Device*> devices = getDevices(CL_DEVICE_TYPE_GPU, kNoOfflineDevices);
+  if (devices.size() > 0) {
+    bool p2p_available = false;
    // Loop through all available devices
-    for (auto device1: Device::devices()) {
+    for (auto device1: devices) {
      // Find all agents that can have access to the current device
      for (auto agent: static_cast<Device*>(device1)->p2pAgents()) {
        // Find cl_device_id associated with the current agent
-        for (auto device2: Device::devices()) {
+        for (auto device2: devices) {
          if (agent.handle == static_cast<Device*>(device2)->getBackendDevice().handle) {
            // Device2 can have access to device1
            device2->p2pDevices_.push_back(as_cl(device1));
            device1->p2p_access_devices_.push_back(device2);
+            p2p_available = true;
          }
        }
      }
    }
+
+    // Create a dummy context for internal memory allocations on all reported devices
+    glb_ctx_ = new amd::Context(devices, amd::Context::Info());
+    if (glb_ctx_ == nullptr) {
+      return false;
+    }
+
+    // Allocate a staging buffer for P2P emulation path
+    if ((devices.size() >= 1) && !p2p_available) {
+      amd::Buffer* buf =
+          new (*glb_ctx_) amd::Buffer(*glb_ctx_, CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
+      if ((buf != nullptr) && buf->create()) {
+        p2p_stage_ = buf;
+      } else {
+        delete buf;
+        return false;
+      }
+    }
+
+    // Allocate mgpu sync buffer for cooperative launches
+    if (amd::IS_HIP) {
+      mg_sync_ = reinterpret_cast<address>(amd::SvmBuffer::malloc(
+          *glb_ctx_, (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS),
+          kMGInfoSizePerDevice * devices.size(), kMGInfoSizePerDevice));
+      if (mg_sync_ == nullptr) {
+        return false;
+      }
+    }
  }

  return true;
@@ -741,44 +775,6 @@ bool Device::create() {
  // Use just 1 entry by default for the map cache
  mapCache_->push_back(nullptr);

-  if ((glb_ctx_ == nullptr) && (gpu_agents_.size() >= 1) &&
-      // Allow creation for the last device in the list.
-      (gpu_agents_[gpu_agents_.size() - 1].handle == bkendDevice_.handle)) {
-    std::vector<amd::Device*> devices;
-    uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false);
-    // Add all PAL devices
-    for (uint32_t i = 0; i < numDevices; ++i) {
-      devices.push_back(amd::Device::devices()[i]);
-    }
-    // Add current
-    devices.push_back(this);
-    // Create a dummy context
-    glb_ctx_ = new amd::Context(devices, info);
-    if (glb_ctx_ == nullptr) {
-      return false;
-    }
-
-    if ((p2p_agents_.size() < (devices.size()-1)) && (devices.size() > 1)) {
-      amd::Buffer* buf = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
-      if ((buf != nullptr) && buf->create()) {
-        p2p_stage_ = buf;
-      }
-      else {
-        delete buf;
-        return false;
-      }
-    }
-    // Check if sync buffer wasn't allocated yet
-    if (amd::IS_HIP && mg_sync_ == nullptr) {
-      mg_sync_ = reinterpret_cast<address>(amd::SvmBuffer::malloc(
-          GlbCtx(), (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS),
-          kMGInfoSizePerDevice * GlbCtx().devices().size(), kMGInfoSizePerDevice));
-      if (mg_sync_ == nullptr) {
-        return false;
-      }
-    }
-  }
-
  if (settings().stagedXferSize_ != 0) {
    // Initialize staged write buffers
    if (settings().stagedXferWrite_) {