Rename device access functions

Rename functions that access devices to reflect the derived device they return. This includes the base device::Device and the derived gpu/pal/roc device classes in both NullDevice and Device forms. Change to use the least derived versions to clarify what operations will be available. Change-Id: I1abb6bfed7efa24852bc8d0d49acaea357d8b5d0
2021-01-10 01:12:54 +00:00
parent f679b05df7
commit 001fd66cac
13 changed files with 147 additions and 127 deletions
@@ -674,7 +674,7 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
  if (workGroupInfo()->compileSize_[0] == 0) {
    // Find the default local workgroup size, if it wasn't specified
    if (lclWorkSize[0] == 0) {
-      if ((dev().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
+      if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
        // Find threads per group
        size_t thrPerGrp = workGroupInfo()->size_;

@@ -685,7 +685,7 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
          // and it's 2 or 3-dimensional workload
          (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
          // Use 8x8 workgroup size if kernel has image writes
-          if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) {
+          if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
            lclWorkSize[0] = 8;
            lclWorkSize[1] = 8;
          }
@@ -709,7 +709,7 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
          }

          // Assuming DWORD access
-          const uint cacheLineMatch = dev().info().globalMemCacheLineSize_ >> 2;
+          const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;

          // Check if we couldn't find optimal workload
          if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
@@ -1095,7 +1095,7 @@ bool Kernel::GetAttrCodePropMetadata() {
  InitParameters(kernelMetaNode);

  // Set the workgroup information for the kernel
-  workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
+  workGroupInfo_.availableLDSSize_ = device().info().localMemSizePerCU_;
  workGroupInfo_.availableSGPRs_ = 104;
  workGroupInfo_.availableVGPRs_ = 256;

@@ -407,7 +407,7 @@ class Kernel : public amd::HeapObject {
  };

  //! Returns GPU device object, associated with this kernel
-  const amd::Device& dev() const { return dev_; }
+  const amd::Device& device() const { return dev_; }

  void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; }

@@ -42,7 +42,7 @@ bool NullProgram::compileImpl(const std::string& src,
                              const char** headerIncludeNames, amd::option::Options* options) {
  std::string sourceCode = src;

-  if (dev().settings().debugFlags_ & Settings::CheckForILSource) {
+  if (gpuNullDevice().settings().debugFlags_ & Settings::CheckForILSource) {
    size_t inc = sourceCode.find("il_cs_", 0);
    if (inc != std::string::npos) {
      // CL program is an IL program
@@ -55,7 +55,7 @@ bool NullProgram::compileImpl(const std::string& src,
  std::string tempFolder = amd::Os::getTempPath();
  std::string tempFileName = amd::Os::getTempFileName();

-  if (dev().settings().debugFlags_ & Settings::StubCLPrograms) {
+  if (gpuNullDevice().settings().debugFlags_ & Settings::StubCLPrograms) {
    std::stringstream fileName;
    std::fstream stubRead;
    // Dump the IL function
@@ -130,7 +130,7 @@ bool NullProgram::compileImpl(const std::string& src,
  }

  if (ACL_SUCCESS !=
-      aclInsertSection(dev().amdilCompiler(), bin, sourceCode.c_str(), sourceCode.size(), aclSOURCE)) {
+      aclInsertSection(gpuNullDevice().amdilCompiler(), bin, sourceCode.c_str(), sourceCode.size(), aclSOURCE)) {
    LogWarning("aclInsertSection failed");
    aclBinaryFini(bin);
    return false;
@@ -145,7 +145,7 @@ bool NullProgram::compileImpl(const std::string& src,
  opts << options->origOptionStr.c_str();

  if (options->origOptionStr.find("-cl-std=CL") == std::string::npos) {
-    switch (dev().settings().oclVersion_) {
+    switch (gpuNullDevice().settings().oclVersion_) {
      case OpenCL10:
        opts << " -cl-std=CL1.0";
        break;
@@ -179,15 +179,15 @@ bool NullProgram::compileImpl(const std::string& src,
    opts << " -I" << tempFolder;
  }

-  if (!dev().settings().imageSupport_) {
+  if (!gpuNullDevice().settings().imageSupport_) {
    opts << " -fno-image-support";
  }

-  if (dev().settings().reportFMAF_) {
+  if (gpuNullDevice().settings().reportFMAF_) {
    opts << " -mfast-fmaf";
  }

-  if (dev().settings().reportFMA_) {
+  if (gpuNullDevice().settings().reportFMA_) {
    opts << " -mfast-fma";
  }

@@ -206,10 +206,10 @@ bool NullProgram::compileImpl(const std::string& src,
    pos = newOpt.find("-fno-bin-llvmir");
  }

-  err = aclCompile(dev().amdilCompiler(), bin, newOpt.c_str(), ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY,
+  err = aclCompile(gpuNullDevice().amdilCompiler(), bin, newOpt.c_str(), ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY,
                   NULL);

-  buildLog_ += aclGetCompilerLog(dev().amdilCompiler());
+  buildLog_ += aclGetCompilerLog(gpuNullDevice().amdilCompiler());

  if (err != ACL_SUCCESS) {
    LogWarning("aclCompile failed");
@@ -218,7 +218,7 @@ bool NullProgram::compileImpl(const std::string& src,
  }

  size_t len = 0;
-  const void* ir = aclExtractSection(dev().amdilCompiler(), bin, &len, aclLLVMIR, &err);
+  const void* ir = aclExtractSection(gpuNullDevice().amdilCompiler(), bin, &len, aclLLVMIR, &err);
  if (err != ACL_SUCCESS) {
    LogWarning("aclExtractSection failed");
    aclBinaryFini(bin);
@@ -284,7 +284,7 @@ int NullProgram::compileBinaryToIL(amd::option::Options* options) {
  }

  if (ACL_SUCCESS !=
-      aclInsertSection(dev().amdilCompiler(), bin, llvmBinary_.data(), llvmBinary_.size(), spirFlag)) {
+      aclInsertSection(gpuNullDevice().amdilCompiler(), bin, llvmBinary_.data(), llvmBinary_.size(), spirFlag)) {
    LogWarning("aclInsertSection failed");
    aclBinaryFini(bin);
    return CL_BUILD_PROGRAM_FAILURE;
@@ -294,7 +294,7 @@ int NullProgram::compileBinaryToIL(amd::option::Options* options) {
  std::string optionStr = options->origOptionStr;
  if (options->origOptionStr.find("kernel-arg-alignment") == std::string::npos) {
    char s[256];
-    sprintf(s, " -Wb,-kernel-arg-alignment=%d", dev().info().memBaseAddrAlign_ / 8);
+    sprintf(s, " -Wb,-kernel-arg-alignment=%d", gpuNullDevice().info().memBaseAddrAlign_ / 8);
    optionStr += s;
  }

@@ -308,8 +308,8 @@ int NullProgram::compileBinaryToIL(amd::option::Options* options) {
    type = ACL_TYPE_ISA;
  }

-  err = aclCompile(dev().amdilCompiler(), bin, optionStr.c_str(), aclTypeBinaryUsed, type, NULL);
-  buildLog_ += aclGetCompilerLog(dev().amdilCompiler());
+  err = aclCompile(gpuNullDevice().amdilCompiler(), bin, optionStr.c_str(), aclTypeBinaryUsed, type, NULL);
+  buildLog_ += aclGetCompilerLog(gpuNullDevice().amdilCompiler());

  if (err != ACL_SUCCESS) {
    LogWarning("aclCompile failed");
@@ -332,7 +332,7 @@ int NullProgram::compileBinaryToIL(amd::option::Options* options) {
  }

  size_t len = 0;
-  const void* amdil = aclExtractSection(dev().amdilCompiler(), bin, &len, aclCODEGEN, &err);
+  const void* amdil = aclExtractSection(gpuNullDevice().amdilCompiler(), bin, &len, aclCODEGEN, &err);
  if (err != ACL_SUCCESS) {
    LogWarning("aclExtractSection failed");
    aclBinaryFini(bin);
@@ -3045,7 +3045,7 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {

 HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
                         uint extraArgsNum)
-    : device::Kernel(prog->dev(), name, *prog),
+    : device::Kernel(prog->device(), name, *prog),
      compileOptions_(compileOptions),
      index_(0),
      code_(NULL),
@@ -115,7 +115,7 @@ NullKernel* Program::createKernel(const std::string& name, const Kernel::InitDat

  *created = false;
  // Create a GPU kernel
-  Kernel* gpuKernel = new Kernel(name, static_cast<const gpu::Device&>(device()), *this, initData);
+  Kernel* gpuKernel = new Kernel(name, gpuDevice(), *this, initData);

  if (gpuKernel == NULL) {
    buildLog_ += "new Kernel() failed";
@@ -512,7 +512,7 @@ bool NullProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
    } else {
      aclTypeUsed = aclLLVMIR;
    }
-    err = aclInsertSection(dev().amdilCompiler(), libs[i], llvmBinaries[i]->data(),
+    err = aclInsertSection(gpuNullDevice().amdilCompiler(), libs[i], llvmBinaries[i]->data(),
                           llvmBinaries[i]->size(), aclTypeUsed);
    if (err != ACL_SUCCESS) {
      LogWarning("aclInsertSection failed");
@@ -529,10 +529,10 @@ bool NullProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
      unsigned int numLibs = libs.size() - 1;

      if (numLibs > 0) {
-        err = aclLink(dev().amdilCompiler(), libs[0], numLibs, &libs[1], ACL_TYPE_LLVMIR_BINARY,
+        err = aclLink(gpuNullDevice().amdilCompiler(), libs[0], numLibs, &libs[1], ACL_TYPE_LLVMIR_BINARY,
                      "-create-library", NULL);

-        buildLog_ += aclGetCompilerLog(dev().amdilCompiler());
+        buildLog_ += aclGetCompilerLog(gpuNullDevice().amdilCompiler());

        if (err != ACL_SUCCESS) {
          LogWarning("aclLink failed");
@@ -549,7 +549,7 @@ bool NullProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
      } else {
        aclTypeUsed = aclLLVMIR;
      }
-      const void* llvmir = aclExtractSection(dev().amdilCompiler(), libs[0], &size, aclTypeUsed, &err);
+      const void* llvmir = aclExtractSection(gpuNullDevice().amdilCompiler(), libs[0], &size, aclTypeUsed, &err);
      if (err != ACL_SUCCESS) {
        LogWarning("aclExtractSection failed");
        break;
@@ -1456,13 +1456,13 @@ bool Program::allocGlobalData(const void* globalData, size_t dataSize, uint inde
    // so possible reallocation won't occur twice or
    // another thread could destroy a heap block,
    // while we didn't finish allocation
-    amd::ScopedLock k(dev().lockAsyncOps());
+    amd::ScopedLock k(gpuDevice().lockAsyncOps());

    // Allocate memory for the global data store
-    glbData_ = dev().createScratchBuffer(amd::alignUp(dataSize, 0x1000));
+    glbData_ = gpuDevice().createScratchBuffer(amd::alignUp(dataSize, 0x1000));
    dataStore = glbData_;
  } else {
-    dataStore = new Memory(dev(), amd::alignUp(dataSize, ConstBuffer::VectorSize));
+    dataStore = new Memory(gpuDevice(), amd::alignUp(dataSize, ConstBuffer::VectorSize));

    // Initialize constant buffer
    if ((dataStore == NULL) || !dataStore->create(Resource::RemoteUSWC)) {
@@ -1478,7 +1478,7 @@ bool Program::allocGlobalData(const void* globalData, size_t dataSize, uint inde
    static const bool Entire = true;
    amd::Coord3D origin(0, 0, 0);
    amd::Coord3D region(dataSize);
-    result = dev().xferMgr().writeBuffer(globalData, *dataStore, origin, region, Entire);
+    result = gpuDevice().xferMgr().writeBuffer(globalData, *dataStore, origin, region, Entire);
  }

  return result;
@@ -1505,7 +1505,7 @@ HSAILProgram::HSAILProgram(Device& device, amd::Program& owner)
      maxScratchRegs_(0),
      executable_(NULL),
      loaderContext_(this) {
-  machineTarget_ = dev().hwInfo()->targetName_;
+  machineTarget_ = gpuNullDevice().hwInfo()->targetName_;
  loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
 }

@@ -1517,7 +1517,7 @@ HSAILProgram::HSAILProgram(NullDevice& device, amd::Program& owner)
      executable_(NULL),
      loaderContext_(this) {
  isNull_ = true;
-  machineTarget_ = dev().hwInfo()->targetName_;
+  machineTarget_ = gpuNullDevice().hwInfo()->targetName_;
  loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
 }

@@ -1578,9 +1578,9 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) {
    // 1. if the program is created with binary and contains only hsail text
    case ACL_TYPE_HSAIL_TEXT: {
      std::string curOptions = options->origOptionStr + hsailOptions();
-      errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, curOptions.c_str(),
+      errorCode = aclCompile(gpuNullDevice().hsaCompiler(), binaryElf_, curOptions.c_str(),
                             continueCompileFrom, ACL_TYPE_CG, NULL);
-      buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
+      buildLog_ += aclGetCompilerLog(gpuNullDevice().hsaCompiler());
      if (errorCode != ACL_SUCCESS) {
        buildLog_ += "Error: BRIG code generation failed.\n";
        return false;
@@ -1602,12 +1602,12 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) {
    std::string fin_options(options->origOptionStr + hsailOptions());
    // Append an option so that we can selectively enable a SCOption on CZ
    // whenever IOMMUv2 is enabled.
-    if (dev().settings().svmFineGrainSystem_) {
+    if (gpuNullDevice().settings().svmFineGrainSystem_) {
      fin_options.append(" -sc-xnack-iommu");
    }
-    errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG,
+    errorCode = aclCompile(gpuNullDevice().hsaCompiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG,
                           ACL_TYPE_ISA, NULL);
-    buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
+    buildLog_ += aclGetCompilerLog(gpuNullDevice().hsaCompiler());
    if (errorCode != ACL_SUCCESS) {
      buildLog_ += "Error: BRIG finalization to ISA failed.\n";
      return false;
@@ -1625,7 +1625,7 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) {
    size_t size = 0;
    hsa_code_object_t code_object;
    code_object.handle = reinterpret_cast<uint64_t>(
-        aclExtractSection(dev().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode));
+        aclExtractSection(gpuNullDevice().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode));
    if (errorCode != ACL_SUCCESS) {
      buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n";
      return false;
@@ -1638,14 +1638,14 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) {
  }
  size_t kernelNamesSize = 0;
  errorCode =
-      aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize);
+      aclQueryInfo(gpuNullDevice().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize);
  if (errorCode != ACL_SUCCESS) {
    buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
    return false;
  }
  if (kernelNamesSize > 0) {
    char* kernelNames = new char[kernelNamesSize];
-    errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames,
+    errorCode = aclQueryInfo(gpuNullDevice().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames,
                             &kernelNamesSize);
    if (errorCode != ACL_SUCCESS) {
      buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
@@ -1661,7 +1661,7 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) {
    for (const auto& it : vKernels) {
      std::string kernelName(it);
      std::string openclKernelName = Kernel::openclMangledName(kernelName);
-      errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
+      errorCode = aclQueryInfo(gpuNullDevice().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
                               openclKernelName.c_str(), &md.numHiddenKernelArgs,
                               &sizeOfnumHiddenKernelArgs);
      if (errorCode != ACL_SUCCESS) {
@@ -1698,7 +1698,7 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) {
  }
  // Save the binary in the interface class
  saveBinaryAndSetType(TYPE_EXECUTABLE);
-  buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
+  buildLog_ += aclGetCompilerLog(gpuNullDevice().hsaCompiler());
  return true;
 }

@@ -1708,13 +1708,13 @@ std::string HSAILProgram::hsailOptions() {
  std::string hsailOptions;
  // Set options for the standard device specific options
  // All our devices support these options now
-  if (dev().settings().reportFMAF_) {
+  if (gpuNullDevice().settings().reportFMAF_) {
    hsailOptions.append(" -DFP_FAST_FMAF=1");
  }
-  if (dev().settings().reportFMA_) {
+  if (gpuNullDevice().settings().reportFMA_) {
    hsailOptions.append(" -DFP_FAST_FMA=1");
  }
-  if (!dev().settings().singleFpDenorm_) {
+  if (!gpuNullDevice().settings().singleFpDenorm_) {
    hsailOptions.append(" -cl-denorms-are-zero");
  }

@@ -1738,7 +1738,7 @@ std::string HSAILProgram::hsailOptions() {
 bool HSAILProgram::allocKernelTable() {
  uint size = kernels().size() * sizeof(size_t);

-  kernels_ = new gpu::Memory(dev(), size);
+  kernels_ = new gpu::Memory(gpuDevice(), size);
  // Initialize kernel table
  if ((kernels_ == NULL) || !kernels_->create(Resource::RemoteUSWC)) {
    delete kernels_;
@@ -1767,7 +1767,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) {
    arch = "hsail64";
  }
  info_ = aclGetTargetInfo(arch.c_str(),
-                           (str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str), &err);
+                           (str && str[0] == '\0' ? gpuNullDevice().hwInfo()->targetName_ : str), &err);
  if (err != ACL_SUCCESS) {
    LogWarning("aclGetTargetInfo failed");
  }
@@ -1867,7 +1867,7 @@ hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char* name) {
 }

 bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
-  uint dev_gfxip = program_->dev().hwInfo()->gfxipVersion_;
+ uint dev_gfxip = program_->gpuNullDevice().hwInfo()->gfxipVersion_;
  uint isa_gfxip = isa.handle;
  switch (dev_gfxip) {
    case gfx700:
@@ -1892,7 +1892,7 @@ bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa)
    case gfx602:
    default:
      LogPrintfError("Unsupported gfxip version gfx%d", dev_gfxip);
-      return false;
+    return false;
  }
 }

@@ -2024,7 +2024,7 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
      assert(false);
      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
-  gpu::Sampler* sampler = new gpu::Sampler(program_->dev());
+  gpu::Sampler* sampler = new gpu::Sampler(program_->gpuDevice());
  if (!sampler || !sampler->create(state)) {
    delete sampler;
    return HSA_STATUS_ERROR;
@@ -2075,15 +2075,15 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
    return new char[size];
  }

-  gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align));
+  gpu::Memory* mem = new gpu::Memory(program_->gpuDevice(), amd::alignUp(size, align));
  if (!mem || !mem->create(gpu::Resource::Local)) {
    delete mem;
    return NULL;
  }
-  assert(program_->dev().xferQueue());
+  assert(program_->gpuDevice().xferQueue());
  if (zero) {
    char pattern = 0;
-    program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0),
+    program_->gpuDevice().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0),
                                         amd::Coord3D(size));
  }
  program_->addGlobalStore(mem);
@@ -2102,9 +2102,9 @@ bool ORCAHSALoaderContext::GpuMemCopy(void* dst, size_t offset, const void* src,
    memcpy(reinterpret_cast<address>(dst) + offset, src, size);
    return true;
  }
-  assert(program_->dev().xferQueue());
+  assert(program_->gpuDevice().xferQueue());
  gpu::Memory* mem = reinterpret_cast<gpu::Memory*>(dst);
-  return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size),
+  return program_->gpuDevice().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size),
                                               true);
  return true;
 }
@@ -289,7 +289,7 @@ class NullProgram : public device::Program {
                      );

  //! Return a typecasted GPU device
-  gpu::NullDevice& dev() {
+  gpu::NullDevice& gpuNullDevice() {
    return const_cast<gpu::NullDevice&>(static_cast<const gpu::NullDevice&>(device()));
  }

@@ -339,10 +339,7 @@ class Program : public NullProgram {
  //! Returns pritnf info array
  const std::vector<device::PrintfInfo>& printfInfo() const { return printf_; }

-   //! Return a typecasted GPU device
-  gpu::Device& dev() { return const_cast<gpu::Device&>(static_cast<const gpu::Device&>(device())); }
-
-protected:
+ protected:
 private:
  //! Disable copy constructor
  Program(const Program&);
@@ -350,6 +347,11 @@ protected:
  //! Disable operator=
  Program& operator=(const Program&);

+  //! Return a typecasted GPU device
+  gpu::Device& gpuDevice() {
+    return const_cast<gpu::Device&>(static_cast<const gpu::Device&>(device()));
+  }
+
  HwConstBuffers constBufs_;  //!< Constant buffers for the global store
  gpu::Memory* glbData_;      //!< Global data store
 };
@@ -467,8 +469,15 @@ class HSAILProgram : public device::Program {

  const std::vector<Memory*>& globalStores() const { return globalStores_; }

-  //! Return a typecasted GPU device
-  gpu::Device& dev() { return const_cast<gpu::Device&>(static_cast<const gpu::Device&>(device())); }
+  //! Return a typecasted GPU null device.
+  gpu::NullDevice& gpuNullDevice() {
+    return const_cast<gpu::NullDevice&>(static_cast<const gpu::NullDevice&>(device()));
+  }
+
+  //! Return a typecasted GPU device. The device must not be the NullDevice.
+  gpu::Device& gpuDevice() {
+    return const_cast<gpu::Device&>(static_cast<const gpu::Device&>(device()));
+  }

  //! Returns GPU kernel table
  const Memory* kernelTable() const { return kernels_; }
@@ -43,8 +43,8 @@ void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
  // Make sure runtime matches HW alignment, which is 256 scratch regs (DWORDs) per wave
  constexpr uint32_t ScratchRegAlignment = 256;
  workGroupInfo_.scratchRegs_ =
-      amd::alignUp((workGroupInfo_.scratchRegs_ * dev().info().wavefrontWidth_),
-                   ScratchRegAlignment) / dev().info().wavefrontWidth_;
+      amd::alignUp((workGroupInfo_.scratchRegs_ * device().info().wavefrontWidth_),
+                   ScratchRegAlignment) / device().info().wavefrontWidth_;
  workGroupInfo_.privateMemSize_ = workGroupInfo_.scratchRegs_ * sizeof(uint32_t);
  workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
  workGroupInfo_.usedSGPRs_ = numSGPRs;
@@ -52,13 +52,13 @@ void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
  workGroupInfo_.usedVGPRs_ = numVGPRs;

  if (!prog().isNull()) {
-    workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
+    workGroupInfo_.availableLDSSize_ = palDevice().properties().gfxipProperties.shaderCore.ldsSizePerCu;
    workGroupInfo_.availableSGPRs_ =
-        dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
+        palDevice().properties().gfxipProperties.shaderCore.numAvailableSgprs;
    workGroupInfo_.availableVGPRs_ =
-        dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
+        palDevice().properties().gfxipProperties.shaderCore.numAvailableVgprs;
    workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ =
-        dev().info().wavefrontWidth_;
+        device().info().wavefrontWidth_;
  } else {
    workGroupInfo_.availableLDSSize_ = 64 * Ki;
    workGroupInfo_.availableSGPRs_ = 104;
@@ -105,7 +105,7 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
 }

 HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
-    : device::Kernel(prog->dev(), name, *prog),
+    : device::Kernel(prog->device(), name, *prog),
      compileOptions_(compileOptions),
      index_(0),
      code_(0),
@@ -128,12 +128,12 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
    options.append(openClKernelName.c_str());
    // Append an option so that we can selectively enable a SCOption on CZ
    // whenever IOMMUv2 is enabled.
-    if (dev().settings().svmFineGrainSystem_) {
+    if (palNullDevice().settings().svmFineGrainSystem_) {
      options.append(" -sc-xnack-iommu");
    }
-    error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
+    error = aclCompile(palNullDevice().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
                       ACL_TYPE_ISA, nullptr);
-    buildLog_ += aclGetCompilerLog(dev().compiler());
+    buildLog_ += aclGetCompilerLog(palNullDevice().compiler());
    if (error != ACL_SUCCESS) {
      LogError("Failed to finalize kernel");
      return false;
@@ -144,7 +144,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {

  // Pull out metadata from the ELF
  size_t sizeOfArgList;
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
                       openClKernelName.c_str(), nullptr, &sizeOfArgList);
  if (error != ACL_SUCCESS) {
    return false;
@@ -154,7 +154,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
  if (nullptr == aclArgList) {
    return false;
  }
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
                       openClKernelName.c_str(), aclArgList, &sizeOfArgList);
  if (error != ACL_SUCCESS) {
    return false;
@@ -164,30 +164,30 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
  delete[] aclArgList;

  size_t sizeOfWorkGroupSize;
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
                       openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
  if (error != ACL_SUCCESS) {
    return false;
  }
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
                       openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
  if (error != ACL_SUCCESS) {
    return false;
  }

  // Copy wavefront size
-  workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
+  workGroupInfo_.wavefrontSize_ = device().info().wavefrontWidth_;
  // Find total workgroup size
  if (workGroupInfo_.compileSize_[0] != 0) {
    workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
        workGroupInfo_.compileSize_[2];
  } else {
-    workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
+    workGroupInfo_.size_ = device().info().preferredWorkGroupSize_;
  }

  // Pull out printf metadata from the ELF
  size_t sizeOfPrintfList;
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
                       openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
  if (error != ACL_SUCCESS) {
    return false;
@@ -199,7 +199,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
    if (nullptr == aclPrintfList) {
      return false;
    }
-    error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
+    error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
                         openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
    if (error != ACL_SUCCESS) {
      return false;
@@ -213,7 +213,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
  aclMetadata md;
  md.enqueue_kernel = false;
  size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
                       openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
  if (error != ACL_SUCCESS) {
    return false;
@@ -222,7 +222,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {

  md.kernel_index = -1;
  size_t sizeOfIndex = sizeof(md.kernel_index);
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_KERNEL_INDEX,
                       openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
  if (error != ACL_SUCCESS) {
    return false;
@@ -230,7 +230,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
  index_ = md.kernel_index;

  size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
                       openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
                       &sizeOfWavesPerSimdHint);
  if (error != ACL_SUCCESS) {
@@ -240,7 +240,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
  waveLimiter_.enable();

  size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
                       openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
                       &sizeOfWorkGroupSizeHint);
  if (error != ACL_SUCCESS) {
@@ -248,7 +248,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
  }

  size_t sizeOfVecTypeHint;
-  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
+  error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
                       openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
  if (error != ACL_SUCCESS) {
    return false;
@@ -259,7 +259,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
    if (NULL == VecTypeHint) {
      return false;
    }
-    error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
+    error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
                         openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
    if (error != ACL_SUCCESS) {
      return false;
@@ -273,8 +273,6 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
  return true;
 }

-const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
-
 const HSAILProgram& HSAILKernel::prog() const {
  return reinterpret_cast<const HSAILProgram&>(prog_);
 }
@@ -476,7 +474,7 @@ bool LightningKernel::init() {
                   workGroupInfo()->usedSGPRs_, workGroupInfo()->usedVGPRs_);

  // Copy wavefront size
-  workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
+  workGroupInfo_.wavefrontSize_ = device().info().wavefrontWidth_;

  if (workGroupInfo_.size_ == 0) {
    return false;
@@ -68,8 +68,11 @@ class HSAILKernel : public device::Kernel {
  //! finalizes the kernel if needed
  bool init(amd::hsa::loader::Symbol* sym, bool finalize = false);

-  //! Returns PAL device object, associated with this kernel
-  const Device& dev() const;
+  //! Returns PAL, possibly null, device object, associated with this kernel.
+  const NullDevice& palNullDevice() const { return reinterpret_cast<const NullDevice&>(dev_); }
+
+  //! Returns PAL device object, associated with this kernel which must not be the null device.
+  const Device& palDevice() const { return reinterpret_cast<const Device&>(dev_); }

  //! Returns HSA program associated with this kernel
  const HSAILProgram& prog() const;
@@ -70,8 +70,8 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
                    bool zero) {
  align = amd::alignUp(align, sizeof(uint32_t));

-  amd::Memory* amd_mem_obj = new (prog.dev().context())
-      amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
+  amd::Memory* amd_mem_obj = new (prog.palDevice().context())
+      amd::Buffer(prog.palDevice().context(), 0, amd::alignUp(size, align),
                  // HIP requires SVM allocation for segment code due to possible global variable
                  // access and global variables are a part of code segment with the latest loader
                  amd::IS_HIP ? reinterpret_cast<void*>(1) : nullptr);
@@ -87,11 +87,11 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
    return false;
  }

-  gpuAccess_ = static_cast<pal::Memory*>(amd_mem_obj->getDeviceMemory(prog.dev(), false));
+  gpuAccess_ = static_cast<pal::Memory*>(amd_mem_obj->getDeviceMemory(prog.palDevice(), false));

  if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) {
    void* ptr = nullptr;
-    cpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align));
+    cpuAccess_ = new pal::Memory(prog.palDevice(), amd::alignUp(size, align));
    if ((cpuAccess_ == nullptr) || !cpuAccess_->create(pal::Resource::Remote)) {
      delete cpuAccess_;
      cpuAccess_ = nullptr;
@@ -111,8 +111,8 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
  if ((cpuAccess_ == nullptr) && zero && !prog.isInternal()) {
    uint64_t pattern = 0;
    size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
-    prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0),
-                                    amd::Coord3D(size));
+    prog.palDevice().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0),
+                                          amd::Coord3D(size));
  }

  switch (segment) {
@@ -266,7 +266,7 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
  }

  size_t kernelNamesSize = 0;
-  acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr,
+  acl_error errorCode = aclQueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr,
                                     nullptr, &kernelNamesSize);
  if (errorCode != ACL_SUCCESS) {
    buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
@@ -274,7 +274,7 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
  }
  if (kernelNamesSize > 0) {
    char* kernelNames = new char[kernelNamesSize];
-    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames,
+    errorCode = aclQueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames,
                             &kernelNamesSize);
    if (errorCode != ACL_SUCCESS) {
      buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
@@ -326,7 +326,7 @@ bool HSAILProgram::createBinary(amd::option::Options* options) { return true; }
 bool HSAILProgram::allocKernelTable() {
  uint size = kernels().size() * sizeof(size_t);

-  kernels_ = new pal::Memory(dev(), size);
+  kernels_ = new pal::Memory(palDevice(), size);
  // Initialize kernel table
  if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) {
    delete kernels_;
@@ -352,7 +352,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) {
    arch = "hsail64";
  }
  info_ = aclGetTargetInfo(arch.c_str(),
-                           (str && str[0] == '\0' ? dev().hwInfo()->machineTarget_ : str), &err);
+                           (str && str[0] == '\0' ? palNullDevice().hwInfo()->machineTarget_ : str), &err);
  if (err != ACL_SUCCESS) {
    LogWarning("aclGetTargetInfo failed");
  }
@@ -523,9 +523,9 @@ hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
 }

 bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
-  uint32_t gfxipVersion = program_->dev().settings().useLightning_
-      ? program_->dev().hwInfo()->gfxipVersionLC_
-      : program_->dev().hwInfo()->gfxipVersion_;
+  uint32_t gfxipVersion = program_->palNullDevice().settings().useLightning_
+      ? program_->palNullDevice().hwInfo()->gfxipVersionLC_
+      : program_->palNullDevice().hwInfo()->gfxipVersion_;
  uint32_t majorSrc = gfxipVersion / 10;
  uint32_t minorSrc = gfxipVersion % 10;

@@ -671,7 +671,7 @@ hsa_status_t PALHSALoaderContext::SamplerCreate(
      assert(false);
      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
-  pal::Sampler* sampler = new pal::Sampler(program_->dev());
+  pal::Sampler* sampler = new pal::Sampler(program_->palDevice());
  if (!sampler || !sampler->create(state)) {
    delete sampler;
    return HSA_STATUS_ERROR;
@@ -157,8 +157,15 @@ class HSAILProgram : public device::Program {

  const std::vector<Memory*>& globalStores() const { return globalStores_; }

-  //! Return a typecasted PAL device
-  pal::Device& dev() { return const_cast<pal::Device&>(static_cast<const pal::Device&>(device())); }
+  //! Return a typecasted PAL null device.
+  pal::NullDevice& palNullDevice() {
+    return const_cast<pal::NullDevice&>(static_cast<const pal::NullDevice&>(device()));
+  }
+
+  //! Return a typecasted PAL device. The device must not be the NullDevice.
+  pal::Device& palDevice() {
+    return const_cast<pal::Device&>(static_cast<const pal::Device&>(device()));
+  }

  //! Returns GPU kernel table
  const Memory* kernelTable() const { return kernels_; }
@@ -241,7 +248,7 @@ class LightningProgram : public HSAILProgram {
  LightningProgram(NullDevice& device, amd::Program& owner) : HSAILProgram(device, owner) {
    isLC_ = true;
    isHIP_ = (owner.language() == amd::Program::HIP);
-    machineTarget_ = dev().hwInfo()->machineTargetLC_;
+    machineTarget_ = palNullDevice().hwInfo()->machineTargetLC_;
  }

  LightningProgram(Device& device, amd::Program& owner) : HSAILProgram(device, owner) {
@@ -31,7 +31,7 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle
               const uint32_t workgroupGroupSegmentByteSize,
               const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
               const uint32_t kernargSegmentAlignment)
-    : device::Kernel(prog->dev(), name, *prog) {
+    : device::Kernel(prog->device(), name, *prog) {
  kernelCodeHandle_ = kernelCodeHandle;
  workgroupGroupSegmentByteSize_ = workgroupGroupSegmentByteSize;
  workitemPrivateSegmentByteSize_ = workitemPrivateSegmentByteSize;
@@ -40,7 +40,7 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle
 }

 Kernel::Kernel(std::string name, Program* prog)
-    : device::Kernel(prog->dev(), name, *prog) {
+    : device::Kernel(prog->device(), name, *prog) {
 }

 #if defined(USE_COMGR_LIBRARY)
@@ -57,10 +57,10 @@ bool LightningKernel::init() {
    symbolName_ = name();
  }
  kernargSegmentAlignment_ =
-      amd::alignUp(std::max(kernargSegmentAlignment_, 128u), dev().info().globalMemCacheLineSize_);
+      amd::alignUp(std::max(kernargSegmentAlignment_, 128u), device().info().globalMemCacheLineSize_);

  // Set the workgroup information for the kernel
-  workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
+  workGroupInfo_.availableLDSSize_ = device().info().localMemSizePerCU_;
  assert(workGroupInfo_.availableLDSSize_ > 0);

  if (!SetAvailableSgprVgpr()) {
@@ -155,7 +155,7 @@ bool LightningKernel::init() {
  workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_;
  workGroupInfo_.preferredSizeMultiple_ = wavefront_size;
  workGroupInfo_.usedStackSize_ = 0;
-  workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+  workGroupInfo_.wavefrontPerSIMD_ = program()->rocDevice().info().maxWorkItemSizes_[0] / wavefront_size;
  workGroupInfo_.wavefrontSize_ = wavefront_size;
  if (workGroupInfo_.size_ == 0) {
    return false;
@@ -181,7 +181,7 @@ bool HSAILKernel::init() {
  hsa_agent_t hsaDevice = program()->hsaDevice();
  // Pull out metadata from the ELF
  size_t sizeOfArgList;
-  aclCompiler* compileHandle = program()->dev().compiler();
+  aclCompiler* compileHandle = program()->rocDevice().compiler();
  std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
  errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
                                         openClKernelName.c_str(), nullptr, &sizeOfArgList);
@@ -202,7 +202,7 @@ bool HSAILKernel::init() {

  // Set the workgroup information for the kernel
  memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
-  workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
+  workGroupInfo_.availableLDSSize_ = program()->rocDevice().info().localMemSizePerCU_;
  assert(workGroupInfo_.availableLDSSize_ > 0);
  workGroupInfo_.availableSGPRs_ = 104;
  workGroupInfo_.availableVGPRs_ = 256;
@@ -250,13 +250,13 @@ bool HSAILKernel::init() {
  }

  workGroupInfo_.usedStackSize_ = 0;
-  workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+  workGroupInfo_.wavefrontPerSIMD_ = program()->rocDevice().info().maxWorkItemSizes_[0] / wavefront_size;
  workGroupInfo_.wavefrontSize_ = wavefront_size;
  if (workGroupInfo_.compileSize_[0] != 0) {
    workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
        workGroupInfo_.compileSize_[2];
  } else {
-    workGroupInfo_.size_ = program()->dev().info().preferredWorkGroupSize_;
+    workGroupInfo_.size_ = program()->rocDevice().info().preferredWorkGroupSize_;
  }

  // Pull out printf metadata from the ELF
@@ -127,7 +127,7 @@ bool Program::initClBinary(char* binaryIn, size_t size) {

 bool Program::defineGlobalVar(const char* name, void* dptr) {
  hsa_status_t status = HSA_STATUS_SUCCESS;
-  hsa_agent_t hsa_device = dev().getBackendDevice();
+  hsa_agent_t hsa_device = rocDevice().getBackendDevice();

  status = hsa_executable_agent_global_variable_define(hsaExecutable_, hsa_device, name, dptr);
  if (status != HSA_STATUS_SUCCESS) {
@@ -153,7 +153,7 @@ bool Program::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr,
    return false;
  }

-  hsa_device= dev().getBackendDevice();
+  hsa_device = rocDevice().getBackendDevice();

  /* Find HSA Symbol by name */
  status = hsa_executable_get_symbol_by_name(hsaExecutable_, global_name, &hsa_device,
@@ -206,7 +206,7 @@ bool Program::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr,
      return false;
    }

-    roc_device = static_cast<const roc::Device*>(&dev());
+    roc_device = &(rocDevice());
    *amd_mem_obj = new(roc_device->context()) amd::Buffer(roc_device->context(), 0, *bytes,
                                                          *device_pptr);

@@ -227,7 +227,7 @@ bool Program::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr,
 }

 HSAILProgram::HSAILProgram(roc::NullDevice& device, amd::Program& owner) : roc::Program(device, owner) {
-  machineTarget_ = dev().deviceInfo().machineTarget_;
+  machineTarget_ = rocNullDevice().deviceInfo().machineTarget_;
 }

 HSAILProgram::~HSAILProgram() {
@@ -268,7 +268,7 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
 #if defined(WITH_COMPILER_LIB)
  // Stop compilation if it is an offline device - HSA runtime does not
  // support ISA compiled offline
-  if (!dev().isOnline()) {
+  if (!device().isOnline()) {
    return true;
  }

@@ -294,7 +294,7 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
    return false;
  }

-  hsa_agent_t hsaDevice = dev().getBackendDevice();
+  hsa_agent_t hsaDevice = rocDevice().getBackendDevice();
  status = hsa_executable_load_agent_code_object(hsaExecutable_, hsaDevice, hsaCodeObjectReader_,
                                                 nullptr, nullptr);
  if (status != HSA_STATUS_SUCCESS) {
@@ -432,7 +432,7 @@ LightningProgram::LightningProgram(roc::NullDevice& device, amd::Program& owner)
  : roc::Program(device, owner) {
  isLC_ = true;
  isHIP_ = (owner.language() == amd::Program::HIP);
-  machineTarget_ = dev().deviceInfo().machineTargetLC_;
+  machineTarget_ = rocNullDevice().deviceInfo().machineTargetLC_;
 }

 bool LightningProgram::createBinary(amd::option::Options* options) {
@@ -477,7 +477,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
    return false;
  }

-  hsa_agent_t agent = dev().getBackendDevice();
+  hsa_agent_t agent = rocDevice().getBackendDevice();
  hsa_status_t status;

  status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
@@ -49,10 +49,13 @@ class Program : public device::Program {
  virtual bool initClBinary(char* binaryIn, size_t size);

  //! Return a typecasted GPU device
-  const NullDevice& dev() const { return static_cast<const NullDevice&>(device()); }
+  const NullDevice& rocNullDevice() const { return static_cast<const NullDevice&>(device()); }
+
+  //! Return a typecasted GPU device
+  const Device& rocDevice() const { return static_cast<const Device&>(device()); }

  //! Returns the hsaBinary associated with the program
-  hsa_agent_t hsaDevice() const { return dev().getBackendDevice(); }
+  hsa_agent_t hsaDevice() const { return rocNullDevice().getBackendDevice(); }

  hsa_executable_t hsaExecutable() const { return hsaExecutable_; }