diff --git a/rocclr/device/devkernel.cpp b/rocclr/device/devkernel.cpp index 6f62239a61..7cb64a6179 100644 --- a/rocclr/device/devkernel.cpp +++ b/rocclr/device/devkernel.cpp @@ -674,7 +674,7 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, if (workGroupInfo()->compileSize_[0] == 0) { // Find the default local workgroup size, if it wasn't specified if (lclWorkSize[0] == 0) { - if ((dev().settings().overrideLclSet & (1 << (workDim - 1))) == 0) { + if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) { // Find threads per group size_t thrPerGrp = workGroupInfo()->size_; @@ -685,7 +685,7 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, // and it's 2 or 3-dimensional workload (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().preferredWorkGroupSize_)) { + if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { lclWorkSize[0] = 8; lclWorkSize[1] = 8; } @@ -709,7 +709,7 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, } // Assuming DWORD access - const uint cacheLineMatch = dev().info().globalMemCacheLineSize_ >> 2; + const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; // Check if we couldn't find optimal workload if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || @@ -1095,7 +1095,7 @@ bool Kernel::GetAttrCodePropMetadata() { InitParameters(kernelMetaNode); // Set the workgroup information for the kernel - workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_; + workGroupInfo_.availableLDSSize_ = device().info().localMemSizePerCU_; workGroupInfo_.availableSGPRs_ = 104; workGroupInfo_.availableVGPRs_ = 256; diff --git a/rocclr/device/devkernel.hpp b/rocclr/device/devkernel.hpp index d164f76a76..ad97d9a2f7 100644 --- a/rocclr/device/devkernel.hpp +++ b/rocclr/device/devkernel.hpp @@ -407,7 +407,7 @@ class Kernel : public amd::HeapObject { }; //! Returns GPU device object, associated with this kernel - const amd::Device& dev() const { return dev_; } + const amd::Device& device() const { return dev_; } void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; } diff --git a/rocclr/device/gpu/gpucompiler.cpp b/rocclr/device/gpu/gpucompiler.cpp index 596095a9bd..d399fd4821 100644 --- a/rocclr/device/gpu/gpucompiler.cpp +++ b/rocclr/device/gpu/gpucompiler.cpp @@ -42,7 +42,7 @@ bool NullProgram::compileImpl(const std::string& src, const char** headerIncludeNames, amd::option::Options* options) { std::string sourceCode = src; - if (dev().settings().debugFlags_ & Settings::CheckForILSource) { + if (gpuNullDevice().settings().debugFlags_ & Settings::CheckForILSource) { size_t inc = sourceCode.find("il_cs_", 0); if (inc != std::string::npos) { // CL program is an IL program @@ -55,7 +55,7 @@ bool NullProgram::compileImpl(const std::string& src, std::string tempFolder = amd::Os::getTempPath(); std::string tempFileName = amd::Os::getTempFileName(); - if (dev().settings().debugFlags_ & Settings::StubCLPrograms) { + if (gpuNullDevice().settings().debugFlags_ & Settings::StubCLPrograms) { std::stringstream fileName; std::fstream stubRead; // Dump the IL function @@ -130,7 +130,7 @@ bool NullProgram::compileImpl(const std::string& src, } if (ACL_SUCCESS != - aclInsertSection(dev().amdilCompiler(), bin, sourceCode.c_str(), sourceCode.size(), aclSOURCE)) { + aclInsertSection(gpuNullDevice().amdilCompiler(), bin, sourceCode.c_str(), sourceCode.size(), aclSOURCE)) { LogWarning("aclInsertSection failed"); aclBinaryFini(bin); return false; @@ -145,7 +145,7 @@ bool NullProgram::compileImpl(const std::string& src, opts << options->origOptionStr.c_str(); if (options->origOptionStr.find("-cl-std=CL") == std::string::npos) { - switch (dev().settings().oclVersion_) { + switch (gpuNullDevice().settings().oclVersion_) { case OpenCL10: opts << " -cl-std=CL1.0"; break; @@ -179,15 +179,15 @@ bool NullProgram::compileImpl(const std::string& src, opts << " -I" << tempFolder; } - if (!dev().settings().imageSupport_) { + if (!gpuNullDevice().settings().imageSupport_) { opts << " -fno-image-support"; } - if (dev().settings().reportFMAF_) { + if (gpuNullDevice().settings().reportFMAF_) { opts << " -mfast-fmaf"; } - if (dev().settings().reportFMA_) { + if (gpuNullDevice().settings().reportFMA_) { opts << " -mfast-fma"; } @@ -206,10 +206,10 @@ bool NullProgram::compileImpl(const std::string& src, pos = newOpt.find("-fno-bin-llvmir"); } - err = aclCompile(dev().amdilCompiler(), bin, newOpt.c_str(), ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, + err = aclCompile(gpuNullDevice().amdilCompiler(), bin, newOpt.c_str(), ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, NULL); - buildLog_ += aclGetCompilerLog(dev().amdilCompiler()); + buildLog_ += aclGetCompilerLog(gpuNullDevice().amdilCompiler()); if (err != ACL_SUCCESS) { LogWarning("aclCompile failed"); @@ -218,7 +218,7 @@ bool NullProgram::compileImpl(const std::string& src, } size_t len = 0; - const void* ir = aclExtractSection(dev().amdilCompiler(), bin, &len, aclLLVMIR, &err); + const void* ir = aclExtractSection(gpuNullDevice().amdilCompiler(), bin, &len, aclLLVMIR, &err); if (err != ACL_SUCCESS) { LogWarning("aclExtractSection failed"); aclBinaryFini(bin); @@ -284,7 +284,7 @@ int NullProgram::compileBinaryToIL(amd::option::Options* options) { } if (ACL_SUCCESS != - aclInsertSection(dev().amdilCompiler(), bin, llvmBinary_.data(), llvmBinary_.size(), spirFlag)) { + aclInsertSection(gpuNullDevice().amdilCompiler(), bin, llvmBinary_.data(), llvmBinary_.size(), spirFlag)) { LogWarning("aclInsertSection failed"); aclBinaryFini(bin); return CL_BUILD_PROGRAM_FAILURE; @@ -294,7 +294,7 @@ int NullProgram::compileBinaryToIL(amd::option::Options* options) { std::string optionStr = options->origOptionStr; if (options->origOptionStr.find("kernel-arg-alignment") == std::string::npos) { char s[256]; - sprintf(s, " -Wb,-kernel-arg-alignment=%d", dev().info().memBaseAddrAlign_ / 8); + sprintf(s, " -Wb,-kernel-arg-alignment=%d", gpuNullDevice().info().memBaseAddrAlign_ / 8); optionStr += s; } @@ -308,8 +308,8 @@ int NullProgram::compileBinaryToIL(amd::option::Options* options) { type = ACL_TYPE_ISA; } - err = aclCompile(dev().amdilCompiler(), bin, optionStr.c_str(), aclTypeBinaryUsed, type, NULL); - buildLog_ += aclGetCompilerLog(dev().amdilCompiler()); + err = aclCompile(gpuNullDevice().amdilCompiler(), bin, optionStr.c_str(), aclTypeBinaryUsed, type, NULL); + buildLog_ += aclGetCompilerLog(gpuNullDevice().amdilCompiler()); if (err != ACL_SUCCESS) { LogWarning("aclCompile failed"); @@ -332,7 +332,7 @@ int NullProgram::compileBinaryToIL(amd::option::Options* options) { } size_t len = 0; - const void* amdil = aclExtractSection(dev().amdilCompiler(), bin, &len, aclCODEGEN, &err); + const void* amdil = aclExtractSection(gpuNullDevice().amdilCompiler(), bin, &len, aclCODEGEN, &err); if (err != ACL_SUCCESS) { LogWarning("aclExtractSection failed"); aclBinaryFini(bin); diff --git a/rocclr/device/gpu/gpukernel.cpp b/rocclr/device/gpu/gpukernel.cpp index 005e5a7b7b..6e26afc454 100644 --- a/rocclr/device/gpu/gpukernel.cpp +++ b/rocclr/device/gpu/gpukernel.cpp @@ -3045,7 +3045,7 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions, uint extraArgsNum) - : device::Kernel(prog->dev(), name, *prog), + : device::Kernel(prog->device(), name, *prog), compileOptions_(compileOptions), index_(0), code_(NULL), diff --git a/rocclr/device/gpu/gpuprogram.cpp b/rocclr/device/gpu/gpuprogram.cpp index af3616eddf..0e1cd6bd94 100644 --- a/rocclr/device/gpu/gpuprogram.cpp +++ b/rocclr/device/gpu/gpuprogram.cpp @@ -115,7 +115,7 @@ NullKernel* Program::createKernel(const std::string& name, const Kernel::InitDat *created = false; // Create a GPU kernel - Kernel* gpuKernel = new Kernel(name, static_cast(device()), *this, initData); + Kernel* gpuKernel = new Kernel(name, gpuDevice(), *this, initData); if (gpuKernel == NULL) { buildLog_ += "new Kernel() failed"; @@ -512,7 +512,7 @@ bool NullProgram::linkImpl(const std::vector& inputPrograms, } else { aclTypeUsed = aclLLVMIR; } - err = aclInsertSection(dev().amdilCompiler(), libs[i], llvmBinaries[i]->data(), + err = aclInsertSection(gpuNullDevice().amdilCompiler(), libs[i], llvmBinaries[i]->data(), llvmBinaries[i]->size(), aclTypeUsed); if (err != ACL_SUCCESS) { LogWarning("aclInsertSection failed"); @@ -529,10 +529,10 @@ bool NullProgram::linkImpl(const std::vector& inputPrograms, unsigned int numLibs = libs.size() - 1; if (numLibs > 0) { - err = aclLink(dev().amdilCompiler(), libs[0], numLibs, &libs[1], ACL_TYPE_LLVMIR_BINARY, + err = aclLink(gpuNullDevice().amdilCompiler(), libs[0], numLibs, &libs[1], ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); - buildLog_ += aclGetCompilerLog(dev().amdilCompiler()); + buildLog_ += aclGetCompilerLog(gpuNullDevice().amdilCompiler()); if (err != ACL_SUCCESS) { LogWarning("aclLink failed"); @@ -549,7 +549,7 @@ bool NullProgram::linkImpl(const std::vector& inputPrograms, } else { aclTypeUsed = aclLLVMIR; } - const void* llvmir = aclExtractSection(dev().amdilCompiler(), libs[0], &size, aclTypeUsed, &err); + const void* llvmir = aclExtractSection(gpuNullDevice().amdilCompiler(), libs[0], &size, aclTypeUsed, &err); if (err != ACL_SUCCESS) { LogWarning("aclExtractSection failed"); break; @@ -1456,13 +1456,13 @@ bool Program::allocGlobalData(const void* globalData, size_t dataSize, uint inde // so possible reallocation won't occur twice or // another thread could destroy a heap block, // while we didn't finish allocation - amd::ScopedLock k(dev().lockAsyncOps()); + amd::ScopedLock k(gpuDevice().lockAsyncOps()); // Allocate memory for the global data store - glbData_ = dev().createScratchBuffer(amd::alignUp(dataSize, 0x1000)); + glbData_ = gpuDevice().createScratchBuffer(amd::alignUp(dataSize, 0x1000)); dataStore = glbData_; } else { - dataStore = new Memory(dev(), amd::alignUp(dataSize, ConstBuffer::VectorSize)); + dataStore = new Memory(gpuDevice(), amd::alignUp(dataSize, ConstBuffer::VectorSize)); // Initialize constant buffer if ((dataStore == NULL) || !dataStore->create(Resource::RemoteUSWC)) { @@ -1478,7 +1478,7 @@ bool Program::allocGlobalData(const void* globalData, size_t dataSize, uint inde static const bool Entire = true; amd::Coord3D origin(0, 0, 0); amd::Coord3D region(dataSize); - result = dev().xferMgr().writeBuffer(globalData, *dataStore, origin, region, Entire); + result = gpuDevice().xferMgr().writeBuffer(globalData, *dataStore, origin, region, Entire); } return result; @@ -1505,7 +1505,7 @@ HSAILProgram::HSAILProgram(Device& device, amd::Program& owner) maxScratchRegs_(0), executable_(NULL), loaderContext_(this) { - machineTarget_ = dev().hwInfo()->targetName_; + machineTarget_ = gpuNullDevice().hwInfo()->targetName_; loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); } @@ -1517,7 +1517,7 @@ HSAILProgram::HSAILProgram(NullDevice& device, amd::Program& owner) executable_(NULL), loaderContext_(this) { isNull_ = true; - machineTarget_ = dev().hwInfo()->targetName_; + machineTarget_ = gpuNullDevice().hwInfo()->targetName_; loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); } @@ -1578,9 +1578,9 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) { // 1. if the program is created with binary and contains only hsail text case ACL_TYPE_HSAIL_TEXT: { std::string curOptions = options->origOptionStr + hsailOptions(); - errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, curOptions.c_str(), + errorCode = aclCompile(gpuNullDevice().hsaCompiler(), binaryElf_, curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, NULL); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + buildLog_ += aclGetCompilerLog(gpuNullDevice().hsaCompiler()); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: BRIG code generation failed.\n"; return false; @@ -1602,12 +1602,12 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) { std::string fin_options(options->origOptionStr + hsailOptions()); // Append an option so that we can selectively enable a SCOption on CZ // whenever IOMMUv2 is enabled. - if (dev().settings().svmFineGrainSystem_) { + if (gpuNullDevice().settings().svmFineGrainSystem_) { fin_options.append(" -sc-xnack-iommu"); } - errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG, + errorCode = aclCompile(gpuNullDevice().hsaCompiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + buildLog_ += aclGetCompilerLog(gpuNullDevice().hsaCompiler()); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: BRIG finalization to ISA failed.\n"; return false; @@ -1625,7 +1625,7 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) { size_t size = 0; hsa_code_object_t code_object; code_object.handle = reinterpret_cast( - aclExtractSection(dev().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode)); + aclExtractSection(gpuNullDevice().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode)); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n"; return false; @@ -1638,14 +1638,14 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) { } size_t kernelNamesSize = 0; errorCode = - aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize); + aclQueryInfo(gpuNullDevice().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; return false; } if (kernelNamesSize > 0) { char* kernelNames = new char[kernelNamesSize]; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, + errorCode = aclQueryInfo(gpuNullDevice().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; @@ -1661,7 +1661,7 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) { for (const auto& it : vKernels) { std::string kernelName(it); std::string openclKernelName = Kernel::openclMangledName(kernelName); - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS, + errorCode = aclQueryInfo(gpuNullDevice().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS, openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs); if (errorCode != ACL_SUCCESS) { @@ -1698,7 +1698,7 @@ bool HSAILProgram::linkImpl(amd::option::Options* options) { } // Save the binary in the interface class saveBinaryAndSetType(TYPE_EXECUTABLE); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + buildLog_ += aclGetCompilerLog(gpuNullDevice().hsaCompiler()); return true; } @@ -1708,13 +1708,13 @@ std::string HSAILProgram::hsailOptions() { std::string hsailOptions; // Set options for the standard device specific options // All our devices support these options now - if (dev().settings().reportFMAF_) { + if (gpuNullDevice().settings().reportFMAF_) { hsailOptions.append(" -DFP_FAST_FMAF=1"); } - if (dev().settings().reportFMA_) { + if (gpuNullDevice().settings().reportFMA_) { hsailOptions.append(" -DFP_FAST_FMA=1"); } - if (!dev().settings().singleFpDenorm_) { + if (!gpuNullDevice().settings().singleFpDenorm_) { hsailOptions.append(" -cl-denorms-are-zero"); } @@ -1738,7 +1738,7 @@ std::string HSAILProgram::hsailOptions() { bool HSAILProgram::allocKernelTable() { uint size = kernels().size() * sizeof(size_t); - kernels_ = new gpu::Memory(dev(), size); + kernels_ = new gpu::Memory(gpuDevice(), size); // Initialize kernel table if ((kernels_ == NULL) || !kernels_->create(Resource::RemoteUSWC)) { delete kernels_; @@ -1767,7 +1767,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) { arch = "hsail64"; } info_ = aclGetTargetInfo(arch.c_str(), - (str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str), &err); + (str && str[0] == '\0' ? gpuNullDevice().hwInfo()->targetName_ : str), &err); if (err != ACL_SUCCESS) { LogWarning("aclGetTargetInfo failed"); } @@ -1867,7 +1867,7 @@ hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char* name) { } bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { - uint dev_gfxip = program_->dev().hwInfo()->gfxipVersion_; + uint dev_gfxip = program_->gpuNullDevice().hwInfo()->gfxipVersion_; uint isa_gfxip = isa.handle; switch (dev_gfxip) { case gfx700: @@ -1892,7 +1892,7 @@ bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) case gfx602: default: LogPrintfError("Unsupported gfxip version gfx%d", dev_gfxip); - return false; + return false; } } @@ -2024,7 +2024,7 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate( assert(false); return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - gpu::Sampler* sampler = new gpu::Sampler(program_->dev()); + gpu::Sampler* sampler = new gpu::Sampler(program_->gpuDevice()); if (!sampler || !sampler->create(state)) { delete sampler; return HSA_STATUS_ERROR; @@ -2075,15 +2075,15 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { return new char[size]; } - gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align)); + gpu::Memory* mem = new gpu::Memory(program_->gpuDevice(), amd::alignUp(size, align)); if (!mem || !mem->create(gpu::Resource::Local)) { delete mem; return NULL; } - assert(program_->dev().xferQueue()); + assert(program_->gpuDevice().xferQueue()); if (zero) { char pattern = 0; - program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), + program_->gpuDevice().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size)); } program_->addGlobalStore(mem); @@ -2102,9 +2102,9 @@ bool ORCAHSALoaderContext::GpuMemCopy(void* dst, size_t offset, const void* src, memcpy(reinterpret_cast
(dst) + offset, src, size); return true; } - assert(program_->dev().xferQueue()); + assert(program_->gpuDevice().xferQueue()); gpu::Memory* mem = reinterpret_cast(dst); - return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), + return program_->gpuDevice().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true); return true; } diff --git a/rocclr/device/gpu/gpuprogram.hpp b/rocclr/device/gpu/gpuprogram.hpp index 17796d89af..05f4569a5c 100644 --- a/rocclr/device/gpu/gpuprogram.hpp +++ b/rocclr/device/gpu/gpuprogram.hpp @@ -289,7 +289,7 @@ class NullProgram : public device::Program { ); //! Return a typecasted GPU device - gpu::NullDevice& dev() { + gpu::NullDevice& gpuNullDevice() { return const_cast(static_cast(device())); } @@ -339,10 +339,7 @@ class Program : public NullProgram { //! Returns pritnf info array const std::vector& printfInfo() const { return printf_; } - //! Return a typecasted GPU device - gpu::Device& dev() { return const_cast(static_cast(device())); } - -protected: + protected: private: //! Disable copy constructor Program(const Program&); @@ -350,6 +347,11 @@ protected: //! Disable operator= Program& operator=(const Program&); + //! Return a typecasted GPU device + gpu::Device& gpuDevice() { + return const_cast(static_cast(device())); + } + HwConstBuffers constBufs_; //!< Constant buffers for the global store gpu::Memory* glbData_; //!< Global data store }; @@ -467,8 +469,15 @@ class HSAILProgram : public device::Program { const std::vector& globalStores() const { return globalStores_; } - //! Return a typecasted GPU device - gpu::Device& dev() { return const_cast(static_cast(device())); } + //! Return a typecasted GPU null device. + gpu::NullDevice& gpuNullDevice() { + return const_cast(static_cast(device())); + } + + //! Return a typecasted GPU device. The device must not be the NullDevice. + gpu::Device& gpuDevice() { + return const_cast(static_cast(device())); + } //! Returns GPU kernel table const Memory* kernelTable() const { return kernels_; } diff --git a/rocclr/device/pal/palkernel.cpp b/rocclr/device/pal/palkernel.cpp index b0fdea8eff..7fcdb6b91f 100644 --- a/rocclr/device/pal/palkernel.cpp +++ b/rocclr/device/pal/palkernel.cpp @@ -43,8 +43,8 @@ void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize, // Make sure runtime matches HW alignment, which is 256 scratch regs (DWORDs) per wave constexpr uint32_t ScratchRegAlignment = 256; workGroupInfo_.scratchRegs_ = - amd::alignUp((workGroupInfo_.scratchRegs_ * dev().info().wavefrontWidth_), - ScratchRegAlignment) / dev().info().wavefrontWidth_; + amd::alignUp((workGroupInfo_.scratchRegs_ * device().info().wavefrontWidth_), + ScratchRegAlignment) / device().info().wavefrontWidth_; workGroupInfo_.privateMemSize_ = workGroupInfo_.scratchRegs_ * sizeof(uint32_t); workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize; workGroupInfo_.usedSGPRs_ = numSGPRs; @@ -52,13 +52,13 @@ void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize, workGroupInfo_.usedVGPRs_ = numVGPRs; if (!prog().isNull()) { - workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu; + workGroupInfo_.availableLDSSize_ = palDevice().properties().gfxipProperties.shaderCore.ldsSizePerCu; workGroupInfo_.availableSGPRs_ = - dev().properties().gfxipProperties.shaderCore.numAvailableSgprs; + palDevice().properties().gfxipProperties.shaderCore.numAvailableSgprs; workGroupInfo_.availableVGPRs_ = - dev().properties().gfxipProperties.shaderCore.numAvailableVgprs; + palDevice().properties().gfxipProperties.shaderCore.numAvailableVgprs; workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = - dev().info().wavefrontWidth_; + device().info().wavefrontWidth_; } else { workGroupInfo_.availableLDSSize_ = 64 * Ki; workGroupInfo_.availableSGPRs_ = 104; @@ -105,7 +105,7 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { } HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions) - : device::Kernel(prog->dev(), name, *prog), + : device::Kernel(prog->device(), name, *prog), compileOptions_(compileOptions), index_(0), code_(0), @@ -128,12 +128,12 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { options.append(openClKernelName.c_str()); // Append an option so that we can selectively enable a SCOption on CZ // whenever IOMMUv2 is enabled. - if (dev().settings().svmFineGrainSystem_) { + if (palNullDevice().settings().svmFineGrainSystem_) { options.append(" -sc-xnack-iommu"); } - error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG, + error = aclCompile(palNullDevice().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr); - buildLog_ += aclGetCompilerLog(dev().compiler()); + buildLog_ += aclGetCompilerLog(palNullDevice().compiler()); if (error != ACL_SUCCESS) { LogError("Failed to finalize kernel"); return false; @@ -144,7 +144,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { // Pull out metadata from the ELF size_t sizeOfArgList; - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfArgList); if (error != ACL_SUCCESS) { return false; @@ -154,7 +154,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { if (nullptr == aclArgList) { return false; } - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList); if (error != ACL_SUCCESS) { return false; @@ -164,30 +164,30 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { delete[] aclArgList; size_t sizeOfWorkGroupSize; - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize); if (error != ACL_SUCCESS) { return false; } - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); if (error != ACL_SUCCESS) { return false; } // Copy wavefront size - workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_; + workGroupInfo_.wavefrontSize_ = device().info().wavefrontWidth_; // Find total workgroup size if (workGroupInfo_.compileSize_[0] != 0) { workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * workGroupInfo_.compileSize_[2]; } else { - workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_; + workGroupInfo_.size_ = device().info().preferredWorkGroupSize_; } // Pull out printf metadata from the ELF size_t sizeOfPrintfList; - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfPrintfList); if (error != ACL_SUCCESS) { return false; @@ -199,7 +199,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { if (nullptr == aclPrintfList) { return false; } - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList); if (error != ACL_SUCCESS) { return false; @@ -213,7 +213,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { aclMetadata md; md.enqueue_kernel = false; size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE, openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue); if (error != ACL_SUCCESS) { return false; @@ -222,7 +222,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { md.kernel_index = -1; size_t sizeOfIndex = sizeof(md.kernel_index); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_KERNEL_INDEX, openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex); if (error != ACL_SUCCESS) { return false; @@ -230,7 +230,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { index_ = md.kernel_index; size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); if (error != ACL_SUCCESS) { @@ -240,7 +240,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { waveLimiter_.enable(); size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT, openClKernelName.c_str(), workGroupInfo_.compileSizeHint_, &sizeOfWorkGroupSizeHint); if (error != ACL_SUCCESS) { @@ -248,7 +248,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { } size_t sizeOfVecTypeHint; - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, openClKernelName.c_str(), NULL, &sizeOfVecTypeHint); if (error != ACL_SUCCESS) { return false; @@ -259,7 +259,7 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { if (NULL == VecTypeHint) { return false; } - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, + error = aclQueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint); if (error != ACL_SUCCESS) { return false; @@ -273,8 +273,6 @@ bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { return true; } -const Device& HSAILKernel::dev() const { return reinterpret_cast(dev_); } - const HSAILProgram& HSAILKernel::prog() const { return reinterpret_cast(prog_); } @@ -476,7 +474,7 @@ bool LightningKernel::init() { workGroupInfo()->usedSGPRs_, workGroupInfo()->usedVGPRs_); // Copy wavefront size - workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_; + workGroupInfo_.wavefrontSize_ = device().info().wavefrontWidth_; if (workGroupInfo_.size_ == 0) { return false; diff --git a/rocclr/device/pal/palkernel.hpp b/rocclr/device/pal/palkernel.hpp index c5d067a94d..71b5373c03 100644 --- a/rocclr/device/pal/palkernel.hpp +++ b/rocclr/device/pal/palkernel.hpp @@ -68,8 +68,11 @@ class HSAILKernel : public device::Kernel { //! finalizes the kernel if needed bool init(amd::hsa::loader::Symbol* sym, bool finalize = false); - //! Returns PAL device object, associated with this kernel - const Device& dev() const; + //! Returns PAL, possibly null, device object, associated with this kernel. + const NullDevice& palNullDevice() const { return reinterpret_cast(dev_); } + + //! Returns PAL device object, associated with this kernel which must not be the null device. + const Device& palDevice() const { return reinterpret_cast(dev_); } //! Returns HSA program associated with this kernel const HSAILProgram& prog() const; diff --git a/rocclr/device/pal/palprogram.cpp b/rocclr/device/pal/palprogram.cpp index 16e2f678e1..76eecb8690 100644 --- a/rocclr/device/pal/palprogram.cpp +++ b/rocclr/device/pal/palprogram.cpp @@ -70,8 +70,8 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t bool zero) { align = amd::alignUp(align, sizeof(uint32_t)); - amd::Memory* amd_mem_obj = new (prog.dev().context()) - amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align), + amd::Memory* amd_mem_obj = new (prog.palDevice().context()) + amd::Buffer(prog.palDevice().context(), 0, amd::alignUp(size, align), // HIP requires SVM allocation for segment code due to possible global variable // access and global variables are a part of code segment with the latest loader amd::IS_HIP ? reinterpret_cast(1) : nullptr); @@ -87,11 +87,11 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t return false; } - gpuAccess_ = static_cast(amd_mem_obj->getDeviceMemory(prog.dev(), false)); + gpuAccess_ = static_cast(amd_mem_obj->getDeviceMemory(prog.palDevice(), false)); if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) { void* ptr = nullptr; - cpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align)); + cpuAccess_ = new pal::Memory(prog.palDevice(), amd::alignUp(size, align)); if ((cpuAccess_ == nullptr) || !cpuAccess_->create(pal::Resource::Remote)) { delete cpuAccess_; cpuAccess_ = nullptr; @@ -111,8 +111,8 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t if ((cpuAccess_ == nullptr) && zero && !prog.isInternal()) { uint64_t pattern = 0; size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1; - prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0), - amd::Coord3D(size)); + prog.palDevice().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0), + amd::Coord3D(size)); } switch (segment) { @@ -266,7 +266,7 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_ } size_t kernelNamesSize = 0; - acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, + acl_error errorCode = aclQueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; @@ -274,7 +274,7 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_ } if (kernelNamesSize > 0) { char* kernelNames = new char[kernelNamesSize]; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, + errorCode = aclQueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; @@ -326,7 +326,7 @@ bool HSAILProgram::createBinary(amd::option::Options* options) { return true; } bool HSAILProgram::allocKernelTable() { uint size = kernels().size() * sizeof(size_t); - kernels_ = new pal::Memory(dev(), size); + kernels_ = new pal::Memory(palDevice(), size); // Initialize kernel table if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) { delete kernels_; @@ -352,7 +352,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) { arch = "hsail64"; } info_ = aclGetTargetInfo(arch.c_str(), - (str && str[0] == '\0' ? dev().hwInfo()->machineTarget_ : str), &err); + (str && str[0] == '\0' ? palNullDevice().hwInfo()->machineTarget_ : str), &err); if (err != ACL_SUCCESS) { LogWarning("aclGetTargetInfo failed"); } @@ -523,9 +523,9 @@ hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) { } bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { - uint32_t gfxipVersion = program_->dev().settings().useLightning_ - ? program_->dev().hwInfo()->gfxipVersionLC_ - : program_->dev().hwInfo()->gfxipVersion_; + uint32_t gfxipVersion = program_->palNullDevice().settings().useLightning_ + ? program_->palNullDevice().hwInfo()->gfxipVersionLC_ + : program_->palNullDevice().hwInfo()->gfxipVersion_; uint32_t majorSrc = gfxipVersion / 10; uint32_t minorSrc = gfxipVersion % 10; @@ -671,7 +671,7 @@ hsa_status_t PALHSALoaderContext::SamplerCreate( assert(false); return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - pal::Sampler* sampler = new pal::Sampler(program_->dev()); + pal::Sampler* sampler = new pal::Sampler(program_->palDevice()); if (!sampler || !sampler->create(state)) { delete sampler; return HSA_STATUS_ERROR; diff --git a/rocclr/device/pal/palprogram.hpp b/rocclr/device/pal/palprogram.hpp index 3027107fc8..57aa9cc40e 100644 --- a/rocclr/device/pal/palprogram.hpp +++ b/rocclr/device/pal/palprogram.hpp @@ -157,8 +157,15 @@ class HSAILProgram : public device::Program { const std::vector& globalStores() const { return globalStores_; } - //! Return a typecasted PAL device - pal::Device& dev() { return const_cast(static_cast(device())); } + //! Return a typecasted PAL null device. + pal::NullDevice& palNullDevice() { + return const_cast(static_cast(device())); + } + + //! Return a typecasted PAL device. The device must not be the NullDevice. + pal::Device& palDevice() { + return const_cast(static_cast(device())); + } //! Returns GPU kernel table const Memory* kernelTable() const { return kernels_; } @@ -241,7 +248,7 @@ class LightningProgram : public HSAILProgram { LightningProgram(NullDevice& device, amd::Program& owner) : HSAILProgram(device, owner) { isLC_ = true; isHIP_ = (owner.language() == amd::Program::HIP); - machineTarget_ = dev().hwInfo()->machineTargetLC_; + machineTarget_ = palNullDevice().hwInfo()->machineTargetLC_; } LightningProgram(Device& device, amd::Program& owner) : HSAILProgram(device, owner) { diff --git a/rocclr/device/rocm/rockernel.cpp b/rocclr/device/rocm/rockernel.cpp index 5c5eaca48f..25498a4e12 100644 --- a/rocclr/device/rocm/rockernel.cpp +++ b/rocclr/device/rocm/rockernel.cpp @@ -31,7 +31,7 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle const uint32_t workgroupGroupSegmentByteSize, const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, const uint32_t kernargSegmentAlignment) - : device::Kernel(prog->dev(), name, *prog) { + : device::Kernel(prog->device(), name, *prog) { kernelCodeHandle_ = kernelCodeHandle; workgroupGroupSegmentByteSize_ = workgroupGroupSegmentByteSize; workitemPrivateSegmentByteSize_ = workitemPrivateSegmentByteSize; @@ -40,7 +40,7 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle } Kernel::Kernel(std::string name, Program* prog) - : device::Kernel(prog->dev(), name, *prog) { + : device::Kernel(prog->device(), name, *prog) { } #if defined(USE_COMGR_LIBRARY) @@ -57,10 +57,10 @@ bool LightningKernel::init() { symbolName_ = name(); } kernargSegmentAlignment_ = - amd::alignUp(std::max(kernargSegmentAlignment_, 128u), dev().info().globalMemCacheLineSize_); + amd::alignUp(std::max(kernargSegmentAlignment_, 128u), device().info().globalMemCacheLineSize_); // Set the workgroup information for the kernel - workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_; + workGroupInfo_.availableLDSSize_ = device().info().localMemSizePerCU_; assert(workGroupInfo_.availableLDSSize_ > 0); if (!SetAvailableSgprVgpr()) { @@ -155,7 +155,7 @@ bool LightningKernel::init() { workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_; workGroupInfo_.preferredSizeMultiple_ = wavefront_size; workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size; + workGroupInfo_.wavefrontPerSIMD_ = program()->rocDevice().info().maxWorkItemSizes_[0] / wavefront_size; workGroupInfo_.wavefrontSize_ = wavefront_size; if (workGroupInfo_.size_ == 0) { return false; @@ -181,7 +181,7 @@ bool HSAILKernel::init() { hsa_agent_t hsaDevice = program()->hsaDevice(); // Pull out metadata from the ELF size_t sizeOfArgList; - aclCompiler* compileHandle = program()->dev().compiler(); + aclCompiler* compileHandle = program()->rocDevice().compiler(); std::string openClKernelName("&__OpenCL_" + name() + "_kernel"); errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfArgList); @@ -202,7 +202,7 @@ bool HSAILKernel::init() { // Set the workgroup information for the kernel memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); - workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_; + workGroupInfo_.availableLDSSize_ = program()->rocDevice().info().localMemSizePerCU_; assert(workGroupInfo_.availableLDSSize_ > 0); workGroupInfo_.availableSGPRs_ = 104; workGroupInfo_.availableVGPRs_ = 256; @@ -250,13 +250,13 @@ bool HSAILKernel::init() { } workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size; + workGroupInfo_.wavefrontPerSIMD_ = program()->rocDevice().info().maxWorkItemSizes_[0] / wavefront_size; workGroupInfo_.wavefrontSize_ = wavefront_size; if (workGroupInfo_.compileSize_[0] != 0) { workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * workGroupInfo_.compileSize_[2]; } else { - workGroupInfo_.size_ = program()->dev().info().preferredWorkGroupSize_; + workGroupInfo_.size_ = program()->rocDevice().info().preferredWorkGroupSize_; } // Pull out printf metadata from the ELF diff --git a/rocclr/device/rocm/rocprogram.cpp b/rocclr/device/rocm/rocprogram.cpp index 512bf4ee2c..cb766f9aba 100644 --- a/rocclr/device/rocm/rocprogram.cpp +++ b/rocclr/device/rocm/rocprogram.cpp @@ -127,7 +127,7 @@ bool Program::initClBinary(char* binaryIn, size_t size) { bool Program::defineGlobalVar(const char* name, void* dptr) { hsa_status_t status = HSA_STATUS_SUCCESS; - hsa_agent_t hsa_device = dev().getBackendDevice(); + hsa_agent_t hsa_device = rocDevice().getBackendDevice(); status = hsa_executable_agent_global_variable_define(hsaExecutable_, hsa_device, name, dptr); if (status != HSA_STATUS_SUCCESS) { @@ -153,7 +153,7 @@ bool Program::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, return false; } - hsa_device= dev().getBackendDevice(); + hsa_device = rocDevice().getBackendDevice(); /* Find HSA Symbol by name */ status = hsa_executable_get_symbol_by_name(hsaExecutable_, global_name, &hsa_device, @@ -206,7 +206,7 @@ bool Program::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, return false; } - roc_device = static_cast(&dev()); + roc_device = &(rocDevice()); *amd_mem_obj = new(roc_device->context()) amd::Buffer(roc_device->context(), 0, *bytes, *device_pptr); @@ -227,7 +227,7 @@ bool Program::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, } HSAILProgram::HSAILProgram(roc::NullDevice& device, amd::Program& owner) : roc::Program(device, owner) { - machineTarget_ = dev().deviceInfo().machineTarget_; + machineTarget_ = rocNullDevice().deviceInfo().machineTarget_; } HSAILProgram::~HSAILProgram() { @@ -268,7 +268,7 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_ #if defined(WITH_COMPILER_LIB) // Stop compilation if it is an offline device - HSA runtime does not // support ISA compiled offline - if (!dev().isOnline()) { + if (!device().isOnline()) { return true; } @@ -294,7 +294,7 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_ return false; } - hsa_agent_t hsaDevice = dev().getBackendDevice(); + hsa_agent_t hsaDevice = rocDevice().getBackendDevice(); status = hsa_executable_load_agent_code_object(hsaExecutable_, hsaDevice, hsaCodeObjectReader_, nullptr, nullptr); if (status != HSA_STATUS_SUCCESS) { @@ -432,7 +432,7 @@ LightningProgram::LightningProgram(roc::NullDevice& device, amd::Program& owner) : roc::Program(device, owner) { isLC_ = true; isHIP_ = (owner.language() == amd::Program::HIP); - machineTarget_ = dev().deviceInfo().machineTargetLC_; + machineTarget_ = rocNullDevice().deviceInfo().machineTargetLC_; } bool LightningProgram::createBinary(amd::option::Options* options) { @@ -477,7 +477,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s return false; } - hsa_agent_t agent = dev().getBackendDevice(); + hsa_agent_t agent = rocDevice().getBackendDevice(); hsa_status_t status; status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, diff --git a/rocclr/device/rocm/rocprogram.hpp b/rocclr/device/rocm/rocprogram.hpp index b33448c881..f8f9691419 100644 --- a/rocclr/device/rocm/rocprogram.hpp +++ b/rocclr/device/rocm/rocprogram.hpp @@ -49,10 +49,13 @@ class Program : public device::Program { virtual bool initClBinary(char* binaryIn, size_t size); //! Return a typecasted GPU device - const NullDevice& dev() const { return static_cast(device()); } + const NullDevice& rocNullDevice() const { return static_cast(device()); } + + //! Return a typecasted GPU device + const Device& rocDevice() const { return static_cast(device()); } //! Returns the hsaBinary associated with the program - hsa_agent_t hsaDevice() const { return dev().getBackendDevice(); } + hsa_agent_t hsaDevice() const { return rocNullDevice().getBackendDevice(); } hsa_executable_t hsaExecutable() const { return hsaExecutable_; }