diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp index 0e67f8117c..56c5f63dbb 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -801,6 +801,12 @@ KernelBlitManager::create(amd::Device& device) bool KernelBlitManager::createProgram(Device& device) { + if (device.blitProgram() == nullptr) { + if (!device.createBlitProgram()) { + return false; + } + } + std::vector devices; devices.push_back(&device); diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index 16b845e29d..d20b1a3513 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -478,7 +478,7 @@ void NullDevice::fillDeviceInfo( info_.numAsyncQueues_ = numComputeRings; info_.numRTQueues_ = palProp.engineProperties[Pal::EngineTypeExclusiveCompute].engineCount - 1; - info_.numRTCUs_ = 0x8; + info_.numRTCUs_ = 8; //palProp.engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu; info_.threadTraceEnable_ = settings().threadTraceEnable_; } @@ -718,6 +718,8 @@ Device::create(Pal::IDevice* device) palSettings->forceHighClocks = appProfile_.enableHighPerformanceState(); palSettings->longRunningSubmissions = true; palSettings->cmdBufBatchedSubmitChainLimit = 0; + //palSettings->disableResourceProcessingManager = true; + //palSettings->disableScManager = true; // Commit the new settings for the device result = iDev()->CommitSettingsAndInit(); @@ -885,28 +887,6 @@ Device::initializeHeapResources() } } - // Delay compilation due to brig_loader memory allocation - const char* scheduler = nullptr; - const char* ocl20 = nullptr; -#if !defined(WITH_LIGHTNING_COMPILER) - std::string sch = SchedulerSourceCode; - if (settings().oclVersion_ == OpenCL20) { - size_t loc = sch.find("%s"); - sch.replace(loc, 2, iDev()->GetDispatchKernelSource()); - scheduler = sch.c_str(); - ocl20 = "-cl-std=CL2.0"; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == nullptr || - !blitProgram_->create(this, scheduler, ocl20)) { - delete blitProgram_; - blitProgram_ = nullptr; - LogError("Couldn't create blit kernels!"); - return false; - } - // Create a synchronized transfer queue xferQueue_ = new VirtualGPU(*this); if (!(xferQueue_ && xferQueue_->create( @@ -2176,6 +2156,35 @@ Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) } } +bool +Device::createBlitProgram() +{ + bool result = true; + + // Delayed compilation due to brig_loader memory allocation + const char* scheduler = nullptr; + const char* ocl20 = nullptr; +#if !defined(WITH_LIGHTNING_COMPILER) + std::string sch = SchedulerSourceCode; + if (settings().oclVersion_ == OpenCL20) { + size_t loc = sch.find("%s"); + sch.replace(loc, 2, iDev()->GetDispatchKernelSource()); + scheduler = sch.c_str(); + ocl20 = "-cl-std=CL2.0"; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == nullptr || + !blitProgram_->create(this, scheduler, ocl20)) { + delete blitProgram_; + blitProgram_ = nullptr; + LogError("Couldn't create blit kernels!"); + result = false; + } + return result; +} + void Device::SrdManager::fillResourceList(std::vector& memList) { diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp index 96ae0bbd85..0ba0a18298 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp @@ -482,6 +482,9 @@ public: bool free //!< TRUE if runtime frees memory ); + //! Create internal blit program + bool createBlitProgram(); + //! Interop for GL device bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const; bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const; diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index 8848d76c69..43e0cf277c 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -394,22 +394,12 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) if (!prog().isNull()) { code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align)); Resource::MemoryType type = Resource::Local; - if (flags_.internalKernel_) { - type = Resource::RemoteUSWC; - } // Initialize kernel ISA code if (code_ && code_->create(type)) { - if (flags_.internalKernel_) { - address cpuCodePtr = static_cast
(code_->map(nullptr, Resource::WriteOnly)); - // Copy only amd_kernel_code_t - memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); - code_->unmap(nullptr); - } - else { - static_cast(dev().xferMgr()).writeRawData( - *code_, codeSize_, reinterpret_cast(akc)); - } + constexpr bool WaitForUpload = true; + code_->writeRawData(*code_->dev().xferQueue(), 0, codeSize_, + reinterpret_cast(akc), WaitForUpload); } else { LogError("Failed to allocate ISA code!"); diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp index f43b4d96a3..f549db1516 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp @@ -469,8 +469,6 @@ HSAILProgram::linkImpl(amd::option::Options* options) aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; bool finalize = true; bool hsaLoad = true; - internal_ = (compileOptions_.find("-cl-internal-kernel") != - std::string::npos) ? true : false; // If !binaryElf_ then program must have been created using clCreateProgramWithBinary if (!binaryElf_) { @@ -936,7 +934,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { assert(size); assert(align); assert(sizeof(void*) == 8 || sizeof(void*) == 4); - if (program_->isNull() || program_->isInternal()) { + if (program_->isNull()) { return new char[size]; } @@ -962,19 +960,20 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, if (0 == size) { return true; } - if (program_->isNull() || program_->isInternal()) { + if (program_->isNull()) { memcpy(reinterpret_cast
(dst) + offset, src, size); return true; } assert(program_->dev().xferQueue()); pal::Memory* mem = reinterpret_cast(dst); - return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true); + constexpr bool WaitForCopy = true; + mem->writeRawData(*mem->dev().xferQueue(), offset, size, src, WaitForCopy); return true; } void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size) { - if (program_->isNull() || program_->isInternal()) { + if (program_->isNull()) { delete[] reinterpret_cast(ptr); } else { @@ -1020,9 +1019,6 @@ LightningProgram::linkImpl(amd::option::Options *options) { using namespace amd::opencl_driver; - internal_ = (compileOptions_.find("-cl-internal-kernel") != - std::string::npos) ? true : false; - aclType continueCompileFrom = llvmBinary_.empty() ? getNextCompilationStageFromBinary(options) : ACL_TYPE_LLVMIR_BINARY; diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp index 971f4f3848..f1b58cec56 100644 --- a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp @@ -182,9 +182,6 @@ public: //! Returns TRUE if the program just compiled bool isNull() const { return isNull_; } - //! Returns TRUE if the program used internally by runtime - bool isInternal() const { return internal_; } - //! Returns TRUE if the program contains static samplers bool isStaticSampler() const { return (staticSamplers_.size() != 0); } @@ -278,7 +275,6 @@ protected: union { struct { uint32_t isNull_ : 1; //!< Null program no memory allocations - uint32_t internal_ : 1; //!< Internal blit program }; uint32_t flags_; //!< Program flags }; diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp index a49976c29d..e466cfe584 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -1118,14 +1118,18 @@ Resource::writeRawData( gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast(data)); gpu.eventEnd(MainEngine, event); - setBusy(gpu, event); - // Update the global GPU event - gpu.setGpuEvent(event, false); - if (waitForEvent) { + //! @note: We don't really have to mark the allocations as busy + //! if we are waiting for a transfer + // Wait for event to complete gpu.waitForEvent(&event); } + else { + setBusy(gpu, event); + // Update the global GPU event + gpu.setGpuEvent(event, false); + } } static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) { diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index bfa1a5b032..615970b0e7 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -63,7 +63,7 @@ VirtualGPU::Queue::Create( qCreateInfo.numReservedCu = rtCU; qCreateInfo.engineIndex = 0x0; cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute; - cmdCreateInfo.flags.rtCu = true; + cmdCreateInfo.flags.realtimeComputeUnits = true; } */ // Find queue object size @@ -801,28 +801,6 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, return false; } - // Choose the appropriate class for blit engine - switch (dev().settings().blitEngine_) { - default: - // Fall through ... - case Settings::BlitEngineHost: - blitSetup.disableAll(); - // Fall through ... - case Settings::BlitEngineCAL: - case Settings::BlitEngineKernel: - // use host blit for HW debug - if (dev().settings().enableHwDebug_) { - blitSetup.disableCopyImageToBuffer_ = true; - blitSetup.disableCopyBufferToImage_ = true; - } - blitMgr_ = new KernelBlitManager(*this, blitSetup); - break; - } - if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { - LogError("Could not create BlitManager!"); - return false; - } - tsCache_ = new TimeStampCache(*this); if (nullptr == tsCache_) { LogError("Could not create TimeStamp cache!"); @@ -846,6 +824,28 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, return false; } + // Choose the appropriate class for blit engine + switch (dev().settings().blitEngine_) { + default: + // Fall through ... + case Settings::BlitEngineHost: + blitSetup.disableAll(); + // Fall through ... + case Settings::BlitEngineCAL: + case Settings::BlitEngineKernel: + // use host blit for HW debug + if (dev().settings().enableHwDebug_) { + blitSetup.disableCopyImageToBuffer_ = true; + blitSetup.disableCopyBufferToImage_ = true; + } + blitMgr_ = new KernelBlitManager(*this, blitSetup); + break; + } + if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { + LogError("Could not create BlitManager!"); + return false; + } + return true; }