diff --git a/rocclr/runtime/device/pal/palblit.cpp b/rocclr/runtime/device/pal/palblit.cpp index ab5f713c8b..c9c1ac53d2 100644 --- a/rocclr/runtime/device/pal/palblit.cpp +++ b/rocclr/runtime/device/pal/palblit.cpp @@ -23,8 +23,8 @@ inline void DmaBlitManager::synchronize() const { if (syncOperation_) { - gpu().waitAllEngines(); gpu().releaseMemObjects(); + gpu().waitAllEngines(); } } @@ -2685,6 +2685,18 @@ KernelBlitManager::runScheduler( return result; } +void +KernelBlitManager::writeRawData( + device::Memory& memory, + size_t size, + const void* data + ) const +{ + static_cast(memory).writeRawData(gpu(), size, data, false); + + synchronize(); +} + amd::Memory* DmaBlitManager::pinHostMemory( const void* hostMem, diff --git a/rocclr/runtime/device/pal/palblit.hpp b/rocclr/runtime/device/pal/palblit.hpp index 462842dd60..112d1fb5af 100644 --- a/rocclr/runtime/device/pal/palblit.hpp +++ b/rocclr/runtime/device/pal/palblit.hpp @@ -371,14 +371,21 @@ public: bool entire = false //!< Entire buffer will be updated ) const; - //! Fills an image memory with a pattern data - virtual bool runScheduler( + //! Runs a GPU scheduler for device enqueue + bool runScheduler( device::Memory& vqueue, //!< Memory object for virtual queue device::Memory& params, //!< Extra arguments for the scheduler uint paramIdx, //!< Parameter index uint threads //!< Number of scheduling threads ) const; + //! Writes CPU raw data into GPU memory + void writeRawData( + device::Memory& memory, //!< Memory object for data udpate + size_t size, //!< Size of raw data + const void* data //!< Raw data pointer + ) const; + private: static const size_t MaxXferBuffers = 2; diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 8c3e1d3e26..90c81e2a6a 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -987,8 +987,8 @@ Device::init() acl_error error; compiler_ = aclCompilerInit(&opts, &error); if (error != ACL_SUCCESS) { - LogError("Error initializing the compiler"); - return false; + LogError("Error initializing the compiler"); + return false; } size_t size = Pal::GetPlatformSize(); @@ -1210,7 +1210,8 @@ Device::createBuffer( // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure. // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()}; - gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true); + static_cast(xferMgr()).writeRawData( + *gpuMemory, sizeof(pipeInit), pipeInit); } // If memory has direct access from host, then get CPU address if (gpuMemory->isHostMemDirectAccess() && diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index 7b0e47baad..ea82fd075a 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -677,9 +677,8 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) return false; } - //! @todo get the right value; // Copy wavefront size - workGroupInfo_.wavefrontSize_ = 64;//dev().getAttribs().wavefrontSize; + workGroupInfo_.wavefrontSize_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize; // Find total workgroup size if (workGroupInfo_.compileSize_[0] != 0) { workGroupInfo_.size_ = @@ -999,7 +998,7 @@ HSAILKernel::loadArguments( break; } - //! @todo 64 bit isn't supported with 32 bit binary + //! 64 bit isn't supported with 32 bit binary uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*)); diff --git a/rocclr/runtime/device/pal/palmemory.cpp b/rocclr/runtime/device/pal/palmemory.cpp index 01b9124fd6..6f03d78cdb 100644 --- a/rocclr/runtime/device/pal/palmemory.cpp +++ b/rocclr/runtime/device/pal/palmemory.cpp @@ -151,14 +151,8 @@ Memory::create( reinterpret_cast(params); // Check if parent was allocated in system memory if ((view->resource_->memoryType() == Resource::Pinned) || - (((view->resource_->memoryType() == Resource::Remote) || - (view->resource_->memoryType() == Resource::RemoteUSWC)) && - // @todo Enable unconditional optimization for remote memory - // Check for external allocation, to avoid the optimization - // for non-VM (double copy) mode - (owner() != nullptr) && - ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) || - dev().settings().remoteAlloc_))) { + (view->resource_->memoryType() == Resource::Remote) || + (view->resource_->memoryType() == Resource::RemoteUSWC)) { // Marks memory object for direct GPU access to the host memory flags_ |= HostMemoryDirectAccess; } @@ -578,10 +572,6 @@ Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) } } - //!@todo A wait isn't really necessary. However - //! Linux no-VM may have extra random failures. - wait(gpu); - // Should never fail assert(result && "Memory synchronization failed!"); } diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp index 383b170552..d677959ff9 100644 --- a/rocclr/runtime/device/pal/palprogram.cpp +++ b/rocclr/runtime/device/pal/palprogram.cpp @@ -237,7 +237,6 @@ HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStage aclType from = ACL_TYPE_DEFAULT; needOptionsCheck = true; size_t boolSize = sizeof(bool); - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? // Checking llvmir in .llvmir section bool containsSpirv = true; errorCode = aclQueryInfo(dev().compiler(), binaryElf_, @@ -375,7 +374,6 @@ HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { return continueCompileFrom; } bool recompile = false; - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? switch (continueCompileFrom) { case ACL_TYPE_HSAIL_BINARY: case ACL_TYPE_CG: diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index 98e3764cfe..e43b9c1320 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -149,8 +149,9 @@ VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem) void VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem) { - memReferences_.erase(iMem); - iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_); + if (0 != memReferences_.erase(iMem)) { + iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_); + } } uint @@ -2264,39 +2265,32 @@ VirtualGPU::submitMarker(amd::Marker& vcmd) GpuEvent* VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem) { - GpuEvents::iterator it = gpuEvents_.find(iMem); - if (it == gpuEvents_.end()) { -// queue(MainEngine).addMemRef(iMem); -// queue(SdmaEngine).addMemRef(iMem); - } return &gpuEvents_[iMem]; } void VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent) { - GpuEvents::iterator it = gpuEvents_.find(iMem); + auto it = gpuEvents_.find(iMem); + if (it != gpuEvents_.end()) { it->second = gpuEvent; } else { -// queue(gpuEvent.engineId_).addMemRef(iMem); gpuEvents_[iMem] = gpuEvent; } -// queues_[gpuEvent.engineId_]->addCmdMemRef(iMem); } void VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait) { + auto it = gpuEvents_.find(iMem); //! @note if there is no wait, then it's a view release - if (wait) { - waitForEvent(&gpuEvents_[iMem]); - //queue(MainEngine).removeMemRef(iMem); - //queue(SdmaEngine).removeMemRef(iMem); + if (wait && (it != gpuEvents_.end())) { + waitForEvent(&it->second); queues_[MainEngine]->removeCmdMemRef(iMem); queues_[SdmaEngine]->removeCmdMemRef(iMem); - gpuEvents_.erase(iMem); + gpuEvents_.erase(it); } }