diff --git a/rocclr/runtime/platform/command.cpp b/rocclr/runtime/platform/command.cpp index 6e53636526..eb6dd0580b 100644 --- a/rocclr/runtime/platform/command.cpp +++ b/rocclr/runtime/platform/command.cpp @@ -279,28 +279,24 @@ cl_int NativeFnCommand::invoke() { } bool OneMemoryArgCommand::validateMemory() { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize()); - return false; - } + device::Memory* mem = memory_->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize()); + return false; } return true; } bool TwoMemoryArgsCommand::validateMemory() { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - device::Memory* mem = memory1_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory1_->getSize()); - return false; - } - mem = memory2_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory2_->getSize()); - return false; - } + device::Memory* mem = memory1_->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory1_->getSize()); + return false; + } + mem = memory2_->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory2_->getSize()); + return false; } return true; } @@ -356,74 +352,68 @@ bool MapMemoryCommand::isEntireMemory() const { } void UnmapMemoryCommand::releaseResources() { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - //! @todo This is a workaround to a deadlock on indirect map release. - //! Remove this code when CAL will have a refcounter on memory. - //! decIndMapCount() has to go back to submitUnmapMemory() - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (NULL != mem) { - mem->releaseIndirectMap(); - } + //! @todo This is a workaround to a deadlock on indirect map release. + //! Remove this code when CAL will have a refcounter on memory. + //! decIndMapCount() has to go back to submitUnmapMemory() + device::Memory* mem = memory_->getDeviceMemory(queue()->device()); + if (NULL != mem) { + mem->releaseIndirectMap(); } + OneMemoryArgCommand::releaseResources(); } bool MigrateMemObjectsCommand::validateMemory() { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - for (const auto& it : memObjects_) { - device::Memory* mem = it->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize()); - return false; - } + for (const auto& it : memObjects_) { + device::Memory* mem = it->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize()); + return false; } } - return true; } cl_int NDRangeKernelCommand::validateMemory() { const amd::Device& device = queue()->device(); - if (device.info().type_ & CL_DEVICE_TYPE_GPU) { - // Validate the kernel before submission - if (!queue()->device().validateKernel(kernel(), queue()->vdev())) { - return CL_OUT_OF_RESOURCES; - } + // Validate the kernel before submission + if (!queue()->device().validateKernel(kernel(), queue()->vdev())) { + return CL_OUT_OF_RESOURCES; + } - const amd::KernelSignature& signature = kernel().signature(); - for (uint i = 0; i != signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - // Check if it's a memory object - if ((desc.type_ == T_POINTER) && (desc.size_ != 0)) { - amd::Memory* amdMemory; - if (kernel().parameters().boundToSvmPointer(device, parameters_, i)) { - // find the real mem object from svm ptr from the list - amdMemory = amd::SvmManager::FindSvmBuffer( - *reinterpret_cast(parameters() + desc.offset_)); - } else { - amdMemory = *reinterpret_cast(parameters() + desc.offset_); + const amd::KernelSignature& signature = kernel().signature(); + for (uint i = 0; i != signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + // Check if it's a memory object + if ((desc.type_ == T_POINTER) && (desc.size_ != 0)) { + amd::Memory* amdMemory; + if (kernel().parameters().boundToSvmPointer(device, parameters_, i)) { + // find the real mem object from svm ptr from the list + amdMemory = amd::SvmManager::FindSvmBuffer( + *reinterpret_cast(parameters() + desc.offset_)); + } else { + amdMemory = *reinterpret_cast(parameters() + desc.offset_); + } + if (amdMemory != NULL) { + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) { + // Make sure argument size isn't bigger than the device limit + if (amdMemory->getSize() > device.info().maxConstantBufferSize_) { + LogPrintfError("HW constant buffer is too big (0x%X bytes)!", amdMemory->getSize()); + return CL_OUT_OF_RESOURCES; + } } - if (amdMemory != NULL) { - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) { - // Make sure argument size isn't bigger than the device limit - if (amdMemory->getSize() > device.info().maxConstantBufferSize_) { - LogPrintfError("HW constant buffer is too big (0x%X bytes)!", amdMemory->getSize()); - return CL_OUT_OF_RESOURCES; - } - } - device::Memory* mem = amdMemory->getDeviceMemory(device); - if (!kernel().getDeviceKernel(device)->validateMemory(i, amdMemory)) { - if (device.reallocMemory(*amdMemory)) { - mem = amdMemory->getDeviceMemory(device); - } else { - mem = NULL; - } - } - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", amdMemory->getSize()); - return CL_MEM_OBJECT_ALLOCATION_FAILURE; + device::Memory* mem = amdMemory->getDeviceMemory(device); + if (!kernel().getDeviceKernel(device)->validateMemory(i, amdMemory)) { + if (device.reallocMemory(*amdMemory)) { + mem = amdMemory->getDeviceMemory(device); + } else { + mem = NULL; } } + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", amdMemory->getSize()); + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } } } } @@ -432,15 +422,13 @@ cl_int NDRangeKernelCommand::validateMemory() { bool ExtObjectsCommand::validateMemory() { bool retVal = true; - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - for (const auto& it : memObjects_) { - device::Memory* mem = it->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize()); - return false; - } - retVal = processGLResource(mem); + for (const auto& it : memObjects_) { + device::Memory* mem = it->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize()); + return false; } + retVal = processGLResource(mem); } return retVal; } @@ -454,33 +442,28 @@ bool ReleaseExtObjectsCommand::processGLResource(device::Memory* mem) { } bool MakeBuffersResidentCommand::validateMemory() { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - for (const auto& it : memObjects_) { - device::Memory* mem = it->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize()); - return false; - } + for (const auto& it : memObjects_) { + device::Memory* mem = it->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", it->getSize()); + return false; } } - return true; } + bool ThreadTraceMemObjectsCommand::validateMemory() { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - for (auto it = memObjects_.cbegin(); it != memObjects_.cend(); it++) { - device::Memory* mem = (*it)->getDeviceMemory(queue()->device()); - if (NULL == mem) { - for (auto tmpIt = memObjects_.cbegin(); tmpIt != it; tmpIt++) { - device::Memory* tmpMem = (*tmpIt)->getDeviceMemory(queue()->device()); - delete tmpMem; - } - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*it)->getSize()); - return false; + for (auto it = memObjects_.cbegin(); it != memObjects_.cend(); it++) { + device::Memory* mem = (*it)->getDeviceMemory(queue()->device()); + if (NULL == mem) { + for (auto tmpIt = memObjects_.cbegin(); tmpIt != it; tmpIt++) { + device::Memory* tmpMem = (*tmpIt)->getDeviceMemory(queue()->device()); + delete tmpMem; } + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*it)->getSize()); + return false; } } - return true; } @@ -527,56 +510,52 @@ void TransferBufferFileCommand::submit(device::VirtualDevice& device) { } bool TransferBufferFileCommand::validateMemory() { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - // Check if the destination buffer has direct host access - if (!(memory_->getMemFlags() & - (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) { - // Allocate staging buffers - for (uint i = 0; i < NumStagingBuffers; ++i) { - staging_[i] = new (memory_->getContext()) - Buffer(memory_->getContext(), StagingBufferMemType, StagingBufferSize); - if (NULL == staging_[i] || !staging_[i]->create(nullptr)) { - return false; - } - device::Memory* mem = staging_[i]->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate staging buffer - 0x%08X bytes!", staging_[i]->getSize()); - return false; - } + // Check if the destination buffer has direct host access + if (!(memory_->getMemFlags() & + (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) { + // Allocate staging buffers + for (uint i = 0; i < NumStagingBuffers; ++i) { + staging_[i] = new (memory_->getContext()) + Buffer(memory_->getContext(), StagingBufferMemType, StagingBufferSize); + if (NULL == staging_[i] || !staging_[i]->create(nullptr)) { + return false; + } + device::Memory* mem = staging_[i]->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate staging buffer - 0x%08X bytes!", staging_[i]->getSize()); + return false; } } + } - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize()); - return false; - } + device::Memory* mem = memory_->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize()); + return false; } return true; } bool CopyMemoryP2PCommand::validateMemory() { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - const std::vector& devices = memory1_->getContext().devices(); - if (devices.size() != 1) { - LogError("Can't allocate memory object for P2P extension"); - return false; - } - device::Memory* mem = memory1_->getDeviceMemory(*devices[0]); - if (nullptr == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory1_->getSize()); - return false; - } - const std::vector& devices2 = memory2_->getContext().devices(); - if (devices2.size() != 1) { - LogError("Can't allocate memory object for P2P extension"); - return false; - } - mem = memory2_->getDeviceMemory(*devices2[0]); - if (nullptr == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory2_->getSize()); - return false; - } + const std::vector& devices = memory1_->getContext().devices(); + if (devices.size() != 1) { + LogError("Can't allocate memory object for P2P extension"); + return false; + } + device::Memory* mem = memory1_->getDeviceMemory(*devices[0]); + if (nullptr == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory1_->getSize()); + return false; + } + const std::vector& devices2 = memory2_->getContext().devices(); + if (devices2.size() != 1) { + LogError("Can't allocate memory object for P2P extension"); + return false; + } + mem = memory2_->getDeviceMemory(*devices2[0]); + if (nullptr == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory2_->getSize()); + return false; } return true; } diff --git a/rocclr/runtime/platform/context.cpp b/rocclr/runtime/platform/context.cpp index e05e866c18..17b106e95b 100644 --- a/rocclr/runtime/platform/context.cpp +++ b/rocclr/runtime/platform/context.cpp @@ -40,16 +40,10 @@ Context::Context(const std::vector& devices, const Info& info) } } if (svmAllocDevice_.size() > 1) { - // make sure the CPU is the last device to do allocation. - if ((svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU)) { - std::swap(svmAllocDevice_.front(), svmAllocDevice_.back()); - } - uint isFirstDeviceFGSEnabled = svmAllocDevice_.front()->isFineGrainedSystem(true); for (auto& dev : svmAllocDevice_) { // allocation on fine - grained system incapable device first - if (isFirstDeviceFGSEnabled && (dev->type() == CL_DEVICE_TYPE_GPU) && - (!(dev->isFineGrainedSystem(true)))) { + if (isFirstDeviceFGSEnabled && !dev->isFineGrainedSystem(true)) { std::swap(svmAllocDevice_.front(), dev); break; } @@ -288,42 +282,29 @@ void* Context::svmAlloc(size_t size, size_t alignment, cl_svm_mem_flags flags) { return NULL; } - if (svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU) { - return AlignedMemory::allocate(size, alignment); - } else { - void* svmPtrAlloced = NULL; - void* tempPtr = NULL; - - amd::ScopedLock lock(&ctxLock_); - for (const auto& dev : svmAllocDevice_) { - if (dev->type() == CL_DEVICE_TYPE_GPU) { - // check if the device support svm platform atomics, - // skipped allocation for platform atomics if not supported by this device - if ((flags & CL_MEM_SVM_ATOMICS) && - !(dev->info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS)) { - continue; - } - svmPtrAlloced = dev->svmAlloc(*this, size, alignment, flags, svmPtrAlloced); - if (svmPtrAlloced == NULL) { - return NULL; - } - } - } - return svmPtrAlloced; - } -} - -void Context::svmFree(void* ptr) const { - if (svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU) { - AlignedMemory::deallocate(ptr); - return; - } + void* svmPtrAlloced = NULL; + void* tempPtr = NULL; amd::ScopedLock lock(&ctxLock_); for (const auto& dev : svmAllocDevice_) { - if (dev->type() == CL_DEVICE_TYPE_GPU) { - dev->svmFree(ptr); + // check if the device support svm platform atomics, + // skipped allocation for platform atomics if not supported by this device + if ((flags & CL_MEM_SVM_ATOMICS) && + !(dev->info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS)) { + continue; } + svmPtrAlloced = dev->svmAlloc(*this, size, alignment, flags, svmPtrAlloced); + if (svmPtrAlloced == NULL) { + return NULL; + } + } + return svmPtrAlloced; +} + +void Context::svmFree(void* ptr) const { + amd::ScopedLock lock(&ctxLock_); + for (const auto& dev : svmAllocDevice_) { + dev->svmFree(ptr); } return; } diff --git a/rocclr/runtime/platform/memory.cpp b/rocclr/runtime/platform/memory.cpp index 90a2432212..c3a5b119b4 100644 --- a/rocclr/runtime/platform/memory.cpp +++ b/rocclr/runtime/platform/memory.cpp @@ -156,14 +156,6 @@ bool Memory::allocHostMemory(void* initFrom, bool allocHostMem, bool forceCopy) const std::vector& devices = context_().devices(); - // Find if a non GPU device was created with the context - for (size_t i = 0; i < devices.size(); i++) { - if (!(devices[i]->info().type_ & CL_DEVICE_TYPE_GPU)) { - allocHostMem = true; - break; - } - } - // This allocation is necessary to use coherency mechanism // for the initialization if (getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { @@ -243,11 +235,8 @@ bool Memory::create(void* initFrom, bool sysMemAlloc) { for (size_t i = 0; i < devices.size(); i++) { deviceAlloced_[devices[i]] = AllocInit; - // Only GPU devices have device memory objects - if (devices[i]->info().type_ & CL_DEVICE_TYPE_GPU) { - deviceMemories_[i].ref_ = devices[i]; - deviceMemories_[i].value_ = NULL; - } + deviceMemories_[i].ref_ = devices[i]; + deviceMemories_[i].value_ = NULL; if (DISABLE_DEFERRED_ALLOC) { device::Memory* mem = getDeviceMemory(*devices[i]); diff --git a/rocclr/runtime/platform/program.cpp b/rocclr/runtime/platform/program.cpp index ffcd627b3e..6338f444ad 100644 --- a/rocclr/runtime/platform/program.cpp +++ b/rocclr/runtime/platform/program.cpp @@ -605,7 +605,7 @@ bool Program::ParseAllOptions(const std::string& options, option::Options& parse bool Symbol::setDeviceKernel(const Device& device, const device::Kernel* func, bool noAlias) { // FIXME_lmoriche: check that the signatures are compatible - if (deviceKernels_.size() == 0 || device.type() == CL_DEVICE_TYPE_CPU) { + if (deviceKernels_.size() == 0) { signature_ = func->signature(); }