From a94fa4eabb8a884dc93a359ee279af2ab58cb408 Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 18 May 2016 18:11:40 -0400
Subject: [PATCH] P4 to Git Change 1270658 by gandryey@gera-w8 on 2016/05/18
17:53:45
SWDEV-86035 - Add PAL backend to OpenCL
- Fix a crash in the pipe test. Device layer can't use device blit queue directly, but requires a blit manager call, which will perform correct wait for idle sequence.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp#2 edit
---
rocclr/runtime/device/pal/palblit.cpp | 14 +++++++++++++-
rocclr/runtime/device/pal/palblit.hpp | 11 +++++++++--
rocclr/runtime/device/pal/paldevice.cpp | 7 ++++---
rocclr/runtime/device/pal/palkernel.cpp | 5 ++---
rocclr/runtime/device/pal/palmemory.cpp | 14 ++------------
rocclr/runtime/device/pal/palprogram.cpp | 2 --
rocclr/runtime/device/pal/palvirtual.cpp | 24 +++++++++---------------
7 files changed, 39 insertions(+), 38 deletions(-)
diff --git a/rocclr/runtime/device/pal/palblit.cpp b/rocclr/runtime/device/pal/palblit.cpp
index ab5f713c8b..c9c1ac53d2 100644
--- a/rocclr/runtime/device/pal/palblit.cpp
+++ b/rocclr/runtime/device/pal/palblit.cpp
@@ -23,8 +23,8 @@ inline void
DmaBlitManager::synchronize() const
{
if (syncOperation_) {
- gpu().waitAllEngines();
gpu().releaseMemObjects();
+ gpu().waitAllEngines();
}
}
@@ -2685,6 +2685,18 @@ KernelBlitManager::runScheduler(
return result;
}
+void
+KernelBlitManager::writeRawData(
+ device::Memory& memory,
+ size_t size,
+ const void* data
+ ) const
+{
+ static_cast(memory).writeRawData(gpu(), size, data, false);
+
+ synchronize();
+}
+
amd::Memory*
DmaBlitManager::pinHostMemory(
const void* hostMem,
diff --git a/rocclr/runtime/device/pal/palblit.hpp b/rocclr/runtime/device/pal/palblit.hpp
index 462842dd60..112d1fb5af 100644
--- a/rocclr/runtime/device/pal/palblit.hpp
+++ b/rocclr/runtime/device/pal/palblit.hpp
@@ -371,14 +371,21 @@ public:
bool entire = false //!< Entire buffer will be updated
) const;
- //! Fills an image memory with a pattern data
- virtual bool runScheduler(
+ //! Runs a GPU scheduler for device enqueue
+ bool runScheduler(
device::Memory& vqueue, //!< Memory object for virtual queue
device::Memory& params, //!< Extra arguments for the scheduler
uint paramIdx, //!< Parameter index
uint threads //!< Number of scheduling threads
) const;
+ //! Writes CPU raw data into GPU memory
+ void writeRawData(
+ device::Memory& memory, //!< Memory object for data udpate
+ size_t size, //!< Size of raw data
+ const void* data //!< Raw data pointer
+ ) const;
+
private:
static const size_t MaxXferBuffers = 2;
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 8c3e1d3e26..90c81e2a6a 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -987,8 +987,8 @@ Device::init()
acl_error error;
compiler_ = aclCompilerInit(&opts, &error);
if (error != ACL_SUCCESS) {
- LogError("Error initializing the compiler");
- return false;
+ LogError("Error initializing the compiler");
+ return false;
}
size_t size = Pal::GetPlatformSize();
@@ -1210,7 +1210,8 @@ Device::createBuffer(
// Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
// Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
- gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
+ static_cast(xferMgr()).writeRawData(
+ *gpuMemory, sizeof(pipeInit), pipeInit);
}
// If memory has direct access from host, then get CPU address
if (gpuMemory->isHostMemDirectAccess() &&
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 7b0e47baad..ea82fd075a 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -677,9 +677,8 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
return false;
}
- //! @todo get the right value;
// Copy wavefront size
- workGroupInfo_.wavefrontSize_ = 64;//dev().getAttribs().wavefrontSize;
+ workGroupInfo_.wavefrontSize_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize;
// Find total workgroup size
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ =
@@ -999,7 +998,7 @@ HSAILKernel::loadArguments(
break;
}
- //! @todo 64 bit isn't supported with 32 bit binary
+ //! 64 bit isn't supported with 32 bit binary
uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));
diff --git a/rocclr/runtime/device/pal/palmemory.cpp b/rocclr/runtime/device/pal/palmemory.cpp
index 01b9124fd6..6f03d78cdb 100644
--- a/rocclr/runtime/device/pal/palmemory.cpp
+++ b/rocclr/runtime/device/pal/palmemory.cpp
@@ -151,14 +151,8 @@ Memory::create(
reinterpret_cast(params);
// Check if parent was allocated in system memory
if ((view->resource_->memoryType() == Resource::Pinned) ||
- (((view->resource_->memoryType() == Resource::Remote) ||
- (view->resource_->memoryType() == Resource::RemoteUSWC)) &&
- // @todo Enable unconditional optimization for remote memory
- // Check for external allocation, to avoid the optimization
- // for non-VM (double copy) mode
- (owner() != nullptr) &&
- ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) ||
- dev().settings().remoteAlloc_))) {
+ (view->resource_->memoryType() == Resource::Remote) ||
+ (view->resource_->memoryType() == Resource::RemoteUSWC)) {
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
}
@@ -578,10 +572,6 @@ Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
}
}
- //!@todo A wait isn't really necessary. However
- //! Linux no-VM may have extra random failures.
- wait(gpu);
-
// Should never fail
assert(result && "Memory synchronization failed!");
}
diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp
index 383b170552..d677959ff9 100644
--- a/rocclr/runtime/device/pal/palprogram.cpp
+++ b/rocclr/runtime/device/pal/palprogram.cpp
@@ -237,7 +237,6 @@ HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStage
aclType from = ACL_TYPE_DEFAULT;
needOptionsCheck = true;
size_t boolSize = sizeof(bool);
- //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
// Checking llvmir in .llvmir section
bool containsSpirv = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
@@ -375,7 +374,6 @@ HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
return continueCompileFrom;
}
bool recompile = false;
- //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
switch (continueCompileFrom) {
case ACL_TYPE_HSAIL_BINARY:
case ACL_TYPE_CG:
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 98e3764cfe..e43b9c1320 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -149,8 +149,9 @@ VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem)
void
VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
{
- memReferences_.erase(iMem);
- iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
+ if (0 != memReferences_.erase(iMem)) {
+ iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
+ }
}
uint
@@ -2264,39 +2265,32 @@ VirtualGPU::submitMarker(amd::Marker& vcmd)
GpuEvent*
VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem)
{
- GpuEvents::iterator it = gpuEvents_.find(iMem);
- if (it == gpuEvents_.end()) {
-// queue(MainEngine).addMemRef(iMem);
-// queue(SdmaEngine).addMemRef(iMem);
- }
return &gpuEvents_[iMem];
}
void
VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent)
{
- GpuEvents::iterator it = gpuEvents_.find(iMem);
+ auto it = gpuEvents_.find(iMem);
+
if (it != gpuEvents_.end()) {
it->second = gpuEvent;
}
else {
-// queue(gpuEvent.engineId_).addMemRef(iMem);
gpuEvents_[iMem] = gpuEvent;
}
-// queues_[gpuEvent.engineId_]->addCmdMemRef(iMem);
}
void
VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait)
{
+ auto it = gpuEvents_.find(iMem);
//! @note if there is no wait, then it's a view release
- if (wait) {
- waitForEvent(&gpuEvents_[iMem]);
- //queue(MainEngine).removeMemRef(iMem);
- //queue(SdmaEngine).removeMemRef(iMem);
+ if (wait && (it != gpuEvents_.end())) {
+ waitForEvent(&it->second);
queues_[MainEngine]->removeCmdMemRef(iMem);
queues_[SdmaEngine]->removeCmdMemRef(iMem);
- gpuEvents_.erase(iMem);
+ gpuEvents_.erase(it);
}
}