From 25319dac4b6d7ecdfd644c662a233f8ca9e10ee8 Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 21 Oct 2016 13:31:50 -0400
Subject: [PATCH] P4 to Git Change 1329979 by gandryey@gera-w8 on 2016/10/21
13:26:53
SWDEV-86035 - Add PAL backend to OpenCL
- Allow device memory usage for blit kernels
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#32 edit
[ROCm/clr commit: 43f8188d59f3f234b2a10831043e1f8430247d54]
---
.../clr/rocclr/runtime/device/pal/palblit.cpp | 6 ++
.../rocclr/runtime/device/pal/paldevice.cpp | 55 +++++++++++--------
.../rocclr/runtime/device/pal/paldevice.hpp | 3 +
.../rocclr/runtime/device/pal/palkernel.cpp | 16 +-----
.../rocclr/runtime/device/pal/palprogram.cpp | 14 ++---
.../rocclr/runtime/device/pal/palprogram.hpp | 4 --
.../rocclr/runtime/device/pal/palresource.cpp | 12 ++--
.../rocclr/runtime/device/pal/palvirtual.cpp | 46 ++++++++--------
8 files changed, 80 insertions(+), 76 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index 0e67f8117c..56c5f63dbb 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -801,6 +801,12 @@ KernelBlitManager::create(amd::Device& device)
bool
KernelBlitManager::createProgram(Device& device)
{
+ if (device.blitProgram() == nullptr) {
+ if (!device.createBlitProgram()) {
+ return false;
+ }
+ }
+
std::vector devices;
devices.push_back(&device);
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
index 16b845e29d..d20b1a3513 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -478,7 +478,7 @@ void NullDevice::fillDeviceInfo(
info_.numAsyncQueues_ = numComputeRings;
info_.numRTQueues_ =
palProp.engineProperties[Pal::EngineTypeExclusiveCompute].engineCount - 1;
- info_.numRTCUs_ = 0x8;
+ info_.numRTCUs_ = 8;
//palProp.engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu;
info_.threadTraceEnable_ = settings().threadTraceEnable_;
}
@@ -718,6 +718,8 @@ Device::create(Pal::IDevice* device)
palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
palSettings->longRunningSubmissions = true;
palSettings->cmdBufBatchedSubmitChainLimit = 0;
+ //palSettings->disableResourceProcessingManager = true;
+ //palSettings->disableScManager = true;
// Commit the new settings for the device
result = iDev()->CommitSettingsAndInit();
@@ -885,28 +887,6 @@ Device::initializeHeapResources()
}
}
- // Delay compilation due to brig_loader memory allocation
- const char* scheduler = nullptr;
- const char* ocl20 = nullptr;
-#if !defined(WITH_LIGHTNING_COMPILER)
- std::string sch = SchedulerSourceCode;
- if (settings().oclVersion_ == OpenCL20) {
- size_t loc = sch.find("%s");
- sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
- scheduler = sch.c_str();
- ocl20 = "-cl-std=CL2.0";
- }
-#endif // !defined(WITH_LIGHTNING_COMPILER)
- blitProgram_ = new BlitProgram(context_);
- // Create blit programs
- if (blitProgram_ == nullptr ||
- !blitProgram_->create(this, scheduler, ocl20)) {
- delete blitProgram_;
- blitProgram_ = nullptr;
- LogError("Couldn't create blit kernels!");
- return false;
- }
-
// Create a synchronized transfer queue
xferQueue_ = new VirtualGPU(*this);
if (!(xferQueue_ && xferQueue_->create(
@@ -2176,6 +2156,35 @@ Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free)
}
}
+bool
+Device::createBlitProgram()
+{
+ bool result = true;
+
+ // Delayed compilation due to brig_loader memory allocation
+ const char* scheduler = nullptr;
+ const char* ocl20 = nullptr;
+#if !defined(WITH_LIGHTNING_COMPILER)
+ std::string sch = SchedulerSourceCode;
+ if (settings().oclVersion_ == OpenCL20) {
+ size_t loc = sch.find("%s");
+ sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
+ scheduler = sch.c_str();
+ ocl20 = "-cl-std=CL2.0";
+ }
+#endif // !defined(WITH_LIGHTNING_COMPILER)
+ blitProgram_ = new BlitProgram(context_);
+ // Create blit programs
+ if (blitProgram_ == nullptr ||
+ !blitProgram_->create(this, scheduler, ocl20)) {
+ delete blitProgram_;
+ blitProgram_ = nullptr;
+ LogError("Couldn't create blit kernels!");
+ result = false;
+ }
+ return result;
+}
+
void
Device::SrdManager::fillResourceList(std::vector& memList)
{
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
index 96ae0bbd85..0ba0a18298 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
@@ -482,6 +482,9 @@ public:
bool free //!< TRUE if runtime frees memory
);
+ //! Create internal blit program
+ bool createBlitProgram();
+
//! Interop for GL device
bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const;
bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const;
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index 8848d76c69..43e0cf277c 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -394,22 +394,12 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
if (!prog().isNull()) {
code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
Resource::MemoryType type = Resource::Local;
- if (flags_.internalKernel_) {
- type = Resource::RemoteUSWC;
- }
// Initialize kernel ISA code
if (code_ && code_->create(type)) {
- if (flags_.internalKernel_) {
- address cpuCodePtr = static_cast(code_->map(nullptr, Resource::WriteOnly));
- // Copy only amd_kernel_code_t
- memcpy(cpuCodePtr, reinterpret_cast(akc), codeSize_);
- code_->unmap(nullptr);
- }
- else {
- static_cast(dev().xferMgr()).writeRawData(
- *code_, codeSize_, reinterpret_cast(akc));
- }
+ constexpr bool WaitForUpload = true;
+ code_->writeRawData(*code_->dev().xferQueue(), 0, codeSize_,
+ reinterpret_cast(akc), WaitForUpload);
}
else {
LogError("Failed to allocate ISA code!");
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
index f43b4d96a3..f549db1516 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
@@ -469,8 +469,6 @@ HSAILProgram::linkImpl(amd::option::Options* options)
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
bool finalize = true;
bool hsaLoad = true;
- internal_ = (compileOptions_.find("-cl-internal-kernel") !=
- std::string::npos) ? true : false;
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
if (!binaryElf_) {
@@ -936,7 +934,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
assert(size);
assert(align);
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
- if (program_->isNull() || program_->isInternal()) {
+ if (program_->isNull()) {
return new char[size];
}
@@ -962,19 +960,20 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
if (0 == size) {
return true;
}
- if (program_->isNull() || program_->isInternal()) {
+ if (program_->isNull()) {
memcpy(reinterpret_cast(dst) + offset, src, size);
return true;
}
assert(program_->dev().xferQueue());
pal::Memory* mem = reinterpret_cast(dst);
- return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
+ constexpr bool WaitForCopy = true;
+ mem->writeRawData(*mem->dev().xferQueue(), offset, size, src, WaitForCopy);
return true;
}
void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size)
{
- if (program_->isNull() || program_->isInternal()) {
+ if (program_->isNull()) {
delete[] reinterpret_cast(ptr);
}
else {
@@ -1020,9 +1019,6 @@ LightningProgram::linkImpl(amd::option::Options *options)
{
using namespace amd::opencl_driver;
- internal_ = (compileOptions_.find("-cl-internal-kernel") !=
- std::string::npos) ? true : false;
-
aclType continueCompileFrom = llvmBinary_.empty()
? getNextCompilationStageFromBinary(options)
: ACL_TYPE_LLVMIR_BINARY;
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
index 971f4f3848..f1b58cec56 100644
--- a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
@@ -182,9 +182,6 @@ public:
//! Returns TRUE if the program just compiled
bool isNull() const { return isNull_; }
- //! Returns TRUE if the program used internally by runtime
- bool isInternal() const { return internal_; }
-
//! Returns TRUE if the program contains static samplers
bool isStaticSampler() const { return (staticSamplers_.size() != 0); }
@@ -278,7 +275,6 @@ protected:
union {
struct {
uint32_t isNull_ : 1; //!< Null program no memory allocations
- uint32_t internal_ : 1; //!< Internal blit program
};
uint32_t flags_; //!< Program flags
};
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
index a49976c29d..e466cfe584 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -1118,14 +1118,18 @@ Resource::writeRawData(
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast(data));
gpu.eventEnd(MainEngine, event);
- setBusy(gpu, event);
- // Update the global GPU event
- gpu.setGpuEvent(event, false);
-
if (waitForEvent) {
+ //! @note: We don't really have to mark the allocations as busy
+ //! if we are waiting for a transfer
+
// Wait for event to complete
gpu.waitForEvent(&event);
}
+ else {
+ setBusy(gpu, event);
+ // Update the global GPU event
+ gpu.setGpuEvent(event, false);
+ }
}
static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement)
{
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index bfa1a5b032..615970b0e7 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -63,7 +63,7 @@ VirtualGPU::Queue::Create(
qCreateInfo.numReservedCu = rtCU;
qCreateInfo.engineIndex = 0x0;
cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute;
- cmdCreateInfo.flags.rtCu = true;
+ cmdCreateInfo.flags.realtimeComputeUnits = true;
}
*/
// Find queue object size
@@ -801,28 +801,6 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
return false;
}
- // Choose the appropriate class for blit engine
- switch (dev().settings().blitEngine_) {
- default:
- // Fall through ...
- case Settings::BlitEngineHost:
- blitSetup.disableAll();
- // Fall through ...
- case Settings::BlitEngineCAL:
- case Settings::BlitEngineKernel:
- // use host blit for HW debug
- if (dev().settings().enableHwDebug_) {
- blitSetup.disableCopyImageToBuffer_ = true;
- blitSetup.disableCopyBufferToImage_ = true;
- }
- blitMgr_ = new KernelBlitManager(*this, blitSetup);
- break;
- }
- if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
- LogError("Could not create BlitManager!");
- return false;
- }
-
tsCache_ = new TimeStampCache(*this);
if (nullptr == tsCache_) {
LogError("Could not create TimeStamp cache!");
@@ -846,6 +824,28 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
return false;
}
+ // Choose the appropriate class for blit engine
+ switch (dev().settings().blitEngine_) {
+ default:
+ // Fall through ...
+ case Settings::BlitEngineHost:
+ blitSetup.disableAll();
+ // Fall through ...
+ case Settings::BlitEngineCAL:
+ case Settings::BlitEngineKernel:
+ // use host blit for HW debug
+ if (dev().settings().enableHwDebug_) {
+ blitSetup.disableCopyImageToBuffer_ = true;
+ blitSetup.disableCopyBufferToImage_ = true;
+ }
+ blitMgr_ = new KernelBlitManager(*this, blitSetup);
+ break;
+ }
+ if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
+ LogError("Could not create BlitManager!");
+ return false;
+ }
+
return true;
}