P4 to Git Change 1329979 by gandryey@gera-w8 on 2016/10/21 13:26:53

SWDEV-86035 - Add PAL backend to OpenCL
	- Allow device memory usage for blit kernels

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#32 edit


[ROCm/clr commit: 43f8188d59]
Этот коммит содержится в:
foreman
2016-10-21 13:31:50 -04:00
родитель 2576f51460
Коммит 25319dac4b
8 изменённых файлов: 80 добавлений и 76 удалений
+6
Просмотреть файл
@@ -801,6 +801,12 @@ KernelBlitManager::create(amd::Device& device)
bool
KernelBlitManager::createProgram(Device& device)
{
if (device.blitProgram() == nullptr) {
if (!device.createBlitProgram()) {
return false;
}
}
std::vector<amd::Device*> devices;
devices.push_back(&device);
+32 -23
Просмотреть файл
@@ -478,7 +478,7 @@ void NullDevice::fillDeviceInfo(
info_.numAsyncQueues_ = numComputeRings;
info_.numRTQueues_ =
palProp.engineProperties[Pal::EngineTypeExclusiveCompute].engineCount - 1;
info_.numRTCUs_ = 0x8;
info_.numRTCUs_ = 8;
//palProp.engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu;
info_.threadTraceEnable_ = settings().threadTraceEnable_;
}
@@ -718,6 +718,8 @@ Device::create(Pal::IDevice* device)
palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
palSettings->longRunningSubmissions = true;
palSettings->cmdBufBatchedSubmitChainLimit = 0;
//palSettings->disableResourceProcessingManager = true;
//palSettings->disableScManager = true;
// Commit the new settings for the device
result = iDev()->CommitSettingsAndInit();
@@ -885,28 +887,6 @@ Device::initializeHeapResources()
}
}
// Delay compilation due to brig_loader memory allocation
const char* scheduler = nullptr;
const char* ocl20 = nullptr;
#if !defined(WITH_LIGHTNING_COMPILER)
std::string sch = SchedulerSourceCode;
if (settings().oclVersion_ == OpenCL20) {
size_t loc = sch.find("%s");
sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
scheduler = sch.c_str();
ocl20 = "-cl-std=CL2.0";
}
#endif // !defined(WITH_LIGHTNING_COMPILER)
blitProgram_ = new BlitProgram(context_);
// Create blit programs
if (blitProgram_ == nullptr ||
!blitProgram_->create(this, scheduler, ocl20)) {
delete blitProgram_;
blitProgram_ = nullptr;
LogError("Couldn't create blit kernels!");
return false;
}
// Create a synchronized transfer queue
xferQueue_ = new VirtualGPU(*this);
if (!(xferQueue_ && xferQueue_->create(
@@ -2176,6 +2156,35 @@ Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free)
}
}
bool
Device::createBlitProgram()
{
bool result = true;
// Delayed compilation due to brig_loader memory allocation
const char* scheduler = nullptr;
const char* ocl20 = nullptr;
#if !defined(WITH_LIGHTNING_COMPILER)
std::string sch = SchedulerSourceCode;
if (settings().oclVersion_ == OpenCL20) {
size_t loc = sch.find("%s");
sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
scheduler = sch.c_str();
ocl20 = "-cl-std=CL2.0";
}
#endif // !defined(WITH_LIGHTNING_COMPILER)
blitProgram_ = new BlitProgram(context_);
// Create blit programs
if (blitProgram_ == nullptr ||
!blitProgram_->create(this, scheduler, ocl20)) {
delete blitProgram_;
blitProgram_ = nullptr;
LogError("Couldn't create blit kernels!");
result = false;
}
return result;
}
void
Device::SrdManager::fillResourceList(std::vector<const Memory*>& memList)
{
+3
Просмотреть файл
@@ -482,6 +482,9 @@ public:
bool free //!< TRUE if runtime frees memory
);
//! Create internal blit program
bool createBlitProgram();
//! Interop for GL device
bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const;
bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const;
+3 -13
Просмотреть файл
@@ -394,22 +394,12 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
if (!prog().isNull()) {
code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
Resource::MemoryType type = Resource::Local;
if (flags_.internalKernel_) {
type = Resource::RemoteUSWC;
}
// Initialize kernel ISA code
if (code_ && code_->create(type)) {
if (flags_.internalKernel_) {
address cpuCodePtr = static_cast<address>(code_->map(nullptr, Resource::WriteOnly));
// Copy only amd_kernel_code_t
memcpy(cpuCodePtr, reinterpret_cast<address>(akc), codeSize_);
code_->unmap(nullptr);
}
else {
static_cast<const KernelBlitManager&>(dev().xferMgr()).writeRawData(
*code_, codeSize_, reinterpret_cast<void*>(akc));
}
constexpr bool WaitForUpload = true;
code_->writeRawData(*code_->dev().xferQueue(), 0, codeSize_,
reinterpret_cast<void*>(akc), WaitForUpload);
}
else {
LogError("Failed to allocate ISA code!");
+5 -9
Просмотреть файл
@@ -469,8 +469,6 @@ HSAILProgram::linkImpl(amd::option::Options* options)
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
bool finalize = true;
bool hsaLoad = true;
internal_ = (compileOptions_.find("-cl-internal-kernel") !=
std::string::npos) ? true : false;
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
if (!binaryElf_) {
@@ -936,7 +934,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
assert(size);
assert(align);
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
if (program_->isNull() || program_->isInternal()) {
if (program_->isNull()) {
return new char[size];
}
@@ -962,19 +960,20 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
if (0 == size) {
return true;
}
if (program_->isNull() || program_->isInternal()) {
if (program_->isNull()) {
memcpy(reinterpret_cast<address>(dst) + offset, src, size);
return true;
}
assert(program_->dev().xferQueue());
pal::Memory* mem = reinterpret_cast<pal::Memory*>(dst);
return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
constexpr bool WaitForCopy = true;
mem->writeRawData(*mem->dev().xferQueue(), offset, size, src, WaitForCopy);
return true;
}
void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size)
{
if (program_->isNull() || program_->isInternal()) {
if (program_->isNull()) {
delete[] reinterpret_cast<char*>(ptr);
}
else {
@@ -1020,9 +1019,6 @@ LightningProgram::linkImpl(amd::option::Options *options)
{
using namespace amd::opencl_driver;
internal_ = (compileOptions_.find("-cl-internal-kernel") !=
std::string::npos) ? true : false;
aclType continueCompileFrom = llvmBinary_.empty()
? getNextCompilationStageFromBinary(options)
: ACL_TYPE_LLVMIR_BINARY;
-4
Просмотреть файл
@@ -182,9 +182,6 @@ public:
//! Returns TRUE if the program just compiled
bool isNull() const { return isNull_; }
//! Returns TRUE if the program used internally by runtime
bool isInternal() const { return internal_; }
//! Returns TRUE if the program contains static samplers
bool isStaticSampler() const { return (staticSamplers_.size() != 0); }
@@ -278,7 +275,6 @@ protected:
union {
struct {
uint32_t isNull_ : 1; //!< Null program no memory allocations
uint32_t internal_ : 1; //!< Internal blit program
};
uint32_t flags_; //!< Program flags
};
+8 -4
Просмотреть файл
@@ -1118,14 +1118,18 @@ Resource::writeRawData(
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast<const uint32_t*>(data));
gpu.eventEnd(MainEngine, event);
setBusy(gpu, event);
// Update the global GPU event
gpu.setGpuEvent(event, false);
if (waitForEvent) {
//! @note: We don't really have to mark the allocations as busy
//! if we are waiting for a transfer
// Wait for event to complete
gpu.waitForEvent(&event);
}
else {
setBusy(gpu, event);
// Update the global GPU event
gpu.setGpuEvent(event, false);
}
}
static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement)
{
+23 -23
Просмотреть файл
@@ -63,7 +63,7 @@ VirtualGPU::Queue::Create(
qCreateInfo.numReservedCu = rtCU;
qCreateInfo.engineIndex = 0x0;
cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute;
cmdCreateInfo.flags.rtCu = true;
cmdCreateInfo.flags.realtimeComputeUnits = true;
}
*/
// Find queue object size
@@ -801,28 +801,6 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
return false;
}
// Choose the appropriate class for blit engine
switch (dev().settings().blitEngine_) {
default:
// Fall through ...
case Settings::BlitEngineHost:
blitSetup.disableAll();
// Fall through ...
case Settings::BlitEngineCAL:
case Settings::BlitEngineKernel:
// use host blit for HW debug
if (dev().settings().enableHwDebug_) {
blitSetup.disableCopyImageToBuffer_ = true;
blitSetup.disableCopyBufferToImage_ = true;
}
blitMgr_ = new KernelBlitManager(*this, blitSetup);
break;
}
if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
LogError("Could not create BlitManager!");
return false;
}
tsCache_ = new TimeStampCache(*this);
if (nullptr == tsCache_) {
LogError("Could not create TimeStamp cache!");
@@ -846,6 +824,28 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
return false;
}
// Choose the appropriate class for blit engine
switch (dev().settings().blitEngine_) {
default:
// Fall through ...
case Settings::BlitEngineHost:
blitSetup.disableAll();
// Fall through ...
case Settings::BlitEngineCAL:
case Settings::BlitEngineKernel:
// use host blit for HW debug
if (dev().settings().enableHwDebug_) {
blitSetup.disableCopyImageToBuffer_ = true;
blitSetup.disableCopyBufferToImage_ = true;
}
blitMgr_ = new KernelBlitManager(*this, blitSetup);
break;
}
if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
LogError("Could not create BlitManager!");
return false;
}
return true;
}