P4 to Git Change 1329979 by gandryey@gera-w8 on 2016/10/21 13:26:53
SWDEV-86035 - Add PAL backend to OpenCL
- Allow device memory usage for blit kernels
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#32 edit
[ROCm/clr commit: 43f8188d59]
Этот коммит содержится в:
@@ -801,6 +801,12 @@ KernelBlitManager::create(amd::Device& device)
|
||||
bool
|
||||
KernelBlitManager::createProgram(Device& device)
|
||||
{
|
||||
if (device.blitProgram() == nullptr) {
|
||||
if (!device.createBlitProgram()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<amd::Device*> devices;
|
||||
devices.push_back(&device);
|
||||
|
||||
|
||||
@@ -478,7 +478,7 @@ void NullDevice::fillDeviceInfo(
|
||||
info_.numAsyncQueues_ = numComputeRings;
|
||||
info_.numRTQueues_ =
|
||||
palProp.engineProperties[Pal::EngineTypeExclusiveCompute].engineCount - 1;
|
||||
info_.numRTCUs_ = 0x8;
|
||||
info_.numRTCUs_ = 8;
|
||||
//palProp.engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu;
|
||||
info_.threadTraceEnable_ = settings().threadTraceEnable_;
|
||||
}
|
||||
@@ -718,6 +718,8 @@ Device::create(Pal::IDevice* device)
|
||||
palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
|
||||
palSettings->longRunningSubmissions = true;
|
||||
palSettings->cmdBufBatchedSubmitChainLimit = 0;
|
||||
//palSettings->disableResourceProcessingManager = true;
|
||||
//palSettings->disableScManager = true;
|
||||
|
||||
// Commit the new settings for the device
|
||||
result = iDev()->CommitSettingsAndInit();
|
||||
@@ -885,28 +887,6 @@ Device::initializeHeapResources()
|
||||
}
|
||||
}
|
||||
|
||||
// Delay compilation due to brig_loader memory allocation
|
||||
const char* scheduler = nullptr;
|
||||
const char* ocl20 = nullptr;
|
||||
#if !defined(WITH_LIGHTNING_COMPILER)
|
||||
std::string sch = SchedulerSourceCode;
|
||||
if (settings().oclVersion_ == OpenCL20) {
|
||||
size_t loc = sch.find("%s");
|
||||
sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
|
||||
scheduler = sch.c_str();
|
||||
ocl20 = "-cl-std=CL2.0";
|
||||
}
|
||||
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
||||
blitProgram_ = new BlitProgram(context_);
|
||||
// Create blit programs
|
||||
if (blitProgram_ == nullptr ||
|
||||
!blitProgram_->create(this, scheduler, ocl20)) {
|
||||
delete blitProgram_;
|
||||
blitProgram_ = nullptr;
|
||||
LogError("Couldn't create blit kernels!");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create a synchronized transfer queue
|
||||
xferQueue_ = new VirtualGPU(*this);
|
||||
if (!(xferQueue_ && xferQueue_->create(
|
||||
@@ -2176,6 +2156,35 @@ Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free)
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
Device::createBlitProgram()
|
||||
{
|
||||
bool result = true;
|
||||
|
||||
// Delayed compilation due to brig_loader memory allocation
|
||||
const char* scheduler = nullptr;
|
||||
const char* ocl20 = nullptr;
|
||||
#if !defined(WITH_LIGHTNING_COMPILER)
|
||||
std::string sch = SchedulerSourceCode;
|
||||
if (settings().oclVersion_ == OpenCL20) {
|
||||
size_t loc = sch.find("%s");
|
||||
sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
|
||||
scheduler = sch.c_str();
|
||||
ocl20 = "-cl-std=CL2.0";
|
||||
}
|
||||
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
||||
blitProgram_ = new BlitProgram(context_);
|
||||
// Create blit programs
|
||||
if (blitProgram_ == nullptr ||
|
||||
!blitProgram_->create(this, scheduler, ocl20)) {
|
||||
delete blitProgram_;
|
||||
blitProgram_ = nullptr;
|
||||
LogError("Couldn't create blit kernels!");
|
||||
result = false;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void
|
||||
Device::SrdManager::fillResourceList(std::vector<const Memory*>& memList)
|
||||
{
|
||||
|
||||
@@ -482,6 +482,9 @@ public:
|
||||
bool free //!< TRUE if runtime frees memory
|
||||
);
|
||||
|
||||
//! Create internal blit program
|
||||
bool createBlitProgram();
|
||||
|
||||
//! Interop for GL device
|
||||
bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const;
|
||||
bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const;
|
||||
|
||||
@@ -394,22 +394,12 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
|
||||
if (!prog().isNull()) {
|
||||
code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
|
||||
Resource::MemoryType type = Resource::Local;
|
||||
if (flags_.internalKernel_) {
|
||||
type = Resource::RemoteUSWC;
|
||||
}
|
||||
|
||||
// Initialize kernel ISA code
|
||||
if (code_ && code_->create(type)) {
|
||||
if (flags_.internalKernel_) {
|
||||
address cpuCodePtr = static_cast<address>(code_->map(nullptr, Resource::WriteOnly));
|
||||
// Copy only amd_kernel_code_t
|
||||
memcpy(cpuCodePtr, reinterpret_cast<address>(akc), codeSize_);
|
||||
code_->unmap(nullptr);
|
||||
}
|
||||
else {
|
||||
static_cast<const KernelBlitManager&>(dev().xferMgr()).writeRawData(
|
||||
*code_, codeSize_, reinterpret_cast<void*>(akc));
|
||||
}
|
||||
constexpr bool WaitForUpload = true;
|
||||
code_->writeRawData(*code_->dev().xferQueue(), 0, codeSize_,
|
||||
reinterpret_cast<void*>(akc), WaitForUpload);
|
||||
}
|
||||
else {
|
||||
LogError("Failed to allocate ISA code!");
|
||||
|
||||
@@ -469,8 +469,6 @@ HSAILProgram::linkImpl(amd::option::Options* options)
|
||||
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
|
||||
bool finalize = true;
|
||||
bool hsaLoad = true;
|
||||
internal_ = (compileOptions_.find("-cl-internal-kernel") !=
|
||||
std::string::npos) ? true : false;
|
||||
|
||||
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
|
||||
if (!binaryElf_) {
|
||||
@@ -936,7 +934,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
assert(size);
|
||||
assert(align);
|
||||
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
|
||||
if (program_->isNull() || program_->isInternal()) {
|
||||
if (program_->isNull()) {
|
||||
return new char[size];
|
||||
}
|
||||
|
||||
@@ -962,19 +960,20 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
|
||||
if (0 == size) {
|
||||
return true;
|
||||
}
|
||||
if (program_->isNull() || program_->isInternal()) {
|
||||
if (program_->isNull()) {
|
||||
memcpy(reinterpret_cast<address>(dst) + offset, src, size);
|
||||
return true;
|
||||
}
|
||||
assert(program_->dev().xferQueue());
|
||||
pal::Memory* mem = reinterpret_cast<pal::Memory*>(dst);
|
||||
return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
|
||||
constexpr bool WaitForCopy = true;
|
||||
mem->writeRawData(*mem->dev().xferQueue(), offset, size, src, WaitForCopy);
|
||||
return true;
|
||||
}
|
||||
|
||||
void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size)
|
||||
{
|
||||
if (program_->isNull() || program_->isInternal()) {
|
||||
if (program_->isNull()) {
|
||||
delete[] reinterpret_cast<char*>(ptr);
|
||||
}
|
||||
else {
|
||||
@@ -1020,9 +1019,6 @@ LightningProgram::linkImpl(amd::option::Options *options)
|
||||
{
|
||||
using namespace amd::opencl_driver;
|
||||
|
||||
internal_ = (compileOptions_.find("-cl-internal-kernel") !=
|
||||
std::string::npos) ? true : false;
|
||||
|
||||
aclType continueCompileFrom = llvmBinary_.empty()
|
||||
? getNextCompilationStageFromBinary(options)
|
||||
: ACL_TYPE_LLVMIR_BINARY;
|
||||
|
||||
@@ -182,9 +182,6 @@ public:
|
||||
//! Returns TRUE if the program just compiled
|
||||
bool isNull() const { return isNull_; }
|
||||
|
||||
//! Returns TRUE if the program used internally by runtime
|
||||
bool isInternal() const { return internal_; }
|
||||
|
||||
//! Returns TRUE if the program contains static samplers
|
||||
bool isStaticSampler() const { return (staticSamplers_.size() != 0); }
|
||||
|
||||
@@ -278,7 +275,6 @@ protected:
|
||||
union {
|
||||
struct {
|
||||
uint32_t isNull_ : 1; //!< Null program no memory allocations
|
||||
uint32_t internal_ : 1; //!< Internal blit program
|
||||
};
|
||||
uint32_t flags_; //!< Program flags
|
||||
};
|
||||
|
||||
@@ -1118,14 +1118,18 @@ Resource::writeRawData(
|
||||
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast<const uint32_t*>(data));
|
||||
gpu.eventEnd(MainEngine, event);
|
||||
|
||||
setBusy(gpu, event);
|
||||
// Update the global GPU event
|
||||
gpu.setGpuEvent(event, false);
|
||||
|
||||
if (waitForEvent) {
|
||||
//! @note: We don't really have to mark the allocations as busy
|
||||
//! if we are waiting for a transfer
|
||||
|
||||
// Wait for event to complete
|
||||
gpu.waitForEvent(&event);
|
||||
}
|
||||
else {
|
||||
setBusy(gpu, event);
|
||||
// Update the global GPU event
|
||||
gpu.setGpuEvent(event, false);
|
||||
}
|
||||
}
|
||||
static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement)
|
||||
{
|
||||
|
||||
@@ -63,7 +63,7 @@ VirtualGPU::Queue::Create(
|
||||
qCreateInfo.numReservedCu = rtCU;
|
||||
qCreateInfo.engineIndex = 0x0;
|
||||
cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute;
|
||||
cmdCreateInfo.flags.rtCu = true;
|
||||
cmdCreateInfo.flags.realtimeComputeUnits = true;
|
||||
}
|
||||
*/
|
||||
// Find queue object size
|
||||
@@ -801,28 +801,6 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
return false;
|
||||
}
|
||||
|
||||
// Choose the appropriate class for blit engine
|
||||
switch (dev().settings().blitEngine_) {
|
||||
default:
|
||||
// Fall through ...
|
||||
case Settings::BlitEngineHost:
|
||||
blitSetup.disableAll();
|
||||
// Fall through ...
|
||||
case Settings::BlitEngineCAL:
|
||||
case Settings::BlitEngineKernel:
|
||||
// use host blit for HW debug
|
||||
if (dev().settings().enableHwDebug_) {
|
||||
blitSetup.disableCopyImageToBuffer_ = true;
|
||||
blitSetup.disableCopyBufferToImage_ = true;
|
||||
}
|
||||
blitMgr_ = new KernelBlitManager(*this, blitSetup);
|
||||
break;
|
||||
}
|
||||
if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
|
||||
LogError("Could not create BlitManager!");
|
||||
return false;
|
||||
}
|
||||
|
||||
tsCache_ = new TimeStampCache(*this);
|
||||
if (nullptr == tsCache_) {
|
||||
LogError("Could not create TimeStamp cache!");
|
||||
@@ -846,6 +824,28 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
return false;
|
||||
}
|
||||
|
||||
// Choose the appropriate class for blit engine
|
||||
switch (dev().settings().blitEngine_) {
|
||||
default:
|
||||
// Fall through ...
|
||||
case Settings::BlitEngineHost:
|
||||
blitSetup.disableAll();
|
||||
// Fall through ...
|
||||
case Settings::BlitEngineCAL:
|
||||
case Settings::BlitEngineKernel:
|
||||
// use host blit for HW debug
|
||||
if (dev().settings().enableHwDebug_) {
|
||||
blitSetup.disableCopyImageToBuffer_ = true;
|
||||
blitSetup.disableCopyBufferToImage_ = true;
|
||||
}
|
||||
blitMgr_ = new KernelBlitManager(*this, blitSetup);
|
||||
break;
|
||||
}
|
||||
if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
|
||||
LogError("Could not create BlitManager!");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user