P4 to Git Change 1271191 by gandryey@gera-w8 on 2016/05/19 18:42:50

SWDEV-86035 - Add PAL backend to OpenCL
	- Adds SDMA worakround for pagefault
	- Removes directSRD and hsail flags
	- Fixes a BSOD with the latets PAL on Fiji. KMD expects a valid UMD client

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#315 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#225 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#345 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#96 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/os/win/winPlatform.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#254 edit
Этот коммит содержится в:
foreman
2016-05-19 18:59:20 -04:00
родитель 4e2cfbc872
Коммит ffcdad85e5
11 изменённых файлов: 114 добавлений и 162 удалений
+18 -31
Просмотреть файл
@@ -3889,30 +3889,23 @@ HSAILKernel::loadArguments(
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
if (dev().settings().hsailDirectSRD_) {
// Image arguments are of size 48 bytes and aligned to 16 bytes
WriteAqlArg(&aqlArgBuf, image->hwState(),
HsaImageObjectSize, HsaImageObjectAlignment);
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
memList.push_back(cb);
}
else {
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
memList.push_back(cb);
}
else {
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
//! @todo Compiler has to return read/write attributes
@@ -3929,15 +3922,9 @@ HSAILKernel::loadArguments(
*reinterpret_cast<amd::Sampler* const*>(paramaddr);
const Sampler* gpuSampler = static_cast<Sampler*>
(sampler->getDeviceSampler(dev()));
if (dev().settings().hsailDirectSRD_) {
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
HsaSamplerObjectSize, HsaSamplerObjectAlignment);
}
else {
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
break;
}
case HSAIL_ARGTYPE_QUEUE: {
-1
Просмотреть файл
@@ -2494,7 +2494,6 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
assert(false);
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
assert(!program_->dev().settings().hsailDirectSRD_);
gpu::Sampler* sampler = new gpu::Sampler(program_->dev());
if (!sampler || !sampler->create(state)) {
delete sampler;
-3
Просмотреть файл
@@ -135,9 +135,6 @@ Settings::Settings()
// Don't support platform atomics by default.
svmAtomics_ = false;
// Use direct SRD by default
hsailDirectSRD_ = GPU_DIRECT_SRD;
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
+1 -2
Просмотреть файл
@@ -69,10 +69,9 @@ public:
uint svmFineGrainSystem_: 1; //!< SVM fine grain system support
uint apuSystem_: 1; //!< Device is APU system with shared memory
uint asyncMemCopy_: 1; //!< Use async memory transfers
uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
uint useDeviceQueue_: 1; //!< Submit to separate device queue
uint singleFpDenorm_: 1; //!< Support Single FP Denorm
uint reserved_: 3;
uint reserved_: 4;
};
uint value_;
};
+39 -54
Просмотреть файл
@@ -149,19 +149,14 @@ NullDevice::create(Pal::GfxIpLevel ipLevel)
device::Program*
NullDevice::createProgram(amd::option::Options* options)
{
device::Program* nullProgram;
if (settings().hsail_) {
nullProgram = new HSAILProgram(*this);
}
else {
// AMDIL path
ShouldNotReachHere();
}
if (nullProgram == nullptr) {
device::Program* program;
program = new HSAILProgram(*this);
if (program == nullptr) {
LogError("Memory allocation has failed!");
}
return nullProgram;
return program;
}
void NullDevice::fillDeviceInfo(
@@ -643,19 +638,26 @@ Device::create(Pal::IDevice* device)
// Update HW info for the device
hwInfo_ = &DeviceInfo[static_cast<uint>(properties().revision)];
// Find the number of available engines
numComputeEngines_ =
properties().engineProperties[Pal::QueueTypeCompute].engineCount;
numDmaEngines_ =
properties().engineProperties[Pal::QueueTypeDma].engineCount;
Pal::PalPublicSettings*const palSettings = iDev()->GetPublicSettings();
// Modify settings here
// palSettings ...
palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
// Commit the new settings for the device
result = iDev()->CommitSettingsAndInit();
if (result == Pal::Result::Success) {
Pal::DeviceFinalizeInfo finalizeInfo = {};
// Request 2 compute engines
finalizeInfo.engineCounts[Pal::QueueTypeCompute] = 2;
// Request 2 SDMA engines
finalizeInfo.engineCounts[Pal::QueueTypeDma] = 2;
// Request all compute engines
finalizeInfo.engineCounts[Pal::QueueTypeCompute] = numComputeEngines_;
// Request all SDMA engines
finalizeInfo.engineCounts[Pal::QueueTypeDma] = numDmaEngines_;
result = iDev()->Finalize(finalizeInfo);
}
@@ -670,12 +672,6 @@ Device::create(Pal::IDevice* device)
appProfile_.reportAsOCL12Device())) {
return false;
}
// Find the number of available engines
numComputeEngines_ =
properties().engineProperties[Pal::QueueTypeCompute].engineCount;
numDmaEngines_ =
properties().engineProperties[Pal::QueueTypeDma].engineCount;
numComputeEngines_ = std::min(numComputeEngines_, settings().numComputeRings_);
amd::Context::Info info = {0};
@@ -816,22 +812,20 @@ Device::initializeHeapResources()
}
// Delay compilation due to brig_loader memory allocation
if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
const char* scheduler = nullptr;
const char* ocl20 = nullptr;
if (settings().oclVersion_ == OpenCL20) {
scheduler = SchedulerSourceCode;
ocl20 = "-cl-std=CL2.0";
}
blitProgram_ = new BlitProgram(context_);
// Create blit programs
if (blitProgram_ == nullptr ||
!blitProgram_->create(this, scheduler, ocl20)) {
delete blitProgram_;
blitProgram_ = nullptr;
LogError("Couldn't create blit kernels!");
return false;
}
const char* scheduler = nullptr;
const char* ocl20 = nullptr;
if (settings().oclVersion_ == OpenCL20) {
scheduler = SchedulerSourceCode;
ocl20 = "-cl-std=CL2.0";
}
blitProgram_ = new BlitProgram(context_);
// Create blit programs
if (blitProgram_ == nullptr ||
!blitProgram_->create(this, scheduler, ocl20)) {
delete blitProgram_;
blitProgram_ = nullptr;
LogError("Couldn't create blit kernels!");
return false;
}
// Create a synchronized transfer queue
@@ -900,20 +894,13 @@ Device::createVirtualDevice(
device::Program*
Device::createProgram(amd::option::Options* options)
{
device::Program* gpuProgram;
if (settings().hsail_) {
gpuProgram = new HSAILProgram(*this);
}
else {
ShouldNotReachHere();
//AMDIL
//gpuProgram = new Program(*this);
}
if (gpuProgram == nullptr) {
device::Program* program;
program = new HSAILProgram(*this);
if (program == nullptr) {
LogError("We failed memory allocation for program!");
}
return gpuProgram;
return program;
}
//! Requested devices list as configured by the GPU_DEVICE_ORDINAL
@@ -1410,14 +1397,12 @@ bool
Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const
{
*sampler = nullptr;
if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) {
Sampler* gpuSampler = new Sampler(*this);
if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) {
delete gpuSampler;
return false;
}
*sampler = gpuSampler;
Sampler* gpuSampler = new Sampler(*this);
if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) {
delete gpuSampler;
return false;
}
*sampler = gpuSampler;
return true;
}
+18 -31
Просмотреть файл
@@ -1063,30 +1063,23 @@ HSAILKernel::loadArguments(
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
if (dev().settings().hsailDirectSRD_) {
// Image arguments are of size 48 bytes and aligned to 16 bytes
WriteAqlArg(&aqlArgBuf, image->hwState(),
HsaImageObjectSize, HsaImageObjectAlignment);
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
memList.push_back(cb);
}
else {
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
memList.push_back(cb);
}
else {
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
//! @todo Compiler has to return read/write attributes
@@ -1103,15 +1096,9 @@ HSAILKernel::loadArguments(
*reinterpret_cast<amd::Sampler* const*>(paramaddr);
const Sampler* gpuSampler = static_cast<Sampler*>
(sampler->getDeviceSampler(dev()));
if (dev().settings().hsailDirectSRD_) {
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
HsaSamplerObjectSize, HsaSamplerObjectAlignment);
}
else {
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
break;
}
case HSAIL_ARGTYPE_QUEUE: {
-1
Просмотреть файл
@@ -844,7 +844,6 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
assert(false);
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
assert(!program_->dev().settings().hsailDirectSRD_);
pal::Sampler* sampler = new pal::Sampler(program_->dev());
if (!sampler || !sampler->create(state)) {
delete sampler;
+6 -2
Просмотреть файл
@@ -1178,7 +1178,9 @@ Resource::partialMemCopyTo(
// Make sure linear pitch in bytes is 4 bytes aligned
if (((copyRegion.gpuMemoryRowPitch % 4) != 0) ||
// another DRM restriciton... SI has 4 pixels
(copyRegion.gpuMemoryOffset % 4 != 0)) {
(copyRegion.gpuMemoryOffset % 4 != 0) ||
(dev().settings().sdamPageFaultWar_ &&
(copyRegion.imageOffset.x % dstResource.elementSize() != 0))) {
result = false;
}
else {
@@ -1204,7 +1206,9 @@ Resource::partialMemCopyTo(
// Make sure linear pitch in bytes is 4 bytes aligned
if (((copyRegion.gpuMemoryRowPitch % 4) != 0) ||
// another DRM restriciton... SI has 4 pixels
(copyRegion.gpuMemoryOffset % 4 != 0)) {
(copyRegion.gpuMemoryOffset % 4 != 0) ||
(dev().settings().sdamPageFaultWar_ &&
(copyRegion.imageOffset.x % elementSize() != 0))) {
result = false;
}
else {
+8 -10
Просмотреть файл
@@ -115,20 +115,17 @@ Settings::Settings()
numDeviceEvents_ = 1024;
numWaitEvents_ = 8;
// Disable HSAIL by default
hsail_ = false;
// Don't support platform atomics by default.
svmAtomics_ = false;
// Use direct SRD by default
hsailDirectSRD_ = GPU_DIRECT_SRD;
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
// Don't support Denormals for single precision by default
singleFpDenorm_ = false;
// Disable SDMA workaround by default
sdamPageFaultWar_ = false;
}
bool
@@ -179,6 +176,9 @@ Settings::create(
// Keep this false even though we have support
// singleFpDenorm_ = true;
viPlus_ = true;
// SDMA may have memory access outside of
// the valid buffer range and cause a page fault
sdamPageFaultWar_ = true;
// Fall through to CI ...
case Pal::AsicRevision::Kalindi:
case Pal::AsicRevision::Spectre:
@@ -193,7 +193,6 @@ Settings::create(
case Pal::AsicRevision::Bonaire:
case Pal::AsicRevision::Hawaii:
ciPlus_ = true;
hsail_ = true;
threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
reportFMAF_ = false;
if (palProp.revision == Pal::AsicRevision::Hawaii) {
@@ -228,11 +227,10 @@ Settings::create(
// This needs to be cleaned once 64bit addressing is stable
if (oclVersion_ < OpenCL20) {
use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false,
/*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR;
/*calAttr.isWorkstation ||*/ true) : GPU_FORCE_64BIT_PTR;
}
else {
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
|| (oclVersion_ >= OpenCL20)))) {
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
use64BitPtr_ = true;
}
}
+24 -25
Просмотреть файл
@@ -50,56 +50,55 @@ public:
uint disablePersistent_: 1; //!< Disables using persistent memory for staging
uint imageSupport_: 1; //!< Report images support
uint doublePrecision_: 1; //!< Enables double precision support
uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program
uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program
uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU
uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program
uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program
uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU
uint force32BitOcl20_: 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
uint imageDMA_: 1; //!< Enable direct image DMA transfers
uint syncObject_: 1; //!< Enable syncobject
uint ciPlus_: 1; //!< CI and post CI features
uint viPlus_: 1; //!< VI and post VI features
uint aiPlus_: 1; //!< AI and post AI features
uint imageDMA_: 1; //!< Enable direct image DMA transfers
uint syncObject_: 1; //!< Enable syncobject
uint ciPlus_: 1; //!< CI and post CI features
uint viPlus_: 1; //!< VI and post VI features
uint aiPlus_: 1; //!< AI and post AI features
uint threadTraceEnable_: 1; //!< Thread trace enable
uint linearPersistentImage_: 1; //!< Allocates linear images in persistent
uint useSingleScratch_: 1; //!< Allocates single scratch per device
uint hsail_: 1; //!< Enables HSAIL compilation
uint stagingWritePersistent_: 1; //!< Enables persistent writes
uint svmAtomics_: 1; //!< SVM device atomics
uint svmFineGrainSystem_: 1; //!< SVM fine grain system support
uint apuSystem_: 1; //!< Device is APU system with shared memory
uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
uint useDeviceQueue_: 1; //!< Submit to separate device queue
uint singleFpDenorm_: 1; //!< Support Single FP Denorm
uint reserved_: 5;
uint svmAtomics_: 1; //!< SVM device atomics
uint svmFineGrainSystem_: 1; //!< SVM fine grain system support
uint apuSystem_: 1; //!< Device is APU system with shared memory
uint useDeviceQueue_: 1; //!< Submit to separate device queue
uint singleFpDenorm_: 1; //!< Support Single FP Denorm
uint sdamPageFaultWar_: 1; //!< SDAM page fault workaround
uint reserved_: 7;
};
uint value_;
};
uint oclVersion_; //!< Reported OpenCL version support
uint debugFlags_; //!< Debug GPU flags
size_t stagedXferSize_; //!< Staged buffer size
uint maxRenames_; //!< Maximum number of possible renames
uint maxRenameSize_; //!< Maximum size for all renames
uint hwLDSSize_; //!< HW local data store size
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
uint workloadSplitSize_; //!< Workload split size
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
uint blitEngine_; //!< Blit engine type
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
size_t resourceCacheSize_; //!< Resource cache size in MB
uint64_t maxAllocSize_; //!< Maximum single allocation size
size_t numMemDependencies_;//!< The array size for memory dependencies tracking
uint cacheLineSize_; //!< Cache line size in bytes
uint cacheSize_; //!< L1 cache size in bytes
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
uint numDeviceEvents_; //!< The number of device events
uint numWaitEvents_; //!< The number of wait events for device enqueue
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
size_t stagedXferSize_; //!< Staged buffer size
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
size_t resourceCacheSize_; //!< Resource cache size in MB
size_t numMemDependencies_;//!< The array size for memory dependencies tracking
uint64_t maxAllocSize_; //!< Maximum single allocation size
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
//! Default constructor
Settings();
-2
Просмотреть файл
@@ -139,8 +139,6 @@ release(bool, GPU_HSAIL_ENABLE, LP64_SWITCH(LINUX_SWITCH(false,true),true), \
"Enable HSAIL on dGPU stack (requires CI+ HW)") \
release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
"Prints the specified number of the child kernels") \
release(bool, GPU_DIRECT_SRD, false, \
"Use indirect SRD access in HSAIL") \
release(bool, GPU_USE_DEVICE_QUEUE, false, \
"Use a dedicated device queue for the actual submissions") \
release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \