P4 to Git Change 1196584 by gandryey@gera-dev-w7 on 2015/10/02 14:01:41

SWDEV-77880 - Enable memory dependency tracking for HSAIL path
	- Use read/write attribute provided by the compiler for async execution optimization

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#299 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#118 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#384 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#136 edit
This commit is contained in:
foreman
2015-10-02 14:19:07 -04:00
vanhempi f5c30d2ca0
commit f8648f412d
4 muutettua tiedostoa jossa 102 lisäystä ja 52 poistoa
@@ -2893,6 +2893,23 @@ GetHSAILArgAlignment(const aclArgData* argInfo)
}
}
inline static HSAIL_ACCESS_TYPE
GetHSAILArgAccessType(const aclArgData* argInfo)
{
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.type) {
case ACCESS_TYPE_RO:
return HSAIL_ACCESS_TYPE_RO;
case ACCESS_TYPE_WO:
return HSAIL_ACCESS_TYPE_WO;
case ACCESS_TYPE_RW:
default:
return HSAIL_ACCESS_TYPE_RW;
}
}
return HSAIL_ACCESS_TYPE_NONE;
}
inline static HSAIL_ADDRESS_QUALIFIER
GetHSAILAddrQual(const aclArgData* argInfo)
{
@@ -3268,6 +3285,7 @@ HSAILKernel::initHsailArgs(const aclArgData* aclArg)
(aclArg->arg.value.data != DATATYPE_struct)) ?
aclArg->arg.value.numElements : 1;
arg->alignment_ = GetHSAILArgAlignment(aclArg);
arg->access_ = GetHSAILArgAccessType(aclArg);
offset += GetHSAILArgSize(aclArg);
arguments_.push_back(arg);
}
@@ -807,6 +807,12 @@ enum HSAIL_DATA_TYPE{
HSAIL_DATATYPE_MAX_TYPES
};
enum HSAIL_ACCESS_TYPE {
HSAIL_ACCESS_TYPE_NONE = 0,
HSAIL_ACCESS_TYPE_RO,
HSAIL_ACCESS_TYPE_WO,
HSAIL_ACCESS_TYPE_RW
};
class HSAILKernel : public device::Kernel
{
@@ -822,6 +828,7 @@ public:
HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument
HSAIL_DATA_TYPE dataType_; //!< The type of data
uint numElem_; //!< Number of elements
HSAIL_ACCESS_TYPE access_; //!< Access type for the argument
};
// Max number of possible extra (hidden) kernel arguments
+74 -50
Näytä tiedosto
@@ -1665,54 +1665,17 @@ VirtualGPU::submitKernelInternalHSA(
std::vector<const Memory*> memList;
bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true:false;
if (!printfDbgHSA().init(*this, printfEnabled )){
if (!printfDbgHSA().init(*this, printfEnabled )) {
LogError( "Printf debug buffer initialization failed!");
return false;
}
bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
bool supportFineGrainedSystem = deviceSupportFGS;
FGSStatus status = kernel.parameters().getSvmSystemPointersSupport();
switch (status) {
case FGS_YES:
if (!deviceSupportFGS) {
return false;
}
supportFineGrainedSystem = true;
break;
case FGS_NO:
supportFineGrainedSystem = false;
break;
case FGS_DEFAULT:
default:
break;
// Check memory dependency and SVM objects
if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) {
LogError("Wrong memory objects!");
return false;
}
size_t count = kernel.parameters().getNumberOfSvmPtr();
size_t execInfoOffset = kernel.parameters().getExecInfoOffset();
amd::Memory* memory = NULL;
//get svm non arugment information
void* const* svmPtrArray = reinterpret_cast<void* const*>(parameters + execInfoOffset);
for (size_t i = 0; i < count; i++) {
memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
if (NULL == memory) {
if (!supportFineGrainedSystem) {
return false;
}
}
else {
Memory* gpuMemory = dev().getGpuMemory(memory);
if (NULL != gpuMemory) {
memList.push_back(gpuMemory);
}
else {
return false;
}
}
}
// Check memory dependency and cache coherency
processMemObjectsHSA(kernel, parameters, nativeMem);
cal_.memCount_ = 0;
if (hsaKernel.dynamicParallelism()) {
@@ -3217,22 +3180,79 @@ VirtualGPU::profileEvent(EngineType engine, bool type) const
}
}
void
bool
VirtualGPU::processMemObjectsHSA(
const amd::Kernel& kernel,
const_address params,
bool nativeMem)
bool nativeMem,
std::vector<const Memory*>* memList)
{
static const bool NoAlias = true;
const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>
(*(kernel.getDeviceKernel(dev(), NoAlias)));
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
memoryDependency().newKernel();
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
bool supportFineGrainedSystem = deviceSupportFGS;
FGSStatus status = kernelParams.getSvmSystemPointersSupport();
switch (status) {
case FGS_YES:
if (!deviceSupportFGS) {
return false;
}
supportFineGrainedSystem = true;
break;
case FGS_NO:
supportFineGrainedSystem = false;
break;
case FGS_DEFAULT:
default:
break;
}
size_t count = kernelParams.getNumberOfSvmPtr();
size_t execInfoOffset = kernelParams.getExecInfoOffset();
bool sync = true;
amd::Memory* memory = NULL;
//get svm non arugment information
void* const* svmPtrArray =
reinterpret_cast<void* const*>(params + execInfoOffset);
for (size_t i = 0; i < count; i++) {
memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
if (NULL == memory) {
if (!supportFineGrainedSystem) {
return false;
}
else if (sync) {
flushCUCaches();
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
}
}
else {
Memory* gpuMemory = dev().getGpuMemory(memory);
if (NULL != gpuMemory) {
// Synchronize data with other memory instances if necessary
gpuMemory->syncCacheFromHost(*this);
const static bool IsReadOnly = false;
// Validate SVM passed in the non argument list
memoryDependency().validate(*this, gpuMemory, IsReadOnly);
memList->push_back(gpuMemory);
}
else {
return false;
}
}
}
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
@@ -3248,9 +3268,10 @@ VirtualGPU::processMemObjectsHSA(
svmMem = amd::SvmManager::FindSvmBuffer(
*reinterpret_cast<void* const*>(params + desc.offset_));
if (!svmMem) {
//!\todo Do we have to sync cache coherency or wait for SDMA?
flushCUCaches();
break;
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
}
}
@@ -3271,10 +3292,11 @@ VirtualGPU::processMemObjectsHSA(
}
if (memory != NULL) {
//!@todo The code below can handle images only,
//! but the qualifier is broken anyway
// Check image
readOnly = (desc.accessQualifier_ ==
CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
// Check buffer
readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, memory, readOnly);
}
@@ -3286,6 +3308,8 @@ VirtualGPU::processMemObjectsHSA(
// Validate global store for a dependency in the queue
memoryDependency().validate(*this, mem, IsReadOnly);
}
return true;
}
amd::Memory*
@@ -463,10 +463,11 @@ private:
);
//! Detects memory dependency for HSAIL kernels and flushes caches
void processMemObjectsHSA(
bool processMemObjectsHSA(
const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
bool nativeMem //!< Native memory objects
bool nativeMem, //!< Native memory objects
std::vector<const Memory*>* memList //!< Memory list for KMD tracking
);
//! Common function for fill memory used by both svm Fill and non-svm fill