P4 to Git Change 1196584 by gandryey@gera-dev-w7 on 2015/10/02 14:01:41
SWDEV-77880 - Enable memory dependency tracking for HSAIL path - Use read/write attribute provided by the compiler for async execution optimization Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#299 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#118 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#384 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#136 edit
This commit is contained in:
@@ -2893,6 +2893,23 @@ GetHSAILArgAlignment(const aclArgData* argInfo)
|
||||
}
|
||||
}
|
||||
|
||||
inline static HSAIL_ACCESS_TYPE
|
||||
GetHSAILArgAccessType(const aclArgData* argInfo)
|
||||
{
|
||||
if (argInfo->type == ARG_TYPE_POINTER) {
|
||||
switch (argInfo->arg.pointer.type) {
|
||||
case ACCESS_TYPE_RO:
|
||||
return HSAIL_ACCESS_TYPE_RO;
|
||||
case ACCESS_TYPE_WO:
|
||||
return HSAIL_ACCESS_TYPE_WO;
|
||||
case ACCESS_TYPE_RW:
|
||||
default:
|
||||
return HSAIL_ACCESS_TYPE_RW;
|
||||
}
|
||||
}
|
||||
return HSAIL_ACCESS_TYPE_NONE;
|
||||
}
|
||||
|
||||
inline static HSAIL_ADDRESS_QUALIFIER
|
||||
GetHSAILAddrQual(const aclArgData* argInfo)
|
||||
{
|
||||
@@ -3268,6 +3285,7 @@ HSAILKernel::initHsailArgs(const aclArgData* aclArg)
|
||||
(aclArg->arg.value.data != DATATYPE_struct)) ?
|
||||
aclArg->arg.value.numElements : 1;
|
||||
arg->alignment_ = GetHSAILArgAlignment(aclArg);
|
||||
arg->access_ = GetHSAILArgAccessType(aclArg);
|
||||
offset += GetHSAILArgSize(aclArg);
|
||||
arguments_.push_back(arg);
|
||||
}
|
||||
|
||||
@@ -807,6 +807,12 @@ enum HSAIL_DATA_TYPE{
|
||||
HSAIL_DATATYPE_MAX_TYPES
|
||||
};
|
||||
|
||||
enum HSAIL_ACCESS_TYPE {
|
||||
HSAIL_ACCESS_TYPE_NONE = 0,
|
||||
HSAIL_ACCESS_TYPE_RO,
|
||||
HSAIL_ACCESS_TYPE_WO,
|
||||
HSAIL_ACCESS_TYPE_RW
|
||||
};
|
||||
|
||||
class HSAILKernel : public device::Kernel
|
||||
{
|
||||
@@ -822,6 +828,7 @@ public:
|
||||
HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument
|
||||
HSAIL_DATA_TYPE dataType_; //!< The type of data
|
||||
uint numElem_; //!< Number of elements
|
||||
HSAIL_ACCESS_TYPE access_; //!< Access type for the argument
|
||||
};
|
||||
|
||||
// Max number of possible extra (hidden) kernel arguments
|
||||
|
||||
@@ -1665,54 +1665,17 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
std::vector<const Memory*> memList;
|
||||
|
||||
bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true:false;
|
||||
if (!printfDbgHSA().init(*this, printfEnabled )){
|
||||
if (!printfDbgHSA().init(*this, printfEnabled )) {
|
||||
LogError( "Printf debug buffer initialization failed!");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
|
||||
bool supportFineGrainedSystem = deviceSupportFGS;
|
||||
FGSStatus status = kernel.parameters().getSvmSystemPointersSupport();
|
||||
switch (status) {
|
||||
case FGS_YES:
|
||||
if (!deviceSupportFGS) {
|
||||
return false;
|
||||
}
|
||||
supportFineGrainedSystem = true;
|
||||
break;
|
||||
case FGS_NO:
|
||||
supportFineGrainedSystem = false;
|
||||
break;
|
||||
case FGS_DEFAULT:
|
||||
default:
|
||||
break;
|
||||
// Check memory dependency and SVM objects
|
||||
if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) {
|
||||
LogError("Wrong memory objects!");
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t count = kernel.parameters().getNumberOfSvmPtr();
|
||||
size_t execInfoOffset = kernel.parameters().getExecInfoOffset();
|
||||
amd::Memory* memory = NULL;
|
||||
//get svm non arugment information
|
||||
void* const* svmPtrArray = reinterpret_cast<void* const*>(parameters + execInfoOffset);
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
|
||||
if (NULL == memory) {
|
||||
if (!supportFineGrainedSystem) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
Memory* gpuMemory = dev().getGpuMemory(memory);
|
||||
if (NULL != gpuMemory) {
|
||||
memList.push_back(gpuMemory);
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check memory dependency and cache coherency
|
||||
processMemObjectsHSA(kernel, parameters, nativeMem);
|
||||
cal_.memCount_ = 0;
|
||||
|
||||
if (hsaKernel.dynamicParallelism()) {
|
||||
@@ -3217,22 +3180,79 @@ VirtualGPU::profileEvent(EngineType engine, bool type) const
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
bool
|
||||
VirtualGPU::processMemObjectsHSA(
|
||||
const amd::Kernel& kernel,
|
||||
const_address params,
|
||||
bool nativeMem)
|
||||
bool nativeMem,
|
||||
std::vector<const Memory*>* memList)
|
||||
{
|
||||
static const bool NoAlias = true;
|
||||
const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>
|
||||
(*(kernel.getDeviceKernel(dev(), NoAlias)));
|
||||
const amd::KernelSignature& signature = kernel.signature();
|
||||
const amd::KernelParameters& kernelParams = kernel.parameters();
|
||||
|
||||
// Mark the tracker with a new kernel,
|
||||
// so we can avoid checks of the aliased objects
|
||||
memoryDependency().newKernel();
|
||||
|
||||
const amd::KernelSignature& signature = kernel.signature();
|
||||
const amd::KernelParameters& kernelParams = kernel.parameters();
|
||||
bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
|
||||
bool supportFineGrainedSystem = deviceSupportFGS;
|
||||
FGSStatus status = kernelParams.getSvmSystemPointersSupport();
|
||||
switch (status) {
|
||||
case FGS_YES:
|
||||
if (!deviceSupportFGS) {
|
||||
return false;
|
||||
}
|
||||
supportFineGrainedSystem = true;
|
||||
break;
|
||||
case FGS_NO:
|
||||
supportFineGrainedSystem = false;
|
||||
break;
|
||||
case FGS_DEFAULT:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
size_t count = kernelParams.getNumberOfSvmPtr();
|
||||
size_t execInfoOffset = kernelParams.getExecInfoOffset();
|
||||
bool sync = true;
|
||||
|
||||
amd::Memory* memory = NULL;
|
||||
//get svm non arugment information
|
||||
void* const* svmPtrArray =
|
||||
reinterpret_cast<void* const*>(params + execInfoOffset);
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
|
||||
if (NULL == memory) {
|
||||
if (!supportFineGrainedSystem) {
|
||||
return false;
|
||||
}
|
||||
else if (sync) {
|
||||
flushCUCaches();
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
memoryDependency().clear(!All);
|
||||
}
|
||||
}
|
||||
else {
|
||||
Memory* gpuMemory = dev().getGpuMemory(memory);
|
||||
if (NULL != gpuMemory) {
|
||||
// Synchronize data with other memory instances if necessary
|
||||
gpuMemory->syncCacheFromHost(*this);
|
||||
|
||||
const static bool IsReadOnly = false;
|
||||
// Validate SVM passed in the non argument list
|
||||
memoryDependency().validate(*this, gpuMemory, IsReadOnly);
|
||||
|
||||
memList->push_back(gpuMemory);
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check all parameters for the current kernel
|
||||
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
||||
@@ -3248,9 +3268,10 @@ VirtualGPU::processMemObjectsHSA(
|
||||
svmMem = amd::SvmManager::FindSvmBuffer(
|
||||
*reinterpret_cast<void* const*>(params + desc.offset_));
|
||||
if (!svmMem) {
|
||||
//!\todo Do we have to sync cache coherency or wait for SDMA?
|
||||
flushCUCaches();
|
||||
break;
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
memoryDependency().clear(!All);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3271,10 +3292,11 @@ VirtualGPU::processMemObjectsHSA(
|
||||
}
|
||||
|
||||
if (memory != NULL) {
|
||||
//!@todo The code below can handle images only,
|
||||
//! but the qualifier is broken anyway
|
||||
// Check image
|
||||
readOnly = (desc.accessQualifier_ ==
|
||||
CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
|
||||
// Check buffer
|
||||
readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
|
||||
// Validate memory for a dependency in the queue
|
||||
memoryDependency().validate(*this, memory, readOnly);
|
||||
}
|
||||
@@ -3286,6 +3308,8 @@ VirtualGPU::processMemObjectsHSA(
|
||||
// Validate global store for a dependency in the queue
|
||||
memoryDependency().validate(*this, mem, IsReadOnly);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
amd::Memory*
|
||||
|
||||
@@ -463,10 +463,11 @@ private:
|
||||
);
|
||||
|
||||
//! Detects memory dependency for HSAIL kernels and flushes caches
|
||||
void processMemObjectsHSA(
|
||||
bool processMemObjectsHSA(
|
||||
const amd::Kernel& kernel, //!< AMD kernel object for execution
|
||||
const_address params, //!< Pointer to the param's store
|
||||
bool nativeMem //!< Native memory objects
|
||||
bool nativeMem, //!< Native memory objects
|
||||
std::vector<const Memory*>* memList //!< Memory list for KMD tracking
|
||||
);
|
||||
|
||||
//! Common function for fill memory used by both svm Fill and non-svm fill
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user