From 214bb5de7526f983fded7e43be7653ca9b3b69cd Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 26 Oct 2016 13:21:54 -0400 Subject: [PATCH] P4 to Git Change 1332184 by lmoriche@lmoriche_opencl_dev on 2016/10/26 13:13:41 SWDEV-105604 - OpenCL program manager for LC on PAL - Integrate changes form the ROCm device: add generic handling of hidden arguments and flatten vector types. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#16 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#35 edit [ROCm/clr commit: 705822e283768d1ec167daaca8b3c2baf6c72973] --- .../rocclr/runtime/device/pal/palkernel.cpp | 720 ++++++++---------- .../rocclr/runtime/device/pal/palkernel.hpp | 28 +- .../rocclr/runtime/device/pal/palvirtual.cpp | 34 +- 3 files changed, 354 insertions(+), 428 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index 46d7337ec5..3998b0e4b1 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -28,13 +28,36 @@ namespace pal { inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) { + if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { + if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) { + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; + } + else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) { + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; + } + else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) { + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; + } + else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { + return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER; + } + else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { + return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE; + } + else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) { + return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION; + } + return HSAIL_ARGTYPE_HIDDEN_NONE; + } + switch (argInfo->type) { case ARG_TYPE_POINTER: return HSAIL_ARGTYPE_POINTER; case ARG_TYPE_QUEUE: return HSAIL_ARGTYPE_QUEUE; case ARG_TYPE_VALUE: - return HSAIL_ARGTYPE_VALUE; + return (argInfo->arg.value.data == DATATYPE_struct) + ? HSAIL_ARGTYPE_REFERENCE : HSAIL_ARGTYPE_VALUE; case ARG_TYPE_IMAGE: return HSAIL_ARGTYPE_IMAGE; case ARG_TYPE_SAMPLER: @@ -49,28 +72,68 @@ inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) { switch (argInfo->type) { - case ARG_TYPE_POINTER: - return argInfo->arg.pointer.align; - default: + case ARG_TYPE_POINTER: + return sizeof(void*); + case ARG_TYPE_VALUE: + switch (argInfo->arg.value.data) { + case DATATYPE_i8: + case DATATYPE_u8: return 1; + case DATATYPE_u16: + case DATATYPE_i16: + case DATATYPE_f16: + return 2; + case DATATYPE_u32: + case DATATYPE_i32: + case DATATYPE_f32: + return 4; + case DATATYPE_i64: + case DATATYPE_u64: + case DATATYPE_f64: + return 8; + case DATATYPE_struct: + return 128; + case DATATYPE_ERROR: + default: + return -1; + } + case ARG_TYPE_IMAGE: return sizeof(cl_mem); + case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); + default: return -1; } } +inline static size_t +GetHSAILArgPointeeAlignment(const aclArgData* argInfo) +{ + if (argInfo->type == ARG_TYPE_POINTER) { + return argInfo->arg.pointer.align; + } + return 1; +} + inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) { + aclAccessType accessType; + if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.type) { - case ACCESS_TYPE_RO: - return HSAIL_ACCESS_TYPE_RO; - case ACCESS_TYPE_WO: - return HSAIL_ACCESS_TYPE_WO; - case ACCESS_TYPE_RW: - default: - return HSAIL_ACCESS_TYPE_RW; - } + accessType = argInfo->arg.pointer.type; } - return HSAIL_ACCESS_TYPE_NONE; + else if (argInfo->type == ARG_TYPE_IMAGE) { + accessType = argInfo->arg.image.type; + } + else { + return HSAIL_ACCESS_TYPE_NONE; + } + if (accessType == ACCESS_TYPE_RO) { + return HSAIL_ACCESS_TYPE_RO; + } + else if (accessType == ACCESS_TYPE_WO) { + return HSAIL_ACCESS_TYPE_WO; + } + + return HSAIL_ACCESS_TYPE_RW; } inline static HSAIL_ADDRESS_QUALIFIER @@ -158,35 +221,28 @@ inline static int GetHSAILArgSize(const aclArgData *argInfo) { switch (argInfo->type) { + case ARG_TYPE_POINTER: return sizeof(void *); case ARG_TYPE_VALUE: - switch (GetHSAILDataType(argInfo)) { - case HSAIL_DATATYPE_B1: - return 1; - case HSAIL_DATATYPE_B8: - case HSAIL_DATATYPE_S8: - case HSAIL_DATATYPE_U8: - return 1; - case HSAIL_DATATYPE_B16: - case HSAIL_DATATYPE_U16: - case HSAIL_DATATYPE_S16: - case HSAIL_DATATYPE_F16: - return 2; - case HSAIL_DATATYPE_B32: - case HSAIL_DATATYPE_U32: - case HSAIL_DATATYPE_S32: - case HSAIL_DATATYPE_F32: - return 4; - case HSAIL_DATATYPE_B64: - case HSAIL_DATATYPE_U64: - case HSAIL_DATATYPE_S64: - case HSAIL_DATATYPE_F64: - return 8; - case HSAIL_DATATYPE_STRUCT: - return argInfo->arg.value.numElements; - default: - return -1; + switch (argInfo->arg.value.data) { + case DATATYPE_i8: + case DATATYPE_u8: + case DATATYPE_struct: + return 1 * argInfo->arg.value.numElements; + case DATATYPE_u16: + case DATATYPE_i16: + case DATATYPE_f16: + return 2 * argInfo->arg.value.numElements; + case DATATYPE_u32: + case DATATYPE_i32: + case DATATYPE_f32: + return 4 * argInfo->arg.value.numElements; + case DATATYPE_i64: + case DATATYPE_u64: + case DATATYPE_f64: + return 8 * argInfo->arg.value.numElements; + case DATATYPE_ERROR: + default: return -1; } - case ARG_TYPE_POINTER: case ARG_TYPE_IMAGE: case ARG_TYPE_SAMPLER: case ARG_TYPE_QUEUE: @@ -197,7 +253,7 @@ GetHSAILArgSize(const aclArgData *argInfo) } inline static clk_value_type_t -GetOclType(const aclArgData* argInfo) +GetOclType(const HSAILKernel::Argument* arg) { static const clk_value_type_t ClkValueMapType[6][6] = { { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, @@ -209,41 +265,53 @@ GetOclType(const aclArgData* argInfo) }; uint sizeType; - if (argInfo->type == ARG_TYPE_QUEUE) { + uint numElements; + if (arg->type_ == HSAIL_ARGTYPE_QUEUE) { return T_QUEUE; } - if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) { + else if (arg->type_ == HSAIL_ARGTYPE_POINTER || arg->type_ == HSAIL_ARGTYPE_IMAGE) { return T_POINTER; } - else if (argInfo->type == ARG_TYPE_VALUE) { - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - sizeType = 0; - break; - case DATATYPE_i16: - case DATATYPE_u16: - sizeType = 1; - break; - case DATATYPE_i32: - case DATATYPE_u32: - sizeType = 2; - break; - case DATATYPE_i64: - case DATATYPE_u64: - sizeType = 3; - break; - case DATATYPE_f16: - case DATATYPE_f32: - sizeType = 4; - break; - case DATATYPE_f64: - sizeType = 5; - break; - default: - return T_VOID; + else if (arg->type_ == HSAIL_ARGTYPE_VALUE + || arg->type_ == HSAIL_ARGTYPE_REFERENCE) { + switch (arg->dataType_) { + case HSAIL_DATATYPE_S8: + case HSAIL_DATATYPE_U8: + sizeType = 0; + numElements = arg->size_; + break; + case HSAIL_DATATYPE_S16: + case HSAIL_DATATYPE_U16: + sizeType = 1; + numElements = arg->size_ / 2; + break; + case HSAIL_DATATYPE_S32: + case HSAIL_DATATYPE_U32: + sizeType = 2; + numElements = arg->size_ / 4; + break; + case HSAIL_DATATYPE_S64: + case HSAIL_DATATYPE_U64: + sizeType = 3; + numElements = arg->size_ / 8; + break; + case HSAIL_DATATYPE_F16: + sizeType = 4; + numElements = arg->size_ / 2; + break; + case HSAIL_DATATYPE_F32: + sizeType = 4; + numElements = arg->size_ / 4; + break; + case HSAIL_DATATYPE_F64: + sizeType = 5; + numElements = arg->size_ / 8; + break; + default: + return T_VOID; } - switch (argInfo->arg.value.numElements) { + + switch (numElements) { case 1: return ClkValueMapType[sizeType][0]; case 2: return ClkValueMapType[sizeType][1]; case 3: return ClkValueMapType[sizeType][2]; @@ -253,7 +321,7 @@ GetOclType(const aclArgData* argInfo) default: return T_VOID; } } - else if (argInfo->type == ARG_TYPE_SAMPLER) { + else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) { return T_SAMPLER; } else { @@ -262,25 +330,21 @@ GetOclType(const aclArgData* argInfo) } inline static cl_kernel_arg_address_qualifier -GetOclAddrQual(const aclArgData* argInfo) +GetOclAddrQual(const HSAILKernel::Argument* arg) { - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_UAV: - case PTR_MT_GLOBAL: + if (arg->type_ == HSAIL_ARGTYPE_POINTER) { + switch (arg->addrQual_) { + case HSAIL_ADDRESS_GLOBAL: return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: + case HSAIL_ADDRESS_CONSTANT: return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: + case HSAIL_ADDRESS_LOCAL: return CL_KERNEL_ARG_ADDRESS_LOCAL; default: return CL_KERNEL_ARG_ADDRESS_PRIVATE; } } - else if (argInfo->type == ARG_TYPE_IMAGE) { + else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { return CL_KERNEL_ARG_ADDRESS_GLOBAL; } //default for all other cases @@ -288,15 +352,15 @@ GetOclAddrQual(const aclArgData* argInfo) } inline static cl_kernel_arg_access_qualifier -GetOclAccessQual(const aclArgData* argInfo) +GetOclAccessQual(const HSAILKernel::Argument* arg) { - if (argInfo->type == ARG_TYPE_IMAGE) { - switch (argInfo->arg.image.type) { - case ACCESS_TYPE_RO: + if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { + switch (arg->access_) { + case HSAIL_ACCESS_TYPE_RO: return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case ACCESS_TYPE_WO: + case HSAIL_ACCESS_TYPE_WO: return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case ACCESS_TYPE_RW: + case HSAIL_ACCESS_TYPE_RW: return CL_KERNEL_ARG_ACCESS_READ_WRITE; default: return CL_KERNEL_ARG_ACCESS_NONE; @@ -335,42 +399,6 @@ GetOclTypeQual(const aclArgData* argInfo) return rv; } -static int -GetOclSize(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: return sizeof(void *); - case ARG_TYPE_VALUE: - //! \note OCL 6.1.5. For 3-component vector data types, - //! the size of the data type is 4 * sizeof(component). - switch (argInfo->arg.value.data) { - case DATATYPE_struct: - return 1 * argInfo->arg.value.numElements; - case DATATYPE_i8: - case DATATYPE_u8: - return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); - case DATATYPE_ERROR: - default: return -1; - } - case ARG_TYPE_IMAGE: return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); - case ARG_TYPE_QUEUE: return sizeof(cl_command_queue); - default: return -1; - } -} - bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) { @@ -428,13 +456,14 @@ HSAILKernel::initArgList(const aclArgData* aclArg) amd::KernelParameterDescriptor desc; size_t offset = 0; - // Reserved arguments for HSAIL launch - aclArg += MaxExtraArgumentsNum; for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { + // skip the hidden arguments + if (arguments_[i]->index_ == uint(-1)) continue; + desc.name_ = arguments_[i]->name_.c_str(); - desc.type_ = GetOclType(aclArg); - desc.addressQualifier_ = GetOclAddrQual(aclArg); - desc.accessQualifier_ = GetOclAccessQual(aclArg); + desc.type_ = GetOclType(arguments_[i]); + desc.addressQualifier_ = GetOclAddrQual(arguments_[i]); + desc.accessQualifier_ = GetOclAccessQual(arguments_[i]); desc.typeQualifier_ = GetOclTypeQual(aclArg); desc.typeName_ = arguments_[i]->typeName_.c_str(); @@ -443,7 +472,7 @@ HSAILKernel::initArgList(const aclArgData* aclArg) desc.size_ = 0; } else { - desc.size_ = GetOclSize(aclArg); + desc.size_ = arguments_[i]->size_; } // Make offset alignment to match CPU metadata, since @@ -473,29 +502,21 @@ HSAILKernel::initArgList(const aclArgData* aclArg) void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { - int offset = 0; - - // Reserved arguments for HSAIL launch - aclArg += MaxExtraArgumentsNum; - // Iterate through the each kernel argument for (; aclArg->struct_size != 0; aclArg++) { Argument* arg = new Argument; + // Initialize HSAIL kernel argument arg->name_ = aclArg->argStr; arg->typeName_ = aclArg->typeStr; arg->size_ = GetHSAILArgSize(aclArg); - arg->offset_ = offset; arg->type_ = GetHSAILArgType(aclArg); arg->addrQual_ = GetHSAILAddrQual(aclArg); arg->dataType_ = GetHSAILDataType(aclArg); - // If vector of args we add additional arguments to flatten it out - arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) && - (aclArg->arg.value.data != DATATYPE_struct)) ? - aclArg->arg.value.numElements : 1; arg->alignment_ = GetHSAILArgAlignment(aclArg); arg->access_ = GetHSAILArgAccessType(aclArg); - offset += GetHSAILArgSize(aclArg); + arg->pointeeAlignment_ = GetHSAILArgPointeeAlignment(aclArg); + arguments_.push_back(arg); } } @@ -568,8 +589,7 @@ HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, - std::string compileOptions, - uint extraArgsNum) + std::string compileOptions) : device::Kernel(name) , compileOptions_(compileOptions) , dev_(prog->dev()) @@ -577,7 +597,6 @@ HSAILKernel::HSAILKernel(std::string name, , index_(0) , code_(0) , codeSize_(0) - , extraArgumentsNum_(extraArgsNum) , waveLimiter_(this, (prog->isNull() ? 1 : dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) * dev().hwInfo()->simdPerCU_) { @@ -944,137 +963,160 @@ HSAILKernel::loadArguments( address aqlStruct = gpu.cb(1)->sysMemCopy(); bool srdResource = false; - if (extraArgumentsNum_ > 0) { - assert(MaxExtraArgumentsNum >= 6 && "MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly"); - size_t extraArgs[MaxExtraArgumentsNum] = { 0, 0, 0, 0, 0, 0 }; - // The HLC generates up to 3 additional arguments for the global offsets - for (uint i = 0; i < sizes.dimensions(); ++i) { - extraArgs[i] = sizes.offset()[i]; - } - // Check if the kernel may have printf output - if ((printfInfo().size() > 0) && - // and printf buffer was allocated - (gpu.printfDbgHSA().dbgBuffer() != nullptr)) { - // and set the fourth argument as the printf_buffer pointer - extraArgs[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); - memList.push_back(gpu.printfDbgHSA().dbgBuffer()); - } - if (dynamicParallelism()) { - // Provide the host parent AQL wrap object to the kernel - AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); - memset(wrap, 0, sizeof(AmdAqlWrap)); - wrap->state = AQL_WRAP_BUSY; - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(sizeof(AmdAqlWrap)); - *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); - // and set 5th & 6th arguments - extraArgs[4] = vmDefQueue; - extraArgs[5] = *vmParentWrap; - memList.push_back(cb); - } - WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t)*extraArgumentsNum_, sizeof(size_t)); + if (dynamicParallelism()) { + // Provide the host parent AQL wrap object to the kernel + AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); + memset(wrap, 0, sizeof(AmdAqlWrap)); + wrap->state = AQL_WRAP_BUSY; + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(sizeof(AmdAqlWrap)); + *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); + memList.push_back(cb); } const amd::KernelSignature& signature = kernel.signature(); const amd::KernelParameters& kernelParams = kernel.parameters(); // Find all parameters for the current kernel - for (uint i = 0; i != signature.numParameters(); ++i) { - const HSAILKernel::Argument* arg = argument(i); - const amd::KernelParameterDescriptor& desc = signature.at(i); - const_address paramaddr = parameters + desc.offset_; - - switch (arg->type_) { - case HSAIL_ARGTYPE_POINTER: - // If it is a global pointer - if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL - || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) { - - Memory* gpuMem = nullptr; - amd::Memory* mem = nullptr; - - if (kernelParams.boundToSvmPointer(dev(), parameters, i)) { - WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr)); - mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast(paramaddr)); - if (mem != nullptr) { - gpuMem = dev().getGpuMemory(mem); - gpuMem->wait(gpu, WaitOnBusyEngine); - if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { - mem->signalWrite(&dev()); - } - memList.push_back(gpuMem); - } - // If finegrainsystem is present then the pointer can be malloced by the app and - // passed to kernel directly. If so copy the pointer location to aqlArgBuf - else if ((dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) { - return nullptr; - } - break; - } - if (nativeMem) { - gpuMem = *reinterpret_cast(paramaddr); - if (nullptr != gpuMem) { - mem = gpuMem->owner(); - } - } - else { - mem = *reinterpret_cast(paramaddr); - if (mem != nullptr) { - gpuMem = dev().getGpuMemory(mem); - } - } - if (gpuMem == nullptr) { - WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*)); - break; - } - - //! 64 bit isn't supported with 32 bit binary - uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); - WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*)); - - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer - gpuMem->wait(gpu, WaitOnBusyEngine); - - //! @todo Compiler has to return read/write attributes - if ((nullptr != mem) && - ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { - mem->signalWrite(&dev()); - } - memList.push_back(gpuMem); - - // save the memory object pointer to allow global memory access - if (nullptr != dev().hwDebugMgr()) { - dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner()); - } + for (auto arg : arguments_) { + // Handle the hidden arguments first, as they do not have a + // matching parameter in the OCL signature (not a valid arg->index_) + if (arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X) { + size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0; + assert(arg->size_ == sizeof(offset_x) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_); + continue; + } + else if (arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y) { + size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0; + assert(arg->size_ == sizeof(offset_y) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_); + continue; + } + else if (arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z) { + size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0; + assert(arg->size_ == sizeof(offset_z) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_); + continue; + } + else if (arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER) { + uint64_t bufferPtr = 0; + if ((printfInfo().size() > 0) && + // and printf buffer was allocated + (gpu.printfDbgHSA().dbgBuffer() != nullptr)) { + // and set the fourth argument as the printf_buffer pointer + bufferPtr = gpu.printfDbgHSA().dbgBuffer()->vmAddress(); + memList.push_back(gpu.printfDbgHSA().dbgBuffer()); } + assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_); + continue; + } + else if (arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE) { + assert(arg->size_ == sizeof(vmDefQueue) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_); + continue; + } + else if (arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION) { + assert(arg->size_ == sizeof(*vmParentWrap) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_); + continue; + } + else if (arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE) { + void* zero = 0; + assert(arg->size_ <= sizeof(zero) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_); + continue; + } + + assert(arg->index_ != uint(-1) && "not a valid signature index"); + const_address paramaddr = parameters + signature.at(arg->index_).offset_; + + if (arg->type_ == HSAIL_ARGTYPE_POINTER) { // If it is a local pointer - else { - assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && - "Unsupported address type"); - ldsAddress = amd::alignUp(ldsAddress, arg->alignment_); - WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t)); + if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) { + ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_); + WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_); ldsAddress += *reinterpret_cast(paramaddr); + continue; } - break; - case HSAIL_ARGTYPE_VALUE: - // Special case for structrues - if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) { - // Copy the current structre into CB1 - memcpy(aqlStruct, paramaddr, arg->size_); - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(arg->size_); - // Then use a pointer in aqlArgBuffer to CB1 - uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset(); - WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*)); - memList.push_back(cb); + assert((arg->addrQual_ == HSAIL_ADDRESS_GLOBAL + || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) + && "Unsupported address qualifier"); + + // If it is a global pointer + Memory* gpuMem = nullptr; + amd::Memory* mem = nullptr; + + if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) { + WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr)); + mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast(paramaddr)); + if (mem != nullptr) { + gpuMem = dev().getGpuMemory(mem); + gpuMem->wait(gpu, WaitOnBusyEngine); + if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { + mem->signalWrite(&dev()); + } + memList.push_back(gpuMem); + } + // If finegrainsystem is present then the pointer can be malloced by the app and + // passed to kernel directly. If so copy the pointer location to aqlArgBuf + else if ((dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) { + return nullptr; + } + break; + } + if (nativeMem) { + gpuMem = *reinterpret_cast(paramaddr); + if (nullptr != gpuMem) { + mem = gpuMem->owner(); + } } else { - WriteAqlArg(&aqlArgBuf, paramaddr, - arg->numElem_ * arg->size_, arg->size_); + mem = *reinterpret_cast(paramaddr); + if (mem != nullptr) { + gpuMem = dev().getGpuMemory(mem); + } } - break; - case HSAIL_ARGTYPE_IMAGE: { + if (gpuMem == nullptr) { + WriteAqlArg(&aqlArgBuf, &gpuMem, arg->size_, arg->alignment_); + continue; + } + + //! 64 bit isn't supported with 32 bit binary + uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); + WriteAqlArg(&aqlArgBuf, &globalAddress, arg->size_, arg->alignment_); + + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + gpuMem->wait(gpu, WaitOnBusyEngine); + + //! @todo Compiler has to return read/write attributes + if ((nullptr != mem) && + ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { + mem->signalWrite(&dev()); + } + memList.push_back(gpuMem); + + // save the memory object pointer to allow global memory access + if (nullptr != dev().hwDebugMgr()) { + dev().hwDebugMgr()->assignKernelParamMem(arg->index_, gpuMem->owner()); + } + } + else if (arg->type_ == HSAIL_ARGTYPE_REFERENCE) { + // Copy the current structure into CB1 + memcpy(aqlStruct, paramaddr, arg->size_); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(arg->size_); + // Then use a pointer in aqlArgBuffer to CB1 + uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset(); + WriteAqlArg(&aqlArgBuf, &gpuPtr, arg->size_, arg->alignment_); + memList.push_back(cb); + } + else if (arg->type_ == HSAIL_ARGTYPE_VALUE) { + WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_); + } + else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { Image* image = nullptr; amd::Memory* mem = nullptr; if (nativeMem) { @@ -1103,11 +1145,13 @@ HSAILKernel::loadArguments( cb->uploadDataToHw(HsaImageObjectSize); // Then use a pointer in aqlArgBuffer to CB1 uint64_t srd = cb->vmAddress() + cb->wrtOffset(); + assert(arg->size_ == sizeof(srd) && "check the sizes"); WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); memList.push_back(cb); } else { uint64_t srd = image->hwSrd(); + assert(arg->size_ == sizeof(srd) && "check the sizes"); WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); srdResource = true; } @@ -1119,19 +1163,19 @@ HSAILKernel::loadArguments( } memList.push_back(image); - break; } - case HSAIL_ARGTYPE_SAMPLER: { + else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) { const amd::Sampler* sampler = *reinterpret_cast(paramaddr); const Sampler* gpuSampler = static_cast (sampler->getDeviceSampler(dev())); uint64_t srd = gpuSampler->hwSrd(); + assert(arg->size_ == sizeof(srd) && "check the sizes"); WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); srdResource = true; break; } - case HSAIL_ARGTYPE_QUEUE: { + else if (arg->type_ == HSAIL_ARGTYPE_QUEUE) { const amd::DeviceQueue* queue = *reinterpret_cast(paramaddr); VirtualGPU* gpuQueue = static_cast(queue->vDev()); @@ -1146,10 +1190,11 @@ HSAILKernel::loadArguments( } vmQueue = gpu.vQueue()->vmAddress(); } - WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*)); + assert(arg->size_ == sizeof(vmQueue) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue)); break; } - default: + else { LogError(" Unsupported address type "); return nullptr; } @@ -1161,26 +1206,18 @@ HSAILKernel::loadArguments( } #if defined(WITH_LIGHTNING_COMPILER) - //!!!!!FIXME_lmoriche: fix the hidden args - size_t extraArgs[] = { 0, 0, 0, 0 }; - // The HLC generates up to 3 additional arguments for the global offsets - for (uint i = 0; i < sizes.dimensions(); ++i) { - extraArgs[i] = sizes.offset()[i]; - } - WriteAqlArg(&aqlArgBuf, &extraArgs[0], sizeof(size_t)); - WriteAqlArg(&aqlArgBuf, &extraArgs[1], sizeof(size_t)); - WriteAqlArg(&aqlArgBuf, &extraArgs[2], sizeof(size_t)); - WriteAqlArg(&aqlArgBuf, &extraArgs[3], sizeof(size_t)); -#endif // defined(WITH_LIGHTNING_COMPILER) - -#if !defined(WITH_LIGHTNING_COMPILER) + // Check there is no arguments' buffer overflow. We may not use all the + // hidden argument slots. + assert(aqlArgBuf <= (gpu.cb(0)->sysMemCopy() + argsBufferSize())); +#else // !defined(WITH_LIGHTNING_COMPILER) // HSAIL kernarg segment size is rounded up to multiple of 16. aqlArgBuf = amd::alignUp(aqlArgBuf, 16); -#endif // !defined(WITH_LIGHTNING_COMPILER) assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) && "Size and the number of arguments don't match!"); +#endif // !defined(WITH_LIGHTNING_COMPILER) hsa_kernel_dispatch_packet_t* hsaDisp = - reinterpret_cast(aqlArgBuf); + reinterpret_cast( + gpu.cb(0)->sysMemCopy() + argsBufferSize()); amd::NDRange local(sizes.local()); const amd::NDRange& global = sizes.global(); @@ -1460,120 +1497,6 @@ GetKernelDataType(const amd::hsa::code::KernelArg::Metadata& lcArg) } } -static inline clk_value_type_t -GetOclType(const HSAILKernel::Argument* arg) -{ - static const clk_value_type_t ClkValueMapType[6][6] = { - { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, - { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, - { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, - { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, - { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, - { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, - }; - - uint sizeType; - uint numElements; - if (arg->type_ == HSAIL_ARGTYPE_POINTER || arg->type_ == HSAIL_ARGTYPE_IMAGE) { - return T_POINTER; - } - else if (arg->type_ == HSAIL_ARGTYPE_VALUE - || arg->type_ == HSAIL_ARGTYPE_REFERENCE) { - switch (arg->dataType_) { - case HSAIL_DATATYPE_S8: - case HSAIL_DATATYPE_U8: - sizeType = 0; - numElements = arg->size_; - break; - case HSAIL_DATATYPE_S16: - case HSAIL_DATATYPE_U16: - sizeType = 1; - numElements = arg->size_ / 2; - break; - case HSAIL_DATATYPE_S32: - case HSAIL_DATATYPE_U32: - sizeType = 2; - numElements = arg->size_ / 4; - break; - case HSAIL_DATATYPE_S64: - case HSAIL_DATATYPE_U64: - sizeType = 3; - numElements = arg->size_ / 8; - break; - case HSAIL_DATATYPE_F16: - sizeType = 4; - numElements = arg->size_ / 2; - break; - case HSAIL_DATATYPE_F32: - sizeType = 4; - numElements = arg->size_ / 4; - break; - case HSAIL_DATATYPE_F64: - sizeType = 5; - numElements = arg->size_ / 8; - break; - default: - return T_VOID; - } - - switch (numElements) { - case 1: return ClkValueMapType[sizeType][0]; - case 2: return ClkValueMapType[sizeType][1]; - case 3: return ClkValueMapType[sizeType][2]; - case 4: return ClkValueMapType[sizeType][3]; - case 8: return ClkValueMapType[sizeType][4]; - case 16: return ClkValueMapType[sizeType][5]; - default: return T_VOID; - } - } - else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) { - return T_SAMPLER; - } - else { - return T_VOID; - } -} - -static inline cl_kernel_arg_address_qualifier -GetOclAddrQual(const HSAILKernel::Argument* arg) -{ - if (arg->type_ == HSAIL_ARGTYPE_POINTER) { - switch (arg->addrQual_) { - case HSAIL_ADDRESS_GLOBAL: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case HSAIL_ADDRESS_CONSTANT: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case HSAIL_ADDRESS_LOCAL: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - } - else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - //default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; -} - -static inline cl_kernel_arg_access_qualifier -GetOclAccessQual(const HSAILKernel::Argument* arg) -{ - if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { - switch (arg->access_) { - case HSAIL_ACCESS_TYPE_RO: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case HSAIL_ACCESS_TYPE_WO: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case HSAIL_ACCESS_TYPE_RW: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - default: - return CL_KERNEL_ARG_ACCESS_NONE; - } - } - return CL_KERNEL_ARG_ACCESS_NONE; -} - static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const amd::hsa::code::KernelArg::Metadata& lcArg) { @@ -1614,7 +1537,6 @@ LightningKernel::initArgList(const amd::hsa::code::Kernel::Metadata& kernelMD) arg->dataType_ = GetKernelDataType(lcArg); arg->alignment_ = GetKernelArgAlignment(lcArg); arg->access_ = GetKernelArgAccessType(lcArg); - arg->numElem_ = 1; arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg); bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp index 581ea3adae..183c9254ee 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp @@ -107,23 +107,17 @@ public: std::string name_; //!< Argument's name std::string typeName_; //!< Argument's type name uint size_; //!< Size in bytes - uint offset_; //!< Argument's offset uint alignment_; //!< Argument's alignment uint pointeeAlignment_; //!< Alignment of the data pointed to HSAIL_ARG_TYPE type_; //!< Type of the argument HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument HSAIL_DATA_TYPE dataType_; //!< The type of data - uint numElem_; //!< Number of elements HSAIL_ACCESS_TYPE access_; //!< Access type for the argument }; - // Max number of possible extra (hidden) kernel arguments - static const uint MaxExtraArgumentsNum = 6; - HSAILKernel(std::string name, HSAILProgram* prog, - std::string compileOptions, - uint extraArgsNum); + std::string compileOptions); virtual ~HSAILKernel(); @@ -134,11 +128,15 @@ public: //! Returns true if memory is valid for execution virtual bool validateMemory(uint idx, amd::Memory* amdMem) const; - //! Returns a pointer to the hsail argument - const Argument* argument(size_t i) const { return arguments_[i]; } + //! Returns the kernel argument list + const std::vector& arguments() const { return arguments_; } - //! Returns the number of hsail arguments - size_t numArguments() const { return arguments_.size(); } + //! Returns a pointer to the hsail argument at the specified index + Argument* argumentAt(size_t index) const { + for (auto arg : arguments_) if (arg->index_ == index) return arg; + assert(!"Should not reach here"); + return NULL; + } //! Returns GPU device object, associated with this kernel const Device& dev() const; @@ -195,15 +193,13 @@ public: std::vector& memList //!< Memory list for GSL/VidMM handles ) const; + //! Returns pritnf info array const std::vector& printfInfo() const { return printf_; } //! Returns the kernel index in the program uint index() const { return index_; } - //! Returns kernel's extra argument count - uint extraArgumentsNum() const { return extraArgumentsNum_; } - //! Get profiling callback object virtual amd::ProfilingCallback* getProfilingCallback( const device::VirtualDevice *vdev) { @@ -252,8 +248,6 @@ protected: uint64_t code_; //!< GPU memory pointer to the kernel size_t codeSize_; //!< Size of ISA code - uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments - union Flags { struct { uint imageEna_: 1; //!< Kernel uses images @@ -275,7 +269,7 @@ public: LightningKernel(const std::string& name, HSAILProgram* prog, const std::string& compileOptions - ): HSAILKernel(name, prog, compileOptions, 0) + ): HSAILKernel(name, prog, compileOptions) {} //! Returns Lightning program associated with this kernel diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 06bbe56ab7..f92ba673cc 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -2122,21 +2122,31 @@ VirtualGPU::submitKernelInternal( gpuDefQueue->virtualQueue_->vmAddress(); address argum = gpuDefQueue->virtualQueue_->data() + offsArg; print << "Kernel: " << child->name() << "\n"; - static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = { - "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; - for (j = 0; j < child->extraArgumentsNum(); ++j) { - print << "\t" << Names[j] << *(size_t*)argum; - print << "\n"; - argum += sizeof(size_t); - } - for (j = 0; j < child->numArguments(); ++j) { - print << "\t" << child->argument(j)->name_ << ": "; - for (int s = child->argument(j)->size_ - 1; s >= 0; --s) { + for (auto arg : child->arguments()) { + const char* extraArgName = nullptr; + switch (arg->type_) { + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: extraArgName = "Offset0: "; break; + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: extraArgName = "Offset1: "; break; + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: extraArgName = "Offset2: "; break; + case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: extraArgName = "PrintfBuf: "; break; + case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE: extraArgName = "VqueuePtr: "; break; + case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION: extraArgName = "AqlWrap: "; break; + case HSAIL_ARGTYPE_HIDDEN_NONE: extraArgName = "Unknown: "; break; + default: break; + } + if (extraArgName) { + print << "\t" << extraArgName << *(size_t*)argum; + print << "\n"; + argum += sizeof(size_t); + continue; + } + print << "\t" << arg->name_ << ": "; + for (int s = arg->size_ - 1; s >= 0; --s) { print.width(2); print.fill('0'); print << (uint32_t)(argum[s]); } - argum += child->argument(j)->size_; + argum += arg->size_; print << "\n"; } printf("%s", print.str().c_str()); @@ -3171,7 +3181,7 @@ VirtualGPU::processMemObjectsHSA( // Check all parameters for the current kernel for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); - const HSAILKernel::Argument* arg = hsaKernel.argument(i); + const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i); Memory* memory = nullptr; bool readOnly = false; amd::Memory* svmMem = nullptr;