diff --git a/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.cpp b/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.cpp index 095d85338b..1faa04da6b 100644 --- a/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.cpp @@ -102,7 +102,7 @@ namespace code { } template<> - bool Read(std::istream& in, AMDGPU::RuntimeMD::KernelArg::TypeKind& v) { + bool Read(std::istream& in, AMDGPU::RuntimeMD::KernelArg::Kind& v) { return ReadConvert(in, v); } @@ -124,17 +124,26 @@ namespace code { namespace KernelArg { using namespace AMDGPU::RuntimeMD::KernelArg; Metadata::Metadata() - : size(0), align(0), pointeeAlign(0), + : size(0), align(0), pointeeAlign(0), accQual(None), isConst(false), isRestrict(false), isVolatile(false), isPipe(false) {} - static const char* TypeKindToString(TypeKind typeKind) { - switch (typeKind) { - case Value: return "Value"; - case Pointer: return "Pointer"; + static const char* KindToString(Kind kind) { + switch (kind) { + case ByValue: return "ByValue"; + case GlobalBuffer: return "GlobalBuffer"; + case DynamicSharedPointer: return "DynamicSharedPointer"; case Image: return "Image"; case Sampler: return "Sampler"; + case Pipe: return "Pipe"; case Queue: return "Queue"; + case HiddenGlobalOffsetX: return "HiddenGlobalOffsetX"; + case HiddenGlobalOffsetY: return "HiddenGlobalOffsetY"; + case HiddenGlobalOffsetZ: return "HiddenGlobalOffsetZ"; + case HiddenPrintfBuffer: return "HiddenPrintfBuffer"; + case HiddenDefaultQueue: return "HiddenDefaultQueue"; + case HiddenCompletionAction: return "HiddenCompletionAction"; + case HiddenNone: return "HiddenNone"; default: return ""; } } @@ -175,7 +184,7 @@ namespace code { case KeyArgAlign: return Read(in, align); case KeyArgTypeName: return Read(in, typeName); case KeyArgName: return Read(in, name); - case KeyArgTypeKind: return Read(in, typeKind); + case KeyArgKind: return Read(in, kind); case KeyArgValueType: return Read(in, valueType); case KeyArgPointeeAlign: return Read(in, pointeeAlign); case KeyArgAddrQual: return Read(in, addrQual); @@ -191,23 +200,29 @@ namespace code { void Metadata::Print(std::ostream& out) { out - << "Type: " << TypeKindToString(typeKind); - if (typeKind == Value) { + << "Kind: " << KindToString(kind); + if (kind == ByValue) { out << " ValueType:" << ValueTypeToString(valueType); } if (isConst) { out << " Const"; } if (isRestrict) { out << " Restrict"; } if (isVolatile) { out << " Volatile"; } if (isPipe) { out << " Pipe"; } - + if (kind == Image || kind == Pipe) { + out << " Access: " << AccessQualToString(accQual); + } out - << " Access: " << AccessQualToString(accQual) << " Address: " << (unsigned) addrQual << " Size: " << size - << " Align: " << align - << " Type Name: " << typeName; + << " Align: " << align; + if (kind == DynamicSharedPointer) { + out << " Pointee Align: " << pointeeAlign; + } + if (!typeName.empty()) { + out << " Type Name: \"" << typeName << "\""; + } if (!name.empty()) { - out << " Name: " << name; + out << " Name: \"" << name << "\""; } } @@ -221,11 +236,11 @@ namespace code { hasWorkgroupSizeHint(false), hasVectorTypeHint(false), hasKernelIndex(false), - hasSGPRs(false), hasVGPRs(false), hasMinWavesPerSIMD(false), hasMaxWavesPerSIMD(false), hasFlatWorkgroupSizeLimits(false), hasMaxWorkgroupSize(false), - isNoPartialWorkgroups(false) + isNoPartialWorkgroups(false), + hasPrintfInfo(false) {} void Metadata::SetCommon(uint8_t mdVersion, uint8_t mdRevision, @@ -260,7 +275,7 @@ namespace code { case KeyArgAlign: case KeyArgTypeName: case KeyArgName: - case KeyArgTypeKind: + case KeyArgKind: case KeyArgValueType: case KeyArgPointeeAlign: case KeyArgAddrQual: @@ -284,12 +299,6 @@ namespace code { case KeyKernelIndex: hasKernelIndex = true; return Read(in, kernelIndex); - case KeySGPRs: - hasSGPRs = true; - return Read(in, numSgprs); - case KeyVGPRs: - hasVGPRs = true; - return Read(in, numVgprs); case KeyMinWavesPerSIMD: hasMinWavesPerSIMD = true; return Read(in, minWavesPerSimd); @@ -306,6 +315,10 @@ namespace code { return Read3(in, maxWorkgroupSize); case KeyNoPartialWorkGroups: isNoPartialWorkgroups = true; + return true; + case KeyPrintfInfo: + hasPrintfInfo = true; + return Read(in, printfInfo); default: return false; } @@ -345,12 +358,6 @@ namespace code { if (hasKernelIndex) { out << " Kernel iIndex: " << kernelIndex << std::endl; } - if (hasSGPRs) { - out << " SGPRs: " << numSgprs << std::endl; - } - if (hasVGPRs) { - out << " VGPRs: " << numVgprs << std::endl; - } if (hasMinWavesPerSIMD) { out << " Min waves per SIMD: " << minWavesPerSimd << std::endl; } @@ -364,6 +371,9 @@ namespace code { if (isNoPartialWorkgroups) { out << " No partial workgroups" << std::endl; } + if (hasPrintfInfo) { + out << " Printf info: " << printfInfo << std::endl; + } out << " Arguments" << std::endl; for (uint32_t i = 0; i < args.size(); ++i) { out << " " << i << ": "; @@ -424,7 +434,7 @@ namespace code { case KeyArgAlign: case KeyArgTypeName: case KeyArgName: - case KeyArgTypeKind: + case KeyArgKind: case KeyArgValueType: case KeyArgPointeeAlign: case KeyArgAddrQual: @@ -437,13 +447,12 @@ namespace code { case KeyWorkGroupSizeHint: case KeyVecTypeHint: case KeyKernelIndex: - case KeySGPRs: - case KeyVGPRs: case KeyMinWavesPerSIMD: case KeyMaxWavesPerSIMD: case KeyFlatWorkGroupSizeLimits: case KeyMaxWorkGroupSize: case KeyNoPartialWorkGroups: + case KeyPrintfInfo: if (!kernel) { return false; } if (!kernel->ReadValue(in, key)) { return false; } break; diff --git a/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.hpp b/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.hpp index 7162706f71..e4add8c5e5 100644 --- a/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.hpp @@ -64,7 +64,7 @@ namespace code { uint32_t pointeeAlign; std::string typeName; std::string name; - AMDGPU::RuntimeMD::KernelArg::TypeKind typeKind; + AMDGPU::RuntimeMD::KernelArg::Kind kind; AMDGPU::RuntimeMD::KernelArg::ValueType valueType; uint8_t addrQual; AMDGPU::RuntimeMD::KernelArg::AccessQualifer accQual; @@ -77,7 +77,7 @@ namespace code { uint32_t PointeeAlign() const { return pointeeAlign; } const std::string& TypeName() const { return typeName; } const std::string& Name() const { return name; } - AMDGPU::RuntimeMD::KernelArg::TypeKind TypeKind() const { return typeKind; } + AMDGPU::RuntimeMD::KernelArg::Kind Kind() const { return kind; } AMDGPU::RuntimeMD::KernelArg::ValueType ValueType() const { return valueType; } uint8_t AddrQual() const { return addrQual; } AMDGPU::RuntimeMD::KernelArg::AccessQualifer AccQual() const { return accQual; } @@ -104,16 +104,17 @@ namespace code { unsigned hasWorkgroupSizeHint : 1; unsigned hasVectorTypeHint : 1; unsigned hasKernelIndex : 1; - unsigned hasSGPRs : 1, hasVGPRs : 1; unsigned hasMinWavesPerSIMD : 1, hasMaxWavesPerSIMD : 1; unsigned hasFlatWorkgroupSizeLimits : 1; unsigned hasMaxWorkgroupSize : 1; unsigned isNoPartialWorkgroups : 1; + unsigned hasPrintfInfo : 1; std::string name; uint32_t requiredWorkgroupSize[3]; uint32_t workgroupSizeHint[3]; std::string vectorTypeHint; + std::string printfInfo; uint32_t kernelIndex; uint32_t numSgprs, numVgprs; @@ -129,12 +130,11 @@ namespace code { bool HasWorkgroupSizeHint() const { return hasWorkgroupSizeHint; } bool HasVecTypeHint() const { return hasVectorTypeHint; } bool HasKernelIndex() const { return hasKernelIndex; } - bool HasSGPRs() const { return hasSGPRs; } - bool HasVGPRs() const { return hasVGPRs; } bool HasMinWavesPerSIMD() const { return hasMinWavesPerSIMD; } bool HasMaxWavesPerSIMD() const { return hasMaxWavesPerSIMD; } bool HasFlatWorkgroupSizeLimits() const { return hasFlatWorkgroupSizeLimits; } bool HasMaxWorkgroupSize() const { return hasMaxWorkgroupSize; } + bool HasPrintfInfo() const { return hasPrintfInfo; } size_t KernelArgCount() const { return args.size(); } const KernelArg::Metadata& GetKernelArgMetadata(size_t index) const; @@ -143,9 +143,8 @@ namespace code { const uint32_t* RequiredWorkgroupSize() const { return hasRequiredWorkgroupSize ? requiredWorkgroupSize : nullptr; } const uint32_t* WorkgroupSizeHint() const { return hasWorkgroupSizeHint ? workgroupSizeHint : nullptr; } const std::string& VecTypeHint() const { return vectorTypeHint; } + const std::string& PrintfInfo() const { return printfInfo; } uint32_t KernelIndex() const { return hasKernelIndex ? kernelIndex : UINT32_MAX; } - uint32_t SGPRS() const { return hasSGPRs ? numSgprs : UINT32_MAX; } - uint32_t VGPRS() const { return hasVGPRs ? numVgprs : UINT32_MAX; } uint32_t MinWavesPerSIMD() const { return hasMinWavesPerSIMD ? minWavesPerSimd : UINT32_MAX; } uint32_t MaxWavesPerSIMD() const { return hasMaxWavesPerSIMD ? maxWavesPerSimd : UINT32_MAX; } uint32_t MinFlatWorkgroupSize() const { return hasFlatWorkgroupSizeLimits ? minFlatWorkgroupSize : UINT32_MAX; } diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp index b97714e978..d2573a1085 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp @@ -19,15 +19,30 @@ namespace roc { static inline ROC_ARG_TYPE GetKernelArgType(const amd::hsa::code::KernelArg::Metadata& lcArg) { - switch (lcArg.TypeKind()) { - case AMDGPU::RuntimeMD::KernelArg::Pointer: + switch (lcArg.Kind()) { + case AMDGPU::RuntimeMD::KernelArg::GlobalBuffer: + case AMDGPU::RuntimeMD::KernelArg::DynamicSharedPointer: return ROC_ARGTYPE_POINTER; - case AMDGPU::RuntimeMD::KernelArg::Value: + case AMDGPU::RuntimeMD::KernelArg::ByValue: return ROC_ARGTYPE_VALUE; case AMDGPU::RuntimeMD::KernelArg::Image: return ROC_ARGTYPE_IMAGE; case AMDGPU::RuntimeMD::KernelArg::Sampler: return ROC_ARGTYPE_SAMPLER; + case AMDGPU::RuntimeMD::KernelArg::HiddenGlobalOffsetX: + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; + case AMDGPU::RuntimeMD::KernelArg::HiddenGlobalOffsetY: + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; + case AMDGPU::RuntimeMD::KernelArg::HiddenGlobalOffsetZ: + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; + case AMDGPU::RuntimeMD::KernelArg::HiddenPrintfBuffer: + return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER; + case AMDGPU::RuntimeMD::KernelArg::HiddenDefaultQueue: + return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE; + case AMDGPU::RuntimeMD::KernelArg::HiddenCompletionAction: + return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION; + case AMDGPU::RuntimeMD::KernelArg::HiddenNone: + return ROC_ARGTYPE_HIDDEN_NONE; default: return ROC_ARGTYPE_ERROR; } @@ -63,7 +78,8 @@ GetKernelArgType(const aclArgData* argInfo) case ARG_TYPE_POINTER: return ROC_ARGTYPE_POINTER; case ARG_TYPE_VALUE: - return ROC_ARGTYPE_VALUE; + return (argInfo->arg.value.data == DATATYPE_struct) + ? ROC_ARGTYPE_REFERENCE : ROC_ARGTYPE_VALUE; case ARG_TYPE_IMAGE: return ROC_ARGTYPE_IMAGE; case ARG_TYPE_SAMPLER: @@ -121,8 +137,7 @@ GetKernelArgAlignment(const aclArgData* argInfo) static inline size_t GetKernelArgPointeeAlignment(const amd::hsa::code::KernelArg::Metadata& lcArg) { - if (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer - && lcArg.AddrQual() == AMDGPU::RuntimeMD::KernelArg::Local) { + if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::DynamicSharedPointer) { uint32_t align = lcArg.PointeeAlign(); if (align == 0) { LogWarning("Missing DynamicSharedPointer alignment"); @@ -147,8 +162,8 @@ GetKernelArgPointeeAlignment(const aclArgData* argInfo) static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const amd::hsa::code::KernelArg::Metadata& lcArg) { - if (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer - || lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Image) { + if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::GlobalBuffer + || lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::Image) { switch (lcArg.AccQual()) { case AMDGPU::RuntimeMD::KernelArg::ReadOnly: return ROC_ACCESS_TYPE_RO; @@ -191,21 +206,21 @@ GetKernelArgAccessType(const aclArgData* argInfo) static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const amd::hsa::code::KernelArg::Metadata& lcArg) { - if (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) { - switch (lcArg.AddrQual()) { - case AMDGPU::RuntimeMD::KernelArg::Global: - return ROC_ADDRESS_GLOBAL; - case AMDGPU::RuntimeMD::KernelArg::Constant: - return ROC_ADDRESS_CONSTANT; - case AMDGPU::RuntimeMD::KernelArg::Local: - return ROC_ADDRESS_LOCAL; - default: - LogError("Unsupported address type"); - return ROC_ADDRESS_ERROR; - } + if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::DynamicSharedPointer) { + return ROC_ADDRESS_LOCAL; } - else if ((lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Image) || - (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Sampler)) { + else if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::GlobalBuffer) { + if (lcArg.AddrQual() == AMDGPU::RuntimeMD::KernelArg::Global) { + return ROC_ADDRESS_GLOBAL; + } + else if (lcArg.AddrQual() == AMDGPU::RuntimeMD::KernelArg::Constant) { + return ROC_ADDRESS_CONSTANT; + } + LogError("Unsupported address type"); + return ROC_ADDRESS_ERROR; + } + else if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::Image + || lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::Sampler) { return ROC_ADDRESS_GLOBAL; } return ROC_ADDRESS_ERROR; @@ -246,9 +261,7 @@ GetKernelDataType(const amd::hsa::code::KernelArg::Metadata& lcArg) { aclArgDataType dataType; - if ((lcArg.TypeKind() != AMDGPU::RuntimeMD::KernelArg::Pointer) || - (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Value)) - { + if (lcArg.Kind() != AMDGPU::RuntimeMD::KernelArg::ByValue) { return ROC_DATATYPE_ERROR; } @@ -382,7 +395,8 @@ GetOclType(const Kernel::Argument* arg) if (arg->type_ == ROC_ARGTYPE_POINTER || arg->type_ == ROC_ARGTYPE_IMAGE) { return T_POINTER; } - else if (arg->type_ == ROC_ARGTYPE_VALUE) { + else if (arg->type_ == ROC_ARGTYPE_VALUE + || arg->type_ == ROC_ARGTYPE_REFERENCE) { switch (arg->dataType_) { case ROC_DATATYPE_S8: case ROC_DATATYPE_U8: @@ -483,7 +497,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const amd::hsa::code::KernelArg::Metadata& lcArg) { cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) { + if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::GlobalBuffer + || lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::DynamicSharedPointer) { if (lcArg.IsVolatile()) { rv |= CL_KERNEL_ARG_TYPE_VOLATILE; } @@ -607,7 +622,6 @@ Kernel::initArguments_LC(const amd::hsa::code::Kernel::Metadata& kernelMD) // Initialize HSAIL kernel argument Kernel::Argument* arg = new Kernel::Argument; - arg->index_ = /* lcArg.IsHidden() ? uint(-1) : */ params.size(); arg->name_ = lcArg.Name(); arg->typeName_ = lcArg.TypeName(); arg->size_ = lcArg.Size(); @@ -618,11 +632,20 @@ Kernel::initArguments_LC(const amd::hsa::code::Kernel::Metadata& kernelMD) arg->access_ = GetKernelArgAccessType(lcArg); arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg); + bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X + || arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y + || arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z + || arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER + || arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE + || arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION + || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE; + + arg->index_ = isHidden ? uint(-1) : params.size(); hsailArgList_.push_back(arg); - /*if (lcArg.IsHidden()) { + if (isHidden) { continue; - }*/ + } // Initialize Device kernel parameters amd::KernelParameterDescriptor desc; @@ -650,35 +673,13 @@ Kernel::initArguments_LC(const amd::hsa::code::Kernel::Metadata& kernelMD) // Local memory for CPU size = sizeof(cl_mem); } - offset = (size_t) amd::alignUp(offset, std::min(size, size_t(16))); + offset = (size_t) amd::alignUp(offset, std::min(size, size_t(16))); desc.offset_ = offset; offset += amd::alignUp(size, sizeof(uint32_t)); params.push_back(desc); } - // Push the hidden arguments. These will be generated by LC at some point - static ROC_ARG_TYPE hiddenArgs[] = { - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X, - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y, - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z, - }; - for (auto type : hiddenArgs) { - Kernel::Argument* arg = new Kernel::Argument; - arg->index_ = uint(-1); - arg->name_ = ""; - arg->typeName_ = "size_t"; - arg->size_ = sizeof(size_t); - arg->type_ = type; - arg->addrQual_ = ROC_ADDRESS_ERROR; - arg->dataType_ = ROC_DATATYPE_U64; - arg->alignment_ = arg->size_; - arg->access_ = ROC_ACCESS_TYPE_NONE; - arg->pointeeAlignment_ = 0; - - hsailArgList_.push_back(arg); - } - createSignature(params); } #endif // defined(WITH_LIGHTNING_COMPILER) diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp index 747187fcc6..996a3ca9e7 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp @@ -20,6 +20,7 @@ enum ROC_ARG_TYPE ROC_ARGTYPE_ERROR = 0, ROC_ARGTYPE_POINTER, ROC_ARGTYPE_VALUE, + ROC_ARGTYPE_REFERENCE, ROC_ARGTYPE_IMAGE, ROC_ARGTYPE_SAMPLER, ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X, diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp index 2ac2b9ee00..d5a88d9513 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp @@ -36,6 +36,8 @@ #endif // !defined(WITH_LIGHTNING_COMPILER) #include "utils/bif_section_labels.hpp" +#include "amd_hsa_kernel_code.h" + #include #include #include @@ -1027,11 +1029,10 @@ HSAILProgram::linkImpl_LC(amd::option::Options *options) return false; } - for ( auto &kernelName : kernelNameList ) - { + for (auto &kernelName : kernelNameList) { hsa_executable_symbol_t kernelSymbol; - hsa_executable_get_symbol ( hsaExecutable_, "", kernelName.c_str(), - hsaDevice, 0, &kernelSymbol ); + hsa_executable_get_symbol( + hsaExecutable_, "", kernelName.c_str(), hsaDevice, 0, &kernelSymbol); uint64_t kernelCodeHandle; status = hsa_executable_symbol_get_info( @@ -1093,6 +1094,22 @@ HSAILProgram::linkImpl_LC(amd::option::Options *options) return false; } +#if 0 + for (auto s = elf.nextSymbol(NULL); s != NULL; s = elf.nextSymbol(s)) { + amd::OclElf::SymbolInfo si; + if (!elf.getSymbolInfo(s, &si) + || strcmp(si.sec_name, ".text") != 0 + || strcmp(si.sym_name, kernelName.c_str()) != 0) { + continue; + } + const amd_kernel_code_t* akc = (amd_kernel_code_t*) + ((address) out_exec->Buf().data() + (si.address - si.sec_addr)); + + // FIXME_lmoriche: this is where we could get the SGPRs and VGPRs + break; + } +#endif + Kernel *aKernel = new roc::Kernel( kernelName, this, diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index b5473a5589..f657e24207 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -1497,29 +1497,34 @@ VirtualGPU::submitKernelInternal( // matching parameter in the OCL signature (not a valid arg->index_) if (arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X) { size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0; - argPtr = addArg(argPtr, &offset_x, sizeof(void*)); + assert(arg->size_ == sizeof(offset_x) && "check the sizes"); + argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_); continue; } else if (arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y) { size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0; - argPtr = addArg(argPtr, &offset_y, sizeof(void*)); + assert(arg->size_ == sizeof(offset_y) && "check the sizes"); + argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_); continue; } else if (arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z) { size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0; - argPtr = addArg(argPtr, &offset_z, sizeof(void*)); + assert(arg->size_ == sizeof(offset_z) && "check the sizes"); + argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_); continue; } else if (arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER) { address bufferPtr = printfDbg()->dbgBuffer(); - argPtr = addArg(argPtr, &bufferPtr, sizeof(void*)); + assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); + argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_); continue; } else if (arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE || arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE) { void* zero = 0; - argPtr = addArg(argPtr, &zero, sizeof(void*)); + assert(arg->size_ <= sizeof(zero) && "check the sizes"); + argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_); continue; } @@ -1558,76 +1563,74 @@ VirtualGPU::submitKernelInternal( mem->signalWrite(&dev()); } } - else if (arg->type_ == ROC_ARGTYPE_VALUE) { - if (arg->dataType_ == ROC_DATATYPE_STRUCT) { - void *mem = allocKernArg(arg->size_, arg->alignment_); - if (mem == NULL) { - LogError("Out of memory"); - return false; - } - memcpy(mem, srcArgPtr, arg->size_); - argPtr = addArg(argPtr, &mem, sizeof(void*)); - continue; + else if (arg->type_ == ROC_ARGTYPE_REFERENCE) { + void *mem = allocKernArg(arg->size_, arg->alignment_); + if (mem == NULL) { + LogError("Out of memory"); + return false; } + memcpy(mem, srcArgPtr, arg->size_); + argPtr = addArg(argPtr, &mem, sizeof(void*)); + } + else if (arg->type_ == ROC_ARGTYPE_VALUE) { argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - srcArgPtr += arg->size_; } else if (arg->type_ == ROC_ARGTYPE_IMAGE) { - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - Image* image = static_cast(mem->getDeviceMemory(dev())); - if (image == NULL) { - LogError("Kernel image argument is not an image object"); - return false; - } + amd::Memory* mem = *reinterpret_cast(srcArgPtr); + Image* image = static_cast(mem->getDeviceMemory(dev())); + if (image == NULL) { + LogError("Kernel image argument is not an image object"); + return false; + } - if (dev().settings().enableImageHandle_) { - const uint64_t image_srd = image->getHsaImageObject().handle; - assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); - argPtr = addArg(argPtr, &image_srd, sizeof(image_srd)); - } - else { - // Image arguments are of size 48 bytes and are aligned to 16 bytes - argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle, - HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT); - } + if (dev().settings().enableImageHandle_) { + const uint64_t image_srd = image->getHsaImageObject().handle; + assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); + argPtr = addArg(argPtr, &image_srd, sizeof(image_srd)); + } + else { + // Image arguments are of size 48 bytes and are aligned to 16 bytes + argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle, + HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT); + } - //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { - mem->signalWrite(&dev()); - } + //! @todo Compiler has to return read/write attributes + const cl_mem_flags flags = mem->getMemFlags(); + if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { + mem->signalWrite(&dev()); + } } else if (arg->type_ == ROC_ARGTYPE_SAMPLER) { - amd::Sampler* sampler = *reinterpret_cast(srcArgPtr); - if (sampler == NULL) { - LogError("Kernel sampler argument is not an sampler object"); - return false; - } + amd::Sampler* sampler = *reinterpret_cast(srcArgPtr); + if (sampler == NULL) { + LogError("Kernel sampler argument is not an sampler object"); + return false; + } - hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSampleDescriptor(samplerDescriptor, *sampler); + hsa_ext_sampler_descriptor_t samplerDescriptor; + fillSampleDescriptor(samplerDescriptor, *sampler); - hsa_ext_sampler_t hsa_sampler; - hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(), - &samplerDescriptor, &hsa_sampler); - if (status != HSA_STATUS_SUCCESS) { - LogError("Error creating device sampler object!"); - return false; - } + hsa_ext_sampler_t hsa_sampler; + hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(), + &samplerDescriptor, &hsa_sampler); + if (status != HSA_STATUS_SUCCESS) { + LogError("Error creating device sampler object!"); + return false; + } - if (dev().settings().enableImageHandle_) { - uint64_t sampler_srd = hsa_sampler.handle; - argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd)); - samplerList_.push_back(hsa_sampler); - // TODO: destroy sampler. - } - else { - argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); + if (dev().settings().enableImageHandle_) { + uint64_t sampler_srd = hsa_sampler.handle; + argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd)); + samplerList_.push_back(hsa_sampler); + // TODO: destroy sampler. + } + else { + argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); - memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE); - argPtr += HSA_SAMPLER_OBJECT_SIZE; - hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler); - } + memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE); + argPtr += HSA_SAMPLER_OBJECT_SIZE; + hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler); + } } }