diff --git a/rocclr/compiler/lib/utils/amdilUtils.cpp b/rocclr/compiler/lib/utils/amdilUtils.cpp new file mode 100644 index 0000000000..babdd4a5ec --- /dev/null +++ b/rocclr/compiler/lib/utils/amdilUtils.cpp @@ -0,0 +1,14 @@ +#include "amdilUtils.hpp" +#include +#include +#include + +// Change all private uav length in a kernel +void amdilUtils::changePrivateUAVLength(std::string& kernel, unsigned length) { + std::regex pattern("dcl_typeless_uav_id\\(([[:digit:]]+)\\)_stride" + "\\(([[:digit:]]+)\\)_length\\([[:digit:]]+\\)_access\\(private\\)"); + std::stringstream ss; + ss << "dcl_typeless_uav_id($1)_stride($2)_length(" << length << + ")_access(private)"; + kernel = std::regex_replace(kernel, pattern, ss.str()); +} diff --git a/rocclr/compiler/lib/utils/amdilUtils.hpp b/rocclr/compiler/lib/utils/amdilUtils.hpp new file mode 100644 index 0000000000..f28bcc3ee7 --- /dev/null +++ b/rocclr/compiler/lib/utils/amdilUtils.hpp @@ -0,0 +1,11 @@ +#ifndef AMDILUTILS_H_ +#define AMDILUTILS_H_ + +#include + +namespace amdilUtils { +// Change all private uav length in a kernel +void changePrivateUAVLength(std::string& kernel, unsigned length); + +} +#endif /* AMDILUTILS_H_ */ diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp index 896e307d8c..09b3afb24c 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.cpp +++ b/rocclr/runtime/device/gpu/gpuprogram.cpp @@ -5,6 +5,7 @@ #include "os/os.hpp" #include "utils/flags.hpp" #include "include/aclTypes.h" +#include "utils/amdilUtils.hpp" #include "utils/bif_section_labels.hpp" #include "device/gpu/gpuprogram.hpp" #include "device/gpu/gpublit.hpp" @@ -232,6 +233,21 @@ NullProgram::isCalled(const ILFunc* base, const ILFunc* func) return false; } +uint +ILFunc::totalHwPrivateUsage() { + if (totalHwPrivateSize_ >= 0) + return totalHwPrivateSize_; + + uint maxChildUsage = 0; + for (size_t i = 0; i < calls_.size(); ++i) { + uint childUsage = calls_[i]->totalHwPrivateUsage(); + if (childUsage > maxChildUsage) + maxChildUsage = childUsage; + } + totalHwPrivateSize_ = hwPrivateSize_ + maxChildUsage; + return totalHwPrivateSize_; +} + void NullProgram::patchMain(std::string& kernel, uint index) { @@ -540,7 +556,6 @@ NullProgram::linkImpl(amd::option::Options* options) // Accumulate all emulated local and private sizes, // necessary for the kernel execution initData.localSize_ += func->localSize_; - initData.privateSize_ += func->privateSize_; // Accumulate all HW local and private sizes, // necessary for the kernel execution @@ -548,6 +563,9 @@ NullProgram::linkImpl(amd::option::Options* options) initData.hwPrivateSize_ += func->hwPrivateSize_; initData.flags_ |= func->flags_; } + initData.privateSize_ = baseFunc->totalHwPrivateUsage(); + amdilUtils::changePrivateUAVLength(kernel, + initData.privateSize_); // Create a GPU kernel bool created; @@ -913,7 +931,6 @@ NullProgram::linkImpl(const std::vector& inputPrograms, // Accumulate all emulated local and private sizes, // necessary for the kernel execution initData.localSize_ += func->localSize_; - initData.privateSize_ += func->privateSize_; // Accumulate all HW local and private sizes, // necessary for the kernel execution @@ -921,6 +938,9 @@ NullProgram::linkImpl(const std::vector& inputPrograms, initData.hwPrivateSize_ += func->hwPrivateSize_; initData.flags_ |= func->flags_; } + initData.privateSize_ = baseFunc->totalHwPrivateUsage(); + amdilUtils::changePrivateUAVLength(kernel, + initData.privateSize_); // Create a GPU kernel bool created; diff --git a/rocclr/runtime/device/gpu/gpuprogram.hpp b/rocclr/runtime/device/gpu/gpuprogram.hpp index 6cc151b353..17b9850546 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.hpp +++ b/rocclr/runtime/device/gpu/gpuprogram.hpp @@ -50,6 +50,7 @@ public: , hwPrivateSize_(0) , hwLocalSize_(0) , flags_(0) + , totalHwPrivateSize_(-1) { code_.begin_ = code_.end_ = 0; metadata_.begin_ = metadata_.end_ = 0; @@ -74,6 +75,7 @@ public: hwPrivateSize_ = func.hwPrivateSize_; hwLocalSize_ = func.hwLocalSize_; flags_ = func.flags_; + totalHwPrivateSize_ = func.totalHwPrivateSize_; // Note: we don't copy calls_ and macros_ return *this; @@ -89,9 +91,11 @@ public: uint hwPrivateSize_; //!< HW private ring allocation by the function uint hwLocalSize_; //!< HW local ring allocation by the function uint flags_; //!< The IL func flags/properties - - std::vector calls_; //! Functions called from the current + long long totalHwPrivateSize_; //!< total HW private usage including called functions + std::vector calls_; //! Functions called from the current std::vector macros_; //! Macros, used in the IL function + + uint totalHwPrivateUsage(); //!< total HW private usage including called functions }; //! \class empty program