From 42f4b2af9745ddf3154b9c19d32ac66f9270446d Mon Sep 17 00:00:00 2001 From: foreman Date: Fri, 12 Sep 2014 15:02:32 -0400 Subject: [PATCH] P4 to Git Change 1076493 by yaxunl@yaxunl_stg_win50 on 2014/09/12 14:45:24 ECR #377625 - AMDIL Function support: Calculate total private memory usage by a kernel including memory used by called functions. This cannot be done by IPA since stack size is known only after register allocation due to potential register spill, but MachineFunctionAnalysis cannot persist after CGSCC pass with current LLVM version. This change adds private memory usage metadata for non-kernel functions. The total private memory usage by a kernel is calculated when AMDIL is split for different kernels. BIF will contain total private memory size. Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/amdilUtils.cpp#1 add ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/amdilUtils.hpp#1 add ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/AMDIL/AMDILKernelManager.cpp#451 edit ... //depot/stg/opencl/drivers/opencl/compiler/llvm/lib/Target/AMDIL/AMDILKernelManager.h#51 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#175 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.hpp#54 edit --- rocclr/compiler/lib/utils/amdilUtils.cpp | 14 ++++++++++++++ rocclr/compiler/lib/utils/amdilUtils.hpp | 11 +++++++++++ rocclr/runtime/device/gpu/gpuprogram.cpp | 24 ++++++++++++++++++++++-- rocclr/runtime/device/gpu/gpuprogram.hpp | 8 ++++++-- 4 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 rocclr/compiler/lib/utils/amdilUtils.cpp create mode 100644 rocclr/compiler/lib/utils/amdilUtils.hpp diff --git a/rocclr/compiler/lib/utils/amdilUtils.cpp b/rocclr/compiler/lib/utils/amdilUtils.cpp new file mode 100644 index 0000000000..babdd4a5ec --- /dev/null +++ b/rocclr/compiler/lib/utils/amdilUtils.cpp @@ -0,0 +1,14 @@ +#include "amdilUtils.hpp" +#include +#include +#include + +// Change all private uav length in a kernel +void amdilUtils::changePrivateUAVLength(std::string& kernel, unsigned length) { + std::regex pattern("dcl_typeless_uav_id\\(([[:digit:]]+)\\)_stride" + "\\(([[:digit:]]+)\\)_length\\([[:digit:]]+\\)_access\\(private\\)"); + std::stringstream ss; + ss << "dcl_typeless_uav_id($1)_stride($2)_length(" << length << + ")_access(private)"; + kernel = std::regex_replace(kernel, pattern, ss.str()); +} diff --git a/rocclr/compiler/lib/utils/amdilUtils.hpp b/rocclr/compiler/lib/utils/amdilUtils.hpp new file mode 100644 index 0000000000..f28bcc3ee7 --- /dev/null +++ b/rocclr/compiler/lib/utils/amdilUtils.hpp @@ -0,0 +1,11 @@ +#ifndef AMDILUTILS_H_ +#define AMDILUTILS_H_ + +#include + +namespace amdilUtils { +// Change all private uav length in a kernel +void changePrivateUAVLength(std::string& kernel, unsigned length); + +} +#endif /* AMDILUTILS_H_ */ diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp index 896e307d8c..09b3afb24c 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.cpp +++ b/rocclr/runtime/device/gpu/gpuprogram.cpp @@ -5,6 +5,7 @@ #include "os/os.hpp" #include "utils/flags.hpp" #include "include/aclTypes.h" +#include "utils/amdilUtils.hpp" #include "utils/bif_section_labels.hpp" #include "device/gpu/gpuprogram.hpp" #include "device/gpu/gpublit.hpp" @@ -232,6 +233,21 @@ NullProgram::isCalled(const ILFunc* base, const ILFunc* func) return false; } +uint +ILFunc::totalHwPrivateUsage() { + if (totalHwPrivateSize_ >= 0) + return totalHwPrivateSize_; + + uint maxChildUsage = 0; + for (size_t i = 0; i < calls_.size(); ++i) { + uint childUsage = calls_[i]->totalHwPrivateUsage(); + if (childUsage > maxChildUsage) + maxChildUsage = childUsage; + } + totalHwPrivateSize_ = hwPrivateSize_ + maxChildUsage; + return totalHwPrivateSize_; +} + void NullProgram::patchMain(std::string& kernel, uint index) { @@ -540,7 +556,6 @@ NullProgram::linkImpl(amd::option::Options* options) // Accumulate all emulated local and private sizes, // necessary for the kernel execution initData.localSize_ += func->localSize_; - initData.privateSize_ += func->privateSize_; // Accumulate all HW local and private sizes, // necessary for the kernel execution @@ -548,6 +563,9 @@ NullProgram::linkImpl(amd::option::Options* options) initData.hwPrivateSize_ += func->hwPrivateSize_; initData.flags_ |= func->flags_; } + initData.privateSize_ = baseFunc->totalHwPrivateUsage(); + amdilUtils::changePrivateUAVLength(kernel, + initData.privateSize_); // Create a GPU kernel bool created; @@ -913,7 +931,6 @@ NullProgram::linkImpl(const std::vector& inputPrograms, // Accumulate all emulated local and private sizes, // necessary for the kernel execution initData.localSize_ += func->localSize_; - initData.privateSize_ += func->privateSize_; // Accumulate all HW local and private sizes, // necessary for the kernel execution @@ -921,6 +938,9 @@ NullProgram::linkImpl(const std::vector& inputPrograms, initData.hwPrivateSize_ += func->hwPrivateSize_; initData.flags_ |= func->flags_; } + initData.privateSize_ = baseFunc->totalHwPrivateUsage(); + amdilUtils::changePrivateUAVLength(kernel, + initData.privateSize_); // Create a GPU kernel bool created; diff --git a/rocclr/runtime/device/gpu/gpuprogram.hpp b/rocclr/runtime/device/gpu/gpuprogram.hpp index 6cc151b353..17b9850546 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.hpp +++ b/rocclr/runtime/device/gpu/gpuprogram.hpp @@ -50,6 +50,7 @@ public: , hwPrivateSize_(0) , hwLocalSize_(0) , flags_(0) + , totalHwPrivateSize_(-1) { code_.begin_ = code_.end_ = 0; metadata_.begin_ = metadata_.end_ = 0; @@ -74,6 +75,7 @@ public: hwPrivateSize_ = func.hwPrivateSize_; hwLocalSize_ = func.hwLocalSize_; flags_ = func.flags_; + totalHwPrivateSize_ = func.totalHwPrivateSize_; // Note: we don't copy calls_ and macros_ return *this; @@ -89,9 +91,11 @@ public: uint hwPrivateSize_; //!< HW private ring allocation by the function uint hwLocalSize_; //!< HW local ring allocation by the function uint flags_; //!< The IL func flags/properties - - std::vector calls_; //! Functions called from the current + long long totalHwPrivateSize_; //!< total HW private usage including called functions + std::vector calls_; //! Functions called from the current std::vector macros_; //! Macros, used in the IL function + + uint totalHwPrivateUsage(); //!< total HW private usage including called functions }; //! \class empty program