P4 to Git Change 1200206 by jatang@jatang-opencl-hsa-stg1 on 2015/10/15 10:08:51

SWDEV-56468 - Support wave limiter in HSAIL path. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#302 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#120 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#386 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#6 edit
2015-10-15 10:21:27 -04:00
commit 19ecdf3fd1
@@ -826,7 +826,7 @@ Kernel::create(

    // Wave limiter needs to be initialized after kernel metadata is parsed
    // Since it depends on it.
-    waveLimiter_.enable();
+    waveLimiter_.enable(dev().settings().ciPlus_);

    if (result) {
        buildError_ = CL_SUCCESS;
@@ -844,7 +844,7 @@ Kernel::Kernel(
    const Program&      prog,
    const InitData*     initData)
    : NullKernel(name, gpuDev, prog)
-    , waveLimiter_(this)
+    , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
 {
    hwPrivateSize_ = 0;
    if (NULL != initData) {
@@ -3371,6 +3371,7 @@ HSAILKernel::HSAILKernel(std::string name,
    , codeSize_(0)
    , hwMetaData_(NULL)
    , extraArgumentsNum_(extraArgsNum)
+    , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
 {
    hsa_ = true;
 }
@@ -3517,6 +3518,16 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
    }
    index_ = md.kernel_index;

+    size_t sizeOfWavesPerSimdHint;
+    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
+        RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
+        &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+
+    waveLimiter_.enable(dev().settings().ciPlus_);
+
    return true;
 }

@@ -918,6 +918,17 @@ public:
    //! Returns kernel's extra argument count
    uint extraArgumentsNum() const { return extraArgumentsNum_; }

+    //! Get profiling callback object
+    virtual amd::ProfilingCallback* getProfilingCallback(
+        const device::VirtualDevice *vdev){
+        return waveLimiter_.getProfilingCallback(vdev);
+    }
+
+    //! Get waves per shader array to be used for kernel execution.
+    uint getWavesPerSH(const device::VirtualDevice *vdev) const {
+        return waveLimiter_.getWavesPerSH(vdev);
+    }
+
 private:
    //! Disable copy constructor
    HSAILKernel(const HSAILKernel&);
@@ -968,6 +979,8 @@ private:
        uint    value_;
        Flags(): value_(0) {}
    } flags_;
+
+    WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
 };

 /*@}*/} // namespace gpu
@@ -1791,7 +1791,7 @@ VirtualGPU::submitKernelInternalHSA(
        // Run AQL dispatch in HW
        eventBegin(MainEngine);
        cs()->AqlDispatch(aqlPkt, vmMems(), cal_.memCount_, scratch, scratchOffset,
-            hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
+            hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, hsaKernel.getWavesPerSH(this));
        eventEnd(MainEngine, gpuEvent);

        if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
@@ -20,24 +20,21 @@ uint WLAlgorithmSmooth::AbandonThresh;
 uint WLAlgorithmSmooth::DscThresh;

 WaveLimiter::WaveLimiter(
-        Kernel* owner,
+        WaveLimiterManager* manager,
        uint    seqNum,
        bool    enable,
        bool    enableDump):
-        owner_(owner),
-        dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
-    auto gpuDev = static_cast<const Device*>(&owner_->dev());
-    auto attrib = gpuDev->getAttribs();
-    auto hwInfo = gpuDev->hwInfo();
-    setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
-            attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
+        manager_(manager),
+        dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
+
+    setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
    MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
    WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
    RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;

    state_ = WARMUP;
    if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
-        traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
+        traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() +
            ".txt");
    }

@@ -58,8 +55,8 @@ uint WaveLimiter::getWavesPerSH(){
    return waves_ * SIMDPerSH_;
 }

-WLAlgorithmSmooth::WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump):
-    WaveLimiter(owner, seqNum, enable, enableDump) {
+WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
+    WaveLimiter(manager, seqNum, enable, enableDump) {
    AdaptCount = 2 * MaxWave + 1;
    AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
    DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
@@ -122,7 +119,7 @@ void WLAlgorithmSmooth::outputTrace() {
        return;
    }

-    traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+    traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
            << " currWaves=" << currWaves_ << " waves=" << waves_
            << " bestWave=" << bestWave_ << '\n';
    output(traceStream_, "\n measure = ", measure_);
@@ -217,8 +214,8 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
    state_.push_back(state);
 }

-WLAlgorithmAvrg::WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump):
-    WaveLimiter(owner, seqNum, enable, enableDump) {
+WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
+    WaveLimiter(manager, seqNum, enable, enableDump) {

    measure_.resize(MaxWave + 1);
    clear(measure_);
@@ -234,7 +231,7 @@ void WLAlgorithmAvrg::outputTrace() {
        return;
    }

-    traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+    traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
            << " currWaves=" << currWaves_ << " waves=" << waves_
            << " bestWave=" << bestWave_ << '\n';
    output(traceStream_, "\n measure = ", measure_);
@@ -279,17 +276,12 @@ void WLAlgorithmAvrg::callback(ulong duration) {
    }
 }

-WaveLimiterManager::WaveLimiterManager(Kernel* kernel):
+WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH):
        owner_(kernel),
        enable_(false),
        enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
-    auto gpuDev = static_cast<const Device*>(&owner_->dev());
-    auto attrib = gpuDev->getAttribs();
-    auto hwInfo = gpuDev->hwInfo();
-    unsigned simdPerSH = 0;
-    setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
-            attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
-    fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
+    setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
+    fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
 }

 WaveLimiterManager::~WaveLimiterManager() {
@@ -326,7 +318,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
        return loc->second;
    }

-    auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
+    auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_,
            enableDump_);
    if (limiter == NULL) {
        enable_ = false;
@@ -336,26 +328,23 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
    return limiter;
 }

-void WaveLimiterManager::enable() {
+void WaveLimiterManager::enable(const bool isCiPlus) {
    if (fixed_ > 0) {
        return;
    }
-    auto gpuDev = static_cast<const Device*>(&owner_->dev());
-    auto hwInfo = gpuDev->hwInfo();
+
    // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
    // Disabled for SI due to bug #10817
-
    if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
        enable_ = GPU_WAVE_LIMIT_ENABLE;
    }
    else {
-        if (gpuDev->settings().ciPlus_) {
+        if (isCiPlus) {
            if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
                enable_ = true;
            }
            else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
-                //Todo:
-                //fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_;
+                fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
            }
        }
    }
@@ -16,12 +16,12 @@
 //! \namespace gpu GPU Device Implementation
 namespace gpu {

-class Kernel;
+class WaveLimiterManager;

 // Adaptively limit the number of waves per SIMD based on kernel execution time
 class WaveLimiter: public amd::ProfilingCallback {
 public:
-    explicit WaveLimiter(Kernel*, uint seqNum, bool enable, bool enableDump);
+    explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
    virtual ~WaveLimiter();

    //! Get waves per shader array to be used for kernel execution.
@@ -57,7 +57,7 @@ protected:
    uint bestWave_;      // Optimal waves per SIMD
    uint countAll_;      // Number of kernel executions
    StateKind state_;
-    Kernel *owner_;
+    WaveLimiterManager* manager_;
    DataDumper dumper_;
    std::ofstream traceStream_;
    uint currWaves_;     // Current waves per SIMD
@@ -88,7 +88,7 @@ protected:

 class WLAlgorithmSmooth: public WaveLimiter {
 public:
-    explicit WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump);
+    explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
    virtual ~WLAlgorithmSmooth();
 private:
    std::vector<uint64_t> reference_;
@@ -117,7 +117,7 @@ private:

 class WLAlgorithmAvrg: public WaveLimiter {
 public:
-    explicit WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump);
+    explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
    virtual ~WLAlgorithmAvrg();
 private:
    //! Call back from Event::recordProfilingInfo to get execution time.
@@ -130,7 +130,7 @@ private:
 // Create wave limiter for each virtual device for a kernel and manages the wave limiters.
 class WaveLimiterManager {
 public:
-    explicit WaveLimiterManager(Kernel* owner);
+    explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
    virtual ~WaveLimiterManager();

    //! Get waves per shader array for a specific virtual device.
@@ -140,9 +140,17 @@ public:
    amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);

    //! Enable wave limiter manager by kernel metadata and flags.
-    void enable();
+    void enable(const bool isCiPlus);
+
+    //! Returns the kernel name
+    const std::string& name() const { return owner_->name(); }
+
+    //! Get SimdPerSH.
+    uint getSimdPerSH() const {return simdPerSH_;}
+
 private:
-    Kernel *owner_;                // The kernel which owns this object
+    device::Kernel *owner_;        // The kernel which owns this object
+    uint simdPerSH_;               // Simd Per SH
    std::unordered_map<const device::VirtualDevice *,
        WaveLimiter*> limiters_;   // Maps virtual device to wave limiter
    bool enable_;                  // Whether the adaptation is enabled