From 93e45cff4efe21efdf628da879265c8a4675d2b5 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 6 Sep 2016 14:24:40 -0400 Subject: [PATCH] P4 to Git Change 1311278 by jatang@jatang-opencl-hsa-stg2 on 2016/09/06 14:13:56 SWDEV-101315 - Fix PerfCounter not working under CodeXL. 1. Need to map ORCA PerfCounter block to PAL PerfCounter block/instance. 2. CodeXL could try to create PerfCouters that don't exist in HW, so need to handle that and return 0 as result. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#21 edit --- rocclr/runtime/device/pal/palcounters.cpp | 238 +++++++++++++++++++++- rocclr/runtime/device/pal/palcounters.hpp | 27 ++- rocclr/runtime/device/pal/palvirtual.cpp | 7 +- 3 files changed, 253 insertions(+), 19 deletions(-) diff --git a/rocclr/runtime/device/pal/palcounters.cpp b/rocclr/runtime/device/pal/palcounters.cpp index 10e142619f..9a482053ca 100644 --- a/rocclr/runtime/device/pal/palcounters.cpp +++ b/rocclr/runtime/device/pal/palcounters.cpp @@ -4,6 +4,7 @@ #include "device/pal/palcounters.hpp" #include "device/pal/palvirtual.hpp" +#include namespace pal { @@ -60,8 +61,13 @@ PalCounterReference::~PalCounterReference() } } -uint64_t PalCounterReference::result(uint index) +uint64_t PalCounterReference::result(int index) { + if (index < 0) { + // These are counters that have no corresponding PalSample created + return 0; + } + if (layout_ != nullptr) { assert(index <= layout_->sampleCount && "index not in range"); const Pal::GlobalSampleLayout& sample = layout_->samples[index]; @@ -118,6 +124,7 @@ bool PalCounterReference::finalize() Pal::GlobalCounterLayout layout = {}; iPerf()->GetGlobalCounterLayout(&layout); + assert(layout.sampleCount == numExpCounters_); size_t size = sizeof(Pal::GlobalCounterLayout) + (sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1)); layout_ = reinterpret_cast(new char[size]); if (layout_ != nullptr) { @@ -131,6 +138,223 @@ bool PalCounterReference::finalize() } } +// Converting from ORCA cmndefs.h to PAL palPerfExperiment.h +static const +std::array, 83> ciBlockIdOrcaToPal = +{{ + {0x0F, 0}, // CB0 + {0x0F, 1}, // CB1 + {0x0F, 2}, // CB2 + {0x0F, 3}, // CB3 + {0x01, 0}, // CPF + {0x0E, 0}, // DB0 + {0x0E, 1}, // DB1 + {0x0E, 2}, // DB2 + {0x0E, 3}, // DB3 + {0x12, 0}, // GRBM + {0x13, 0}, // GRBMSE + {0x04, 0}, // PA_SU + {0x04, 0}, // PA_SC + {0x06, 0}, // SPI + {0x07, 0}, // SQ + {0x07, 0}, // SQ_ES + {0x07, 0}, // SQ_GS + {0x07, 0}, // SQ_VS + {0x07, 0}, // SQ_PS + {0x07, 0}, // SQ_LS + {0x07, 0}, // SQ_HS + {0x07, 0}, // SQ_CS + {0x08, 0}, // SX + {0x09, 0}, // TA0 + {0x09, 1}, // TA1 + {0x09, 2}, // TA2 + {0x09, 3}, // TA3 + {0x09, 4}, // TA4 + {0x09, 5}, // TA5 + {0x09, 6}, // TA6 + {0x09, 7}, // TA7 + {0x09, 8}, // TA8 + {0x09, 9}, // TA9 + {0x09, 0x0a}, // TA10 + {0x0D, 0}, // TCA0 + {0x0D, 1}, // TCA1 + {0x0C, 0}, // TCC0 + {0x0C, 1}, // TCC1 + {0x0C, 2}, // TCC2 + {0x0C, 3}, // TCC3 + {0x0C, 4}, // TCC4 + {0x0C, 5}, // TCC5 + {0x0C, 6}, // TCC6 + {0x0C, 7}, // TCC7 + {0x0C, 8}, // TCC8 + {0x0C, 9}, // TCC9 + {0x0C, 0x0a}, // TCC10 + {0x0C, 0x0b}, // TCC11 + {0x0C, 0x0c}, // TCC12 + {0x0C, 0x0d}, // TCC13 + {0x0C, 0x0e}, // TCC14 + {0x0C, 0x0f}, // TCC15 + {0x0A, 0}, // TD0 + {0x0A, 1}, // TD1 + {0x0A, 2}, // TD2 + {0x0A, 3}, // TD3 + {0x0A, 4}, // TD4 + {0x0A, 5}, // TD5 + {0x0A, 6}, // TD6 + {0x0A, 7}, // TD7 + {0x0A, 8}, // TD8 + {0x0A, 9}, // TD9 + {0x0A, 0x0a}, // TD10 + {0x0B, 0}, // TCP0 + {0x0B, 1}, // TCP1 + {0x0B, 2}, // TCP2 + {0x0B, 3}, // TCP3 + {0x0B, 4}, // TCP4 + {0x0B, 5}, // TCP5 + {0x0B, 6}, // TCP6 + {0x0B, 7}, // TCP7 + {0x0B, 8}, // TCP8 + {0x0B, 9}, // TCP9 + {0x0B, 0x0a}, // TCP10 + {0x10, 0}, // GDS + {0x03, 0}, // VGT + {0x02, 0}, // IA + {0x16, 0}, // MC + {0x11, 0}, // SRBM + {0x1a, 0}, // TCS + {0x19, 0}, // WD + {0x17, 0}, // CPG + {0x18, 0}, // CPC +}}; + +static const +std::array, 98> viBlockIdOrcaToPal = +{{ + {0x0F, 0}, // CB0 + {0x0F, 1}, // CB1 + {0x0F, 2}, // CB2 + {0x0F, 3}, // CB3 + {0x01, 0}, // CPF + {0x0E, 0}, // DB0 + {0x0E, 1}, // DB1 + {0x0E, 2}, // DB2 + {0x0E, 3}, // DB3 + {0x12, 0}, // GRBM + {0x13, 0}, // GRBMSE + {0x04, 0}, // PA_SU + {0x04, 0}, // PA_SC + {0x06, 0}, // SPI + {0x07, 0}, // SQ + {0x07, 0}, // SQ_ES + {0x07, 0}, // SQ_GS + {0x07, 0}, // SQ_VS + {0x07, 0}, // SQ_PS + {0x07, 0}, // SQ_LS + {0x07, 0}, // SQ_HS + {0x07, 0}, // SQ_CS + {0x08, 0}, // SX + {0x09, 0}, // TA0 + {0x09, 1}, // TA1 + {0x09, 2}, // TA2 + {0x09, 3}, // TA3 + {0x09, 4}, // TA4 + {0x09, 5}, // TA5 + {0x09, 6}, // TA6 + {0x09, 7}, // TA7 + {0x09, 8}, // TA8 + {0x09, 9}, // TA9 + {0x09, 0x0a}, // TA10 + {0x09, 0x0b}, // TA11 + {0x09, 0x0c}, // TA12 + {0x09, 0x0d}, // TA13 + {0x09, 0x0e}, // TA14 + {0x09, 0x0f}, // TA15 + {0x0D, 0}, // TCA0 + {0x0D, 1}, // TCA1 + {0x0C, 0}, // TCC0 + {0x0C, 1}, // TCC1 + {0x0C, 2}, // TCC2 + {0x0C, 3}, // TCC3 + {0x0C, 4}, // TCC4 + {0x0C, 5}, // TCC5 + {0x0C, 6}, // TCC6 + {0x0C, 7}, // TCC7 + {0x0C, 8}, // TCC8 + {0x0C, 9}, // TCC9 + {0x0C, 0x0a}, // TCC10 + {0x0C, 0x0b}, // TCC11 + {0x0C, 0x0c}, // TCC12 + {0x0C, 0x0d}, // TCC13 + {0x0C, 0x0e}, // TCC14 + {0x0C, 0x0f}, // TCC15 + {0x0A, 0}, // TD0 + {0x0A, 1}, // TD1 + {0x0A, 2}, // TD2 + {0x0A, 3}, // TD3 + {0x0A, 4}, // TD4 + {0x0A, 5}, // TD5 + {0x0A, 6}, // TD6 + {0x0A, 7}, // TD7 + {0x0A, 8}, // TD8 + {0x0A, 9}, // TD9 + {0x0A, 0x0a}, // TD10 + {0x0A, 0x0b}, // TD11 + {0x0A, 0x0c}, // TD12 + {0x0A, 0x0d}, // TD13 + {0x0A, 0x0e}, // TD14 + {0x0A, 0x0f}, // TD15 + {0x0B, 0}, // TCP0 + {0x0B, 1}, // TCP1 + {0x0B, 2}, // TCP2 + {0x0B, 3}, // TCP3 + {0x0B, 4}, // TCP4 + {0x0B, 5}, // TCP5 + {0x0B, 6}, // TCP6 + {0x0B, 7}, // TCP7 + {0x0B, 8}, // TCP8 + {0x0B, 9}, // TCP9 + {0x0B, 0x0a}, // TCP10 + {0x0B, 0x0b}, // TCP11 + {0x0B, 0x0c}, // TCP12 + {0x0B, 0x0d}, // TCP13 + {0x0B, 0x0e}, // TCP14 + {0x0B, 0x0f}, // TCP15 + {0x10, 0}, // GDS + {0x03, 0}, // VGT + {0x02, 0}, // IA + {0x16, 0}, // MC + {0x11, 0}, // SRBM + {0x19, 0}, // WD + {0x17, 0}, // CPG + {0x18, 0}, // CPC +}}; + +void PerfCounter::convertInfo() +{ + switch (dev().ipLevel()) { + case Pal::GfxIpLevel::GfxIp7: + if (info_.blockIndex_ < ciBlockIdOrcaToPal.size()) { + auto p = ciBlockIdOrcaToPal[info_.blockIndex_]; + info_.blockIndex_ = std::get<0>(p); + info_.counterIndex_ = std::get<1>(p); + } + break; + case Pal::GfxIpLevel::GfxIp8: + if (info_.blockIndex_ < viBlockIdOrcaToPal.size()) { + auto p = viBlockIdOrcaToPal[info_.blockIndex_]; + info_.blockIndex_ = std::get<0>(p); + info_.counterIndex_ = std::get<1>(p); + } + break; + case Pal::GfxIpLevel::GfxIp9: + Unimplemented(); + break; + default: + Unimplemented(); + break; + } +} + PerfCounter::~PerfCounter() { if (palRef_ == nullptr) { @@ -144,7 +368,7 @@ PerfCounter::~PerfCounter() bool PerfCounter::create() { - index_ = palRef_->retain() - 2; + palRef_->retain(); // Initialize the counter Pal::PerfCounterInfo counterInfo = {}; @@ -153,11 +377,15 @@ PerfCounter::create() counterInfo.instance = info_.counterIndex_; counterInfo.eventId = info_.eventIndex_; Pal::Result result = iPerf()->AddCounter(counterInfo); - if (result != Pal::Result::Success) { + if (result == Pal::Result::Success) { + index_ = palRef_->getPalCounterIndex(); + return true; + } + else { + // Get here when there's no HW PerfCounter matching the counterInfo + index_ = -1; return false; } - - return true; } uint64_t diff --git a/rocclr/runtime/device/pal/palcounters.hpp b/rocclr/runtime/device/pal/palcounters.hpp index 14bae0a9a9..c0566421f8 100644 --- a/rocclr/runtime/device/pal/palcounters.hpp +++ b/rocclr/runtime/device/pal/palcounters.hpp @@ -25,7 +25,8 @@ public: , gpu_(gpu) , memory_(nullptr) , cpuAddr_(nullptr) - , layout_(nullptr) {} + , layout_(nullptr) + , numExpCounters_(0) {} //! Get PAL counter Pal::IPerfExperiment* iPerf() const { return perfExp_; } @@ -37,7 +38,10 @@ public: bool finalize(); //! Returns the PAL counter results - uint64_t result(uint index); + uint64_t result(int index); + + //! Get the latest Experiment Counter index + uint getPalCounterIndex() { return numExpCounters_++; }; protected: //! Default destructor @@ -50,11 +54,12 @@ private: //! Disable operator= PalCounterReference& operator=(const PalCounterReference&); - VirtualGPU& gpu_; //!< The virtual GPU device object - Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object - Pal::GlobalCounterLayout* layout_; //!< Layout of the result - Memory* memory_; - void* cpuAddr_; //!< CPU address of memory_ + VirtualGPU& gpu_; //!< The virtual GPU device object + Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object + Pal::GlobalCounterLayout* layout_; //!< Layout of the result + Memory* memory_; //!< Memory used by PAL performance experiment + void* cpuAddr_; //!< CPU address of memory_ + uint numExpCounters_; //!< Number of Experiment Counter created }; //! Performance counter implementation on GPU @@ -83,6 +88,7 @@ public: info_.blockIndex_ = blockIndex; info_.counterIndex_ = counterIndex; info_.eventIndex_ = eventIndex; + convertInfo(); } //! Destructor for the GPU PerfCounter object @@ -102,7 +108,7 @@ public: //! Returns the virtual GPU device const VirtualGPU& gpu() const { return palRef_->gpu(); } - //! Returns the CAL performance counter descriptor + //! Returns the PAL performance counter descriptor const Info* info() const { return &info_; } //! Returns the Info structure for performance counter @@ -115,10 +121,13 @@ private: //! Disable default operator= PerfCounter& operator=(const PerfCounter&); + //! Convert info from ORCA to PAL + void convertInfo(); + const Device& gpuDevice_; //!< The backend device PalCounterReference* palRef_; //!< Reference counter Info info_; //!< The info structure for perfcounter - uint index_; //!< Counter index in the CAL container + int index_; //!< Counter index in the PAL container }; } // namespace pal diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index dab2648aa2..bacd8e88c7 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -2344,19 +2344,16 @@ VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) return; } else if (gpuCounter->create()) { - amdCounter->setDeviceCounter(gpuCounter); newExperiment = true; } else { - LogPrintfError("We failed to allocate a perfcounter in CAL.\ + LogPrintfError("We failed to allocate a perfcounter in PAL.\ Block: %d, counter: #d, event: %d", gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_, gpuCounter->info()->eventIndex_); - delete gpuCounter; - vcmd.setStatus(CL_INVALID_OPERATION); - return; } + amdCounter->setDeviceCounter(gpuCounter); } }