Files
rocm-systems/rocclr/runtime/device/pal/palthreadtrace.cpp
T
foreman 892bda4554 P4 to Git Change 1293980 by jatang@jatang-opencl-hsa-stg3 on 2016/07/21 10:22:49
SWDEV-95919 - OCL ThreadTrace on PAL.

	The PAL PerfExperiment interface is not compatible with the  OCL ThreadTrace extension, in terms of Thread Trace buffer handling. PAL PerfExperiment is expecting one buffer bound to it for all shader engines, while the OCL ThreadTrace extension is expecting one buffer for each shader engines. To accomodate the difference, we copy from the PAL PerfExperiment buffer to the  OCL ThreadTrace extension buffers.

	The next step is to support setting ThreadTrace parameters, and getting actually capture size from the layout.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palthreadtrace.cpp#2 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palthreadtrace.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#18 edit
2016-07-21 10:32:54 -04:00

183 líneas
4.9 KiB
C++

//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/pal/palthreadtrace.hpp"
#include "device/pal/palvirtual.hpp"
namespace pal {
PalThreadTraceReference*
PalThreadTraceReference::Create(VirtualGPU& gpu)
{
Pal::Result result;
// Create performance experiment
Pal::PerfExperimentCreateInfo createInfo = {};
createInfo.optionFlags.sampleInternalOperations = 1;
createInfo.optionFlags.cacheFlushOnCounterCollection = 1;
createInfo.optionFlags.sqShaderMask = 1;
createInfo.optionValues.sampleInternalOperations = true;
createInfo.optionValues.cacheFlushOnCounterCollection = true;
createInfo.optionValues.sqShaderMask = Pal::PerfShaderMaskCs;
size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize(
createInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
PalThreadTraceReference* memRef = new (palExperSize) PalThreadTraceReference(gpu);
if (memRef != nullptr) {
result = gpu.dev().iDev()->CreatePerfExperiment(createInfo,
&memRef[1], &memRef->perfExp_);
if (result != Pal::Result::Success) {
memRef->release();
return nullptr;
}
}
return memRef;
}
PalThreadTraceReference::~PalThreadTraceReference()
{
// The thread trace object is always associated with a particular queue,
// so we have to lock just this queue
amd::ScopedLock lock(gpu_.execution());
delete layout_;
delete memory_;
if (nullptr != iPerf()) {
iPerf()->Destroy();
}
}
bool
PalThreadTraceReference::finalize()
{
Pal::Result result;
iPerf()->Finalize();
// Acquire GPU memory for the query from the pool and bind it.
Pal::GpuMemoryRequirements gpuMemReqs = {};
iPerf()->GetGpuMemoryRequirements(&gpuMemReqs);
memory_ = new Memory(gpu().dev(), amd::alignUp(gpuMemReqs.size, gpuMemReqs.alignment));
if (nullptr == memory_) {
return false;
}
if (!memory_->create(Resource::Local)) {
return false;
}
gpu_.queue(gpu_.engineID_).addMemRef(memory_->iMem());
result = iPerf()->BindGpuMemory(memory_->iMem(), 0);
if (result != Pal::Result::Success) {
return false;
}
Pal::ThreadTraceLayout layout = {};
iPerf()->GetThreadTraceLayout(&layout);
size_t size = sizeof(Pal::ThreadTraceLayout) + (sizeof(Pal::ThreadTraceSeLayout) * (layout.traceCount - 1));
layout_ = reinterpret_cast<Pal::ThreadTraceLayout*>(new char[size]);
if (layout_ == nullptr) {
return false;
}
layout_->traceCount = layout.traceCount;
iPerf()->GetThreadTraceLayout(layout_);
return true;
}
void
PalThreadTraceReference::copyToUserBuffer(Memory* dstMem, uint seIndex)
{
amd::Coord3D srcOrigin(layout_->traces[seIndex].dataOffset, 0, 0);
amd::Coord3D dstOrigin(0, 0, 0);
amd::Coord3D size(dstMem->size(), 0, 0);
gpu_.blitMgr().copyBuffer(*memory_, *dstMem, srcOrigin, dstOrigin, size, true);
}
ThreadTrace::~ThreadTrace()
{
if (palRef_ == nullptr) {
return;
}
// Release the thread trace reference object
palRef_->release();
}
bool
ThreadTrace::create()
{
palRef_->retain();
size_t se = 0;
for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) {
// Initialize the thread trace
Pal::PerfTraceInfo sqttInfo = {};
sqttInfo.traceType = Pal::PerfTraceType::ThreadTrace;
sqttInfo.instance = se;
sqttInfo.optionFlags.bufferSize = 1;
// PAL requires ThreadTrace buffer aligned to 4KB
sqttInfo.optionValues.bufferSize = amd::alignUp(dev().getGpuMemory(*itMemObj)->size(), (0x1 << 12));
sqttInfo.optionFlags.threadTraceTokenMask = 1;
sqttInfo.optionValues.threadTraceTokenMask = 0x0000ffff;
Pal::Result result = iPerf()->AddTrace(sqttInfo);
if (result != Pal::Result::Success) {
return false;
}
}
return true;
}
void
ThreadTrace::populateUserMemory()
{
uint se = 0;
for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) {
palRef_->copyToUserBuffer(dev().getGpuMemory(*itMemObj), se);
}
}
bool
ThreadTrace::info(uint infoType, uint* info, uint infoSize) const
{
switch (infoType) {
case CL_THREAD_TRACE_BUFFERS_SIZE: {
if (infoSize < numSe_) {
LogError("The amount of buffers should be equal to the amount of Shader Engines");
return false;
}
else {
uint se = 0;
for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) {
info[se] = dev().getGpuMemory(*itMemObj)->size();
}
}
break;
}
default:
LogError("Wrong ThreadTrace::getInfo parameter");
return false;
}
return true;
}
} // namespace pal