P4 to Git Change 1065597 by gandryey@gera-dev-w7 on 2014/08/12 18:38:45
ECR #304775 - Device enqueuing - Provide scratch buffer offset for generic address space - Use single scratch buffer for all available queues. Each queue will have a unique subbuffer in the global buffer Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#454 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#129 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#329 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#120 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#37 edit
Этот коммит содержится в:
@@ -373,6 +373,7 @@ Device::Device()
|
||||
, resourceCache_(NULL)
|
||||
, heapInitComplete_(false)
|
||||
, xferQueue_(NULL)
|
||||
, globalScratchBuf_(NULL)
|
||||
, srdManager_(NULL)
|
||||
{
|
||||
}
|
||||
@@ -389,6 +390,8 @@ Device::~Device()
|
||||
scratch_[s] = NULL;
|
||||
}
|
||||
|
||||
delete globalScratchBuf_;
|
||||
|
||||
// Destroy transfer queue
|
||||
delete xferQueue_;
|
||||
|
||||
@@ -2273,7 +2276,6 @@ Device::ScratchBuffer::destroyMemory()
|
||||
delete memObjs_[i];
|
||||
memObjs_[i] = NULL;
|
||||
}
|
||||
regNum_ = 0;
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -2282,30 +2284,63 @@ Device::allocScratch(uint regNum, const VirtualGPU* vgpu)
|
||||
if (regNum > 0) {
|
||||
// Serialize the scratch buffer allocation code
|
||||
amd::ScopedLock lk(*lockAsyncOps_);
|
||||
uint s = vgpu->hwRing();
|
||||
uint sb = vgpu->hwRing();
|
||||
|
||||
// Check if the current buffer isn't big enough
|
||||
if (regNum > scratch_[s]->regNum_) {
|
||||
if (regNum > scratch_[sb]->regNum_) {
|
||||
// Stall all command queues, since runtime will reallocate memory
|
||||
ScopedLockVgpus lock(*this);
|
||||
std::vector<Memory*>& mems = scratch_[s]->memObjs_;
|
||||
|
||||
// Calculate the size of the new buffer
|
||||
size_t size = calcScratchBufferSize(regNum);
|
||||
scratch_[sb]->regNum_ = regNum;
|
||||
size_t size = 0;
|
||||
uint offset = 0;
|
||||
|
||||
scratch_[s]->destroyMemory();
|
||||
|
||||
// Loop through all memory objects and reallocate them
|
||||
for (uint i = 0; i < mems.size(); ++i) {
|
||||
// Allocate new buffer
|
||||
mems[i] = new gpu::Memory(*this, size);
|
||||
if ((mems[i] == NULL) || !mems[i]->create(Resource::Scratch)) {
|
||||
LogError("Couldn't allocate scratch memory");
|
||||
scratch_[s]->regNum_ = 0;
|
||||
return false;
|
||||
// Destroy all views
|
||||
for (uint s = 0; s < scratch_.size(); ++s) {
|
||||
ScratchBuffer* scratchBuf = scratch_[s];
|
||||
if (scratchBuf->regNum_ > 0) {
|
||||
scratchBuf->destroyMemory();
|
||||
// Calculate the size of the scratch buffer for a queue
|
||||
scratchBuf->size_ = calcScratchBufferSize(scratchBuf->regNum_);
|
||||
scratchBuf->offset_ = offset;
|
||||
size += scratchBuf->size_ * scratchBuf->memObjs_.size();
|
||||
offset += scratchBuf->size_;
|
||||
}
|
||||
}
|
||||
|
||||
delete globalScratchBuf_;
|
||||
|
||||
// Allocate new buffer.
|
||||
globalScratchBuf_ = new gpu::Memory(*this, size);
|
||||
if ((globalScratchBuf_ == NULL) ||
|
||||
!globalScratchBuf_->create(Resource::Scratch)) {
|
||||
LogError("Couldn't allocate scratch memory");
|
||||
for (uint s = 0; s < scratch_.size(); ++s) {
|
||||
scratch_[s]->regNum_ = 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
for (uint s = 0; s < scratch_.size(); ++s) {
|
||||
std::vector<Memory*>& mems = scratch_[s]->memObjs_;
|
||||
|
||||
// Loop through all memory objects and reallocate them
|
||||
for (uint i = 0; i < mems.size(); ++i) {
|
||||
if (scratch_[s]->regNum_ > 0) {
|
||||
// Allocate new buffer
|
||||
mems[i] = new gpu::Memory(*this, scratch_[s]->size_);
|
||||
Resource::ViewParams view;
|
||||
view.resource_ = globalScratchBuf_;
|
||||
view.offset_ = scratch_[s]->offset_ + i * scratch_[s]->size_;
|
||||
view.size_ = scratch_[s]->size_;
|
||||
if ((mems[i] == NULL) || !mems[i]->create(Resource::View, &view)) {
|
||||
LogError("Couldn't allocate a scratch view");
|
||||
scratch_[s]->regNum_ = 0;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
scratch_[s]->regNum_ = regNum;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@@ -2341,8 +2376,13 @@ Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* v
|
||||
void
|
||||
Device::destroyScratchBuffers()
|
||||
{
|
||||
for (uint s = 0; s < scratch_.size(); ++s) {
|
||||
scratch_[s]->destroyMemory();
|
||||
if (globalScratchBuf_ != NULL) {
|
||||
for (uint s = 0; s < scratch_.size(); ++s) {
|
||||
scratch_[s]->destroyMemory();
|
||||
scratch_[s]->regNum_ = 0;
|
||||
}
|
||||
delete globalScratchBuf_;
|
||||
globalScratchBuf_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -297,9 +297,11 @@ public:
|
||||
{
|
||||
uint regNum_; //!< The number of used scratch registers
|
||||
std::vector<Memory*> memObjs_; //!< Memory objects for scratch buffers
|
||||
uint offset_; //!< Offset from the global scratch store
|
||||
uint size_; //!< Scratch buffer size on this queue
|
||||
|
||||
//! Default constructor
|
||||
ScratchBuffer(uint numMems): regNum_(0), memObjs_(numMems) {}
|
||||
ScratchBuffer(uint numMems): regNum_(0), memObjs_(numMems), offset_(0) {}
|
||||
|
||||
//! Default constructor
|
||||
~ScratchBuffer();
|
||||
@@ -524,6 +526,9 @@ public:
|
||||
|
||||
const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; }
|
||||
|
||||
//! Returns the global scratch buffer
|
||||
Memory* globalScratchBuf() const { return globalScratchBuf_; };
|
||||
|
||||
//! Destroys scratch buffer memory
|
||||
void destroyScratchBuffers();
|
||||
|
||||
@@ -613,9 +618,10 @@ private:
|
||||
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
|
||||
ResourceCache* resourceCache_; //!< CAL resource cache
|
||||
Engines engines_; //!< Available engines on device
|
||||
bool heapInitComplete_; //!< Keep track of initialization status of heap resources
|
||||
bool heapInitComplete_; //!< Keep track of initialization status of heap resources
|
||||
VirtualGPU* xferQueue_; //!< Transfer queue
|
||||
std::vector<ScratchBuffer*> scratch_; //!< Scratch buffers for kernels
|
||||
Memory* globalScratchBuf_; //!< Global scratch buffer
|
||||
SrdManager* srdManager_; //!< SRD manager object
|
||||
|
||||
static AppProfile appProfile_; //!< application profile
|
||||
|
||||
@@ -68,7 +68,7 @@ struct SchedulerParam {
|
||||
uint32_t releaseHostCP; //!< Releases CP on the host queue
|
||||
uint64_t parentAQL; //!< Host parent AmdAqlWrap packet
|
||||
uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue
|
||||
uint32_t reserved; //!< Reserved field
|
||||
uint32_t scratchOffset; //!< Scratch buffer offset
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -92,7 +92,7 @@ typedef struct _SchedulerParam {
|
||||
uint releaseHostCP; //!< Releases CP on the host queue
|
||||
ulong parentAQL; //!< Host parent AmdAqlWrap packet
|
||||
uint dedicatedQueue; //!< Scheduler uses a dedicated queue
|
||||
uint reserved; //!< Reserved field
|
||||
uint scratchOffset; //!< Scratch buffer offset
|
||||
} SchedulerParam;
|
||||
|
||||
typedef struct _HwDispatch {
|
||||
@@ -152,7 +152,7 @@ typedef struct _HwDispatch {
|
||||
uint shPrivateHi; // 0x00000000 ---- dstAddressHi
|
||||
uint user4; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
|
||||
uint offsUser4; // 0x00000248 ---- OFFSET
|
||||
uint privOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0
|
||||
uint scratchOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0
|
||||
uint privSize; // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30
|
||||
uint packet4; // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE
|
||||
uint glbSizeX; // 0x00000000
|
||||
@@ -170,10 +170,11 @@ static inline void
|
||||
dispatch(
|
||||
volatile __global HwDispatch* dispatch,
|
||||
__global HsaAqlDispatchPacket* aqlPkt,
|
||||
uint scratchSize,
|
||||
uint numMaxWaves,
|
||||
ulong scratch,
|
||||
ulong hsaQueue)
|
||||
ulong scratch,
|
||||
ulong hsaQueue,
|
||||
uint scratchSize,
|
||||
uint scratchOffset,
|
||||
uint numMaxWaves)
|
||||
{
|
||||
const uint UsrRegOffset = 0x240;
|
||||
const uint Pm4Nop = 0xC0001002;
|
||||
@@ -258,8 +259,9 @@ dispatch(
|
||||
// flatScratchEna = (flags & 0x20);
|
||||
if (flags & 0x20) {
|
||||
dispatch->copyData = Pm4CopyReg;
|
||||
dispatch->scratchAddrLo = (uint)(scratch >> 16);
|
||||
dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16);
|
||||
dispatch->offsUser4 = UsrRegOffset + usrRegCnt;
|
||||
dispatch->scratchOffs = scratchOffset;
|
||||
dispatch->privSize = privateSize;
|
||||
}
|
||||
else {
|
||||
@@ -421,8 +423,8 @@ scheduler(
|
||||
(__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
|
||||
}
|
||||
// Launch child kernel ....
|
||||
dispatch(hwDisp, &disp->aql, param->scratchSize, param->numMaxWaves,
|
||||
param->scratch, param->hsa_queue);
|
||||
dispatch(hwDisp, &disp->aql, param->scratch, param->hsa_queue,
|
||||
param->scratchSize, param->scratchOffset, param->numMaxWaves);
|
||||
disp->state = AQL_WRAP_BUSY;
|
||||
releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
|
||||
disp->wait_num, (__global uint*)queue->event_slot_mask,
|
||||
|
||||
@@ -377,7 +377,6 @@ VirtualGPU::VirtualGPU(
|
||||
, numVmMems_(0)
|
||||
, dmaFlushMgmt_(device)
|
||||
, numGrpCb_(NULL)
|
||||
, scratchRegNum_(0)
|
||||
, hwRing_(0)
|
||||
, readjustTimeGPU_(0)
|
||||
, currTs_(NULL)
|
||||
@@ -645,7 +644,7 @@ VirtualGPU::~VirtualGPU()
|
||||
//!@note OCLtst uses single device with multiple tests
|
||||
//! Release memory only if it's the last command queue.
|
||||
//! The first queue is reserved for the transfers on device
|
||||
if ((scratchRegNum_ > 0) && (gpuDevice_.numOfVgpus_ <= 1)) {
|
||||
if (gpuDevice_.numOfVgpus_ <= 1) {
|
||||
gpuDevice_.destroyScratchBuffers();
|
||||
}
|
||||
|
||||
@@ -1736,12 +1735,14 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
}
|
||||
|
||||
gslMemObject scratch = NULL;
|
||||
uint scratchOffset = 0;
|
||||
// Check if the device allocated more registers than the old setup
|
||||
if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
|
||||
const std::vector<Memory*>& mems = dev().scratch(hwRing())->memObjs_;
|
||||
const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing());
|
||||
const std::vector<Memory*>& mems = scratchObj->memObjs_;
|
||||
scratch = mems[0]->gslResource();
|
||||
memList.push_back(mems[0]);
|
||||
scratchRegNum_ = dev().scratch(hwRing())->regNum_;
|
||||
scratchOffset = scratchObj->offset_;
|
||||
}
|
||||
|
||||
// Add GSL handle to the memory list for VidMM
|
||||
@@ -1752,7 +1753,7 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
GpuEvent gpuEvent;
|
||||
// Run AQL dispatch in HW
|
||||
runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
|
||||
scratch, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress());
|
||||
scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress());
|
||||
|
||||
if (hsaKernel.dynamicParallelism()) {
|
||||
// Make sure exculsive access to the device queue
|
||||
@@ -1884,12 +1885,14 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
param->scratchSize = scratchBuf->size();
|
||||
param->scratch = scratchBuf->vmAddress();
|
||||
param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
|
||||
param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
|
||||
memList.push_back(scratchBuf);
|
||||
}
|
||||
else {
|
||||
param->numMaxWaves = 0;
|
||||
param->scratchSize = 0;
|
||||
param->scratch = 0;
|
||||
param->scratchOffset = 0;
|
||||
}
|
||||
|
||||
// Add all kernels in the program to the mem list.
|
||||
@@ -2180,7 +2183,6 @@ VirtualGPU::releaseMemory(gslMemObject gslResource, bool wait)
|
||||
for (uint i = 0; i < mems.size(); ++i) {
|
||||
if ((mems[i] != NULL) && (mems[i]->gslResource() == gslResource)) {
|
||||
setScratchBuffer(NULL, i);
|
||||
scratchRegNum_ = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2986,14 +2988,13 @@ VirtualGPU::waitEventLock(CommandBatch* cb)
|
||||
void
|
||||
VirtualGPU::validateScratchBuffer(const Kernel* kernel)
|
||||
{
|
||||
// Check if the device allocated more registers than the old setup
|
||||
if (dev().scratch(hwRing())->regNum_ > scratchRegNum_) {
|
||||
// Check if a scratch buffer is required
|
||||
if (dev().scratch(hwRing())->regNum_ > 0) {
|
||||
const std::vector<Memory*>& mems = dev().scratch(hwRing())->memObjs_;
|
||||
for (uint i = 0; i < mems.size(); ++i) {
|
||||
// Setup scratch buffer
|
||||
setScratchBuffer(mems[i]->gslResource(), i);
|
||||
}
|
||||
scratchRegNum_ = dev().scratch(hwRing())->regNum_;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -519,7 +519,6 @@ private:
|
||||
CommandBatchList cbList_; //!< List of command batches
|
||||
|
||||
ConstBuffer* numGrpCb_; //!< Constant buffer for 8xx workaround
|
||||
uint scratchRegNum_; //!< Number of scratch registers used in this queue
|
||||
uint hwRing_; //!< HW ring used on this virtual device
|
||||
|
||||
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
|
||||
|
||||
@@ -1266,10 +1266,11 @@ CALGSLContext::writeTimer(bool sdma, const gslMemObject mem, uint32 offset) cons
|
||||
|
||||
void
|
||||
CALGSLContext::runAqlDispatch(GpuEvent& event, const void* aqlPacket,
|
||||
const gslMemObject* mems, uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA)
|
||||
const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset,
|
||||
const void* cpuKernelCode, uint64 hsaQueueVA)
|
||||
{
|
||||
eventBegin(MainEngine);
|
||||
m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, cpuKernelCode, hsaQueueVA);
|
||||
m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA);
|
||||
eventEnd(MainEngine, event);
|
||||
}
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ public:
|
||||
bool runProgramGrid(GpuEvent& event, const ProgramGrid* pProgramGrid, const gslMemObject* mems, uint32 numMems);
|
||||
bool runProgramVideoDecode(GpuEvent& event, gslMemObject mo, const CALprogramVideoDecode& decode);
|
||||
void runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems,
|
||||
uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA);
|
||||
uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA);
|
||||
mcaddr virtualQueueDispatcherStart();
|
||||
void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart);
|
||||
void virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
|
||||
|
||||
Ссылка в новой задаче
Block a user