P4 to Git Change 1069927 by skudchad@skudchad_test_win_opencl2 on 2014/08/25 14:51:55
ECR #304775 - Optimization for rectangular copies(Part2). Due to HW restriction of 14bits for src and dst pitch, its advantageous to choose optimal bpp. Higher the bpp the larger the byte pitch. This indirectly helps to reduce the number of packets for buffer copy(line by line vs a single sub_win raw packet)
ReviewBoardURL = http://ocltc.amd.com/reviews/r/5605/diff/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#109 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#191 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.hpp#76 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#64 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#38 edit
[ROCm/clr commit: 5efe63df44]
이 커밋은 다음에 포함됨:
@@ -613,24 +613,36 @@ DmaBlitManager::copyBufferRect(
|
||||
size_t srcOffset;
|
||||
size_t dstOffset;
|
||||
|
||||
uint bytesPerElement = 16;
|
||||
bool optimalElementSize = false;
|
||||
bool subWindowRectCopy = dev().settings().rectLinearDMA_;
|
||||
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
|
||||
size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;
|
||||
|
||||
srcOffset = srcRect.offset(0, 0, 0);
|
||||
dstOffset = dstRect.offset(0, 0, 0);
|
||||
|
||||
while (bytesPerElement >= 1) {
|
||||
if (((srcOffset % 4) == 0) &&
|
||||
((dstOffset % 4) == 0) &&
|
||||
((size[0] % bytesPerElement) == 0) &&
|
||||
((srcRect.rowPitch_ % bytesPerElement) == 0) &&
|
||||
((srcRect.slicePitch_ % bytesPerElement) == 0) &&
|
||||
((dstRect.rowPitch_ % bytesPerElement) == 0) &&
|
||||
((dstRect.slicePitch_ % bytesPerElement) == 0)) {
|
||||
optimalElementSize = true;
|
||||
break;
|
||||
}
|
||||
bytesPerElement = bytesPerElement >> 1;
|
||||
}
|
||||
|
||||
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
|
||||
size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF;
|
||||
size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF;
|
||||
|
||||
if (subWindowRectCopy &&
|
||||
(((srcOffset % 4) != 0) ||
|
||||
((dstOffset % 4) != 0) ||
|
||||
((size[0] % 4) != 0) ||
|
||||
((srcRect.rowPitch_ % 4) != 0) ||
|
||||
((srcRect.slicePitch_ % 4) != 0) ||
|
||||
((dstRect.rowPitch_ % 4) != 0) ||
|
||||
((dstRect.slicePitch_ % 4) != 0) ||
|
||||
(!optimalElementSize ||
|
||||
(srcRect.rowPitch_ > pitchLimit) ||
|
||||
(dstRect.rowPitch_ > pitchLimit) ||
|
||||
(size[0] > 0x3fff) || // 14 bits limit in HW
|
||||
(size[0] > sizeLimit) || // See above
|
||||
(size[1] > 0x3fff) || // 14 bits limit in HW
|
||||
(size[2] > 0x7ff))) { // 11 bits limit in HW
|
||||
// Restriction with rectLinearDRMDMA packet
|
||||
@@ -642,7 +654,7 @@ DmaBlitManager::copyBufferRect(
|
||||
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
|
||||
amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
|
||||
amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
|
||||
size, gpuMem(dstMemory), true)) {
|
||||
size, gpuMem(dstMemory), true, false, bytesPerElement)) {
|
||||
LogError("copyBufferRect failed!");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1158,7 +1158,8 @@ Resource::partialMemCopyTo(
|
||||
const amd::Coord3D& size,
|
||||
Resource& dstResource,
|
||||
bool enableCopyRect,
|
||||
bool flushDMA) const
|
||||
bool flushDMA,
|
||||
uint bytesPerElement) const
|
||||
{
|
||||
GpuEvent event;
|
||||
bool result;
|
||||
@@ -1198,7 +1199,7 @@ Resource::partialMemCopyTo(
|
||||
result = gpu.copyPartial(event,
|
||||
gslResource(), calSrcOrigin,
|
||||
dstResource.gslResource(), calDstOrigin,
|
||||
calSize, static_cast<CALmemcopyflags>(syncFlags), enableCopyRect);
|
||||
calSize, static_cast<CALmemcopyflags>(syncFlags), enableCopyRect, bytesPerElement);
|
||||
|
||||
if (result) {
|
||||
// Mark source and destination as busy
|
||||
|
||||
@@ -232,7 +232,8 @@ public:
|
||||
const amd::Coord3D& size, //!< Size of the region to copy
|
||||
Resource& dstResource, //!< Destination resource
|
||||
bool enableRectCopy = false, //!< Rectangular DMA support
|
||||
bool flushDMA = false //!< Flush DMA if requested
|
||||
bool flushDMA = false, //!< Flush DMA if requested
|
||||
uint bytesPerElement = 1 //!< Bytes Per Element
|
||||
) const;
|
||||
|
||||
/*! \brief Copies size/4 DWORD of memory to a surface
|
||||
|
||||
@@ -513,7 +513,8 @@ CALGSLContext::copyPartial(GpuEvent& event,
|
||||
size_t* destOffset,
|
||||
size_t* size,
|
||||
CALmemcopyflags flags,
|
||||
bool enableRectCopy)
|
||||
bool enableRectCopy,
|
||||
uint32 bytesPerElement)
|
||||
{
|
||||
uint64 surfaceSize;
|
||||
uint32 mode = GSL_SYNCUPLOAD_IGNORE_ELEMENTSIZE;
|
||||
@@ -563,7 +564,7 @@ CALGSLContext::copyPartial(GpuEvent& event,
|
||||
}
|
||||
m_cs->syncUploadRawRect(srcMem, srcOffset[0], (uint32)srcOffset[1], (uint32)srcOffset[2],
|
||||
destMem, destOffset[0], (uint32)destOffset[1], (uint32)destOffset[2],
|
||||
size[0], (uint32)size[1], (uint32)size[2], mode);
|
||||
size[0], (uint32)size[1], (uint32)size[2], mode, bytesPerElement);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -71,7 +71,7 @@ public:
|
||||
void destroyProgramObject(gslProgramObject func);
|
||||
|
||||
bool copyPartial(GpuEvent& event, gslMemObject srcMem, size_t* srcOffset,
|
||||
gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect);
|
||||
gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect, uint32 bytesPerElement);
|
||||
|
||||
void setSamplerParameter(uint32 sampler, gslTexParameterPname param, CALvoid* vals);
|
||||
|
||||
|
||||
새 이슈에서 참조
사용자 차단