P4 to Git Change 1069927 by skudchad@skudchad_test_win_opencl2 on 2014/08/25 14:51:55

ECR #304775 - Optimization for rectangular copies(Part2). Due to HW restriction of 14bits for src and dst pitch, its advantageous to choose optimal bpp. Higher the bpp the larger the byte pitch. This indirectly helps to reduce the number of packets for buffer copy(line by line vs a single sub_win raw packet)

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/5605/diff/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#109 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#191 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.hpp#76 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#64 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#38 edit
Этот коммит содержится в:
foreman
2014-08-25 15:09:01 -04:00
родитель 1b3776aabe
Коммит 5efe63df44
5 изменённых файлов: 32 добавлений и 17 удалений
+23 -11
Просмотреть файл
@@ -613,24 +613,36 @@ DmaBlitManager::copyBufferRect(
size_t srcOffset;
size_t dstOffset;
uint bytesPerElement = 16;
bool optimalElementSize = false;
bool subWindowRectCopy = dev().settings().rectLinearDMA_;
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;
srcOffset = srcRect.offset(0, 0, 0);
dstOffset = dstRect.offset(0, 0, 0);
while (bytesPerElement >= 1) {
if (((srcOffset % 4) == 0) &&
((dstOffset % 4) == 0) &&
((size[0] % bytesPerElement) == 0) &&
((srcRect.rowPitch_ % bytesPerElement) == 0) &&
((srcRect.slicePitch_ % bytesPerElement) == 0) &&
((dstRect.rowPitch_ % bytesPerElement) == 0) &&
((dstRect.slicePitch_ % bytesPerElement) == 0)) {
optimalElementSize = true;
break;
}
bytesPerElement = bytesPerElement >> 1;
}
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF;
size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF;
if (subWindowRectCopy &&
(((srcOffset % 4) != 0) ||
((dstOffset % 4) != 0) ||
((size[0] % 4) != 0) ||
((srcRect.rowPitch_ % 4) != 0) ||
((srcRect.slicePitch_ % 4) != 0) ||
((dstRect.rowPitch_ % 4) != 0) ||
((dstRect.slicePitch_ % 4) != 0) ||
(!optimalElementSize ||
(srcRect.rowPitch_ > pitchLimit) ||
(dstRect.rowPitch_ > pitchLimit) ||
(size[0] > 0x3fff) || // 14 bits limit in HW
(size[0] > sizeLimit) || // See above
(size[1] > 0x3fff) || // 14 bits limit in HW
(size[2] > 0x7ff))) { // 11 bits limit in HW
// Restriction with rectLinearDRMDMA packet
@@ -642,7 +654,7 @@ DmaBlitManager::copyBufferRect(
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
size, gpuMem(dstMemory), true)) {
size, gpuMem(dstMemory), true, false, bytesPerElement)) {
LogError("copyBufferRect failed!");
return false;
}
+3 -2
Просмотреть файл
@@ -1158,7 +1158,8 @@ Resource::partialMemCopyTo(
const amd::Coord3D& size,
Resource& dstResource,
bool enableCopyRect,
bool flushDMA) const
bool flushDMA,
uint bytesPerElement) const
{
GpuEvent event;
bool result;
@@ -1198,7 +1199,7 @@ Resource::partialMemCopyTo(
result = gpu.copyPartial(event,
gslResource(), calSrcOrigin,
dstResource.gslResource(), calDstOrigin,
calSize, static_cast<CALmemcopyflags>(syncFlags), enableCopyRect);
calSize, static_cast<CALmemcopyflags>(syncFlags), enableCopyRect, bytesPerElement);
if (result) {
// Mark source and destination as busy
+2 -1
Просмотреть файл
@@ -232,7 +232,8 @@ public:
const amd::Coord3D& size, //!< Size of the region to copy
Resource& dstResource, //!< Destination resource
bool enableRectCopy = false, //!< Rectangular DMA support
bool flushDMA = false //!< Flush DMA if requested
bool flushDMA = false, //!< Flush DMA if requested
uint bytesPerElement = 1 //!< Bytes Per Element
) const;
/*! \brief Copies size/4 DWORD of memory to a surface
+3 -2
Просмотреть файл
@@ -513,7 +513,8 @@ CALGSLContext::copyPartial(GpuEvent& event,
size_t* destOffset,
size_t* size,
CALmemcopyflags flags,
bool enableRectCopy)
bool enableRectCopy,
uint32 bytesPerElement)
{
uint64 surfaceSize;
uint32 mode = GSL_SYNCUPLOAD_IGNORE_ELEMENTSIZE;
@@ -563,7 +564,7 @@ CALGSLContext::copyPartial(GpuEvent& event,
}
m_cs->syncUploadRawRect(srcMem, srcOffset[0], (uint32)srcOffset[1], (uint32)srcOffset[2],
destMem, destOffset[0], (uint32)destOffset[1], (uint32)destOffset[2],
size[0], (uint32)size[1], (uint32)size[2], mode);
size[0], (uint32)size[1], (uint32)size[2], mode, bytesPerElement);
}
else
{
+1 -1
Просмотреть файл
@@ -71,7 +71,7 @@ public:
void destroyProgramObject(gslProgramObject func);
bool copyPartial(GpuEvent& event, gslMemObject srcMem, size_t* srcOffset,
gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect);
gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect, uint32 bytesPerElement);
void setSamplerParameter(uint32 sampler, gslTexParameterPname param, CALvoid* vals);