P4 to Git Change 1069927 by skudchad@skudchad_test_win_opencl2 on 2014/08/25 14:51:55

ECR #304775 - Optimization for rectangular copies(Part2). Due to HW restriction of 14bits for src and dst pitch, its advantageous to choose optimal bpp. Higher the bpp the larger the byte pitch. This indirectly helps to reduce the number of packets for buffer copy(line by line vs a single sub_win raw packet)

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/5605/diff/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#109 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#191 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.hpp#76 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#64 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#38 edit


[ROCm/clr commit: 5efe63df44]
이 커밋은 다음에 포함됨:
foreman
2014-08-25 15:09:01 -04:00
부모 5ce5daa142
커밋 e51d6f0635
5개의 변경된 파일32개의 추가작업 그리고 17개의 파일을 삭제
+23 -11
파일 보기
@@ -613,24 +613,36 @@ DmaBlitManager::copyBufferRect(
size_t srcOffset;
size_t dstOffset;
uint bytesPerElement = 16;
bool optimalElementSize = false;
bool subWindowRectCopy = dev().settings().rectLinearDMA_;
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;
srcOffset = srcRect.offset(0, 0, 0);
dstOffset = dstRect.offset(0, 0, 0);
while (bytesPerElement >= 1) {
if (((srcOffset % 4) == 0) &&
((dstOffset % 4) == 0) &&
((size[0] % bytesPerElement) == 0) &&
((srcRect.rowPitch_ % bytesPerElement) == 0) &&
((srcRect.slicePitch_ % bytesPerElement) == 0) &&
((dstRect.rowPitch_ % bytesPerElement) == 0) &&
((dstRect.slicePitch_ % bytesPerElement) == 0)) {
optimalElementSize = true;
break;
}
bytesPerElement = bytesPerElement >> 1;
}
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF;
size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF;
if (subWindowRectCopy &&
(((srcOffset % 4) != 0) ||
((dstOffset % 4) != 0) ||
((size[0] % 4) != 0) ||
((srcRect.rowPitch_ % 4) != 0) ||
((srcRect.slicePitch_ % 4) != 0) ||
((dstRect.rowPitch_ % 4) != 0) ||
((dstRect.slicePitch_ % 4) != 0) ||
(!optimalElementSize ||
(srcRect.rowPitch_ > pitchLimit) ||
(dstRect.rowPitch_ > pitchLimit) ||
(size[0] > 0x3fff) || // 14 bits limit in HW
(size[0] > sizeLimit) || // See above
(size[1] > 0x3fff) || // 14 bits limit in HW
(size[2] > 0x7ff))) { // 11 bits limit in HW
// Restriction with rectLinearDRMDMA packet
@@ -642,7 +654,7 @@ DmaBlitManager::copyBufferRect(
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
size, gpuMem(dstMemory), true)) {
size, gpuMem(dstMemory), true, false, bytesPerElement)) {
LogError("copyBufferRect failed!");
return false;
}
+3 -2
파일 보기
@@ -1158,7 +1158,8 @@ Resource::partialMemCopyTo(
const amd::Coord3D& size,
Resource& dstResource,
bool enableCopyRect,
bool flushDMA) const
bool flushDMA,
uint bytesPerElement) const
{
GpuEvent event;
bool result;
@@ -1198,7 +1199,7 @@ Resource::partialMemCopyTo(
result = gpu.copyPartial(event,
gslResource(), calSrcOrigin,
dstResource.gslResource(), calDstOrigin,
calSize, static_cast<CALmemcopyflags>(syncFlags), enableCopyRect);
calSize, static_cast<CALmemcopyflags>(syncFlags), enableCopyRect, bytesPerElement);
if (result) {
// Mark source and destination as busy
+2 -1
파일 보기
@@ -232,7 +232,8 @@ public:
const amd::Coord3D& size, //!< Size of the region to copy
Resource& dstResource, //!< Destination resource
bool enableRectCopy = false, //!< Rectangular DMA support
bool flushDMA = false //!< Flush DMA if requested
bool flushDMA = false, //!< Flush DMA if requested
uint bytesPerElement = 1 //!< Bytes Per Element
) const;
/*! \brief Copies size/4 DWORD of memory to a surface
+3 -2
파일 보기
@@ -513,7 +513,8 @@ CALGSLContext::copyPartial(GpuEvent& event,
size_t* destOffset,
size_t* size,
CALmemcopyflags flags,
bool enableRectCopy)
bool enableRectCopy,
uint32 bytesPerElement)
{
uint64 surfaceSize;
uint32 mode = GSL_SYNCUPLOAD_IGNORE_ELEMENTSIZE;
@@ -563,7 +564,7 @@ CALGSLContext::copyPartial(GpuEvent& event,
}
m_cs->syncUploadRawRect(srcMem, srcOffset[0], (uint32)srcOffset[1], (uint32)srcOffset[2],
destMem, destOffset[0], (uint32)destOffset[1], (uint32)destOffset[2],
size[0], (uint32)size[1], (uint32)size[2], mode);
size[0], (uint32)size[1], (uint32)size[2], mode, bytesPerElement);
}
else
{
+1 -1
파일 보기
@@ -71,7 +71,7 @@ public:
void destroyProgramObject(gslProgramObject func);
bool copyPartial(GpuEvent& event, gslMemObject srcMem, size_t* srcOffset,
gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect);
gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect, uint32 bytesPerElement);
void setSamplerParameter(uint32 sampler, gslTexParameterPname param, CALvoid* vals);