P4 to Git Change 1067573 by skudchad@skudchad_opencl_win_2 on 2014/08/18 16:38:03

ECR #304775 - Refactor code to do line by line copies for read\write Rect. This avoids taking the blit copy path which may be even slower.

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/5567/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#108 edit


[ROCm/clr commit: a5e788c9f8]
Este commit está contenido en:
foreman
2014-08-18 16:46:45 -04:00
padre 41985a7688
commit a669c5ab36
+35 -32
Ver fichero
@@ -613,7 +613,41 @@ DmaBlitManager::copyBufferRect(
size_t srcOffset;
size_t dstOffset;
if (!dev().settings().rectLinearDMA_) {
bool subWindowRectCopy = dev().settings().rectLinearDMA_;
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;
srcOffset = srcRect.offset(0, 0, 0);
dstOffset = dstRect.offset(0, 0, 0);
if (subWindowRectCopy &&
(((srcOffset % 4) != 0) ||
((dstOffset % 4) != 0) ||
((size[0] % 4) != 0) ||
((srcRect.rowPitch_ % 4) != 0) ||
((srcRect.slicePitch_ % 4) != 0) ||
((dstRect.rowPitch_ % 4) != 0) ||
((dstRect.slicePitch_ % 4) != 0) ||
(srcRect.rowPitch_ > pitchLimit) ||
(dstRect.rowPitch_ > pitchLimit) ||
(size[0] > 0x3fff) || // 14 bits limit in HW
(size[1] > 0x3fff) || // 14 bits limit in HW
(size[2] > 0x7ff))) { // 11 bits limit in HW
// Restriction with rectLinearDRMDMA packet
subWindowRectCopy = false;
}
if (subWindowRectCopy) {
// Copy data with subwindow copy packet
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
size, gpuMem(dstMemory), true)) {
LogError("copyBufferRect failed!");
return false;
}
}
else {
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
srcOffset = srcRect.offset(0, y, z);
@@ -632,37 +666,6 @@ DmaBlitManager::copyBufferRect(
}
}
}
else {
srcOffset = srcRect.offset(0, 0, 0);
dstOffset = dstRect.offset(0, 0, 0);
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;
if (((srcOffset % 4) != 0) ||
((dstOffset % 4) != 0) ||
((size[0] % 4) != 0) ||
((srcRect.rowPitch_ % 4) != 0) ||
((srcRect.slicePitch_ % 4) != 0) ||
((dstRect.rowPitch_ % 4) != 0) ||
((dstRect.slicePitch_ % 4) != 0) ||
(srcRect.rowPitch_ > pitchLimit) ||
(dstRect.rowPitch_ > pitchLimit) ||
(size[0] > 0x3fff) || // 14 bits limit in HW
(size[1] > 0x3fff) || // 14 bits limit in HW
(size[2] > 0x7ff)) { // 11 bits limit in HW
// Restriction with rectLinearDRMDMA packet
return false;
}
// Copy data
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
size, gpuMem(dstMemory), true)) {
LogError("copyBufferRect failed!");
return false;
}
}
}
return true;
}