From b672b6c4dabd1ff89f7202b99397fef290ec1011 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 16 Sep 2014 14:43:17 -0400
Subject: [PATCH] P4 to Git Change 1077444 by gandryey@gera-dev-w7 on
2014/09/16 14:31:35
ECR #304775 - Add capability to enable large allocations >4GB
- Update the blit kernels to consider a buffer size >4GB
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/blitcl.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#110 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#280 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsablit.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#214 edit
---
rocclr/runtime/device/blitcl.cpp | 138 +++++++++++-----------
rocclr/runtime/device/gpu/gpublit.cpp | 81 ++++++-------
rocclr/runtime/device/gpu/gpusettings.cpp | 7 +-
rocclr/runtime/device/hsa/hsablit.cpp | 64 +++++-----
rocclr/runtime/utils/flags.hpp | 2 +
5 files changed, 145 insertions(+), 147 deletions(-)
diff --git a/rocclr/runtime/device/blitcl.cpp b/rocclr/runtime/device/blitcl.cpp
index e68fa3d986..4a79a0e597 100644
--- a/rocclr/runtime/device/blitcl.cpp
+++ b/rocclr/runtime/device/blitcl.cpp
@@ -11,13 +11,13 @@ const char* BlitSourceCode = BLIT_KERNELS(
__kernel void copyBufferToImage(
__global uint* src,
__write_only image2d_array_t dst,
- int4 srcOrigin,
+ ulong4 srcOrigin,
int4 dstOrigin,
int4 size,
- int4 format,
- int4 pitch)
+ uint4 format,
+ ulong4 pitch)
{
- uint idxSrc;
+ ulong idxSrc;
int4 coordsDst;
uint4 pixel;
__global uint* srcUInt = src;
@@ -49,61 +49,55 @@ __kernel void copyBufferToImage(
switch (format.x) {
case 1:
// Check size
- switch (format.y) {
- case 1:
+ if (format.y == 1) {
pixel.x = (uint)srcUChar[idxSrc];
- break;
- case 2:
+ }
+ else if (format.y == 2) {
pixel.x = (uint)srcUShort[idxSrc];
- break;
- case 4:
+ }
+ else {
pixel.x = srcUInt[idxSrc];
- break;
}
break;
case 2:
// Check size
- switch (format.y) {
- case 1:
+ if (format.y == 1) {
tmpUShort = srcUShort[idxSrc];
pixel.x = (uint)(tmpUShort & 0xff);
pixel.y = (uint)(tmpUShort >> 8);
- break;
- case 2:
+ }
+ else if (format.y == 2) {
tmpUInt = srcUInt[idxSrc];
pixel.x = (tmpUInt & 0xffff);
pixel.y = (tmpUInt >> 16);
- break;
- case 4:
+ }
+ else {
pixel.x = srcUInt[idxSrc++];
pixel.y = srcUInt[idxSrc];
- break;
}
break;
case 4:
// Check size
- switch (format.y) {
- case 1:
+ if (format.y == 1) {
tmpUInt = srcUInt[idxSrc];
pixel.x = tmpUInt & 0xff;
pixel.y = (tmpUInt >> 8) & 0xff;
pixel.z = (tmpUInt >> 16) & 0xff;
pixel.w = (tmpUInt >> 24) & 0xff;
- break;
- case 2:
+ }
+ else if (format.y == 2) {
tmpUInt = srcUInt[idxSrc++];
pixel.x = tmpUInt & 0xffff;
pixel.y = (tmpUInt >> 16);
tmpUInt = srcUInt[idxSrc];
pixel.z = tmpUInt & 0xffff;
pixel.w = (tmpUInt >> 16);
- break;
- case 4:
+ }
+ else {
pixel.x = srcUInt[idxSrc++];
pixel.y = srcUInt[idxSrc++];
pixel.z = srcUInt[idxSrc++];
pixel.w = srcUInt[idxSrc];
- break;
}
break;
}
@@ -117,12 +111,12 @@ __kernel void copyImageToBuffer(
__global ushort* dstUShort,
__global uchar* dstUChar,
int4 srcOrigin,
- int4 dstOrigin,
+ ulong4 dstOrigin,
int4 size,
- int4 format,
- int4 pitch)
+ uint4 format,
+ ulong4 pitch)
{
- uint idxDst;
+ ulong idxDst;
int4 coordsSrc;
uint4 texel;
@@ -205,12 +199,12 @@ __kernel void copyImageToBuffer(
__kernel void copyImage(
__read_only image2d_array_t src,
__write_only image2d_array_t dst,
- int4 srcOrigin,
- int4 dstOrigin,
- int4 size)
+ int4 srcOrigin,
+ int4 dstOrigin,
+ int4 size)
{
- int4 coordsDst;
- int4 coordsSrc;
+ int4 coordsDst;
+ int4 coordsSrc;
coordsDst.x = get_global_id(0);
coordsDst.y = get_global_id(1);
@@ -234,12 +228,12 @@ __kernel void copyImage(
__kernel void copyImage1DA(
__read_only image2d_array_t src,
__write_only image2d_array_t dst,
- int4 srcOrigin,
- int4 dstOrigin,
- int4 size)
+ int4 srcOrigin,
+ int4 dstOrigin,
+ int4 size)
{
- int4 coordsDst;
- int4 coordsSrc;
+ int4 coordsDst;
+ int4 coordsSrc;
coordsDst.x = get_global_id(0);
coordsDst.y = get_global_id(1);
@@ -271,13 +265,13 @@ __kernel void copyImage1DA(
__kernel void copyBufferRect(
__global uchar* src,
__global uchar* dst,
- uint4 srcRect,
- uint4 dstRect,
- uint4 size)
+ ulong4 srcRect,
+ ulong4 dstRect,
+ ulong4 size)
{
- uint x = (uint)get_global_id(0);
- uint y = (uint)get_global_id(1);
- uint z = (uint)get_global_id(2);
+ ulong x = get_global_id(0);
+ ulong y = get_global_id(1);
+ ulong z = get_global_id(2);
if ((x >= size.x) ||
(y >= size.y) ||
@@ -285,8 +279,8 @@ __kernel void copyBufferRect(
return;
}
- uint offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y;
- uint offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y;
+ ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y;
+ ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y;
dst[offsDst] = src[offsSrc];
}
@@ -294,13 +288,13 @@ __kernel void copyBufferRect(
__kernel void copyBufferRectAligned(
__global uint* src,
__global uint* dst,
- uint4 srcRect,
- uint4 dstRect,
- uint4 size)
+ ulong4 srcRect,
+ ulong4 dstRect,
+ ulong4 size)
{
- uint x = (uint)get_global_id(0);
- uint y = (uint)get_global_id(1);
- uint z = (uint)get_global_id(2);
+ ulong x = get_global_id(0);
+ ulong y = get_global_id(1);
+ ulong z = get_global_id(2);
if ((x >= size.x) ||
(y >= size.y) ||
@@ -308,8 +302,8 @@ __kernel void copyBufferRectAligned(
return;
}
- uint offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y;
- uint offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y;
+ ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y;
+ ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y;
if (size.w == 16) {
__global uint4* src4 = (__global uint4*)src;
@@ -324,18 +318,18 @@ __kernel void copyBufferRectAligned(
__kernel void copyBuffer(
__global uchar* src,
__global uchar* dst,
- int srcOrigin,
- int dstOrigin,
- uint size)
+ ulong srcOrigin,
+ ulong dstOrigin,
+ ulong size)
{
- uint id = (uint)get_global_id(0);
+ ulong id = get_global_id(0);
if (id >= size) {
return;
}
- uint offsSrc = id + srcOrigin;
- uint offsDst = id + dstOrigin;
+ ulong offsSrc = id + srcOrigin;
+ ulong offsDst = id + dstOrigin;
dst[offsDst] = src[offsSrc];
}
@@ -343,19 +337,19 @@ __kernel void copyBuffer(
__kernel void copyBufferAligned(
__global uint* src,
__global uint* dst,
- int srcOrigin,
- int dstOrigin,
- uint size,
- uint alignment)
+ ulong srcOrigin,
+ ulong dstOrigin,
+ ulong size,
+ uint alignment)
{
- uint id = (uint)get_global_id(0);
+ ulong id = get_global_id(0);
if (id >= size) {
return;
}
- uint offsSrc = id + srcOrigin;
- uint offsDst = id + dstOrigin;
+ ulong offsSrc = id + srcOrigin;
+ ulong offsDst = id + dstOrigin;
if (alignment == 16) {
__global uint4* src4 = (__global uint4*)src;
@@ -371,11 +365,11 @@ __kernel void fillBuffer(
__global uchar* bufUChar,
__global uint* bufUInt,
__constant uchar* pattern,
- uint patternSize,
- uint offset,
- uint size)
+ uint patternSize,
+ ulong offset,
+ ulong size)
{
- uint id = (uint)get_global_id(0);
+ ulong id = get_global_id(0);
if (id >= size) {
return;
diff --git a/rocclr/runtime/device/gpu/gpublit.cpp b/rocclr/runtime/device/gpu/gpublit.cpp
index 32e1c9e5a7..a5a8a6b8bb 100644
--- a/rocclr/runtime/device/gpu/gpublit.cpp
+++ b/rocclr/runtime/device/gpu/gpublit.cpp
@@ -1140,7 +1140,7 @@ KernelBlitManager::copyBufferToImage(
void
CalcRowSlicePitches(
- cl_int* pitch, const cl_int* copySize,
+ cl_ulong* pitch, const cl_int* copySize,
size_t rowPitch, size_t slicePitch, const Memory& mem)
{
size_t memFmtSize = memoryFormatSize(mem.cal()->format_).size_;
@@ -1317,17 +1317,17 @@ KernelBlitManager::copyBufferToImageKernel(
granularity = 4;
}
CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
- cl_int srcOrg[4] = { (cl_int)srcOrigin[0] / granularity,
- (cl_int)srcOrigin[1],
- (cl_int)srcOrigin[2], 0 };
+ cl_ulong srcOrg[4] = { srcOrigin[0] / granularity,
+ srcOrigin[1],
+ srcOrigin[2], 0 };
setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
- cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
- (cl_int)dstOrigin[1],
- (cl_int)dstOrigin[2], 0 };
- cl_int copySize[4] = { (cl_int)size[0],
- (cl_int)size[1],
- (cl_int)size[2], 0 };
+ cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
+ (cl_int)dstOrigin[1],
+ (cl_int)dstOrigin[2], 0 };
+ cl_int copySize[4] = { (cl_int)size[0],
+ (cl_int)size[1],
+ (cl_int)size[2], 0 };
if (swapLayer) {
dstOrg[2] = dstOrg[1];
dstOrg[1] = 0;
@@ -1340,13 +1340,13 @@ KernelBlitManager::copyBufferToImageKernel(
// Program memory format
uint multiplier = memFmt.size_ / sizeof(uint32_t);
multiplier = (multiplier == 0) ? 1 : multiplier;
- cl_int format[4] = { (cl_int)memFmt.components_,
- (cl_int)memFmt.size_ / (cl_int)memFmt.components_,
- (cl_int)multiplier, 0 };
+ cl_uint format[4] = { memFmt.components_,
+ memFmt.size_ / memFmt.components_,
+ multiplier, 0 };
setArgument(kernels_[blitType], 5, sizeof(format), format);
// Program row and slice pitches
- cl_int pitch[4] = { 0 };
+ cl_ulong pitch[4] = { 0 };
CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory));
setArgument(kernels_[blitType], 6, sizeof(pitch), pitch);
@@ -1661,12 +1661,12 @@ KernelBlitManager::copyImageToBufferKernel(
setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem);
setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem);
- cl_int srcOrg[4] = { (cl_int)srcOrigin[0],
- (cl_int)srcOrigin[1],
- (cl_int)srcOrigin[2], 0 };
- cl_int copySize[4] = { (cl_int)size[0],
- (cl_int)size[1],
- (cl_int)size[2], 0 };
+ cl_int srcOrg[4] = { (cl_int)srcOrigin[0],
+ (cl_int)srcOrigin[1],
+ (cl_int)srcOrigin[2], 0 };
+ cl_int copySize[4] = { (cl_int)size[0],
+ (cl_int)size[1],
+ (cl_int)size[2], 0 };
if (swapLayer) {
srcOrg[2] = srcOrg[1];
srcOrg[1] = 0;
@@ -1685,22 +1685,22 @@ KernelBlitManager::copyImageToBufferKernel(
granularity = 4;
}
CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
- cl_int dstOrg[4] = { (cl_int)dstOrigin[0] / granularity,
- (cl_int)dstOrigin[1],
- (cl_int)dstOrigin[2], 0 };
+ cl_ulong dstOrg[4] = { dstOrigin[0] / granularity,
+ dstOrigin[1],
+ dstOrigin[2], 0 };
setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg);
setArgument(kernels_[blitType], 6, sizeof(copySize), copySize);
// Program memory format
uint multiplier = memFmt.size_ / sizeof(uint32_t);
multiplier = (multiplier == 0) ? 1 : multiplier;
- cl_int format[4] = { (cl_int)memFmt.components_,
- (cl_int)memFmt.size_ / (cl_int)memFmt.components_,
- (cl_int)multiplier, 0 };
+ cl_uint format[4] = { memFmt.components_,
+ memFmt.size_ / memFmt.components_,
+ multiplier, 0 };
setArgument(kernels_[blitType], 7, sizeof(format), format);
// Program row and slice pitches
- cl_int pitch[4] = { 0 };
+ cl_ulong pitch[4] = { 0 };
CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory));
setArgument(kernels_[blitType], 8, sizeof(pitch), pitch);
@@ -2130,18 +2130,15 @@ KernelBlitManager::copyBufferRect(
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
mem = &gpuMem(dstMemory);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
- cl_uint src[4] = { (cl_uint)srcRect.rowPitch_,
- (cl_uint)srcRect.slicePitch_,
- (cl_uint)srcRect.start_, 0 };
+ cl_ulong src[4] = { srcRect.rowPitch_,
+ srcRect.slicePitch_,
+ srcRect.start_, 0 };
setArgument(kernels_[blitType], 2, sizeof(src), src);
- cl_uint dst[4] = { (cl_uint)dstRect.rowPitch_,
- (cl_uint)dstRect.slicePitch_,
- (cl_uint)dstRect.start_, 0 };
+ cl_ulong dst[4] = { dstRect.rowPitch_,
+ dstRect.slicePitch_,
+ dstRect.start_, 0 };
setArgument(kernels_[blitType], 3, sizeof(dst), dst);
- cl_int copySize[4] = { (cl_int)size[0],
- (cl_int)size[1],
- (cl_int)size[2],
- (cl_int)CopyRectAlignment[i] };
+ cl_ulong copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] };
setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
// Create ND range object for the kernel's execution
@@ -2426,7 +2423,7 @@ KernelBlitManager::fillBuffer(
else {
uint fillType = FillBuffer;
size_t globalWorkOffset[3] = { 0, 0, 0 };
- cl_int fillSize = size[0] / patternSize;
+ cl_ulong fillSize = size[0] / patternSize;
size_t globalWorkSize = amd::alignUp(fillSize, 256);
size_t localWorkSize = 256;
bool dwordAligned =
@@ -2450,7 +2447,7 @@ KernelBlitManager::fillBuffer(
memcpy(constBuf, pattern, patternSize);
gpuCB->unmap(&gpu());
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB);
- cl_int offset = origin[0];
+ cl_ulong offset = origin[0];
if (dwordAligned) {
patternSize /= sizeof(uint32_t);
offset /= sizeof(uint32_t);
@@ -2528,14 +2525,14 @@ KernelBlitManager::copyBuffer(
mem = &gpuMem(dstMemory);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
// Program source origin
- cl_int srcOffset = srcOrigin[0] / CopyBuffAlignment[i];;
+ cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];;
setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
// Program destinaiton origin
- cl_int dstOffset = dstOrigin[0] / CopyBuffAlignment[i];;
+ cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];;
setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
- cl_int copySize = size[0];
+ cl_ulong copySize = size[0];
setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size);
if (blitType == BlitCopyBufferAligned) {
diff --git a/rocclr/runtime/device/gpu/gpusettings.cpp b/rocclr/runtime/device/gpu/gpusettings.cpp
index ab74023527..cb095cb0b0 100644
--- a/rocclr/runtime/device/gpu/gpusettings.cpp
+++ b/rocclr/runtime/device/gpu/gpusettings.cpp
@@ -320,7 +320,12 @@ Settings::create(
supportDepthsRGB_ = true;
}
if (use64BitPtr_) {
- maxAllocSize_ = 4048 * Mi;
+ if (GPU_ENABLE_LARGE_ALLOCATION) {
+ maxAllocSize_ = 16ULL * Gi;
+ }
+ else {
+ maxAllocSize_ = 4048 * Mi;
+ }
}
else {
maxAllocSize_ = 3ULL * Gi;
diff --git a/rocclr/runtime/device/hsa/hsablit.cpp b/rocclr/runtime/device/hsa/hsablit.cpp
index ddd0e3df35..f270745fea 100644
--- a/rocclr/runtime/device/hsa/hsablit.cpp
+++ b/rocclr/runtime/device/hsa/hsablit.cpp
@@ -566,7 +566,7 @@ HsaBlitManager::importExportImage(
static void
CalcRowSlicePitches(
- cl_int* pitch, const cl_int* copySize,
+ cl_ulong* pitch, const cl_int* copySize,
size_t rowPitch, size_t slicePitch, const Memory& mem)
{
const oclhsa::Image &hsaImage = static_cast< const oclhsa::Image &>(mem);
@@ -1067,14 +1067,14 @@ KernelBlitManager::copyBuffer(
clmem = ((cl_mem) as_cl(dstMemory.owner()));
kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
// Program source origin
- cl_int srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
+ cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
kernels_[blitType]->parameters().set(2, sizeof(srcOffset), &srcOffset);
// Program destinaiton origin
- cl_int dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
+ cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
kernels_[blitType]->parameters().set(3, sizeof(dstOffset), &dstOffset);
- cl_int copySize = size[0];
+ cl_ulong copySize = size[0];
kernels_[blitType]->parameters().set(4, sizeof(copySize), ©Size);
if (blitType == BlitCopyBufferAligned) {
@@ -1187,18 +1187,18 @@ KernelBlitManager::copyBufferRect(
kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
clmem = ((cl_mem) as_cl(dstMemory.owner()));
kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
- cl_uint src[4] = { (cl_uint)srcRect.rowPitch_,
- (cl_uint)srcRect.slicePitch_,
- (cl_uint)srcRect.start_, 0 };
+ cl_ulong src[4] = { srcRect.rowPitch_,
+ srcRect.slicePitch_,
+ srcRect.start_, 0 };
kernels_[blitType]->parameters().set(2, sizeof(src), src);
- cl_uint dst[4] = { (cl_uint)dstRect.rowPitch_,
- (cl_uint)dstRect.slicePitch_,
- (cl_uint)dstRect.start_, 0 };
+ cl_ulong dst[4] = { dstRect.rowPitch_,
+ dstRect.slicePitch_,
+ dstRect.start_, 0 };
kernels_[blitType]->parameters().set(3, sizeof(dst), dst);
- cl_int copySize[4] = { (cl_int)size[0],
- (cl_int)size[1],
- (cl_int)size[2],
- (cl_int)CopyRectAlignment[i] };
+ cl_ulong copySize[4] = { size[0],
+ size[1],
+ size[2],
+ CopyRectAlignment[i] };
kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize);
// Create ND range object for the kernel's execution
@@ -1311,23 +1311,23 @@ KernelBlitManager::copyImageToBuffer(
granularity = 4;
}
CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
- cl_int dstOrg[4] = { (cl_int)dstOrigin[0] / granularity,
- (cl_int)dstOrigin[1],
- (cl_int)dstOrigin[2],
- 0 };
+ cl_ulong dstOrg[4] = { dstOrigin[0] / granularity,
+ dstOrigin[1],
+ dstOrigin[2],
+ 0 };
kernels_[blitType]->parameters().set(5, sizeof(dstOrg), dstOrg);
kernels_[blitType]->parameters().set(6, sizeof(copySize), copySize);
// Program memory format
uint multiplier = elementSize / sizeof(uint32_t);
multiplier = (multiplier == 0) ? 1 : multiplier;
- cl_int format[4] = { (cl_int)numChannels,
- (cl_int)elementSize / (cl_int)numChannels,
- (cl_int)multiplier, 0 };
+ cl_uint format[4] = { (cl_uint)numChannels,
+ (cl_uint)(elementSize / numChannels),
+ multiplier, 0 };
kernels_[blitType]->parameters().set(7, sizeof(format), format);
// Program row and slice pitches
- cl_int pitch[4] = { 0 };
+ cl_ulong pitch[4] = { 0 };
CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, srcImage);
kernels_[blitType]->parameters().set(8, sizeof(pitch), pitch);
@@ -1429,9 +1429,9 @@ KernelBlitManager::copyBufferToImage(
granularity = 4;
}
CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
- cl_int srcOrg[4] = { (cl_int)srcOrigin[0] / granularity,
- (cl_int)srcOrigin[1],
- (cl_int)srcOrigin[2], 0 };
+ cl_ulong srcOrg[4] = { srcOrigin[0] / granularity,
+ srcOrigin[1],
+ srcOrigin[2], 0 };
kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg);
cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
@@ -1447,13 +1447,13 @@ KernelBlitManager::copyBufferToImage(
// Program memory format
uint multiplier = elementSize / sizeof(uint32_t);
multiplier = (multiplier == 0) ? 1 : multiplier;
- cl_int format[4] = { (cl_int)numChannels,
- (cl_int)elementSize / (cl_int)numChannels,
- (cl_int)multiplier, 0 };
+ cl_uint format[4] = { (cl_uint)numChannels,
+ (cl_uint)(elementSize / numChannels),
+ multiplier, 0 };
kernels_[blitType]->parameters().set(5, sizeof(format), format);
// Program row and slice pitches
- cl_int pitch[4] = { 0 };
+ cl_ulong pitch[4] = { 0 };
CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, dstImage);
kernels_[blitType]->parameters().set(6, sizeof(pitch), pitch);
@@ -1617,7 +1617,7 @@ KernelBlitManager::fillBuffer(
uint fillType = FillBuffer;
size_t globalWorkOffset[3] = { 0, 0, 0 };
- cl_int fillSize = size[0] / patternSize;
+ cl_ulong fillSize = size[0] / patternSize;
size_t globalWorkSize = amd::alignUp(fillSize, 256);
size_t localWorkSize = 256;
bool dwordAligned =
@@ -1650,14 +1650,14 @@ KernelBlitManager::fillBuffer(
cl_mem clmem = ((cl_mem) as_cl(fillMemory));
kernels_[fillType]->parameters().set(2, sizeof(cl_mem), &clmem);
- cl_int offset = origin[0];
+ cl_ulong offset = origin[0];
if (dwordAligned) {
patternSize /= sizeof(uint32_t);
offset /= sizeof(uint32_t);
}
kernels_[fillType]->parameters().set(3, sizeof(cl_uint), &patternSize);
kernels_[fillType]->parameters().set(4, sizeof(offset), &offset);
- kernels_[fillType]->parameters().set(5, sizeof(cl_mem), &fillSize);
+ kernels_[fillType]->parameters().set(5, sizeof(fillSize), &fillSize);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1,
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index 669cb79705..be94c00834 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -152,6 +152,8 @@ release(bool, GPU_DIRECT_SRD, false, \
"Use indirect SRD access in HSAIL") \
release(bool, GPU_USE_DEVICE_QUEUE, false, \
"Use a dedicated device queue for the actual submissions") \
+release(bool, GPU_ENABLE_LARGE_ALLOCATION, false, \
+ "Enable >4GB single allocations") \
release(bool, AMD_DEPTH_MSAA_INTEROP, false, \
"Enable depth stencil and MSAA buffer interop") \
release(bool, AMD_THREAD_TRACE_ENABLE, false, \