From b672b6c4dabd1ff89f7202b99397fef290ec1011 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 16 Sep 2014 14:43:17 -0400 Subject: [PATCH] P4 to Git Change 1077444 by gandryey@gera-dev-w7 on 2014/09/16 14:31:35 ECR #304775 - Add capability to enable large allocations >4GB - Update the blit kernels to consider a buffer size >4GB Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/blitcl.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#110 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#280 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsablit.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#214 edit --- rocclr/runtime/device/blitcl.cpp | 138 +++++++++++----------- rocclr/runtime/device/gpu/gpublit.cpp | 81 ++++++------- rocclr/runtime/device/gpu/gpusettings.cpp | 7 +- rocclr/runtime/device/hsa/hsablit.cpp | 64 +++++----- rocclr/runtime/utils/flags.hpp | 2 + 5 files changed, 145 insertions(+), 147 deletions(-) diff --git a/rocclr/runtime/device/blitcl.cpp b/rocclr/runtime/device/blitcl.cpp index e68fa3d986..4a79a0e597 100644 --- a/rocclr/runtime/device/blitcl.cpp +++ b/rocclr/runtime/device/blitcl.cpp @@ -11,13 +11,13 @@ const char* BlitSourceCode = BLIT_KERNELS( __kernel void copyBufferToImage( __global uint* src, __write_only image2d_array_t dst, - int4 srcOrigin, + ulong4 srcOrigin, int4 dstOrigin, int4 size, - int4 format, - int4 pitch) + uint4 format, + ulong4 pitch) { - uint idxSrc; + ulong idxSrc; int4 coordsDst; uint4 pixel; __global uint* srcUInt = src; @@ -49,61 +49,55 @@ __kernel void copyBufferToImage( switch (format.x) { case 1: // Check size - switch (format.y) { - case 1: + if (format.y == 1) { pixel.x = (uint)srcUChar[idxSrc]; - break; - case 2: + } + else if (format.y == 2) { pixel.x = (uint)srcUShort[idxSrc]; - break; - case 4: + } + else { pixel.x = srcUInt[idxSrc]; - break; } break; case 2: // Check size - switch (format.y) { - case 1: + if (format.y == 1) { tmpUShort = srcUShort[idxSrc]; pixel.x = (uint)(tmpUShort & 0xff); pixel.y = (uint)(tmpUShort >> 8); - break; - case 2: + } + else if (format.y == 2) { tmpUInt = srcUInt[idxSrc]; pixel.x = (tmpUInt & 0xffff); pixel.y = (tmpUInt >> 16); - break; - case 4: + } + else { pixel.x = srcUInt[idxSrc++]; pixel.y = srcUInt[idxSrc]; - break; } break; case 4: // Check size - switch (format.y) { - case 1: + if (format.y == 1) { tmpUInt = srcUInt[idxSrc]; pixel.x = tmpUInt & 0xff; pixel.y = (tmpUInt >> 8) & 0xff; pixel.z = (tmpUInt >> 16) & 0xff; pixel.w = (tmpUInt >> 24) & 0xff; - break; - case 2: + } + else if (format.y == 2) { tmpUInt = srcUInt[idxSrc++]; pixel.x = tmpUInt & 0xffff; pixel.y = (tmpUInt >> 16); tmpUInt = srcUInt[idxSrc]; pixel.z = tmpUInt & 0xffff; pixel.w = (tmpUInt >> 16); - break; - case 4: + } + else { pixel.x = srcUInt[idxSrc++]; pixel.y = srcUInt[idxSrc++]; pixel.z = srcUInt[idxSrc++]; pixel.w = srcUInt[idxSrc]; - break; } break; } @@ -117,12 +111,12 @@ __kernel void copyImageToBuffer( __global ushort* dstUShort, __global uchar* dstUChar, int4 srcOrigin, - int4 dstOrigin, + ulong4 dstOrigin, int4 size, - int4 format, - int4 pitch) + uint4 format, + ulong4 pitch) { - uint idxDst; + ulong idxDst; int4 coordsSrc; uint4 texel; @@ -205,12 +199,12 @@ __kernel void copyImageToBuffer( __kernel void copyImage( __read_only image2d_array_t src, __write_only image2d_array_t dst, - int4 srcOrigin, - int4 dstOrigin, - int4 size) + int4 srcOrigin, + int4 dstOrigin, + int4 size) { - int4 coordsDst; - int4 coordsSrc; + int4 coordsDst; + int4 coordsSrc; coordsDst.x = get_global_id(0); coordsDst.y = get_global_id(1); @@ -234,12 +228,12 @@ __kernel void copyImage( __kernel void copyImage1DA( __read_only image2d_array_t src, __write_only image2d_array_t dst, - int4 srcOrigin, - int4 dstOrigin, - int4 size) + int4 srcOrigin, + int4 dstOrigin, + int4 size) { - int4 coordsDst; - int4 coordsSrc; + int4 coordsDst; + int4 coordsSrc; coordsDst.x = get_global_id(0); coordsDst.y = get_global_id(1); @@ -271,13 +265,13 @@ __kernel void copyImage1DA( __kernel void copyBufferRect( __global uchar* src, __global uchar* dst, - uint4 srcRect, - uint4 dstRect, - uint4 size) + ulong4 srcRect, + ulong4 dstRect, + ulong4 size) { - uint x = (uint)get_global_id(0); - uint y = (uint)get_global_id(1); - uint z = (uint)get_global_id(2); + ulong x = get_global_id(0); + ulong y = get_global_id(1); + ulong z = get_global_id(2); if ((x >= size.x) || (y >= size.y) || @@ -285,8 +279,8 @@ __kernel void copyBufferRect( return; } - uint offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y; - uint offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y; + ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y; + ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y; dst[offsDst] = src[offsSrc]; } @@ -294,13 +288,13 @@ __kernel void copyBufferRect( __kernel void copyBufferRectAligned( __global uint* src, __global uint* dst, - uint4 srcRect, - uint4 dstRect, - uint4 size) + ulong4 srcRect, + ulong4 dstRect, + ulong4 size) { - uint x = (uint)get_global_id(0); - uint y = (uint)get_global_id(1); - uint z = (uint)get_global_id(2); + ulong x = get_global_id(0); + ulong y = get_global_id(1); + ulong z = get_global_id(2); if ((x >= size.x) || (y >= size.y) || @@ -308,8 +302,8 @@ __kernel void copyBufferRectAligned( return; } - uint offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y; - uint offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y; + ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y; + ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y; if (size.w == 16) { __global uint4* src4 = (__global uint4*)src; @@ -324,18 +318,18 @@ __kernel void copyBufferRectAligned( __kernel void copyBuffer( __global uchar* src, __global uchar* dst, - int srcOrigin, - int dstOrigin, - uint size) + ulong srcOrigin, + ulong dstOrigin, + ulong size) { - uint id = (uint)get_global_id(0); + ulong id = get_global_id(0); if (id >= size) { return; } - uint offsSrc = id + srcOrigin; - uint offsDst = id + dstOrigin; + ulong offsSrc = id + srcOrigin; + ulong offsDst = id + dstOrigin; dst[offsDst] = src[offsSrc]; } @@ -343,19 +337,19 @@ __kernel void copyBuffer( __kernel void copyBufferAligned( __global uint* src, __global uint* dst, - int srcOrigin, - int dstOrigin, - uint size, - uint alignment) + ulong srcOrigin, + ulong dstOrigin, + ulong size, + uint alignment) { - uint id = (uint)get_global_id(0); + ulong id = get_global_id(0); if (id >= size) { return; } - uint offsSrc = id + srcOrigin; - uint offsDst = id + dstOrigin; + ulong offsSrc = id + srcOrigin; + ulong offsDst = id + dstOrigin; if (alignment == 16) { __global uint4* src4 = (__global uint4*)src; @@ -371,11 +365,11 @@ __kernel void fillBuffer( __global uchar* bufUChar, __global uint* bufUInt, __constant uchar* pattern, - uint patternSize, - uint offset, - uint size) + uint patternSize, + ulong offset, + ulong size) { - uint id = (uint)get_global_id(0); + ulong id = get_global_id(0); if (id >= size) { return; diff --git a/rocclr/runtime/device/gpu/gpublit.cpp b/rocclr/runtime/device/gpu/gpublit.cpp index 32e1c9e5a7..a5a8a6b8bb 100644 --- a/rocclr/runtime/device/gpu/gpublit.cpp +++ b/rocclr/runtime/device/gpu/gpublit.cpp @@ -1140,7 +1140,7 @@ KernelBlitManager::copyBufferToImage( void CalcRowSlicePitches( - cl_int* pitch, const cl_int* copySize, + cl_ulong* pitch, const cl_int* copySize, size_t rowPitch, size_t slicePitch, const Memory& mem) { size_t memFmtSize = memoryFormatSize(mem.cal()->format_).size_; @@ -1317,17 +1317,17 @@ KernelBlitManager::copyBufferToImageKernel( granularity = 4; } CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_int srcOrg[4] = { (cl_int)srcOrigin[0] / granularity, - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; + cl_ulong srcOrg[4] = { srcOrigin[0] / granularity, + srcOrigin[1], + srcOrigin[2], 0 }; setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; + cl_int dstOrg[4] = { (cl_int)dstOrigin[0], + (cl_int)dstOrigin[1], + (cl_int)dstOrigin[2], 0 }; + cl_int copySize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; if (swapLayer) { dstOrg[2] = dstOrg[1]; dstOrg[1] = 0; @@ -1340,13 +1340,13 @@ KernelBlitManager::copyBufferToImageKernel( // Program memory format uint multiplier = memFmt.size_ / sizeof(uint32_t); multiplier = (multiplier == 0) ? 1 : multiplier; - cl_int format[4] = { (cl_int)memFmt.components_, - (cl_int)memFmt.size_ / (cl_int)memFmt.components_, - (cl_int)multiplier, 0 }; + cl_uint format[4] = { memFmt.components_, + memFmt.size_ / memFmt.components_, + multiplier, 0 }; setArgument(kernels_[blitType], 5, sizeof(format), format); // Program row and slice pitches - cl_int pitch[4] = { 0 }; + cl_ulong pitch[4] = { 0 }; CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); @@ -1661,12 +1661,12 @@ KernelBlitManager::copyImageToBufferKernel( setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; + cl_int srcOrg[4] = { (cl_int)srcOrigin[0], + (cl_int)srcOrigin[1], + (cl_int)srcOrigin[2], 0 }; + cl_int copySize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; if (swapLayer) { srcOrg[2] = srcOrg[1]; srcOrg[1] = 0; @@ -1685,22 +1685,22 @@ KernelBlitManager::copyImageToBufferKernel( granularity = 4; } CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_int dstOrg[4] = { (cl_int)dstOrigin[0] / granularity, - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; + cl_ulong dstOrg[4] = { dstOrigin[0] / granularity, + dstOrigin[1], + dstOrigin[2], 0 }; setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); // Program memory format uint multiplier = memFmt.size_ / sizeof(uint32_t); multiplier = (multiplier == 0) ? 1 : multiplier; - cl_int format[4] = { (cl_int)memFmt.components_, - (cl_int)memFmt.size_ / (cl_int)memFmt.components_, - (cl_int)multiplier, 0 }; + cl_uint format[4] = { memFmt.components_, + memFmt.size_ / memFmt.components_, + multiplier, 0 }; setArgument(kernels_[blitType], 7, sizeof(format), format); // Program row and slice pitches - cl_int pitch[4] = { 0 }; + cl_ulong pitch[4] = { 0 }; CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); @@ -2130,18 +2130,15 @@ KernelBlitManager::copyBufferRect( setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); mem = &gpuMem(dstMemory); setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - cl_uint src[4] = { (cl_uint)srcRect.rowPitch_, - (cl_uint)srcRect.slicePitch_, - (cl_uint)srcRect.start_, 0 }; + cl_ulong src[4] = { srcRect.rowPitch_, + srcRect.slicePitch_, + srcRect.start_, 0 }; setArgument(kernels_[blitType], 2, sizeof(src), src); - cl_uint dst[4] = { (cl_uint)dstRect.rowPitch_, - (cl_uint)dstRect.slicePitch_, - (cl_uint)dstRect.start_, 0 }; + cl_ulong dst[4] = { dstRect.rowPitch_, + dstRect.slicePitch_, + dstRect.start_, 0 }; setArgument(kernels_[blitType], 3, sizeof(dst), dst); - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], - (cl_int)CopyRectAlignment[i] }; + cl_ulong copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] }; setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); // Create ND range object for the kernel's execution @@ -2426,7 +2423,7 @@ KernelBlitManager::fillBuffer( else { uint fillType = FillBuffer; size_t globalWorkOffset[3] = { 0, 0, 0 }; - cl_int fillSize = size[0] / patternSize; + cl_ulong fillSize = size[0] / patternSize; size_t globalWorkSize = amd::alignUp(fillSize, 256); size_t localWorkSize = 256; bool dwordAligned = @@ -2450,7 +2447,7 @@ KernelBlitManager::fillBuffer( memcpy(constBuf, pattern, patternSize); gpuCB->unmap(&gpu()); setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB); - cl_int offset = origin[0]; + cl_ulong offset = origin[0]; if (dwordAligned) { patternSize /= sizeof(uint32_t); offset /= sizeof(uint32_t); @@ -2528,14 +2525,14 @@ KernelBlitManager::copyBuffer( mem = &gpuMem(dstMemory); setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); // Program source origin - cl_int srcOffset = srcOrigin[0] / CopyBuffAlignment[i];; + cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];; setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); // Program destinaiton origin - cl_int dstOffset = dstOrigin[0] / CopyBuffAlignment[i];; + cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];; setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); - cl_int copySize = size[0]; + cl_ulong copySize = size[0]; setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); if (blitType == BlitCopyBufferAligned) { diff --git a/rocclr/runtime/device/gpu/gpusettings.cpp b/rocclr/runtime/device/gpu/gpusettings.cpp index ab74023527..cb095cb0b0 100644 --- a/rocclr/runtime/device/gpu/gpusettings.cpp +++ b/rocclr/runtime/device/gpu/gpusettings.cpp @@ -320,7 +320,12 @@ Settings::create( supportDepthsRGB_ = true; } if (use64BitPtr_) { - maxAllocSize_ = 4048 * Mi; + if (GPU_ENABLE_LARGE_ALLOCATION) { + maxAllocSize_ = 16ULL * Gi; + } + else { + maxAllocSize_ = 4048 * Mi; + } } else { maxAllocSize_ = 3ULL * Gi; diff --git a/rocclr/runtime/device/hsa/hsablit.cpp b/rocclr/runtime/device/hsa/hsablit.cpp index ddd0e3df35..f270745fea 100644 --- a/rocclr/runtime/device/hsa/hsablit.cpp +++ b/rocclr/runtime/device/hsa/hsablit.cpp @@ -566,7 +566,7 @@ HsaBlitManager::importExportImage( static void CalcRowSlicePitches( - cl_int* pitch, const cl_int* copySize, + cl_ulong* pitch, const cl_int* copySize, size_t rowPitch, size_t slicePitch, const Memory& mem) { const oclhsa::Image &hsaImage = static_cast< const oclhsa::Image &>(mem); @@ -1067,14 +1067,14 @@ KernelBlitManager::copyBuffer( clmem = ((cl_mem) as_cl(dstMemory.owner())); kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); // Program source origin - cl_int srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; + cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; kernels_[blitType]->parameters().set(2, sizeof(srcOffset), &srcOffset); // Program destinaiton origin - cl_int dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; + cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; kernels_[blitType]->parameters().set(3, sizeof(dstOffset), &dstOffset); - cl_int copySize = size[0]; + cl_ulong copySize = size[0]; kernels_[blitType]->parameters().set(4, sizeof(copySize), ©Size); if (blitType == BlitCopyBufferAligned) { @@ -1187,18 +1187,18 @@ KernelBlitManager::copyBufferRect( kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); clmem = ((cl_mem) as_cl(dstMemory.owner())); kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - cl_uint src[4] = { (cl_uint)srcRect.rowPitch_, - (cl_uint)srcRect.slicePitch_, - (cl_uint)srcRect.start_, 0 }; + cl_ulong src[4] = { srcRect.rowPitch_, + srcRect.slicePitch_, + srcRect.start_, 0 }; kernels_[blitType]->parameters().set(2, sizeof(src), src); - cl_uint dst[4] = { (cl_uint)dstRect.rowPitch_, - (cl_uint)dstRect.slicePitch_, - (cl_uint)dstRect.start_, 0 }; + cl_ulong dst[4] = { dstRect.rowPitch_, + dstRect.slicePitch_, + dstRect.start_, 0 }; kernels_[blitType]->parameters().set(3, sizeof(dst), dst); - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], - (cl_int)CopyRectAlignment[i] }; + cl_ulong copySize[4] = { size[0], + size[1], + size[2], + CopyRectAlignment[i] }; kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize); // Create ND range object for the kernel's execution @@ -1311,23 +1311,23 @@ KernelBlitManager::copyImageToBuffer( granularity = 4; } CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_int dstOrg[4] = { (cl_int)dstOrigin[0] / granularity, - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], - 0 }; + cl_ulong dstOrg[4] = { dstOrigin[0] / granularity, + dstOrigin[1], + dstOrigin[2], + 0 }; kernels_[blitType]->parameters().set(5, sizeof(dstOrg), dstOrg); kernels_[blitType]->parameters().set(6, sizeof(copySize), copySize); // Program memory format uint multiplier = elementSize / sizeof(uint32_t); multiplier = (multiplier == 0) ? 1 : multiplier; - cl_int format[4] = { (cl_int)numChannels, - (cl_int)elementSize / (cl_int)numChannels, - (cl_int)multiplier, 0 }; + cl_uint format[4] = { (cl_uint)numChannels, + (cl_uint)(elementSize / numChannels), + multiplier, 0 }; kernels_[blitType]->parameters().set(7, sizeof(format), format); // Program row and slice pitches - cl_int pitch[4] = { 0 }; + cl_ulong pitch[4] = { 0 }; CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, srcImage); kernels_[blitType]->parameters().set(8, sizeof(pitch), pitch); @@ -1429,9 +1429,9 @@ KernelBlitManager::copyBufferToImage( granularity = 4; } CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_int srcOrg[4] = { (cl_int)srcOrigin[0] / granularity, - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; + cl_ulong srcOrg[4] = { srcOrigin[0] / granularity, + srcOrigin[1], + srcOrigin[2], 0 }; kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg); cl_int dstOrg[4] = { (cl_int)dstOrigin[0], @@ -1447,13 +1447,13 @@ KernelBlitManager::copyBufferToImage( // Program memory format uint multiplier = elementSize / sizeof(uint32_t); multiplier = (multiplier == 0) ? 1 : multiplier; - cl_int format[4] = { (cl_int)numChannels, - (cl_int)elementSize / (cl_int)numChannels, - (cl_int)multiplier, 0 }; + cl_uint format[4] = { (cl_uint)numChannels, + (cl_uint)(elementSize / numChannels), + multiplier, 0 }; kernels_[blitType]->parameters().set(5, sizeof(format), format); // Program row and slice pitches - cl_int pitch[4] = { 0 }; + cl_ulong pitch[4] = { 0 }; CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, dstImage); kernels_[blitType]->parameters().set(6, sizeof(pitch), pitch); @@ -1617,7 +1617,7 @@ KernelBlitManager::fillBuffer( uint fillType = FillBuffer; size_t globalWorkOffset[3] = { 0, 0, 0 }; - cl_int fillSize = size[0] / patternSize; + cl_ulong fillSize = size[0] / patternSize; size_t globalWorkSize = amd::alignUp(fillSize, 256); size_t localWorkSize = 256; bool dwordAligned = @@ -1650,14 +1650,14 @@ KernelBlitManager::fillBuffer( cl_mem clmem = ((cl_mem) as_cl(fillMemory)); kernels_[fillType]->parameters().set(2, sizeof(cl_mem), &clmem); - cl_int offset = origin[0]; + cl_ulong offset = origin[0]; if (dwordAligned) { patternSize /= sizeof(uint32_t); offset /= sizeof(uint32_t); } kernels_[fillType]->parameters().set(3, sizeof(cl_uint), &patternSize); kernels_[fillType]->parameters().set(4, sizeof(offset), &offset); - kernels_[fillType]->parameters().set(5, sizeof(cl_mem), &fillSize); + kernels_[fillType]->parameters().set(5, sizeof(fillSize), &fillSize); // Create ND range object for the kernel's execution amd::NDRangeContainer ndrange(1, diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index 669cb79705..be94c00834 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -152,6 +152,8 @@ release(bool, GPU_DIRECT_SRD, false, \ "Use indirect SRD access in HSAIL") \ release(bool, GPU_USE_DEVICE_QUEUE, false, \ "Use a dedicated device queue for the actual submissions") \ +release(bool, GPU_ENABLE_LARGE_ALLOCATION, false, \ + "Enable >4GB single allocations") \ release(bool, AMD_DEPTH_MSAA_INTEROP, false, \ "Enable depth stencil and MSAA buffer interop") \ release(bool, AMD_THREAD_TRACE_ENABLE, false, \