diff --git a/rocclr/runtime/device/blitcl.cpp b/rocclr/runtime/device/blitcl.cpp index 4a79a0e597..2462e0cf4e 100644 --- a/rocclr/runtime/device/blitcl.cpp +++ b/rocclr/runtime/device/blitcl.cpp @@ -7,430 +7,160 @@ namespace device { #define BLIT_KERNELS(...) #__VA_ARGS__ const char* BlitSourceCode = BLIT_KERNELS( -\n + +extern void __amd_copyBufferToImage( + __global uint*, __write_only image2d_array_t, ulong4, + int4, int4, uint4, ulong4); + +extern void __amd_copyImageToBuffer( + __read_only image2d_array_t, __global uint*, __global ushort*, + __global uchar*, int4, ulong4, int4, uint4, ulong4); + +extern void __amd_copyImage( + __read_only image2d_array_t, __write_only image2d_array_t, + int4, int4, int4); + +extern void __amd_copyImage1DA( + __read_only image2d_array_t, __write_only image2d_array_t, + int4, int4, int4); + +extern void __amd_copyBufferRect( + __global uchar*, __global uchar*, + ulong4, ulong4, ulong4); + +extern void __amd_copyBufferRectAligned( + __global uint*, __global uint*, + ulong4, ulong4, ulong4); + +extern void __amd_copyBuffer( + __global uchar*, __global uchar*, + ulong, ulong, ulong); + +extern void __amd_copyBufferAligned( + __global uint*, __global uint*, + ulong, ulong, ulong, uint); + +extern void __amd_fillBuffer( + __global uchar*, __global uint*, __constant uchar*, + uint, ulong, ulong); + +extern void __amd_fillImage( + __write_only image2d_array_t, + float4, int4, uint4, int4, int4, uint); + + __kernel void copyBufferToImage( - __global uint* src, - __write_only image2d_array_t dst, - ulong4 srcOrigin, - int4 dstOrigin, - int4 size, - uint4 format, - ulong4 pitch) + __global uint* src, + __write_only image2d_array_t dst, + ulong4 srcOrigin, + int4 dstOrigin, + int4 size, + uint4 format, + ulong4 pitch) { - ulong idxSrc; - int4 coordsDst; - uint4 pixel; - __global uint* srcUInt = src; - __global ushort* srcUShort = (__global ushort*)src; - __global uchar* srcUChar = (__global uchar*)src; - ushort tmpUShort; - uint tmpUInt; - - coordsDst.x = get_global_id(0); - coordsDst.y = get_global_id(1); - coordsDst.z = get_global_id(2); - coordsDst.w = 0; - - if ((coordsDst.x >= size.x) || - (coordsDst.y >= size.y) || - (coordsDst.z >= size.z)) { - return; - } - - idxSrc = (coordsDst.z * pitch.y + - coordsDst.y * pitch.x + coordsDst.x) * - format.z + srcOrigin.x; - - coordsDst.x += dstOrigin.x; - coordsDst.y += dstOrigin.y; - coordsDst.z += dstOrigin.z; - - // Check components - switch (format.x) { - case 1: - // Check size - if (format.y == 1) { - pixel.x = (uint)srcUChar[idxSrc]; - } - else if (format.y == 2) { - pixel.x = (uint)srcUShort[idxSrc]; - } - else { - pixel.x = srcUInt[idxSrc]; - } - break; - case 2: - // Check size - if (format.y == 1) { - tmpUShort = srcUShort[idxSrc]; - pixel.x = (uint)(tmpUShort & 0xff); - pixel.y = (uint)(tmpUShort >> 8); - } - else if (format.y == 2) { - tmpUInt = srcUInt[idxSrc]; - pixel.x = (tmpUInt & 0xffff); - pixel.y = (tmpUInt >> 16); - } - else { - pixel.x = srcUInt[idxSrc++]; - pixel.y = srcUInt[idxSrc]; - } - break; - case 4: - // Check size - if (format.y == 1) { - tmpUInt = srcUInt[idxSrc]; - pixel.x = tmpUInt & 0xff; - pixel.y = (tmpUInt >> 8) & 0xff; - pixel.z = (tmpUInt >> 16) & 0xff; - pixel.w = (tmpUInt >> 24) & 0xff; - } - else if (format.y == 2) { - tmpUInt = srcUInt[idxSrc++]; - pixel.x = tmpUInt & 0xffff; - pixel.y = (tmpUInt >> 16); - tmpUInt = srcUInt[idxSrc]; - pixel.z = tmpUInt & 0xffff; - pixel.w = (tmpUInt >> 16); - } - else { - pixel.x = srcUInt[idxSrc++]; - pixel.y = srcUInt[idxSrc++]; - pixel.z = srcUInt[idxSrc++]; - pixel.w = srcUInt[idxSrc]; - } - break; - } - // Write the final pixel - write_imageui(dst, coordsDst, pixel); + __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch); } -\n + __kernel void copyImageToBuffer( - __read_only image2d_array_t src, - __global uint* dstUInt, - __global ushort* dstUShort, - __global uchar* dstUChar, - int4 srcOrigin, - ulong4 dstOrigin, - int4 size, - uint4 format, - ulong4 pitch) + __read_only image2d_array_t src, + __global uint* dstUInt, + __global ushort* dstUShort, + __global uchar* dstUChar, + int4 srcOrigin, + ulong4 dstOrigin, + int4 size, + uint4 format, + ulong4 pitch) { - ulong idxDst; - int4 coordsSrc; - uint4 texel; - - coordsSrc.x = get_global_id(0); - coordsSrc.y = get_global_id(1); - coordsSrc.z = get_global_id(2); - coordsSrc.w = 0; - - if ((coordsSrc.x >= size.x) || - (coordsSrc.y >= size.y) || - (coordsSrc.z >= size.z)) { - return; - } - - idxDst = (coordsSrc.z * pitch.y + coordsSrc.y * pitch.x + - coordsSrc.x) * format.z + dstOrigin.x; - - coordsSrc.x += srcOrigin.x; - coordsSrc.y += srcOrigin.y; - coordsSrc.z += srcOrigin.z; - - texel = read_imageui(src, coordsSrc); - - // Check components - switch (format.x) { - case 1: - // Check size - switch (format.y) { - case 1: - dstUChar[idxDst] = (uchar)texel.x; - break; - case 2: - dstUShort[idxDst] = (ushort)texel.x; - break; - case 4: - dstUInt[idxDst] = texel.x; - break; - } - break; - case 2: - // Check size - switch (format.y) { - case 1: - dstUShort[idxDst] = (ushort)texel.x | - ((ushort)texel.y << 8); - break; - case 2: - dstUInt[idxDst] = texel.x | (texel.y << 16); - break; - case 4: - dstUInt[idxDst++] = texel.x; - dstUInt[idxDst] = texel.y; - break; - } - break; - case 4: - // Check size - switch (format.y) { - case 1: - dstUInt[idxDst] = (uint)texel.x | - (texel.y << 8) | - (texel.z << 16) | - (texel.w << 24); - break; - case 2: - dstUInt[idxDst++] = texel.x | (texel.y << 16); - dstUInt[idxDst] = texel.z | (texel.w << 16); - break; - case 4: - dstUInt[idxDst++] = texel.x; - dstUInt[idxDst++] = texel.y; - dstUInt[idxDst++] = texel.z; - dstUInt[idxDst] = texel.w; - break; - } - break; - } + __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar, + srcOrigin, dstOrigin, size, format, pitch); } -\n + __kernel void copyImage( __read_only image2d_array_t src, __write_only image2d_array_t dst, - int4 srcOrigin, - int4 dstOrigin, - int4 size) + int4 srcOrigin, + int4 dstOrigin, + int4 size) { - int4 coordsDst; - int4 coordsSrc; - - coordsDst.x = get_global_id(0); - coordsDst.y = get_global_id(1); - coordsDst.z = get_global_id(2); - coordsDst.w = 0; - - if ((coordsDst.x >= size.x) || - (coordsDst.y >= size.y) || - (coordsDst.z >= size.z)) { - return; - } - - coordsSrc = srcOrigin + coordsDst; - coordsDst += dstOrigin; - - uint4 texel; - texel = read_imageui(src, coordsSrc); - write_imageui(dst, coordsDst, texel); + __amd_copyImage(src, dst, srcOrigin, dstOrigin, size); } -\n + __kernel void copyImage1DA( - __read_only image2d_array_t src, - __write_only image2d_array_t dst, - int4 srcOrigin, - int4 dstOrigin, - int4 size) + __read_only image2d_array_t src, + __write_only image2d_array_t dst, + int4 srcOrigin, + int4 dstOrigin, + int4 size) { - int4 coordsDst; - int4 coordsSrc; - - coordsDst.x = get_global_id(0); - coordsDst.y = get_global_id(1); - coordsDst.z = get_global_id(2); - coordsDst.w = 0; - - if ((coordsDst.x >= size.x) || - (coordsDst.y >= size.y) || - (coordsDst.z >= size.z)) { - return; - } - - coordsSrc = srcOrigin + coordsDst; - coordsDst += dstOrigin; - if (srcOrigin.w != 0) { - coordsSrc.z = coordsSrc.y; - coordsSrc.y = 0; - } - if (dstOrigin.w != 0) { - coordsDst.z = coordsDst.y; - coordsDst.y = 0; - } - - uint4 texel; - texel = read_imageui(src, coordsSrc); - write_imageui(dst, coordsDst, texel); + __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size); } -\n + __kernel void copyBufferRect( - __global uchar* src, - __global uchar* dst, - ulong4 srcRect, - ulong4 dstRect, - ulong4 size) + __global uchar* src, + __global uchar* dst, + ulong4 srcRect, + ulong4 dstRect, + ulong4 size) { - ulong x = get_global_id(0); - ulong y = get_global_id(1); - ulong z = get_global_id(2); - - if ((x >= size.x) || - (y >= size.y) || - (z >= size.z)) { - return; - } - - ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y; - ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y; - - dst[offsDst] = src[offsSrc]; + __amd_copyBufferRect(src, dst, srcRect, dstRect, size); } -\n + __kernel void copyBufferRectAligned( - __global uint* src, - __global uint* dst, - ulong4 srcRect, - ulong4 dstRect, - ulong4 size) + __global uint* src, + __global uint* dst, + ulong4 srcRect, + ulong4 dstRect, + ulong4 size) { - ulong x = get_global_id(0); - ulong y = get_global_id(1); - ulong z = get_global_id(2); - - if ((x >= size.x) || - (y >= size.y) || - (z >= size.z)) { - return; - } - - ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y; - ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y; - - if (size.w == 16) { - __global uint4* src4 = (__global uint4*)src; - __global uint4* dst4 = (__global uint4*)dst; - dst4[offsDst] = src4[offsSrc]; - } - else { - dst[offsDst] = src[offsSrc]; - } + __amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size); } -\n + __kernel void copyBuffer( - __global uchar* src, - __global uchar* dst, - ulong srcOrigin, - ulong dstOrigin, - ulong size) + __global uchar* src, + __global uchar* dst, + ulong srcOrigin, + ulong dstOrigin, + ulong size) { - ulong id = get_global_id(0); - - if (id >= size) { - return; - } - - ulong offsSrc = id + srcOrigin; - ulong offsDst = id + dstOrigin; - - dst[offsDst] = src[offsSrc]; + __amd_copyBuffer(src, dst, srcOrigin, dstOrigin, size); } -\n + __kernel void copyBufferAligned( - __global uint* src, - __global uint* dst, - ulong srcOrigin, - ulong dstOrigin, - ulong size, - uint alignment) + __global uint* src, + __global uint* dst, + ulong srcOrigin, + ulong dstOrigin, + ulong size, + uint alignment) { - ulong id = get_global_id(0); - - if (id >= size) { - return; - } - - ulong offsSrc = id + srcOrigin; - ulong offsDst = id + dstOrigin; - - if (alignment == 16) { - __global uint4* src4 = (__global uint4*)src; - __global uint4* dst4 = (__global uint4*)dst; - dst4[offsDst] = src4[offsSrc]; - } - else { - dst[offsDst] = src[offsSrc]; - } + __amd_copyBufferAligned(src, dst, srcOrigin, dstOrigin, size, alignment); } -\n + __kernel void fillBuffer( - __global uchar* bufUChar, - __global uint* bufUInt, - __constant uchar* pattern, - uint patternSize, - ulong offset, - ulong size) + __global uchar* bufUChar, + __global uint* bufUInt, + __constant uchar* pattern, + uint patternSize, + ulong offset, + ulong size) { - ulong id = get_global_id(0); - - if (id >= size) { - return; - } - - if (bufUInt) { - __global uint* element = &bufUInt[offset + id * patternSize]; - __constant uint* pt = (__constant uint*)pattern; - - for (uint i = 0; i < patternSize; ++i) { - element[i] = pt[i]; - } - } - else { - __global uchar* element = &bufUChar[offset + id * patternSize]; - - for (uint i = 0; i < patternSize; ++i) { - element[i] = pattern[i]; - } - } + __amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size); } -\n + __kernel void fillImage( - __write_only image2d_array_t image, - float4 patternFLOAT4, - int4 patternINT4, - uint4 patternUINT4, - int4 origin, - int4 size, - uint type) + __write_only image2d_array_t image, + float4 patternFLOAT4, + int4 patternINT4, + uint4 patternUINT4, + int4 origin, + int4 size, + uint type) { - int4 coords; - - coords.x = get_global_id(0); - coords.y = get_global_id(1); - coords.z = get_global_id(2); - coords.w = 0; - - if ((coords.x >= size.x) || - (coords.y >= size.y) || - (coords.z >= size.z)) { - return; - } - - coords += origin; - - // Check components - switch (type) { - case 0: - write_imagef(image, coords, patternFLOAT4); - break; - case 1: - write_imagei(image, coords, patternINT4); - break; - case 2: - write_imageui(image, coords, patternUINT4); - break; - } + __amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4, + origin, size, type); } -\n -\n + ); } // namespace device diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp index e0c1a73be6..3a05b1a2d3 100644 --- a/rocclr/runtime/device/gpu/gpuschedcl.cpp +++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp @@ -8,519 +8,17 @@ namespace gpu { const char* SchedulerSourceCode = SCHEDULER_KERNEL( \n -//! AmdAqlWrap slot state -enum AqlWrapState { - AQL_WRAP_FREE = 0, - AQL_WRAP_RESERVED, - AQL_WRAP_READY, - AQL_WRAP_MARKER, - AQL_WRAP_BUSY, - AQL_WRAP_DONE -}; - -//! Profiling states -enum ProfilingState { - PROFILING_COMMAND_START = 0, - PROFILING_COMMAND_END, - PROFILING_COMMAND_COMPLETE -}; - -typedef struct _HsaAqlDispatchPacket { - uint mix; - ushort workgroup_size[3]; - ushort reserved2; - uint grid_size[3]; - uint private_segment_size_bytes; - uint group_segment_size_bytes; - ulong kernel_object_address; - ulong kernel_arg_address; - ulong reserved3; - ulong completion_signal; -} HsaAqlDispatchPacket; - -typedef struct _AmdVQueueHeader { - uint aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64). - uint event_slot_num; //!< [LRO] The number of kernel events in the events buffer - ulong event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events - ulong event_slots; //!< [LRO] Pointer to a buffer for the events. - // Array of event_slot_num entries of AmdEvent - ulong aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots - uint command_counter; //!< [LRW] The global counter for the submitted commands into the queue - uint wait_size; //!< [LRO] The wait list size (in clk_event_t) - uint arg_size; //!< [LRO] The size of argument buffer (in bytes) - uint reserved0; //!< For the future usage - ulong kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) - uint reserved[2]; //!< For the future usage -} AmdVQueueHeader; - -typedef struct _AmdAqlWrap { - uint state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY, - // MARKER, BUSY and DONE. The block could be returned back to a free state. - uint enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start – - // (kernel_enqueue_flags_t) - // CLK_ENQUEUE_FLAGS_NO_WAIT – we just start processing - // CLK_ENQUEUE_FLAGS_WAIT_KERNEL – check if parent_wrap->state is done and then start processing - // CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP - currently == WAIT_KERNEL - uint command_id; //!< [LWO/SRO] The unique command ID - uint child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. - // It’s incremented on the - // start and decremented on the finish. The parent kernel can be considered as - // done when the value is 0 and the state is DONE - ulong completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) - ulong parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) - ulong wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) - uint wait_num; //!< [LWO/SRO] The number of cl_event_wait objects - uint reserved[5]; //!< For the future usage - HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet -} AmdAqlWrap; - -typedef struct _AmdEvent { - uint state; //!< [LRO/SRW] Event state: START, END, COMPLETE - uint counter; //!< [LRW] Event retain/release counter. 0 means the event is free - ulong timer[3]; //!< [LRO/SWO] Timer values for profiling for each state - ulong capture_info; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME -} AmdEvent; - -typedef struct _SchedulerParam { - uint signal; //!< Signal to stop the child queue - uint eng_clk; //!< Engine clock in Mhz - ulong hw_queue; //!< Address to HW queue - ulong hsa_queue; //!< Address to HSA dummy queue - uint reserved; //!< reserved - uint scratchSize; //!< Scratch buffer size - ulong scratch; //!< GPU address to the scratch buffer - uint numMaxWaves; //!< Num max waves on the asic - uint releaseHostCP; //!< Releases CP on the host queue - ulong parentAQL; //!< Host parent AmdAqlWrap packet - uint dedicatedQueue; //!< Scheduler uses a dedicated queue - uint scratchOffset; //!< Scratch buffer offset -} SchedulerParam; - -typedef struct _HwDispatchHeader { - uint pad00; // CP WRITE_DATA write to rewind for memory - uint pad01; - uint pad02; - uint pad03; - uint rewind; // REWIND execution - uint startExe; // valid bit - uint condExe0; // 0xC0032200 -- TYPE 3, COND_EXEC - uint condExe1; // 0x00000204 ---- - uint condExe2; // 0x00000000 ---- - uint condExe3; // 0x00000000 ---- - uint condExe4; // 0x00000000 ---- -} HwDispatchHeader; - -typedef struct _HwDispatch { - uint packet0; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values) - uint offset0; // 0x00000204 ---- OFFSET - uint startX; // 0x00000000 ---- COMPUTE_START_X: START = 0x0 - uint startY; // 0x00000000 ---- COMPUTE_START_Y: START = 0x0 - uint startZ; // 0x00000000 ---- COMPUTE_START_Z: START = 0x0 - uint wrkGrpSizeX; // 0x00000000 ---- COMPUTE_NUM_THREAD_X: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 - uint wrkGrpSizeY; // 0x00000000 ---- COMPUTE_NUM_THREAD_Y: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 - uint wrkGrpSizeZ; // 0x00000000 ---- COMPUTE_NUM_THREAD_Z: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 - uint packet1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offset1; // 0x0000020C ---- OFFSET - uint isaLo; // 0x00000000 ---- COMPUTE_PGM_LO: DATA = 0x0 - uint isaHi; // 0x00000000 ---- COMPUTE_PGM_HI: DATA = 0x0, INST_ATC__CI__VI = 0x0 - uint packet2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offset2; // 0x00000212 ---- OFFSET - uint resource1; // 0x00000000 ---- COMPUTE_PGM_RSRC1: VGPRS = 0x0, SGPRS = 0x0, PRIORITY = 0x0, FLOAT_MODE = 0x0, PRIV = 0x0, DX10_CLAMP = 0x0, DEBUG_MODE = 0x0, IEEE_MODE = 0x0, BULKY__CI__VI = 0x0, CDBG_USER__CI__VI = 0x0 - uint resource2; // 0x00000000 ---- COMPUTE_PGM_RSRC2: SCRATCH_EN = 0x0, USER_SGPR = 0x0, TRAP_PRESENT = 0x0, TGID_X_EN = 0x0, TGID_Y_EN = 0x0, TGID_Z_EN = 0x0, TG_SIZE_EN = 0x0, TIDIG_COMP_CNT = 0x0, EXCP_EN_MSB__CI__VI = 0x0, LDS_SIZE = 0x0, EXCP_EN = 0x0 - uint packet3; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values) - uint offset3; // 0x00000215 ---- OFFSET - uint pad31; // 0x00000000 ---- COMPUTE_RESOURCE_LIMITS: WAVES_PER_SH = 0x0, TG_PER_CU = 0x0, LOCK_THRESHOLD = 0x0, SIMD_DEST_CNTL = 0x0, FORCE_SIMD_DIST__CI__VI = 0x0, CU_GROUP_COUNT__CI__VI = 0x0 - uint pad32; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE0: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF - uint pad33; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE1: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF - uint ringSize; // 0x00000000 ---- COMPUTE_TMPRING_SIZE: WAVES = 0x0, WAVESIZE = 0x0 - uint pad34; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE2__CI__VI: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF - uint pad35; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE3__CI__VI: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF - uint user0; // 0xC0047602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (4 values) - uint offsUser0; // 0x00000240 ---- OFFSET - uint scratchLo; // 0x00000000 ---- COMPUTE_USER_DATA_0: DATA = 0x0 - uint scratchHi; // 0x80000000 ---- COMPUTE_USER_DATA_1: DATA = 0x80000000 - uint scratchSize; // 0x00000000 ---- COMPUTE_USER_DATA_2: DATA = 0x0 - uint padUser; // 0x00EA7FAC ---- COMPUTE_USER_DATA_3: DATA = 0xEA7FAC - uint user1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offsUser1; // 0x00000244 ---- OFFSET - uint aqlPtrLo; // 0x00000000 ---- COMPUTE_USER_DATA_4: DATA = 0x0 - uint aqlPtrHi; // 0x00000000 ---- COMPUTE_USER_DATA_5: DATA = 0x0 - uint user2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offsUser2; // 0x00000246 ---- OFFSET - uint hsaQueueLo; // 0x00000000 ---- COMPUTE_USER_DATA_6: DATA = 0x0 - uint hsaQueueHi; // 0x00000000 ---- COMPUTE_USER_DATA_7: DATA = 0x0 - uint user3; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offsUser3; // 0x00000246 ---- OFFSET - uint argsLo; // 0x00000000 ---- COMPUTE_USER_DATA_8: DATA = 0x0 - uint argsHi; // 0x00000000 ---- COMPUTE_USER_DATA_9: DATA = 0x0 - uint copyData; // 0xC0044000 -- TYPE 3, COPY_DATA - uint copyDataFlags; // 0x00000405 ---- srcSel 0x5, destSel 0x4, countSel 0x0, wrConfirm 0x0, engineSel 0x0 - uint scratchAddrLo; // 0x000201C4 ---- srcAddressLo - uint scratchAddrHi; // 0x00000000 ---- srcAddressHi - uint shPrivateLo; // 0x00002580 ---- dstAddressLo - uint shPrivateHi; // 0x00000000 ---- dstAddressHi - uint user4; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offsUser4; // 0x00000248 ---- OFFSET - uint scratchOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0 - uint privSize; // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30 - uint packet4; // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE - uint glbSizeX; // 0x00000000 - uint glbSizeY; // 0x00000000 - uint glbSizeZ; // 0x00000000 - uint padd41; // 0x00000021 -} HwDispatch; - -const uint ResumeExecution = 0x80000000; // 0x81000000 -const uint StallExecution = 0x00000000; // 0x01000000 -const uint WavefrontSize = 64; -const uint MaxWaveSize = 0x400; - -static inline void -dispatch( - volatile __global HwDispatch* dispatch, - __global HsaAqlDispatchPacket* aqlPkt, - ulong scratch, - ulong hsaQueue, - uint scratchSize, - uint scratchOffset, - uint numMaxWaves) -{ - const uint UsrRegOffset = 0x240; - const uint Pm4Nop = 0xC0001002; - const uint Pm4UserRegs = 0xC0007602; - const uint Pm4CopyReg = 0xC0044000; - - uint usrRegCnt = 0; - - dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0]; - dispatch->wrkGrpSizeY = aqlPkt->workgroup_size[1]; - dispatch->wrkGrpSizeZ = aqlPkt->workgroup_size[2]; - // ISA address - __global uchar* kernelObj = (__global uchar*)aqlPkt->kernel_object_address; - ulong isa = aqlPkt->kernel_object_address + *((__global uint*)(kernelObj + 0x10)); - dispatch->isaLo = (uint)(isa >> 8); - dispatch->isaHi = (uint)(isa >> 40); - - // Program PGM resource registers - dispatch->resource1 = *((__global uint*)(kernelObj + 0x30)); - dispatch->resource2 = *((__global uint*)(kernelObj + 0x34)); - uint flags = *((__global uint*)(kernelObj + 0x38)); - uint privateSize = *((__global uint*)(kernelObj + 0x50)); - - uint ldsSize = aqlPkt->group_segment_size_bytes + - *((__global uint*)(kernelObj + 0x54)); - // Align up the LDS blocks 128 * 4(in DWORDs) - uint ldsBlocks = (ldsSize + 511) >> 9; - dispatch->resource2 |= (ldsBlocks << 15); - - // Workaround for compiler bug - dispatch->scratchLo = (flags & 1); - // privSegEna = (flags & 1); - if (flags & 0x1) { - uint waveSize = privateSize * WavefrontSize; - // 256 DWRODs is the minimum for SQ - waveSize = max(MaxWaveSize, waveSize); - uint numWaves = scratchSize / waveSize; - numWaves = min(numWaves, numMaxWaves); - dispatch->ringSize = numWaves; - dispatch->ringSize |= (waveSize >> 10) << 12; - dispatch->user0 = Pm4UserRegs | (4 << 16); - dispatch->scratchLo = (uint)scratch; - dispatch->scratchHi = ((uint)(scratch >> 32)) | 0x80000000; // Enables swizzle - dispatch->scratchSize = scratchSize; - usrRegCnt += 4; - } - else { - dispatch->ringSize = 0; - dispatch->user0 = Pm4Nop | (4 << 16); - } - - // dispatchEna = (flags & 0x2); - dispatch->user1 = (flags & 0x2) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16)); - dispatch->offsUser1 = UsrRegOffset + usrRegCnt; - usrRegCnt += (flags & 0x2) ? 2 : 0; - ulong gpuAqlPtr = (ulong)aqlPkt; - dispatch->aqlPtrLo = (uint)gpuAqlPtr; - dispatch->aqlPtrHi = (uint)(gpuAqlPtr >> 32); - - // queuePtr = (flags & 0x4); - if (flags & 0x4) { - dispatch->user2 = Pm4UserRegs | (2 << 16); - dispatch->offsUser2 = UsrRegOffset + usrRegCnt; - usrRegCnt += 2; - dispatch->hsaQueueLo = (uint)hsaQueue; - dispatch->hsaQueueHi = (uint)(hsaQueue >> 32); - } - else { - dispatch->user2 = Pm4Nop | (2 << 16); - } - - // kernelArgEna = (flags & 0x8); - dispatch->user3 = (flags & 0x8) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16)); - dispatch->offsUser3 = UsrRegOffset + usrRegCnt; - usrRegCnt += (flags & 0x8) ? 2 : 0; - dispatch->argsLo = (uint)aqlPkt->kernel_arg_address; - dispatch->argsHi = (uint)(aqlPkt->kernel_arg_address >> 32); - - // flatScratchEna = (flags & 0x20); - if (flags & 0x20) { - dispatch->copyData = Pm4CopyReg; - dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16); - dispatch->offsUser4 = UsrRegOffset + usrRegCnt; - dispatch->scratchOffs = scratchOffset; - dispatch->privSize = privateSize; - } - else { - dispatch->copyData = Pm4Nop | (8 << 16); - } - - dispatch->glbSizeX = aqlPkt->grid_size[0]; - dispatch->glbSizeY = aqlPkt->grid_size[1]; - dispatch->glbSizeZ = aqlPkt->grid_size[2]; -} - -static inline bool -checkWaitEvents(__global AmdEvent** events, uint numEvents) -{ - for (uint i = 0; i < numEvents; ++i) { - if (atomic_load_explicit( - (__global atomic_uint*)(&events[i]->state), - memory_order_acquire, memory_scope_device) != CL_COMPLETE) { - return false; - } - } - return true; -} - - -// release slot in a bitmask controlled resource i is the slot number -static inline void -release_slot(__global uint* restrict mask, uint i) -{ - /* uint b = ~(1UL << (i & 0x1f)); */ - uint b = ~amd_bfm(1U, i); - __global atomic_uint *p = (__global atomic_uint *)(mask + (i >> 5)); - uint vv; - uint v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device); - for (;;) { - vv = v & b; - if (atomic_compare_exchange_strong_explicit(p, &v, vv, - memory_order_acq_rel, memory_order_acquire, memory_scope_device)) { - break; - } - } -} - -static inline void -releaseEvent(__global AmdEvent* ev, __global uint* emask, __global AmdEvent* eb) -{ - uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U, - memory_order_acq_rel, memory_scope_device); - if (c == 1U) { - uint i = ev - eb; - release_slot(emask, i); - } -} - -static inline void -releaseWaitEvents(__global AmdEvent** events, uint numEvents, - __global uint* emask, __global AmdEvent* eb) -{ - for (uint i = 0; i < numEvents; ++i) { - releaseEvent(events[i], emask, eb); - } -} - -static inline uint -min_command(uint slot_num, __global AmdAqlWrap* wraps) -{ - uint minCommand = 0xffffffff; - for (uint idx = 0; idx < slot_num; ++idx) { - __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; - uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), - memory_order_acquire, memory_scope_device); - if ((slotState != AQL_WRAP_FREE) && (slotState != AQL_WRAP_RESERVED)) { - minCommand = min(disp->command_id, minCommand); - } - } - return minCommand; -} - -extern ulong __hsail_get_clock(); // Declaration is required - +extern void __amdrt_scheduler(__global void *, __global void *, uint); +\n __kernel void scheduler( - __global AmdVQueueHeader* queue, - __global SchedulerParam* params, - uint paramIdx) + __global void * queue, + __global void * params, + uint paramIdx) { - __global SchedulerParam* param = ¶ms[paramIdx]; - volatile __global HwDispatch* hwDisp = (volatile __global HwDispatch*) - &((__global HwDispatchHeader*)param->hw_queue)[1]; - __global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL); - __global uint* counter = (__global uint*)(&hostParent->child_counter); - __global uint* signal = (__global uint*)(¶m->signal); - __global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1]; - __global uint* amask = (__global uint *)queue->aql_slot_mask; - - //! @todo This is an unexplained behavior. - //! The scheduler can be launched one more time after termination. - if (1 == atomic_load_explicit((__global atomic_uint*)¶m->releaseHostCP, - memory_order_acquire, memory_scope_device)) { - return; - } - - uint launch = 0; - uint loop = 1; - - uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]), - memory_order_acquire, memory_scope_device); - - int baseIdx = get_group_id(0) * 32; - while (mask != 0) { - uint sIdx = ctz(mask); - uint idx = baseIdx + sIdx; - mask &= ~(1 << sIdx); - __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; - uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), - memory_order_acquire, memory_scope_device); - __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap); - __global AmdEvent* event = (__global AmdEvent*)(disp->completion); - - // Check if the current slot is ready for processing - if (slotState == AQL_WRAP_READY) { - if (launch == 0) { - // Attempt to find a new disaptch if nothing was launched yet - uint parentState = atomic_load_explicit( - (__global atomic_uint*)(&parent->state), - memory_order_acquire, memory_scope_device); - uint enqueueFlags = atomic_load_explicit( - (__global atomic_uint*)(&disp->enqueue_flags), - memory_order_acquire, memory_scope_device); - - // Check the launch flags - if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) || - (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) && - (parentState != AQL_WRAP_DONE)) { - continue; - } - - // Check if the wait list is COMPLETE - launch = checkWaitEvents( - (__global AmdEvent**)(disp->wait_list), disp->wait_num); - - if (launch) { - if (event != 0) { - event->timer[PROFILING_COMMAND_START] = - (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; - } - // Launch child kernel .... - dispatch(&hwDisp[get_group_id(0)], &disp->aql, param->scratch, param->hsa_queue, - param->scratchSize, param->scratchOffset, param->numMaxWaves); - disp->state = AQL_WRAP_BUSY; - releaseWaitEvents((__global AmdEvent**)(disp->wait_list), - disp->wait_num, (__global uint*)queue->event_slot_mask, - (__global AmdEvent*)queue->event_slots); - break; - } - } - } - else if (slotState == AQL_WRAP_MARKER) { - bool complete = false; - if (disp->wait_num == 0) { - uint minCommand = min_command(queue->aql_slot_num, wraps); - if (disp->command_id == minCommand) { - complete = true; - } - } - else { - // Check if the wait list is COMPLETE - if (checkWaitEvents( - (__global AmdEvent**)(disp->wait_list), disp->wait_num)) { - complete = true; - releaseWaitEvents((__global AmdEvent**)(disp->wait_list), - disp->wait_num, (__global uint*)queue->event_slot_mask, - (__global AmdEvent*)queue->event_slots); - } - } - if (complete) { - // Decrement the child execution counter on the parent - atomic_fetch_sub_explicit( - (__global atomic_uint*)&parent->child_counter, - 1, memory_order_acq_rel, memory_scope_device); - event->state = CL_COMPLETE; - disp->state = AQL_WRAP_FREE; - release_slot(amask, idx); - releaseEvent(event, (__global uint*)queue->event_slot_mask, - (__global AmdEvent*)queue->event_slots); - } - } - else if ((slotState == AQL_WRAP_BUSY) || - (slotState == AQL_WRAP_DONE)) { - if (slotState == AQL_WRAP_BUSY) { - disp->state = AQL_WRAP_DONE; - if (event != 0) { - event->timer[PROFILING_COMMAND_END] = - (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; - } - } - // Was CL_EVENT requested? - if (event != 0) { - // The current dispatch doesn't have any outstanding children - if (disp->child_counter == 0) { - event->timer[PROFILING_COMMAND_COMPLETE] = - (__hsail_get_clock() * (ulong)param->eng_clk) >> 10; - event->state = CL_COMPLETE; - if (event->capture_info != 0) { - __global ulong* values = (__global ulong*)event->capture_info; - values[0] = event->timer[PROFILING_COMMAND_END] - - event->timer[PROFILING_COMMAND_START]; - values[1] = event->timer[PROFILING_COMMAND_COMPLETE] - - event->timer[PROFILING_COMMAND_START]; - } - releaseEvent(event, (__global uint *)queue->event_slot_mask, - (__global AmdEvent *)queue->event_slots); - } - } - // The current dispatch doesn't have any outstanding children - if (disp->child_counter == 0) { - // Decrement the child execution counter on the parent - atomic_fetch_sub_explicit( - (__global atomic_uint*)&parent->child_counter, - 1, memory_order_acq_rel, memory_scope_device); - disp->state = AQL_WRAP_FREE; - release_slot(amask, idx); - } - } - } - - if (launch == 0) { - hwDisp[get_group_id(0)].glbSizeX = 0; - hwDisp[get_group_id(0)].glbSizeY = 0; - hwDisp[get_group_id(0)].glbSizeZ = 0; - } - - if (param->dedicatedQueue) { - loop = atomic_load_explicit((__global atomic_uint*)signal, - memory_order_acquire, memory_scope_device); - } - else { - loop = atomic_load_explicit((__global atomic_uint*)counter, - memory_order_acquire, memory_scope_device); - } - - if (loop == 0) { - //! \todo Write deadcode to the template, but somehow - //! the scheduler will be launched one more time. - hwDisp->packet0 = 0xdeadc0de; - atomic_store_explicit((__global atomic_uint*)signal, - 0, memory_order_release, memory_scope_device); - atomic_store_explicit((__global atomic_uint*)¶m->releaseHostCP, - 1, memory_order_release, memory_scope_device); - } + __amd_scheduler(queue, params, paramIdx); } \n -\n ); } // namespace gpu