From 3fb99880472cf8dc4d2b4cfdeca0d377f6fb236d Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 2 Oct 2014 12:55:08 -0400
Subject: [PATCH] P4 to Git Change 1083488 by bsumner@bsumner-lin-opencl on
2014/10/02 09:01:39
ECR #304775 - back out 1083437
Affected files ...
... //depot/stg/opencl/drivers/opencl/library/common.hsa/src/blitKernels.cl#2 delete
... //depot/stg/opencl/drivers/opencl/library/common/src/blitKernels.cl#2 delete
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/devenq.h#5 edit
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/schedule.cl#2 delete
... //depot/stg/opencl/drivers/opencl/library/x86/gen/build/Makefile.gen#18 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/blitcl.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#31 edit
[ROCm/clr commit: b737b1b1aba66c90821ea21df872db9220d1ceb9]
---
projects/clr/rocclr/runtime/device/blitcl.cpp | 514 +++++++++++++-----
.../rocclr/runtime/device/gpu/gpuschedcl.cpp | 514 +++++++++++++++++-
2 files changed, 900 insertions(+), 128 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/blitcl.cpp b/projects/clr/rocclr/runtime/device/blitcl.cpp
index 2462e0cf4e..4a79a0e597 100644
--- a/projects/clr/rocclr/runtime/device/blitcl.cpp
+++ b/projects/clr/rocclr/runtime/device/blitcl.cpp
@@ -7,160 +7,430 @@ namespace device {
#define BLIT_KERNELS(...) #__VA_ARGS__
const char* BlitSourceCode = BLIT_KERNELS(
-
-extern void __amd_copyBufferToImage(
- __global uint*, __write_only image2d_array_t, ulong4,
- int4, int4, uint4, ulong4);
-
-extern void __amd_copyImageToBuffer(
- __read_only image2d_array_t, __global uint*, __global ushort*,
- __global uchar*, int4, ulong4, int4, uint4, ulong4);
-
-extern void __amd_copyImage(
- __read_only image2d_array_t, __write_only image2d_array_t,
- int4, int4, int4);
-
-extern void __amd_copyImage1DA(
- __read_only image2d_array_t, __write_only image2d_array_t,
- int4, int4, int4);
-
-extern void __amd_copyBufferRect(
- __global uchar*, __global uchar*,
- ulong4, ulong4, ulong4);
-
-extern void __amd_copyBufferRectAligned(
- __global uint*, __global uint*,
- ulong4, ulong4, ulong4);
-
-extern void __amd_copyBuffer(
- __global uchar*, __global uchar*,
- ulong, ulong, ulong);
-
-extern void __amd_copyBufferAligned(
- __global uint*, __global uint*,
- ulong, ulong, ulong, uint);
-
-extern void __amd_fillBuffer(
- __global uchar*, __global uint*, __constant uchar*,
- uint, ulong, ulong);
-
-extern void __amd_fillImage(
- __write_only image2d_array_t,
- float4, int4, uint4, int4, int4, uint);
-
-
+\n
__kernel void copyBufferToImage(
- __global uint* src,
- __write_only image2d_array_t dst,
- ulong4 srcOrigin,
- int4 dstOrigin,
- int4 size,
- uint4 format,
- ulong4 pitch)
+ __global uint* src,
+ __write_only image2d_array_t dst,
+ ulong4 srcOrigin,
+ int4 dstOrigin,
+ int4 size,
+ uint4 format,
+ ulong4 pitch)
{
- __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch);
-}
+ ulong idxSrc;
+ int4 coordsDst;
+ uint4 pixel;
+ __global uint* srcUInt = src;
+ __global ushort* srcUShort = (__global ushort*)src;
+ __global uchar* srcUChar = (__global uchar*)src;
+ ushort tmpUShort;
+ uint tmpUInt;
+ coordsDst.x = get_global_id(0);
+ coordsDst.y = get_global_id(1);
+ coordsDst.z = get_global_id(2);
+ coordsDst.w = 0;
+
+ if ((coordsDst.x >= size.x) ||
+ (coordsDst.y >= size.y) ||
+ (coordsDst.z >= size.z)) {
+ return;
+ }
+
+ idxSrc = (coordsDst.z * pitch.y +
+ coordsDst.y * pitch.x + coordsDst.x) *
+ format.z + srcOrigin.x;
+
+ coordsDst.x += dstOrigin.x;
+ coordsDst.y += dstOrigin.y;
+ coordsDst.z += dstOrigin.z;
+
+ // Check components
+ switch (format.x) {
+ case 1:
+ // Check size
+ if (format.y == 1) {
+ pixel.x = (uint)srcUChar[idxSrc];
+ }
+ else if (format.y == 2) {
+ pixel.x = (uint)srcUShort[idxSrc];
+ }
+ else {
+ pixel.x = srcUInt[idxSrc];
+ }
+ break;
+ case 2:
+ // Check size
+ if (format.y == 1) {
+ tmpUShort = srcUShort[idxSrc];
+ pixel.x = (uint)(tmpUShort & 0xff);
+ pixel.y = (uint)(tmpUShort >> 8);
+ }
+ else if (format.y == 2) {
+ tmpUInt = srcUInt[idxSrc];
+ pixel.x = (tmpUInt & 0xffff);
+ pixel.y = (tmpUInt >> 16);
+ }
+ else {
+ pixel.x = srcUInt[idxSrc++];
+ pixel.y = srcUInt[idxSrc];
+ }
+ break;
+ case 4:
+ // Check size
+ if (format.y == 1) {
+ tmpUInt = srcUInt[idxSrc];
+ pixel.x = tmpUInt & 0xff;
+ pixel.y = (tmpUInt >> 8) & 0xff;
+ pixel.z = (tmpUInt >> 16) & 0xff;
+ pixel.w = (tmpUInt >> 24) & 0xff;
+ }
+ else if (format.y == 2) {
+ tmpUInt = srcUInt[idxSrc++];
+ pixel.x = tmpUInt & 0xffff;
+ pixel.y = (tmpUInt >> 16);
+ tmpUInt = srcUInt[idxSrc];
+ pixel.z = tmpUInt & 0xffff;
+ pixel.w = (tmpUInt >> 16);
+ }
+ else {
+ pixel.x = srcUInt[idxSrc++];
+ pixel.y = srcUInt[idxSrc++];
+ pixel.z = srcUInt[idxSrc++];
+ pixel.w = srcUInt[idxSrc];
+ }
+ break;
+ }
+ // Write the final pixel
+ write_imageui(dst, coordsDst, pixel);
+}
+\n
__kernel void copyImageToBuffer(
- __read_only image2d_array_t src,
- __global uint* dstUInt,
- __global ushort* dstUShort,
- __global uchar* dstUChar,
- int4 srcOrigin,
- ulong4 dstOrigin,
- int4 size,
- uint4 format,
- ulong4 pitch)
+ __read_only image2d_array_t src,
+ __global uint* dstUInt,
+ __global ushort* dstUShort,
+ __global uchar* dstUChar,
+ int4 srcOrigin,
+ ulong4 dstOrigin,
+ int4 size,
+ uint4 format,
+ ulong4 pitch)
{
- __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar,
- srcOrigin, dstOrigin, size, format, pitch);
-}
+ ulong idxDst;
+ int4 coordsSrc;
+ uint4 texel;
+ coordsSrc.x = get_global_id(0);
+ coordsSrc.y = get_global_id(1);
+ coordsSrc.z = get_global_id(2);
+ coordsSrc.w = 0;
+
+ if ((coordsSrc.x >= size.x) ||
+ (coordsSrc.y >= size.y) ||
+ (coordsSrc.z >= size.z)) {
+ return;
+ }
+
+ idxDst = (coordsSrc.z * pitch.y + coordsSrc.y * pitch.x +
+ coordsSrc.x) * format.z + dstOrigin.x;
+
+ coordsSrc.x += srcOrigin.x;
+ coordsSrc.y += srcOrigin.y;
+ coordsSrc.z += srcOrigin.z;
+
+ texel = read_imageui(src, coordsSrc);
+
+ // Check components
+ switch (format.x) {
+ case 1:
+ // Check size
+ switch (format.y) {
+ case 1:
+ dstUChar[idxDst] = (uchar)texel.x;
+ break;
+ case 2:
+ dstUShort[idxDst] = (ushort)texel.x;
+ break;
+ case 4:
+ dstUInt[idxDst] = texel.x;
+ break;
+ }
+ break;
+ case 2:
+ // Check size
+ switch (format.y) {
+ case 1:
+ dstUShort[idxDst] = (ushort)texel.x |
+ ((ushort)texel.y << 8);
+ break;
+ case 2:
+ dstUInt[idxDst] = texel.x | (texel.y << 16);
+ break;
+ case 4:
+ dstUInt[idxDst++] = texel.x;
+ dstUInt[idxDst] = texel.y;
+ break;
+ }
+ break;
+ case 4:
+ // Check size
+ switch (format.y) {
+ case 1:
+ dstUInt[idxDst] = (uint)texel.x |
+ (texel.y << 8) |
+ (texel.z << 16) |
+ (texel.w << 24);
+ break;
+ case 2:
+ dstUInt[idxDst++] = texel.x | (texel.y << 16);
+ dstUInt[idxDst] = texel.z | (texel.w << 16);
+ break;
+ case 4:
+ dstUInt[idxDst++] = texel.x;
+ dstUInt[idxDst++] = texel.y;
+ dstUInt[idxDst++] = texel.z;
+ dstUInt[idxDst] = texel.w;
+ break;
+ }
+ break;
+ }
+}
+\n
__kernel void copyImage(
__read_only image2d_array_t src,
__write_only image2d_array_t dst,
- int4 srcOrigin,
- int4 dstOrigin,
- int4 size)
+ int4 srcOrigin,
+ int4 dstOrigin,
+ int4 size)
{
- __amd_copyImage(src, dst, srcOrigin, dstOrigin, size);
-}
+ int4 coordsDst;
+ int4 coordsSrc;
+ coordsDst.x = get_global_id(0);
+ coordsDst.y = get_global_id(1);
+ coordsDst.z = get_global_id(2);
+ coordsDst.w = 0;
+
+ if ((coordsDst.x >= size.x) ||
+ (coordsDst.y >= size.y) ||
+ (coordsDst.z >= size.z)) {
+ return;
+ }
+
+ coordsSrc = srcOrigin + coordsDst;
+ coordsDst += dstOrigin;
+
+ uint4 texel;
+ texel = read_imageui(src, coordsSrc);
+ write_imageui(dst, coordsDst, texel);
+}
+\n
__kernel void copyImage1DA(
- __read_only image2d_array_t src,
- __write_only image2d_array_t dst,
- int4 srcOrigin,
- int4 dstOrigin,
- int4 size)
+ __read_only image2d_array_t src,
+ __write_only image2d_array_t dst,
+ int4 srcOrigin,
+ int4 dstOrigin,
+ int4 size)
{
- __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size);
-}
+ int4 coordsDst;
+ int4 coordsSrc;
+ coordsDst.x = get_global_id(0);
+ coordsDst.y = get_global_id(1);
+ coordsDst.z = get_global_id(2);
+ coordsDst.w = 0;
+
+ if ((coordsDst.x >= size.x) ||
+ (coordsDst.y >= size.y) ||
+ (coordsDst.z >= size.z)) {
+ return;
+ }
+
+ coordsSrc = srcOrigin + coordsDst;
+ coordsDst += dstOrigin;
+ if (srcOrigin.w != 0) {
+ coordsSrc.z = coordsSrc.y;
+ coordsSrc.y = 0;
+ }
+ if (dstOrigin.w != 0) {
+ coordsDst.z = coordsDst.y;
+ coordsDst.y = 0;
+ }
+
+ uint4 texel;
+ texel = read_imageui(src, coordsSrc);
+ write_imageui(dst, coordsDst, texel);
+}
+\n
__kernel void copyBufferRect(
- __global uchar* src,
- __global uchar* dst,
- ulong4 srcRect,
- ulong4 dstRect,
- ulong4 size)
+ __global uchar* src,
+ __global uchar* dst,
+ ulong4 srcRect,
+ ulong4 dstRect,
+ ulong4 size)
{
- __amd_copyBufferRect(src, dst, srcRect, dstRect, size);
-}
+ ulong x = get_global_id(0);
+ ulong y = get_global_id(1);
+ ulong z = get_global_id(2);
+ if ((x >= size.x) ||
+ (y >= size.y) ||
+ (z >= size.z)) {
+ return;
+ }
+
+ ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y;
+ ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y;
+
+ dst[offsDst] = src[offsSrc];
+}
+\n
__kernel void copyBufferRectAligned(
- __global uint* src,
- __global uint* dst,
- ulong4 srcRect,
- ulong4 dstRect,
- ulong4 size)
+ __global uint* src,
+ __global uint* dst,
+ ulong4 srcRect,
+ ulong4 dstRect,
+ ulong4 size)
{
- __amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size);
-}
+ ulong x = get_global_id(0);
+ ulong y = get_global_id(1);
+ ulong z = get_global_id(2);
+ if ((x >= size.x) ||
+ (y >= size.y) ||
+ (z >= size.z)) {
+ return;
+ }
+
+ ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y;
+ ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y;
+
+ if (size.w == 16) {
+ __global uint4* src4 = (__global uint4*)src;
+ __global uint4* dst4 = (__global uint4*)dst;
+ dst4[offsDst] = src4[offsSrc];
+ }
+ else {
+ dst[offsDst] = src[offsSrc];
+ }
+}
+\n
__kernel void copyBuffer(
- __global uchar* src,
- __global uchar* dst,
- ulong srcOrigin,
- ulong dstOrigin,
- ulong size)
+ __global uchar* src,
+ __global uchar* dst,
+ ulong srcOrigin,
+ ulong dstOrigin,
+ ulong size)
{
- __amd_copyBuffer(src, dst, srcOrigin, dstOrigin, size);
-}
+ ulong id = get_global_id(0);
+ if (id >= size) {
+ return;
+ }
+
+ ulong offsSrc = id + srcOrigin;
+ ulong offsDst = id + dstOrigin;
+
+ dst[offsDst] = src[offsSrc];
+}
+\n
__kernel void copyBufferAligned(
- __global uint* src,
- __global uint* dst,
- ulong srcOrigin,
- ulong dstOrigin,
- ulong size,
- uint alignment)
+ __global uint* src,
+ __global uint* dst,
+ ulong srcOrigin,
+ ulong dstOrigin,
+ ulong size,
+ uint alignment)
{
- __amd_copyBufferAligned(src, dst, srcOrigin, dstOrigin, size, alignment);
-}
+ ulong id = get_global_id(0);
+ if (id >= size) {
+ return;
+ }
+
+ ulong offsSrc = id + srcOrigin;
+ ulong offsDst = id + dstOrigin;
+
+ if (alignment == 16) {
+ __global uint4* src4 = (__global uint4*)src;
+ __global uint4* dst4 = (__global uint4*)dst;
+ dst4[offsDst] = src4[offsSrc];
+ }
+ else {
+ dst[offsDst] = src[offsSrc];
+ }
+}
+\n
__kernel void fillBuffer(
- __global uchar* bufUChar,
- __global uint* bufUInt,
- __constant uchar* pattern,
- uint patternSize,
- ulong offset,
- ulong size)
+ __global uchar* bufUChar,
+ __global uint* bufUInt,
+ __constant uchar* pattern,
+ uint patternSize,
+ ulong offset,
+ ulong size)
{
- __amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size);
-}
+ ulong id = get_global_id(0);
+ if (id >= size) {
+ return;
+ }
+
+ if (bufUInt) {
+ __global uint* element = &bufUInt[offset + id * patternSize];
+ __constant uint* pt = (__constant uint*)pattern;
+
+ for (uint i = 0; i < patternSize; ++i) {
+ element[i] = pt[i];
+ }
+ }
+ else {
+ __global uchar* element = &bufUChar[offset + id * patternSize];
+
+ for (uint i = 0; i < patternSize; ++i) {
+ element[i] = pattern[i];
+ }
+ }
+}
+\n
__kernel void fillImage(
- __write_only image2d_array_t image,
- float4 patternFLOAT4,
- int4 patternINT4,
- uint4 patternUINT4,
- int4 origin,
- int4 size,
- uint type)
+ __write_only image2d_array_t image,
+ float4 patternFLOAT4,
+ int4 patternINT4,
+ uint4 patternUINT4,
+ int4 origin,
+ int4 size,
+ uint type)
{
- __amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4,
- origin, size, type);
-}
+ int4 coords;
+ coords.x = get_global_id(0);
+ coords.y = get_global_id(1);
+ coords.z = get_global_id(2);
+ coords.w = 0;
+
+ if ((coords.x >= size.x) ||
+ (coords.y >= size.y) ||
+ (coords.z >= size.z)) {
+ return;
+ }
+
+ coords += origin;
+
+ // Check components
+ switch (type) {
+ case 0:
+ write_imagef(image, coords, patternFLOAT4);
+ break;
+ case 1:
+ write_imagei(image, coords, patternINT4);
+ break;
+ case 2:
+ write_imageui(image, coords, patternUINT4);
+ break;
+ }
+}
+\n
+\n
);
} // namespace device
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 3a05b1a2d3..e0c1a73be6 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -8,17 +8,519 @@ namespace gpu {
const char* SchedulerSourceCode = SCHEDULER_KERNEL(
\n
-extern void __amdrt_scheduler(__global void *, __global void *, uint);
-\n
+//! AmdAqlWrap slot state
+enum AqlWrapState {
+ AQL_WRAP_FREE = 0,
+ AQL_WRAP_RESERVED,
+ AQL_WRAP_READY,
+ AQL_WRAP_MARKER,
+ AQL_WRAP_BUSY,
+ AQL_WRAP_DONE
+};
+
+//! Profiling states
+enum ProfilingState {
+ PROFILING_COMMAND_START = 0,
+ PROFILING_COMMAND_END,
+ PROFILING_COMMAND_COMPLETE
+};
+
+typedef struct _HsaAqlDispatchPacket {
+ uint mix;
+ ushort workgroup_size[3];
+ ushort reserved2;
+ uint grid_size[3];
+ uint private_segment_size_bytes;
+ uint group_segment_size_bytes;
+ ulong kernel_object_address;
+ ulong kernel_arg_address;
+ ulong reserved3;
+ ulong completion_signal;
+} HsaAqlDispatchPacket;
+
+typedef struct _AmdVQueueHeader {
+ uint aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64).
+ uint event_slot_num; //!< [LRO] The number of kernel events in the events buffer
+ ulong event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events
+ ulong event_slots; //!< [LRO] Pointer to a buffer for the events.
+ // Array of event_slot_num entries of AmdEvent
+ ulong aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots
+ uint command_counter; //!< [LRW] The global counter for the submitted commands into the queue
+ uint wait_size; //!< [LRO] The wait list size (in clk_event_t)
+ uint arg_size; //!< [LRO] The size of argument buffer (in bytes)
+ uint reserved0; //!< For the future usage
+ ulong kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry)
+ uint reserved[2]; //!< For the future usage
+} AmdVQueueHeader;
+
+typedef struct _AmdAqlWrap {
+ uint state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY,
+ // MARKER, BUSY and DONE. The block could be returned back to a free state.
+ uint enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start
+ // (kernel_enqueue_flags_t)
+ // CLK_ENQUEUE_FLAGS_NO_WAIT we just start processing
+ // CLK_ENQUEUE_FLAGS_WAIT_KERNEL check if parent_wrap->state is done and then start processing
+ // CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP - currently == WAIT_KERNEL
+ uint command_id; //!< [LWO/SRO] The unique command ID
+ uint child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels.
+ // Its incremented on the
+ // start and decremented on the finish. The parent kernel can be considered as
+ // done when the value is 0 and the state is DONE
+ ulong completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
+ ulong parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
+ ulong wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
+ uint wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
+ uint reserved[5]; //!< For the future usage
+ HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet 64 bytes AQL packet
+} AmdAqlWrap;
+
+typedef struct _AmdEvent {
+ uint state; //!< [LRO/SRW] Event state: START, END, COMPLETE
+ uint counter; //!< [LRW] Event retain/release counter. 0 means the event is free
+ ulong timer[3]; //!< [LRO/SWO] Timer values for profiling for each state
+ ulong capture_info; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME
+} AmdEvent;
+
+typedef struct _SchedulerParam {
+ uint signal; //!< Signal to stop the child queue
+ uint eng_clk; //!< Engine clock in Mhz
+ ulong hw_queue; //!< Address to HW queue
+ ulong hsa_queue; //!< Address to HSA dummy queue
+ uint reserved; //!< reserved
+ uint scratchSize; //!< Scratch buffer size
+ ulong scratch; //!< GPU address to the scratch buffer
+ uint numMaxWaves; //!< Num max waves on the asic
+ uint releaseHostCP; //!< Releases CP on the host queue
+ ulong parentAQL; //!< Host parent AmdAqlWrap packet
+ uint dedicatedQueue; //!< Scheduler uses a dedicated queue
+ uint scratchOffset; //!< Scratch buffer offset
+} SchedulerParam;
+
+typedef struct _HwDispatchHeader {
+ uint pad00; // CP WRITE_DATA write to rewind for memory
+ uint pad01;
+ uint pad02;
+ uint pad03;
+ uint rewind; // REWIND execution
+ uint startExe; // valid bit
+ uint condExe0; // 0xC0032200 -- TYPE 3, COND_EXEC
+ uint condExe1; // 0x00000204 ----
+ uint condExe2; // 0x00000000 ----
+ uint condExe3; // 0x00000000 ----
+ uint condExe4; // 0x00000000 ----
+} HwDispatchHeader;
+
+typedef struct _HwDispatch {
+ uint packet0; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values)
+ uint offset0; // 0x00000204 ---- OFFSET
+ uint startX; // 0x00000000 ---- COMPUTE_START_X: START = 0x0
+ uint startY; // 0x00000000 ---- COMPUTE_START_Y: START = 0x0
+ uint startZ; // 0x00000000 ---- COMPUTE_START_Z: START = 0x0
+ uint wrkGrpSizeX; // 0x00000000 ---- COMPUTE_NUM_THREAD_X: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
+ uint wrkGrpSizeY; // 0x00000000 ---- COMPUTE_NUM_THREAD_Y: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
+ uint wrkGrpSizeZ; // 0x00000000 ---- COMPUTE_NUM_THREAD_Z: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
+ uint packet1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+ uint offset1; // 0x0000020C ---- OFFSET
+ uint isaLo; // 0x00000000 ---- COMPUTE_PGM_LO: DATA = 0x0
+ uint isaHi; // 0x00000000 ---- COMPUTE_PGM_HI: DATA = 0x0, INST_ATC__CI__VI = 0x0
+ uint packet2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+ uint offset2; // 0x00000212 ---- OFFSET
+ uint resource1; // 0x00000000 ---- COMPUTE_PGM_RSRC1: VGPRS = 0x0, SGPRS = 0x0, PRIORITY = 0x0, FLOAT_MODE = 0x0, PRIV = 0x0, DX10_CLAMP = 0x0, DEBUG_MODE = 0x0, IEEE_MODE = 0x0, BULKY__CI__VI = 0x0, CDBG_USER__CI__VI = 0x0
+ uint resource2; // 0x00000000 ---- COMPUTE_PGM_RSRC2: SCRATCH_EN = 0x0, USER_SGPR = 0x0, TRAP_PRESENT = 0x0, TGID_X_EN = 0x0, TGID_Y_EN = 0x0, TGID_Z_EN = 0x0, TG_SIZE_EN = 0x0, TIDIG_COMP_CNT = 0x0, EXCP_EN_MSB__CI__VI = 0x0, LDS_SIZE = 0x0, EXCP_EN = 0x0
+ uint packet3; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values)
+ uint offset3; // 0x00000215 ---- OFFSET
+ uint pad31; // 0x00000000 ---- COMPUTE_RESOURCE_LIMITS: WAVES_PER_SH = 0x0, TG_PER_CU = 0x0, LOCK_THRESHOLD = 0x0, SIMD_DEST_CNTL = 0x0, FORCE_SIMD_DIST__CI__VI = 0x0, CU_GROUP_COUNT__CI__VI = 0x0
+ uint pad32; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE0: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF
+ uint pad33; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE1: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF
+ uint ringSize; // 0x00000000 ---- COMPUTE_TMPRING_SIZE: WAVES = 0x0, WAVESIZE = 0x0
+ uint pad34; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE2__CI__VI: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF
+ uint pad35; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE3__CI__VI: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF
+ uint user0; // 0xC0047602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (4 values)
+ uint offsUser0; // 0x00000240 ---- OFFSET
+ uint scratchLo; // 0x00000000 ---- COMPUTE_USER_DATA_0: DATA = 0x0
+ uint scratchHi; // 0x80000000 ---- COMPUTE_USER_DATA_1: DATA = 0x80000000
+ uint scratchSize; // 0x00000000 ---- COMPUTE_USER_DATA_2: DATA = 0x0
+ uint padUser; // 0x00EA7FAC ---- COMPUTE_USER_DATA_3: DATA = 0xEA7FAC
+ uint user1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+ uint offsUser1; // 0x00000244 ---- OFFSET
+ uint aqlPtrLo; // 0x00000000 ---- COMPUTE_USER_DATA_4: DATA = 0x0
+ uint aqlPtrHi; // 0x00000000 ---- COMPUTE_USER_DATA_5: DATA = 0x0
+ uint user2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+ uint offsUser2; // 0x00000246 ---- OFFSET
+ uint hsaQueueLo; // 0x00000000 ---- COMPUTE_USER_DATA_6: DATA = 0x0
+ uint hsaQueueHi; // 0x00000000 ---- COMPUTE_USER_DATA_7: DATA = 0x0
+ uint user3; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+ uint offsUser3; // 0x00000246 ---- OFFSET
+ uint argsLo; // 0x00000000 ---- COMPUTE_USER_DATA_8: DATA = 0x0
+ uint argsHi; // 0x00000000 ---- COMPUTE_USER_DATA_9: DATA = 0x0
+ uint copyData; // 0xC0044000 -- TYPE 3, COPY_DATA
+ uint copyDataFlags; // 0x00000405 ---- srcSel 0x5, destSel 0x4, countSel 0x0, wrConfirm 0x0, engineSel 0x0
+ uint scratchAddrLo; // 0x000201C4 ---- srcAddressLo
+ uint scratchAddrHi; // 0x00000000 ---- srcAddressHi
+ uint shPrivateLo; // 0x00002580 ---- dstAddressLo
+ uint shPrivateHi; // 0x00000000 ---- dstAddressHi
+ uint user4; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+ uint offsUser4; // 0x00000248 ---- OFFSET
+ uint scratchOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0
+ uint privSize; // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30
+ uint packet4; // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE
+ uint glbSizeX; // 0x00000000
+ uint glbSizeY; // 0x00000000
+ uint glbSizeZ; // 0x00000000
+ uint padd41; // 0x00000021
+} HwDispatch;
+
+const uint ResumeExecution = 0x80000000; // 0x81000000
+const uint StallExecution = 0x00000000; // 0x01000000
+const uint WavefrontSize = 64;
+const uint MaxWaveSize = 0x400;
+
+static inline void
+dispatch(
+ volatile __global HwDispatch* dispatch,
+ __global HsaAqlDispatchPacket* aqlPkt,
+ ulong scratch,
+ ulong hsaQueue,
+ uint scratchSize,
+ uint scratchOffset,
+ uint numMaxWaves)
+{
+ const uint UsrRegOffset = 0x240;
+ const uint Pm4Nop = 0xC0001002;
+ const uint Pm4UserRegs = 0xC0007602;
+ const uint Pm4CopyReg = 0xC0044000;
+
+ uint usrRegCnt = 0;
+
+ dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0];
+ dispatch->wrkGrpSizeY = aqlPkt->workgroup_size[1];
+ dispatch->wrkGrpSizeZ = aqlPkt->workgroup_size[2];
+ // ISA address
+ __global uchar* kernelObj = (__global uchar*)aqlPkt->kernel_object_address;
+ ulong isa = aqlPkt->kernel_object_address + *((__global uint*)(kernelObj + 0x10));
+ dispatch->isaLo = (uint)(isa >> 8);
+ dispatch->isaHi = (uint)(isa >> 40);
+
+ // Program PGM resource registers
+ dispatch->resource1 = *((__global uint*)(kernelObj + 0x30));
+ dispatch->resource2 = *((__global uint*)(kernelObj + 0x34));
+ uint flags = *((__global uint*)(kernelObj + 0x38));
+ uint privateSize = *((__global uint*)(kernelObj + 0x50));
+
+ uint ldsSize = aqlPkt->group_segment_size_bytes +
+ *((__global uint*)(kernelObj + 0x54));
+ // Align up the LDS blocks 128 * 4(in DWORDs)
+ uint ldsBlocks = (ldsSize + 511) >> 9;
+ dispatch->resource2 |= (ldsBlocks << 15);
+
+ // Workaround for compiler bug
+ dispatch->scratchLo = (flags & 1);
+ // privSegEna = (flags & 1);
+ if (flags & 0x1) {
+ uint waveSize = privateSize * WavefrontSize;
+ // 256 DWRODs is the minimum for SQ
+ waveSize = max(MaxWaveSize, waveSize);
+ uint numWaves = scratchSize / waveSize;
+ numWaves = min(numWaves, numMaxWaves);
+ dispatch->ringSize = numWaves;
+ dispatch->ringSize |= (waveSize >> 10) << 12;
+ dispatch->user0 = Pm4UserRegs | (4 << 16);
+ dispatch->scratchLo = (uint)scratch;
+ dispatch->scratchHi = ((uint)(scratch >> 32)) | 0x80000000; // Enables swizzle
+ dispatch->scratchSize = scratchSize;
+ usrRegCnt += 4;
+ }
+ else {
+ dispatch->ringSize = 0;
+ dispatch->user0 = Pm4Nop | (4 << 16);
+ }
+
+ // dispatchEna = (flags & 0x2);
+ dispatch->user1 = (flags & 0x2) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16));
+ dispatch->offsUser1 = UsrRegOffset + usrRegCnt;
+ usrRegCnt += (flags & 0x2) ? 2 : 0;
+ ulong gpuAqlPtr = (ulong)aqlPkt;
+ dispatch->aqlPtrLo = (uint)gpuAqlPtr;
+ dispatch->aqlPtrHi = (uint)(gpuAqlPtr >> 32);
+
+ // queuePtr = (flags & 0x4);
+ if (flags & 0x4) {
+ dispatch->user2 = Pm4UserRegs | (2 << 16);
+ dispatch->offsUser2 = UsrRegOffset + usrRegCnt;
+ usrRegCnt += 2;
+ dispatch->hsaQueueLo = (uint)hsaQueue;
+ dispatch->hsaQueueHi = (uint)(hsaQueue >> 32);
+ }
+ else {
+ dispatch->user2 = Pm4Nop | (2 << 16);
+ }
+
+ // kernelArgEna = (flags & 0x8);
+ dispatch->user3 = (flags & 0x8) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16));
+ dispatch->offsUser3 = UsrRegOffset + usrRegCnt;
+ usrRegCnt += (flags & 0x8) ? 2 : 0;
+ dispatch->argsLo = (uint)aqlPkt->kernel_arg_address;
+ dispatch->argsHi = (uint)(aqlPkt->kernel_arg_address >> 32);
+
+ // flatScratchEna = (flags & 0x20);
+ if (flags & 0x20) {
+ dispatch->copyData = Pm4CopyReg;
+ dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16);
+ dispatch->offsUser4 = UsrRegOffset + usrRegCnt;
+ dispatch->scratchOffs = scratchOffset;
+ dispatch->privSize = privateSize;
+ }
+ else {
+ dispatch->copyData = Pm4Nop | (8 << 16);
+ }
+
+ dispatch->glbSizeX = aqlPkt->grid_size[0];
+ dispatch->glbSizeY = aqlPkt->grid_size[1];
+ dispatch->glbSizeZ = aqlPkt->grid_size[2];
+}
+
+static inline bool
+checkWaitEvents(__global AmdEvent** events, uint numEvents)
+{
+ for (uint i = 0; i < numEvents; ++i) {
+ if (atomic_load_explicit(
+ (__global atomic_uint*)(&events[i]->state),
+ memory_order_acquire, memory_scope_device) != CL_COMPLETE) {
+ return false;
+ }
+ }
+ return true;
+}
+
+
+// release slot in a bitmask controlled resource i is the slot number
+static inline void
+release_slot(__global uint* restrict mask, uint i)
+{
+ /* uint b = ~(1UL << (i & 0x1f)); */
+ uint b = ~amd_bfm(1U, i);
+ __global atomic_uint *p = (__global atomic_uint *)(mask + (i >> 5));
+ uint vv;
+ uint v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device);
+ for (;;) {
+ vv = v & b;
+ if (atomic_compare_exchange_strong_explicit(p, &v, vv,
+ memory_order_acq_rel, memory_order_acquire, memory_scope_device)) {
+ break;
+ }
+ }
+}
+
+static inline void
+releaseEvent(__global AmdEvent* ev, __global uint* emask, __global AmdEvent* eb)
+{
+ uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U,
+ memory_order_acq_rel, memory_scope_device);
+ if (c == 1U) {
+ uint i = ev - eb;
+ release_slot(emask, i);
+ }
+}
+
+static inline void
+releaseWaitEvents(__global AmdEvent** events, uint numEvents,
+ __global uint* emask, __global AmdEvent* eb)
+{
+ for (uint i = 0; i < numEvents; ++i) {
+ releaseEvent(events[i], emask, eb);
+ }
+}
+
+static inline uint
+min_command(uint slot_num, __global AmdAqlWrap* wraps)
+{
+ uint minCommand = 0xffffffff;
+ for (uint idx = 0; idx < slot_num; ++idx) {
+ __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
+ uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
+ memory_order_acquire, memory_scope_device);
+ if ((slotState != AQL_WRAP_FREE) && (slotState != AQL_WRAP_RESERVED)) {
+ minCommand = min(disp->command_id, minCommand);
+ }
+ }
+ return minCommand;
+}
+
+extern ulong __hsail_get_clock(); // Declaration is required
+
__kernel void
scheduler(
- __global void * queue,
- __global void * params,
- uint paramIdx)
+ __global AmdVQueueHeader* queue,
+ __global SchedulerParam* params,
+ uint paramIdx)
{
- __amd_scheduler(queue, params, paramIdx);
+ __global SchedulerParam* param = ¶ms[paramIdx];
+ volatile __global HwDispatch* hwDisp = (volatile __global HwDispatch*)
+ &((__global HwDispatchHeader*)param->hw_queue)[1];
+ __global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL);
+ __global uint* counter = (__global uint*)(&hostParent->child_counter);
+ __global uint* signal = (__global uint*)(¶m->signal);
+ __global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
+ __global uint* amask = (__global uint *)queue->aql_slot_mask;
+
+ //! @todo This is an unexplained behavior.
+ //! The scheduler can be launched one more time after termination.
+ if (1 == atomic_load_explicit((__global atomic_uint*)¶m->releaseHostCP,
+ memory_order_acquire, memory_scope_device)) {
+ return;
+ }
+
+ uint launch = 0;
+ uint loop = 1;
+
+ uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
+ memory_order_acquire, memory_scope_device);
+
+ int baseIdx = get_group_id(0) * 32;
+ while (mask != 0) {
+ uint sIdx = ctz(mask);
+ uint idx = baseIdx + sIdx;
+ mask &= ~(1 << sIdx);
+ __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
+ uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
+ memory_order_acquire, memory_scope_device);
+ __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap);
+ __global AmdEvent* event = (__global AmdEvent*)(disp->completion);
+
+ // Check if the current slot is ready for processing
+ if (slotState == AQL_WRAP_READY) {
+ if (launch == 0) {
+ // Attempt to find a new disaptch if nothing was launched yet
+ uint parentState = atomic_load_explicit(
+ (__global atomic_uint*)(&parent->state),
+ memory_order_acquire, memory_scope_device);
+ uint enqueueFlags = atomic_load_explicit(
+ (__global atomic_uint*)(&disp->enqueue_flags),
+ memory_order_acquire, memory_scope_device);
+
+ // Check the launch flags
+ if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) ||
+ (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) &&
+ (parentState != AQL_WRAP_DONE)) {
+ continue;
+ }
+
+ // Check if the wait list is COMPLETE
+ launch = checkWaitEvents(
+ (__global AmdEvent**)(disp->wait_list), disp->wait_num);
+
+ if (launch) {
+ if (event != 0) {
+ event->timer[PROFILING_COMMAND_START] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
+ }
+ // Launch child kernel ....
+ dispatch(&hwDisp[get_group_id(0)], &disp->aql, param->scratch, param->hsa_queue,
+ param->scratchSize, param->scratchOffset, param->numMaxWaves);
+ disp->state = AQL_WRAP_BUSY;
+ releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
+ disp->wait_num, (__global uint*)queue->event_slot_mask,
+ (__global AmdEvent*)queue->event_slots);
+ break;
+ }
+ }
+ }
+ else if (slotState == AQL_WRAP_MARKER) {
+ bool complete = false;
+ if (disp->wait_num == 0) {
+ uint minCommand = min_command(queue->aql_slot_num, wraps);
+ if (disp->command_id == minCommand) {
+ complete = true;
+ }
+ }
+ else {
+ // Check if the wait list is COMPLETE
+ if (checkWaitEvents(
+ (__global AmdEvent**)(disp->wait_list), disp->wait_num)) {
+ complete = true;
+ releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
+ disp->wait_num, (__global uint*)queue->event_slot_mask,
+ (__global AmdEvent*)queue->event_slots);
+ }
+ }
+ if (complete) {
+ // Decrement the child execution counter on the parent
+ atomic_fetch_sub_explicit(
+ (__global atomic_uint*)&parent->child_counter,
+ 1, memory_order_acq_rel, memory_scope_device);
+ event->state = CL_COMPLETE;
+ disp->state = AQL_WRAP_FREE;
+ release_slot(amask, idx);
+ releaseEvent(event, (__global uint*)queue->event_slot_mask,
+ (__global AmdEvent*)queue->event_slots);
+ }
+ }
+ else if ((slotState == AQL_WRAP_BUSY) ||
+ (slotState == AQL_WRAP_DONE)) {
+ if (slotState == AQL_WRAP_BUSY) {
+ disp->state = AQL_WRAP_DONE;
+ if (event != 0) {
+ event->timer[PROFILING_COMMAND_END] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
+ }
+ }
+ // Was CL_EVENT requested?
+ if (event != 0) {
+ // The current dispatch doesn't have any outstanding children
+ if (disp->child_counter == 0) {
+ event->timer[PROFILING_COMMAND_COMPLETE] =
+ (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
+ event->state = CL_COMPLETE;
+ if (event->capture_info != 0) {
+ __global ulong* values = (__global ulong*)event->capture_info;
+ values[0] = event->timer[PROFILING_COMMAND_END] -
+ event->timer[PROFILING_COMMAND_START];
+ values[1] = event->timer[PROFILING_COMMAND_COMPLETE] -
+ event->timer[PROFILING_COMMAND_START];
+ }
+ releaseEvent(event, (__global uint *)queue->event_slot_mask,
+ (__global AmdEvent *)queue->event_slots);
+ }
+ }
+ // The current dispatch doesn't have any outstanding children
+ if (disp->child_counter == 0) {
+ // Decrement the child execution counter on the parent
+ atomic_fetch_sub_explicit(
+ (__global atomic_uint*)&parent->child_counter,
+ 1, memory_order_acq_rel, memory_scope_device);
+ disp->state = AQL_WRAP_FREE;
+ release_slot(amask, idx);
+ }
+ }
+ }
+
+ if (launch == 0) {
+ hwDisp[get_group_id(0)].glbSizeX = 0;
+ hwDisp[get_group_id(0)].glbSizeY = 0;
+ hwDisp[get_group_id(0)].glbSizeZ = 0;
+ }
+
+ if (param->dedicatedQueue) {
+ loop = atomic_load_explicit((__global atomic_uint*)signal,
+ memory_order_acquire, memory_scope_device);
+ }
+ else {
+ loop = atomic_load_explicit((__global atomic_uint*)counter,
+ memory_order_acquire, memory_scope_device);
+ }
+
+ if (loop == 0) {
+ //! \todo Write deadcode to the template, but somehow
+ //! the scheduler will be launched one more time.
+ hwDisp->packet0 = 0xdeadc0de;
+ atomic_store_explicit((__global atomic_uint*)signal,
+ 0, memory_order_release, memory_scope_device);
+ atomic_store_explicit((__global atomic_uint*)¶m->releaseHostCP,
+ 1, memory_order_release, memory_scope_device);
+ }
}
\n
+\n
);
} // namespace gpu