From 72fd13e44580c2d70f50d2dcf3ca453b1582ebdc Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 1 Oct 2014 20:12:43 -0400
Subject: [PATCH] P4 to Git Change 1083437 by bsumner@bsumner-lin-opencl on
2014/10/01 20:06:47
ECR #304775 - move runtime kernels mostly into library
Affected files ...
... //depot/stg/opencl/drivers/opencl/library/common.hsa/src/blitKernels.cl#1 add
... //depot/stg/opencl/drivers/opencl/library/common/src/blitKernels.cl#1 add
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/devenq.h#4 edit
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/schedule.cl#1 add
... //depot/stg/opencl/drivers/opencl/library/x86/gen/build/Makefile.gen#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/blitcl.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#30 edit
---
rocclr/runtime/device/blitcl.cpp | 514 ++++++-----------------
rocclr/runtime/device/gpu/gpuschedcl.cpp | 514 +----------------------
2 files changed, 128 insertions(+), 900 deletions(-)
diff --git a/rocclr/runtime/device/blitcl.cpp b/rocclr/runtime/device/blitcl.cpp
index 4a79a0e597..2462e0cf4e 100644
--- a/rocclr/runtime/device/blitcl.cpp
+++ b/rocclr/runtime/device/blitcl.cpp
@@ -7,430 +7,160 @@ namespace device {
#define BLIT_KERNELS(...) #__VA_ARGS__
const char* BlitSourceCode = BLIT_KERNELS(
-\n
+
+extern void __amd_copyBufferToImage(
+ __global uint*, __write_only image2d_array_t, ulong4,
+ int4, int4, uint4, ulong4);
+
+extern void __amd_copyImageToBuffer(
+ __read_only image2d_array_t, __global uint*, __global ushort*,
+ __global uchar*, int4, ulong4, int4, uint4, ulong4);
+
+extern void __amd_copyImage(
+ __read_only image2d_array_t, __write_only image2d_array_t,
+ int4, int4, int4);
+
+extern void __amd_copyImage1DA(
+ __read_only image2d_array_t, __write_only image2d_array_t,
+ int4, int4, int4);
+
+extern void __amd_copyBufferRect(
+ __global uchar*, __global uchar*,
+ ulong4, ulong4, ulong4);
+
+extern void __amd_copyBufferRectAligned(
+ __global uint*, __global uint*,
+ ulong4, ulong4, ulong4);
+
+extern void __amd_copyBuffer(
+ __global uchar*, __global uchar*,
+ ulong, ulong, ulong);
+
+extern void __amd_copyBufferAligned(
+ __global uint*, __global uint*,
+ ulong, ulong, ulong, uint);
+
+extern void __amd_fillBuffer(
+ __global uchar*, __global uint*, __constant uchar*,
+ uint, ulong, ulong);
+
+extern void __amd_fillImage(
+ __write_only image2d_array_t,
+ float4, int4, uint4, int4, int4, uint);
+
+
__kernel void copyBufferToImage(
- __global uint* src,
- __write_only image2d_array_t dst,
- ulong4 srcOrigin,
- int4 dstOrigin,
- int4 size,
- uint4 format,
- ulong4 pitch)
+ __global uint* src,
+ __write_only image2d_array_t dst,
+ ulong4 srcOrigin,
+ int4 dstOrigin,
+ int4 size,
+ uint4 format,
+ ulong4 pitch)
{
- ulong idxSrc;
- int4 coordsDst;
- uint4 pixel;
- __global uint* srcUInt = src;
- __global ushort* srcUShort = (__global ushort*)src;
- __global uchar* srcUChar = (__global uchar*)src;
- ushort tmpUShort;
- uint tmpUInt;
-
- coordsDst.x = get_global_id(0);
- coordsDst.y = get_global_id(1);
- coordsDst.z = get_global_id(2);
- coordsDst.w = 0;
-
- if ((coordsDst.x >= size.x) ||
- (coordsDst.y >= size.y) ||
- (coordsDst.z >= size.z)) {
- return;
- }
-
- idxSrc = (coordsDst.z * pitch.y +
- coordsDst.y * pitch.x + coordsDst.x) *
- format.z + srcOrigin.x;
-
- coordsDst.x += dstOrigin.x;
- coordsDst.y += dstOrigin.y;
- coordsDst.z += dstOrigin.z;
-
- // Check components
- switch (format.x) {
- case 1:
- // Check size
- if (format.y == 1) {
- pixel.x = (uint)srcUChar[idxSrc];
- }
- else if (format.y == 2) {
- pixel.x = (uint)srcUShort[idxSrc];
- }
- else {
- pixel.x = srcUInt[idxSrc];
- }
- break;
- case 2:
- // Check size
- if (format.y == 1) {
- tmpUShort = srcUShort[idxSrc];
- pixel.x = (uint)(tmpUShort & 0xff);
- pixel.y = (uint)(tmpUShort >> 8);
- }
- else if (format.y == 2) {
- tmpUInt = srcUInt[idxSrc];
- pixel.x = (tmpUInt & 0xffff);
- pixel.y = (tmpUInt >> 16);
- }
- else {
- pixel.x = srcUInt[idxSrc++];
- pixel.y = srcUInt[idxSrc];
- }
- break;
- case 4:
- // Check size
- if (format.y == 1) {
- tmpUInt = srcUInt[idxSrc];
- pixel.x = tmpUInt & 0xff;
- pixel.y = (tmpUInt >> 8) & 0xff;
- pixel.z = (tmpUInt >> 16) & 0xff;
- pixel.w = (tmpUInt >> 24) & 0xff;
- }
- else if (format.y == 2) {
- tmpUInt = srcUInt[idxSrc++];
- pixel.x = tmpUInt & 0xffff;
- pixel.y = (tmpUInt >> 16);
- tmpUInt = srcUInt[idxSrc];
- pixel.z = tmpUInt & 0xffff;
- pixel.w = (tmpUInt >> 16);
- }
- else {
- pixel.x = srcUInt[idxSrc++];
- pixel.y = srcUInt[idxSrc++];
- pixel.z = srcUInt[idxSrc++];
- pixel.w = srcUInt[idxSrc];
- }
- break;
- }
- // Write the final pixel
- write_imageui(dst, coordsDst, pixel);
+ __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch);
}
-\n
+
__kernel void copyImageToBuffer(
- __read_only image2d_array_t src,
- __global uint* dstUInt,
- __global ushort* dstUShort,
- __global uchar* dstUChar,
- int4 srcOrigin,
- ulong4 dstOrigin,
- int4 size,
- uint4 format,
- ulong4 pitch)
+ __read_only image2d_array_t src,
+ __global uint* dstUInt,
+ __global ushort* dstUShort,
+ __global uchar* dstUChar,
+ int4 srcOrigin,
+ ulong4 dstOrigin,
+ int4 size,
+ uint4 format,
+ ulong4 pitch)
{
- ulong idxDst;
- int4 coordsSrc;
- uint4 texel;
-
- coordsSrc.x = get_global_id(0);
- coordsSrc.y = get_global_id(1);
- coordsSrc.z = get_global_id(2);
- coordsSrc.w = 0;
-
- if ((coordsSrc.x >= size.x) ||
- (coordsSrc.y >= size.y) ||
- (coordsSrc.z >= size.z)) {
- return;
- }
-
- idxDst = (coordsSrc.z * pitch.y + coordsSrc.y * pitch.x +
- coordsSrc.x) * format.z + dstOrigin.x;
-
- coordsSrc.x += srcOrigin.x;
- coordsSrc.y += srcOrigin.y;
- coordsSrc.z += srcOrigin.z;
-
- texel = read_imageui(src, coordsSrc);
-
- // Check components
- switch (format.x) {
- case 1:
- // Check size
- switch (format.y) {
- case 1:
- dstUChar[idxDst] = (uchar)texel.x;
- break;
- case 2:
- dstUShort[idxDst] = (ushort)texel.x;
- break;
- case 4:
- dstUInt[idxDst] = texel.x;
- break;
- }
- break;
- case 2:
- // Check size
- switch (format.y) {
- case 1:
- dstUShort[idxDst] = (ushort)texel.x |
- ((ushort)texel.y << 8);
- break;
- case 2:
- dstUInt[idxDst] = texel.x | (texel.y << 16);
- break;
- case 4:
- dstUInt[idxDst++] = texel.x;
- dstUInt[idxDst] = texel.y;
- break;
- }
- break;
- case 4:
- // Check size
- switch (format.y) {
- case 1:
- dstUInt[idxDst] = (uint)texel.x |
- (texel.y << 8) |
- (texel.z << 16) |
- (texel.w << 24);
- break;
- case 2:
- dstUInt[idxDst++] = texel.x | (texel.y << 16);
- dstUInt[idxDst] = texel.z | (texel.w << 16);
- break;
- case 4:
- dstUInt[idxDst++] = texel.x;
- dstUInt[idxDst++] = texel.y;
- dstUInt[idxDst++] = texel.z;
- dstUInt[idxDst] = texel.w;
- break;
- }
- break;
- }
+ __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar,
+ srcOrigin, dstOrigin, size, format, pitch);
}
-\n
+
__kernel void copyImage(
__read_only image2d_array_t src,
__write_only image2d_array_t dst,
- int4 srcOrigin,
- int4 dstOrigin,
- int4 size)
+ int4 srcOrigin,
+ int4 dstOrigin,
+ int4 size)
{
- int4 coordsDst;
- int4 coordsSrc;
-
- coordsDst.x = get_global_id(0);
- coordsDst.y = get_global_id(1);
- coordsDst.z = get_global_id(2);
- coordsDst.w = 0;
-
- if ((coordsDst.x >= size.x) ||
- (coordsDst.y >= size.y) ||
- (coordsDst.z >= size.z)) {
- return;
- }
-
- coordsSrc = srcOrigin + coordsDst;
- coordsDst += dstOrigin;
-
- uint4 texel;
- texel = read_imageui(src, coordsSrc);
- write_imageui(dst, coordsDst, texel);
+ __amd_copyImage(src, dst, srcOrigin, dstOrigin, size);
}
-\n
+
__kernel void copyImage1DA(
- __read_only image2d_array_t src,
- __write_only image2d_array_t dst,
- int4 srcOrigin,
- int4 dstOrigin,
- int4 size)
+ __read_only image2d_array_t src,
+ __write_only image2d_array_t dst,
+ int4 srcOrigin,
+ int4 dstOrigin,
+ int4 size)
{
- int4 coordsDst;
- int4 coordsSrc;
-
- coordsDst.x = get_global_id(0);
- coordsDst.y = get_global_id(1);
- coordsDst.z = get_global_id(2);
- coordsDst.w = 0;
-
- if ((coordsDst.x >= size.x) ||
- (coordsDst.y >= size.y) ||
- (coordsDst.z >= size.z)) {
- return;
- }
-
- coordsSrc = srcOrigin + coordsDst;
- coordsDst += dstOrigin;
- if (srcOrigin.w != 0) {
- coordsSrc.z = coordsSrc.y;
- coordsSrc.y = 0;
- }
- if (dstOrigin.w != 0) {
- coordsDst.z = coordsDst.y;
- coordsDst.y = 0;
- }
-
- uint4 texel;
- texel = read_imageui(src, coordsSrc);
- write_imageui(dst, coordsDst, texel);
+ __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size);
}
-\n
+
__kernel void copyBufferRect(
- __global uchar* src,
- __global uchar* dst,
- ulong4 srcRect,
- ulong4 dstRect,
- ulong4 size)
+ __global uchar* src,
+ __global uchar* dst,
+ ulong4 srcRect,
+ ulong4 dstRect,
+ ulong4 size)
{
- ulong x = get_global_id(0);
- ulong y = get_global_id(1);
- ulong z = get_global_id(2);
-
- if ((x >= size.x) ||
- (y >= size.y) ||
- (z >= size.z)) {
- return;
- }
-
- ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y;
- ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y;
-
- dst[offsDst] = src[offsSrc];
+ __amd_copyBufferRect(src, dst, srcRect, dstRect, size);
}
-\n
+
__kernel void copyBufferRectAligned(
- __global uint* src,
- __global uint* dst,
- ulong4 srcRect,
- ulong4 dstRect,
- ulong4 size)
+ __global uint* src,
+ __global uint* dst,
+ ulong4 srcRect,
+ ulong4 dstRect,
+ ulong4 size)
{
- ulong x = get_global_id(0);
- ulong y = get_global_id(1);
- ulong z = get_global_id(2);
-
- if ((x >= size.x) ||
- (y >= size.y) ||
- (z >= size.z)) {
- return;
- }
-
- ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y;
- ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y;
-
- if (size.w == 16) {
- __global uint4* src4 = (__global uint4*)src;
- __global uint4* dst4 = (__global uint4*)dst;
- dst4[offsDst] = src4[offsSrc];
- }
- else {
- dst[offsDst] = src[offsSrc];
- }
+ __amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size);
}
-\n
+
__kernel void copyBuffer(
- __global uchar* src,
- __global uchar* dst,
- ulong srcOrigin,
- ulong dstOrigin,
- ulong size)
+ __global uchar* src,
+ __global uchar* dst,
+ ulong srcOrigin,
+ ulong dstOrigin,
+ ulong size)
{
- ulong id = get_global_id(0);
-
- if (id >= size) {
- return;
- }
-
- ulong offsSrc = id + srcOrigin;
- ulong offsDst = id + dstOrigin;
-
- dst[offsDst] = src[offsSrc];
+ __amd_copyBuffer(src, dst, srcOrigin, dstOrigin, size);
}
-\n
+
__kernel void copyBufferAligned(
- __global uint* src,
- __global uint* dst,
- ulong srcOrigin,
- ulong dstOrigin,
- ulong size,
- uint alignment)
+ __global uint* src,
+ __global uint* dst,
+ ulong srcOrigin,
+ ulong dstOrigin,
+ ulong size,
+ uint alignment)
{
- ulong id = get_global_id(0);
-
- if (id >= size) {
- return;
- }
-
- ulong offsSrc = id + srcOrigin;
- ulong offsDst = id + dstOrigin;
-
- if (alignment == 16) {
- __global uint4* src4 = (__global uint4*)src;
- __global uint4* dst4 = (__global uint4*)dst;
- dst4[offsDst] = src4[offsSrc];
- }
- else {
- dst[offsDst] = src[offsSrc];
- }
+ __amd_copyBufferAligned(src, dst, srcOrigin, dstOrigin, size, alignment);
}
-\n
+
__kernel void fillBuffer(
- __global uchar* bufUChar,
- __global uint* bufUInt,
- __constant uchar* pattern,
- uint patternSize,
- ulong offset,
- ulong size)
+ __global uchar* bufUChar,
+ __global uint* bufUInt,
+ __constant uchar* pattern,
+ uint patternSize,
+ ulong offset,
+ ulong size)
{
- ulong id = get_global_id(0);
-
- if (id >= size) {
- return;
- }
-
- if (bufUInt) {
- __global uint* element = &bufUInt[offset + id * patternSize];
- __constant uint* pt = (__constant uint*)pattern;
-
- for (uint i = 0; i < patternSize; ++i) {
- element[i] = pt[i];
- }
- }
- else {
- __global uchar* element = &bufUChar[offset + id * patternSize];
-
- for (uint i = 0; i < patternSize; ++i) {
- element[i] = pattern[i];
- }
- }
+ __amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size);
}
-\n
+
__kernel void fillImage(
- __write_only image2d_array_t image,
- float4 patternFLOAT4,
- int4 patternINT4,
- uint4 patternUINT4,
- int4 origin,
- int4 size,
- uint type)
+ __write_only image2d_array_t image,
+ float4 patternFLOAT4,
+ int4 patternINT4,
+ uint4 patternUINT4,
+ int4 origin,
+ int4 size,
+ uint type)
{
- int4 coords;
-
- coords.x = get_global_id(0);
- coords.y = get_global_id(1);
- coords.z = get_global_id(2);
- coords.w = 0;
-
- if ((coords.x >= size.x) ||
- (coords.y >= size.y) ||
- (coords.z >= size.z)) {
- return;
- }
-
- coords += origin;
-
- // Check components
- switch (type) {
- case 0:
- write_imagef(image, coords, patternFLOAT4);
- break;
- case 1:
- write_imagei(image, coords, patternINT4);
- break;
- case 2:
- write_imageui(image, coords, patternUINT4);
- break;
- }
+ __amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4,
+ origin, size, type);
}
-\n
-\n
+
);
} // namespace device
diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp
index e0c1a73be6..3a05b1a2d3 100644
--- a/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -8,519 +8,17 @@ namespace gpu {
const char* SchedulerSourceCode = SCHEDULER_KERNEL(
\n
-//! AmdAqlWrap slot state
-enum AqlWrapState {
- AQL_WRAP_FREE = 0,
- AQL_WRAP_RESERVED,
- AQL_WRAP_READY,
- AQL_WRAP_MARKER,
- AQL_WRAP_BUSY,
- AQL_WRAP_DONE
-};
-
-//! Profiling states
-enum ProfilingState {
- PROFILING_COMMAND_START = 0,
- PROFILING_COMMAND_END,
- PROFILING_COMMAND_COMPLETE
-};
-
-typedef struct _HsaAqlDispatchPacket {
- uint mix;
- ushort workgroup_size[3];
- ushort reserved2;
- uint grid_size[3];
- uint private_segment_size_bytes;
- uint group_segment_size_bytes;
- ulong kernel_object_address;
- ulong kernel_arg_address;
- ulong reserved3;
- ulong completion_signal;
-} HsaAqlDispatchPacket;
-
-typedef struct _AmdVQueueHeader {
- uint aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64).
- uint event_slot_num; //!< [LRO] The number of kernel events in the events buffer
- ulong event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events
- ulong event_slots; //!< [LRO] Pointer to a buffer for the events.
- // Array of event_slot_num entries of AmdEvent
- ulong aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots
- uint command_counter; //!< [LRW] The global counter for the submitted commands into the queue
- uint wait_size; //!< [LRO] The wait list size (in clk_event_t)
- uint arg_size; //!< [LRO] The size of argument buffer (in bytes)
- uint reserved0; //!< For the future usage
- ulong kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry)
- uint reserved[2]; //!< For the future usage
-} AmdVQueueHeader;
-
-typedef struct _AmdAqlWrap {
- uint state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY,
- // MARKER, BUSY and DONE. The block could be returned back to a free state.
- uint enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start
- // (kernel_enqueue_flags_t)
- // CLK_ENQUEUE_FLAGS_NO_WAIT we just start processing
- // CLK_ENQUEUE_FLAGS_WAIT_KERNEL check if parent_wrap->state is done and then start processing
- // CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP - currently == WAIT_KERNEL
- uint command_id; //!< [LWO/SRO] The unique command ID
- uint child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels.
- // Its incremented on the
- // start and decremented on the finish. The parent kernel can be considered as
- // done when the value is 0 and the state is DONE
- ulong completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
- ulong parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
- ulong wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
- uint wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
- uint reserved[5]; //!< For the future usage
- HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet 64 bytes AQL packet
-} AmdAqlWrap;
-
-typedef struct _AmdEvent {
- uint state; //!< [LRO/SRW] Event state: START, END, COMPLETE
- uint counter; //!< [LRW] Event retain/release counter. 0 means the event is free
- ulong timer[3]; //!< [LRO/SWO] Timer values for profiling for each state
- ulong capture_info; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME
-} AmdEvent;
-
-typedef struct _SchedulerParam {
- uint signal; //!< Signal to stop the child queue
- uint eng_clk; //!< Engine clock in Mhz
- ulong hw_queue; //!< Address to HW queue
- ulong hsa_queue; //!< Address to HSA dummy queue
- uint reserved; //!< reserved
- uint scratchSize; //!< Scratch buffer size
- ulong scratch; //!< GPU address to the scratch buffer
- uint numMaxWaves; //!< Num max waves on the asic
- uint releaseHostCP; //!< Releases CP on the host queue
- ulong parentAQL; //!< Host parent AmdAqlWrap packet
- uint dedicatedQueue; //!< Scheduler uses a dedicated queue
- uint scratchOffset; //!< Scratch buffer offset
-} SchedulerParam;
-
-typedef struct _HwDispatchHeader {
- uint pad00; // CP WRITE_DATA write to rewind for memory
- uint pad01;
- uint pad02;
- uint pad03;
- uint rewind; // REWIND execution
- uint startExe; // valid bit
- uint condExe0; // 0xC0032200 -- TYPE 3, COND_EXEC
- uint condExe1; // 0x00000204 ----
- uint condExe2; // 0x00000000 ----
- uint condExe3; // 0x00000000 ----
- uint condExe4; // 0x00000000 ----
-} HwDispatchHeader;
-
-typedef struct _HwDispatch {
- uint packet0; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values)
- uint offset0; // 0x00000204 ---- OFFSET
- uint startX; // 0x00000000 ---- COMPUTE_START_X: START = 0x0
- uint startY; // 0x00000000 ---- COMPUTE_START_Y: START = 0x0
- uint startZ; // 0x00000000 ---- COMPUTE_START_Z: START = 0x0
- uint wrkGrpSizeX; // 0x00000000 ---- COMPUTE_NUM_THREAD_X: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
- uint wrkGrpSizeY; // 0x00000000 ---- COMPUTE_NUM_THREAD_Y: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
- uint wrkGrpSizeZ; // 0x00000000 ---- COMPUTE_NUM_THREAD_Z: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
- uint packet1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
- uint offset1; // 0x0000020C ---- OFFSET
- uint isaLo; // 0x00000000 ---- COMPUTE_PGM_LO: DATA = 0x0
- uint isaHi; // 0x00000000 ---- COMPUTE_PGM_HI: DATA = 0x0, INST_ATC__CI__VI = 0x0
- uint packet2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
- uint offset2; // 0x00000212 ---- OFFSET
- uint resource1; // 0x00000000 ---- COMPUTE_PGM_RSRC1: VGPRS = 0x0, SGPRS = 0x0, PRIORITY = 0x0, FLOAT_MODE = 0x0, PRIV = 0x0, DX10_CLAMP = 0x0, DEBUG_MODE = 0x0, IEEE_MODE = 0x0, BULKY__CI__VI = 0x0, CDBG_USER__CI__VI = 0x0
- uint resource2; // 0x00000000 ---- COMPUTE_PGM_RSRC2: SCRATCH_EN = 0x0, USER_SGPR = 0x0, TRAP_PRESENT = 0x0, TGID_X_EN = 0x0, TGID_Y_EN = 0x0, TGID_Z_EN = 0x0, TG_SIZE_EN = 0x0, TIDIG_COMP_CNT = 0x0, EXCP_EN_MSB__CI__VI = 0x0, LDS_SIZE = 0x0, EXCP_EN = 0x0
- uint packet3; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values)
- uint offset3; // 0x00000215 ---- OFFSET
- uint pad31; // 0x00000000 ---- COMPUTE_RESOURCE_LIMITS: WAVES_PER_SH = 0x0, TG_PER_CU = 0x0, LOCK_THRESHOLD = 0x0, SIMD_DEST_CNTL = 0x0, FORCE_SIMD_DIST__CI__VI = 0x0, CU_GROUP_COUNT__CI__VI = 0x0
- uint pad32; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE0: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF
- uint pad33; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE1: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF
- uint ringSize; // 0x00000000 ---- COMPUTE_TMPRING_SIZE: WAVES = 0x0, WAVESIZE = 0x0
- uint pad34; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE2__CI__VI: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF
- uint pad35; // 0xFFFFFFFF ---- COMPUTE_STATIC_THREAD_MGMT_SE3__CI__VI: SH0_CU_EN = 0xFFFF, SH1_CU_EN = 0xFFFF
- uint user0; // 0xC0047602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (4 values)
- uint offsUser0; // 0x00000240 ---- OFFSET
- uint scratchLo; // 0x00000000 ---- COMPUTE_USER_DATA_0: DATA = 0x0
- uint scratchHi; // 0x80000000 ---- COMPUTE_USER_DATA_1: DATA = 0x80000000
- uint scratchSize; // 0x00000000 ---- COMPUTE_USER_DATA_2: DATA = 0x0
- uint padUser; // 0x00EA7FAC ---- COMPUTE_USER_DATA_3: DATA = 0xEA7FAC
- uint user1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
- uint offsUser1; // 0x00000244 ---- OFFSET
- uint aqlPtrLo; // 0x00000000 ---- COMPUTE_USER_DATA_4: DATA = 0x0
- uint aqlPtrHi; // 0x00000000 ---- COMPUTE_USER_DATA_5: DATA = 0x0
- uint user2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
- uint offsUser2; // 0x00000246 ---- OFFSET
- uint hsaQueueLo; // 0x00000000 ---- COMPUTE_USER_DATA_6: DATA = 0x0
- uint hsaQueueHi; // 0x00000000 ---- COMPUTE_USER_DATA_7: DATA = 0x0
- uint user3; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
- uint offsUser3; // 0x00000246 ---- OFFSET
- uint argsLo; // 0x00000000 ---- COMPUTE_USER_DATA_8: DATA = 0x0
- uint argsHi; // 0x00000000 ---- COMPUTE_USER_DATA_9: DATA = 0x0
- uint copyData; // 0xC0044000 -- TYPE 3, COPY_DATA
- uint copyDataFlags; // 0x00000405 ---- srcSel 0x5, destSel 0x4, countSel 0x0, wrConfirm 0x0, engineSel 0x0
- uint scratchAddrLo; // 0x000201C4 ---- srcAddressLo
- uint scratchAddrHi; // 0x00000000 ---- srcAddressHi
- uint shPrivateLo; // 0x00002580 ---- dstAddressLo
- uint shPrivateHi; // 0x00000000 ---- dstAddressHi
- uint user4; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
- uint offsUser4; // 0x00000248 ---- OFFSET
- uint scratchOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0
- uint privSize; // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30
- uint packet4; // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE
- uint glbSizeX; // 0x00000000
- uint glbSizeY; // 0x00000000
- uint glbSizeZ; // 0x00000000
- uint padd41; // 0x00000021
-} HwDispatch;
-
-const uint ResumeExecution = 0x80000000; // 0x81000000
-const uint StallExecution = 0x00000000; // 0x01000000
-const uint WavefrontSize = 64;
-const uint MaxWaveSize = 0x400;
-
-static inline void
-dispatch(
- volatile __global HwDispatch* dispatch,
- __global HsaAqlDispatchPacket* aqlPkt,
- ulong scratch,
- ulong hsaQueue,
- uint scratchSize,
- uint scratchOffset,
- uint numMaxWaves)
-{
- const uint UsrRegOffset = 0x240;
- const uint Pm4Nop = 0xC0001002;
- const uint Pm4UserRegs = 0xC0007602;
- const uint Pm4CopyReg = 0xC0044000;
-
- uint usrRegCnt = 0;
-
- dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0];
- dispatch->wrkGrpSizeY = aqlPkt->workgroup_size[1];
- dispatch->wrkGrpSizeZ = aqlPkt->workgroup_size[2];
- // ISA address
- __global uchar* kernelObj = (__global uchar*)aqlPkt->kernel_object_address;
- ulong isa = aqlPkt->kernel_object_address + *((__global uint*)(kernelObj + 0x10));
- dispatch->isaLo = (uint)(isa >> 8);
- dispatch->isaHi = (uint)(isa >> 40);
-
- // Program PGM resource registers
- dispatch->resource1 = *((__global uint*)(kernelObj + 0x30));
- dispatch->resource2 = *((__global uint*)(kernelObj + 0x34));
- uint flags = *((__global uint*)(kernelObj + 0x38));
- uint privateSize = *((__global uint*)(kernelObj + 0x50));
-
- uint ldsSize = aqlPkt->group_segment_size_bytes +
- *((__global uint*)(kernelObj + 0x54));
- // Align up the LDS blocks 128 * 4(in DWORDs)
- uint ldsBlocks = (ldsSize + 511) >> 9;
- dispatch->resource2 |= (ldsBlocks << 15);
-
- // Workaround for compiler bug
- dispatch->scratchLo = (flags & 1);
- // privSegEna = (flags & 1);
- if (flags & 0x1) {
- uint waveSize = privateSize * WavefrontSize;
- // 256 DWRODs is the minimum for SQ
- waveSize = max(MaxWaveSize, waveSize);
- uint numWaves = scratchSize / waveSize;
- numWaves = min(numWaves, numMaxWaves);
- dispatch->ringSize = numWaves;
- dispatch->ringSize |= (waveSize >> 10) << 12;
- dispatch->user0 = Pm4UserRegs | (4 << 16);
- dispatch->scratchLo = (uint)scratch;
- dispatch->scratchHi = ((uint)(scratch >> 32)) | 0x80000000; // Enables swizzle
- dispatch->scratchSize = scratchSize;
- usrRegCnt += 4;
- }
- else {
- dispatch->ringSize = 0;
- dispatch->user0 = Pm4Nop | (4 << 16);
- }
-
- // dispatchEna = (flags & 0x2);
- dispatch->user1 = (flags & 0x2) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16));
- dispatch->offsUser1 = UsrRegOffset + usrRegCnt;
- usrRegCnt += (flags & 0x2) ? 2 : 0;
- ulong gpuAqlPtr = (ulong)aqlPkt;
- dispatch->aqlPtrLo = (uint)gpuAqlPtr;
- dispatch->aqlPtrHi = (uint)(gpuAqlPtr >> 32);
-
- // queuePtr = (flags & 0x4);
- if (flags & 0x4) {
- dispatch->user2 = Pm4UserRegs | (2 << 16);
- dispatch->offsUser2 = UsrRegOffset + usrRegCnt;
- usrRegCnt += 2;
- dispatch->hsaQueueLo = (uint)hsaQueue;
- dispatch->hsaQueueHi = (uint)(hsaQueue >> 32);
- }
- else {
- dispatch->user2 = Pm4Nop | (2 << 16);
- }
-
- // kernelArgEna = (flags & 0x8);
- dispatch->user3 = (flags & 0x8) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16));
- dispatch->offsUser3 = UsrRegOffset + usrRegCnt;
- usrRegCnt += (flags & 0x8) ? 2 : 0;
- dispatch->argsLo = (uint)aqlPkt->kernel_arg_address;
- dispatch->argsHi = (uint)(aqlPkt->kernel_arg_address >> 32);
-
- // flatScratchEna = (flags & 0x20);
- if (flags & 0x20) {
- dispatch->copyData = Pm4CopyReg;
- dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16);
- dispatch->offsUser4 = UsrRegOffset + usrRegCnt;
- dispatch->scratchOffs = scratchOffset;
- dispatch->privSize = privateSize;
- }
- else {
- dispatch->copyData = Pm4Nop | (8 << 16);
- }
-
- dispatch->glbSizeX = aqlPkt->grid_size[0];
- dispatch->glbSizeY = aqlPkt->grid_size[1];
- dispatch->glbSizeZ = aqlPkt->grid_size[2];
-}
-
-static inline bool
-checkWaitEvents(__global AmdEvent** events, uint numEvents)
-{
- for (uint i = 0; i < numEvents; ++i) {
- if (atomic_load_explicit(
- (__global atomic_uint*)(&events[i]->state),
- memory_order_acquire, memory_scope_device) != CL_COMPLETE) {
- return false;
- }
- }
- return true;
-}
-
-
-// release slot in a bitmask controlled resource i is the slot number
-static inline void
-release_slot(__global uint* restrict mask, uint i)
-{
- /* uint b = ~(1UL << (i & 0x1f)); */
- uint b = ~amd_bfm(1U, i);
- __global atomic_uint *p = (__global atomic_uint *)(mask + (i >> 5));
- uint vv;
- uint v = atomic_load_explicit(p, memory_order_acquire, memory_scope_device);
- for (;;) {
- vv = v & b;
- if (atomic_compare_exchange_strong_explicit(p, &v, vv,
- memory_order_acq_rel, memory_order_acquire, memory_scope_device)) {
- break;
- }
- }
-}
-
-static inline void
-releaseEvent(__global AmdEvent* ev, __global uint* emask, __global AmdEvent* eb)
-{
- uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U,
- memory_order_acq_rel, memory_scope_device);
- if (c == 1U) {
- uint i = ev - eb;
- release_slot(emask, i);
- }
-}
-
-static inline void
-releaseWaitEvents(__global AmdEvent** events, uint numEvents,
- __global uint* emask, __global AmdEvent* eb)
-{
- for (uint i = 0; i < numEvents; ++i) {
- releaseEvent(events[i], emask, eb);
- }
-}
-
-static inline uint
-min_command(uint slot_num, __global AmdAqlWrap* wraps)
-{
- uint minCommand = 0xffffffff;
- for (uint idx = 0; idx < slot_num; ++idx) {
- __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
- uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
- memory_order_acquire, memory_scope_device);
- if ((slotState != AQL_WRAP_FREE) && (slotState != AQL_WRAP_RESERVED)) {
- minCommand = min(disp->command_id, minCommand);
- }
- }
- return minCommand;
-}
-
-extern ulong __hsail_get_clock(); // Declaration is required
-
+extern void __amdrt_scheduler(__global void *, __global void *, uint);
+\n
__kernel void
scheduler(
- __global AmdVQueueHeader* queue,
- __global SchedulerParam* params,
- uint paramIdx)
+ __global void * queue,
+ __global void * params,
+ uint paramIdx)
{
- __global SchedulerParam* param = ¶ms[paramIdx];
- volatile __global HwDispatch* hwDisp = (volatile __global HwDispatch*)
- &((__global HwDispatchHeader*)param->hw_queue)[1];
- __global AmdAqlWrap* hostParent = (__global AmdAqlWrap*)(param->parentAQL);
- __global uint* counter = (__global uint*)(&hostParent->child_counter);
- __global uint* signal = (__global uint*)(¶m->signal);
- __global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1];
- __global uint* amask = (__global uint *)queue->aql_slot_mask;
-
- //! @todo This is an unexplained behavior.
- //! The scheduler can be launched one more time after termination.
- if (1 == atomic_load_explicit((__global atomic_uint*)¶m->releaseHostCP,
- memory_order_acquire, memory_scope_device)) {
- return;
- }
-
- uint launch = 0;
- uint loop = 1;
-
- uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
- memory_order_acquire, memory_scope_device);
-
- int baseIdx = get_group_id(0) * 32;
- while (mask != 0) {
- uint sIdx = ctz(mask);
- uint idx = baseIdx + sIdx;
- mask &= ~(1 << sIdx);
- __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx];
- uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state),
- memory_order_acquire, memory_scope_device);
- __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap);
- __global AmdEvent* event = (__global AmdEvent*)(disp->completion);
-
- // Check if the current slot is ready for processing
- if (slotState == AQL_WRAP_READY) {
- if (launch == 0) {
- // Attempt to find a new disaptch if nothing was launched yet
- uint parentState = atomic_load_explicit(
- (__global atomic_uint*)(&parent->state),
- memory_order_acquire, memory_scope_device);
- uint enqueueFlags = atomic_load_explicit(
- (__global atomic_uint*)(&disp->enqueue_flags),
- memory_order_acquire, memory_scope_device);
-
- // Check the launch flags
- if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) ||
- (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) &&
- (parentState != AQL_WRAP_DONE)) {
- continue;
- }
-
- // Check if the wait list is COMPLETE
- launch = checkWaitEvents(
- (__global AmdEvent**)(disp->wait_list), disp->wait_num);
-
- if (launch) {
- if (event != 0) {
- event->timer[PROFILING_COMMAND_START] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- }
- // Launch child kernel ....
- dispatch(&hwDisp[get_group_id(0)], &disp->aql, param->scratch, param->hsa_queue,
- param->scratchSize, param->scratchOffset, param->numMaxWaves);
- disp->state = AQL_WRAP_BUSY;
- releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
- disp->wait_num, (__global uint*)queue->event_slot_mask,
- (__global AmdEvent*)queue->event_slots);
- break;
- }
- }
- }
- else if (slotState == AQL_WRAP_MARKER) {
- bool complete = false;
- if (disp->wait_num == 0) {
- uint minCommand = min_command(queue->aql_slot_num, wraps);
- if (disp->command_id == minCommand) {
- complete = true;
- }
- }
- else {
- // Check if the wait list is COMPLETE
- if (checkWaitEvents(
- (__global AmdEvent**)(disp->wait_list), disp->wait_num)) {
- complete = true;
- releaseWaitEvents((__global AmdEvent**)(disp->wait_list),
- disp->wait_num, (__global uint*)queue->event_slot_mask,
- (__global AmdEvent*)queue->event_slots);
- }
- }
- if (complete) {
- // Decrement the child execution counter on the parent
- atomic_fetch_sub_explicit(
- (__global atomic_uint*)&parent->child_counter,
- 1, memory_order_acq_rel, memory_scope_device);
- event->state = CL_COMPLETE;
- disp->state = AQL_WRAP_FREE;
- release_slot(amask, idx);
- releaseEvent(event, (__global uint*)queue->event_slot_mask,
- (__global AmdEvent*)queue->event_slots);
- }
- }
- else if ((slotState == AQL_WRAP_BUSY) ||
- (slotState == AQL_WRAP_DONE)) {
- if (slotState == AQL_WRAP_BUSY) {
- disp->state = AQL_WRAP_DONE;
- if (event != 0) {
- event->timer[PROFILING_COMMAND_END] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- }
- }
- // Was CL_EVENT requested?
- if (event != 0) {
- // The current dispatch doesn't have any outstanding children
- if (disp->child_counter == 0) {
- event->timer[PROFILING_COMMAND_COMPLETE] =
- (__hsail_get_clock() * (ulong)param->eng_clk) >> 10;
- event->state = CL_COMPLETE;
- if (event->capture_info != 0) {
- __global ulong* values = (__global ulong*)event->capture_info;
- values[0] = event->timer[PROFILING_COMMAND_END] -
- event->timer[PROFILING_COMMAND_START];
- values[1] = event->timer[PROFILING_COMMAND_COMPLETE] -
- event->timer[PROFILING_COMMAND_START];
- }
- releaseEvent(event, (__global uint *)queue->event_slot_mask,
- (__global AmdEvent *)queue->event_slots);
- }
- }
- // The current dispatch doesn't have any outstanding children
- if (disp->child_counter == 0) {
- // Decrement the child execution counter on the parent
- atomic_fetch_sub_explicit(
- (__global atomic_uint*)&parent->child_counter,
- 1, memory_order_acq_rel, memory_scope_device);
- disp->state = AQL_WRAP_FREE;
- release_slot(amask, idx);
- }
- }
- }
-
- if (launch == 0) {
- hwDisp[get_group_id(0)].glbSizeX = 0;
- hwDisp[get_group_id(0)].glbSizeY = 0;
- hwDisp[get_group_id(0)].glbSizeZ = 0;
- }
-
- if (param->dedicatedQueue) {
- loop = atomic_load_explicit((__global atomic_uint*)signal,
- memory_order_acquire, memory_scope_device);
- }
- else {
- loop = atomic_load_explicit((__global atomic_uint*)counter,
- memory_order_acquire, memory_scope_device);
- }
-
- if (loop == 0) {
- //! \todo Write deadcode to the template, but somehow
- //! the scheduler will be launched one more time.
- hwDisp->packet0 = 0xdeadc0de;
- atomic_store_explicit((__global atomic_uint*)signal,
- 0, memory_order_release, memory_scope_device);
- atomic_store_explicit((__global atomic_uint*)¶m->releaseHostCP,
- 1, memory_order_release, memory_scope_device);
- }
+ __amd_scheduler(queue, params, paramIdx);
}
\n
-\n
);
} // namespace gpu