Files
rocm-systems/rocclr/runtime/device/gpu/gpuschedcl.cpp
T
foreman 53a0c0add6 P4 to Git Change 1306079 by gandryey@gera-w8 on 2016/08/23 11:52:31
SWDEV-95905 - OpenCL on PAL - Device Enqueue
	- Move disapatch funciton outside of the compiler lib. Client must provide 4 new functions to the scheduler: GetCmdTemplateHeaderSize(), GetCmdTemplateDispatchSize(),EmptyCmdTemplateDispatch(), RunCmdTemplateDispatch().

	http://ocltc.amd.com/reviews/r/11142/

Affected files ...

... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/schedule.cl#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#34 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#2 edit
2016-08-23 12:05:38 -04:00

296 строки
12 KiB
C++

//
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
//
namespace gpu {
#define SCHEDULER_KERNEL(...) #__VA_ARGS__
const char* SchedulerSourceCode = SCHEDULER_KERNEL(
\n
extern void __amd_scheduler(__global void *, __global void *, uint);
\n
typedef struct _HsaAqlDispatchPacket {
uint mix;
ushort workgroup_size[3];
ushort reserved2;
uint grid_size[3];
uint private_segment_size_bytes;
uint group_segment_size_bytes;
ulong kernel_object_address;
ulong kernel_arg_address;
ulong reserved3;
ulong completion_signal;
} HsaAqlDispatchPacket;
\n
// This is an OpenCLized hsa_control_directives_t
typedef struct _AmdControlDirectives {
ulong enabled_control_directives;
ushort enable_break_exceptions;
ushort enable_detect_exceptions;
uint max_dynamic_group_size;
ulong max_flat_grid_size;
uint max_flat_workgroup_size;
uchar required_dim;
uchar reserved1[3];
ulong required_grid_size[3];
uint required_workgroup_size[3];
uchar reserved2[60];
} AmdControlDirectives;
\n
// This is an OpenCLized amd_kernel_code_t
typedef struct _AmdKernelCode {
uint amd_kernel_code_version_major;
uint amd_kernel_code_version_minor;
ushort amd_machine_kind;
ushort amd_machine_version_major;
ushort amd_machine_version_minor;
ushort amd_machine_version_stepping;
long kernel_code_entry_byte_offset;
long kernel_code_prefetch_byte_offset;
ulong kernel_code_prefetch_byte_size;
ulong max_scratch_backing_memory_byte_size;
uint compute_pgm_rsrc1;
uint compute_pgm_rsrc2;
uint kernel_code_properties;
uint workitem_private_segment_byte_size;
uint workgroup_group_segment_byte_size;
uint gds_segment_byte_size;
ulong kernarg_segment_byte_size;
uint workgroup_fbarrier_count;
ushort wavefront_sgpr_count;
ushort workitem_vgpr_count;
ushort reserved_vgpr_first;
ushort reserved_vgpr_count;
ushort reserved_sgpr_first;
ushort reserved_sgpr_count;
ushort debug_wavefront_private_segment_offset_sgpr;
ushort debug_private_segment_buffer_sgpr;
uchar kernarg_segment_alignment;
uchar group_segment_alignment;
uchar private_segment_alignment;
uchar wavefront_size;
int call_convention;
uchar reserved1[12];
ulong runtime_loader_kernel_symbol;
AmdControlDirectives control_directives;
} AmdKernelCode;
\n
typedef struct _HwDispatchHeader {
uint writeData0; // CP WRITE_DATA write to rewind for memory
uint writeData1;
uint writeData2;
uint writeData3;
uint rewind; // REWIND execution
uint startExe; // valid bit
uint condExe0; // 0xC0032200 -- TYPE 3, COND_EXEC
uint condExe1; // 0x00000204 ----
uint condExe2; // 0x00000000 ----
uint condExe3; // 0x00000000 ----
uint condExe4; // 0x00000000 ----
} HwDispatchHeader;
\n
typedef struct _HwDispatch {
uint packet0; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values)
uint offset0; // 0x00000204 ---- OFFSET
uint startX; // 0x00000000 ---- COMPUTE_START_X: START = 0x0
uint startY; // 0x00000000 ---- COMPUTE_START_Y: START = 0x0
uint startZ; // 0x00000000 ---- COMPUTE_START_Z: START = 0x0
uint wrkGrpSizeX; // 0x00000000 ---- COMPUTE_NUM_THREAD_X: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
uint wrkGrpSizeY; // 0x00000000 ---- COMPUTE_NUM_THREAD_Y: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
uint wrkGrpSizeZ; // 0x00000000 ---- COMPUTE_NUM_THREAD_Z: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
uint packet1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
uint offset1; // 0x0000020C ---- OFFSET
uint isaLo; // 0x00000000 ---- COMPUTE_PGM_LO: DATA = 0x0
uint isaHi; // 0x00000000 ---- COMPUTE_PGM_HI: DATA = 0x0, INST_ATC__CI__VI = 0x0
uint packet2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
uint offset2; // 0x00000212 ---- OFFSET
uint resource1; // 0x00000000 ---- COMPUTE_PGM_RSRC1
uint resource2; // 0x00000000 ---- COMPUTE_PGM_RSRC2
uint packet3; // 0xc0017602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (1 value)
uint offset3; // 0x00000215 ---- OFFSET
uint pad31; // 0x000003ff ---- COMPUTE_RESOURCE_LIMITS
uint packet31; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (1 value)
uint offset31; // 0x00000218 ---- OFFSET
uint ringSize; // 0x00000000 ---- COMPUTE_TMPRING_SIZE: WAVES = 0x0, WAVESIZE = 0x0
uint user0; // 0xC0047602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (4 values)
uint offsUser0; // 0x00000240 ---- OFFSET
uint scratchLo; // 0x00000000 ---- COMPUTE_USER_DATA_0: DATA = 0x0
uint scratchHi; // 0x80000000 ---- COMPUTE_USER_DATA_1: DATA = 0x80000000
uint scratchSize; // 0x00000000 ---- COMPUTE_USER_DATA_2: DATA = 0x0
uint padUser; // 0x00EA7FAC ---- COMPUTE_USER_DATA_3: DATA = 0xEA7FAC
uint user1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
uint offsUser1; // 0x00000244 ---- OFFSET
uint aqlPtrLo; // 0x00000000 ---- COMPUTE_USER_DATA_4: DATA = 0x0
uint aqlPtrHi; // 0x00000000 ---- COMPUTE_USER_DATA_5: DATA = 0x0
uint user2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
uint offsUser2; // 0x00000246 ---- OFFSET
uint hsaQueueLo; // 0x00000000 ---- COMPUTE_USER_DATA_6: DATA = 0x0
uint hsaQueueHi; // 0x00000000 ---- COMPUTE_USER_DATA_7: DATA = 0x0
uint user3; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
uint offsUser3; // 0x00000246 ---- OFFSET
uint argsLo; // 0x00000000 ---- COMPUTE_USER_DATA_8: DATA = 0x0
uint argsHi; // 0x00000000 ---- COMPUTE_USER_DATA_9: DATA = 0x0
uint copyData; // 0xC0044000 -- TYPE 3, COPY_DATA
uint copyDataFlags; // 0x00000405 ---- srcSel 0x5, destSel 0x4, countSel 0x0, wrConfirm 0x0, engineSel 0x0
uint scratchAddrLo; // 0x000201C4 ---- srcAddressLo
uint scratchAddrHi; // 0x00000000 ---- srcAddressHi
uint shPrivateLo; // 0x00002580 ---- dstAddressLo
uint shPrivateHi; // 0x00000000 ---- dstAddressHi
uint user4; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
uint offsUser4; // 0x00000248 ---- OFFSET
uint scratchOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0
uint privSize; // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30
uint packet4; // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE
uint glbSizeX; // 0x00000000
uint glbSizeY; // 0x00000000
uint glbSizeZ; // 0x00000000
uint padd41; // 0x00000021
} HwDispatch;
\n
static const uint WavefrontSize = 64;
static const uint MaxWaveSize = 0x400;
static const uint UsrRegOffset = 0x240;
static const uint Pm4Nop = 0xC0001002;
static const uint Pm4UserRegs = 0xC0007602;
static const uint Pm4CopyReg = 0xC0044000;
static const uint PrivateSegEna = 0x1;
static const uint DispatchEna = 0x2;
static const uint QueuePtrEna = 0x4;
static const uint KernelArgEna = 0x8;
static const uint FlatScratchEna = 0x20;
\n
uint GetCmdTemplateHeaderSize() { return sizeof(HwDispatchHeader); }
\n
uint GetCmdTemplateDispatchSize() { return sizeof(HwDispatch); }
\n
void EmptyCmdTemplateDispatch(ulong cmdBuf)
{
volatile __global HwDispatch* dispatch = (volatile __global HwDispatch*)cmdBuf;
dispatch->glbSizeX = 0;
dispatch->glbSizeY = 0;
dispatch->glbSizeZ = 0;
}
\n
void RunCmdTemplateDispatch(
ulong cmdBuf,
__global HsaAqlDispatchPacket* aqlPkt,
ulong scratch,
ulong hsaQueue,
uint scratchSize,
uint scratchOffset,
uint numMaxWaves,
uint useATC)
\n
{
volatile __global HwDispatch* dispatch = (volatile __global HwDispatch*)cmdBuf;
uint usrRegCnt = 0;
// Program workgroup size
dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0];
dispatch->wrkGrpSizeY = aqlPkt->workgroup_size[1];
dispatch->wrkGrpSizeZ = aqlPkt->workgroup_size[2];
// ISA address
__global AmdKernelCode* kernelObj = (__global AmdKernelCode*)aqlPkt->kernel_object_address;
ulong isa = aqlPkt->kernel_object_address + kernelObj->kernel_code_entry_byte_offset;
dispatch->isaLo = (uint)(isa >> 8);
dispatch->isaHi = (uint)(isa >> 40) | (useATC ? 0x100 : 0);
// Program PGM resource registers
dispatch->resource1 = kernelObj->compute_pgm_rsrc1;
dispatch->resource2 = kernelObj->compute_pgm_rsrc2;
uint flags = kernelObj->kernel_code_properties;
uint privateSize = kernelObj->workitem_private_segment_byte_size;
uint ldsSize = aqlPkt->group_segment_size_bytes +
kernelObj->workgroup_group_segment_byte_size;
// Align up the LDS blocks 128 * 4(in DWORDs)
uint ldsBlocks = (ldsSize + 511) >> 9;
dispatch->resource2 |= (ldsBlocks << 15);
// Private/scratch segment was enabled
if (flags & PrivateSegEna) {
uint waveSize = privateSize * WavefrontSize;
// 256 DWRODs is the minimum for SQ
waveSize = max(MaxWaveSize, waveSize);
uint numWaves = scratchSize / waveSize;
numWaves = min(numWaves, numMaxWaves);
dispatch->ringSize = numWaves;
dispatch->ringSize |= (waveSize >> 10) << 12;
dispatch->user0 = Pm4UserRegs | (4 << 16);
dispatch->scratchLo = (uint)scratch;
dispatch->scratchHi = ((uint)(scratch >> 32)) | 0x80000000; // Enables swizzle
dispatch->scratchSize = scratchSize;
usrRegCnt += 4;
}
else {
dispatch->ringSize = 0;
dispatch->user0 = Pm4Nop | (4 << 16);
}
// Pointer to the AQL dispatch packet
dispatch->user1 = (flags & DispatchEna) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16));
dispatch->offsUser1 = UsrRegOffset + usrRegCnt;
usrRegCnt += (flags & DispatchEna) ? 2 : 0;
ulong gpuAqlPtr = (ulong)aqlPkt;
dispatch->aqlPtrLo = (uint)gpuAqlPtr;
dispatch->aqlPtrHi = (uint)(gpuAqlPtr >> 32);
// Pointer to the AQL queue header
if (flags & QueuePtrEna) {
dispatch->user2 = Pm4UserRegs | (2 << 16);
dispatch->offsUser2 = UsrRegOffset + usrRegCnt;
usrRegCnt += 2;
dispatch->hsaQueueLo = (uint)hsaQueue;
dispatch->hsaQueueHi = (uint)(hsaQueue >> 32);
}
else {
dispatch->user2 = Pm4Nop | (2 << 16);
}
// Pointer to the AQL kernel arguments
dispatch->user3 = (flags & KernelArgEna) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16));
dispatch->offsUser3 = UsrRegOffset + usrRegCnt;
usrRegCnt += (flags & KernelArgEna) ? 2 : 0;
dispatch->argsLo = (uint)aqlPkt->kernel_arg_address;
dispatch->argsHi = (uint)(aqlPkt->kernel_arg_address >> 32);
// Provide pointer to the private/scratch buffer for the flat address
if (flags & FlatScratchEna) {
dispatch->copyData = Pm4CopyReg;
dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16);
dispatch->offsUser4 = UsrRegOffset + usrRegCnt;
dispatch->scratchOffs = scratchOffset;
dispatch->privSize = privateSize;
}
else {
dispatch->copyData = Pm4Nop | (8 << 16);
}
// Update the global launch grid
dispatch->glbSizeX = aqlPkt->grid_size[0];
dispatch->glbSizeY = aqlPkt->grid_size[1];
dispatch->glbSizeZ = aqlPkt->grid_size[2];
}
\n
__kernel void
scheduler(
__global void * queue,
__global void * params,
uint paramIdx)
{
__amd_scheduler(queue, params, paramIdx);
}
\n
);
} // namespace gpu