diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index 51f51eadad..f1ddd90fd9 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -874,8 +874,11 @@ Device::initializeHeapResources() // Delay compilation due to brig_loader memory allocation const char* scheduler = nullptr; const char* ocl20 = nullptr; + std::string sch = SchedulerSourceCode; if (settings().oclVersion_ == OpenCL20) { - scheduler = SchedulerSourceCode; + size_t loc = sch.find("%s"); + sch.replace(loc, 2, iDev()->GetDispatchKernelSource()); + scheduler = sch.c_str(); ocl20 = "-cl-std=CL2.0"; } blitProgram_ = new BlitProgram(context_); diff --git a/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp b/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp index b01c1c80b6..047d5b610b 100644 --- a/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp @@ -6,278 +6,7 @@ namespace pal { #define SCHEDULER_KERNEL(...) #__VA_ARGS__ const char* SchedulerSourceCode = SCHEDULER_KERNEL( - extern void __amd_scheduler(__global void *, __global void *, uint); -\n -typedef struct _HsaAqlDispatchPacket { - uint mix; - ushort workgroup_size[3]; - ushort reserved2; - uint grid_size[3]; - uint private_segment_size_bytes; - uint group_segment_size_bytes; - ulong kernel_object_address; - ulong kernel_arg_address; - ulong reserved3; - ulong completion_signal; -} HsaAqlDispatchPacket; -\n -// This is an OpenCLized hsa_control_directives_t -typedef struct _AmdControlDirectives { - ulong enabled_control_directives; - ushort enable_break_exceptions; - ushort enable_detect_exceptions; - uint max_dynamic_group_size; - ulong max_flat_grid_size; - uint max_flat_workgroup_size; - uchar required_dim; - uchar reserved1[3]; - ulong required_grid_size[3]; - uint required_workgroup_size[3]; - uchar reserved2[60]; -} AmdControlDirectives; -\n -// This is an OpenCLized amd_kernel_code_t -typedef struct _AmdKernelCode { - uint amd_kernel_code_version_major; - uint amd_kernel_code_version_minor; - ushort amd_machine_kind; - ushort amd_machine_version_major; - ushort amd_machine_version_minor; - ushort amd_machine_version_stepping; - long kernel_code_entry_byte_offset; - long kernel_code_prefetch_byte_offset; - ulong kernel_code_prefetch_byte_size; - ulong max_scratch_backing_memory_byte_size; - uint compute_pgm_rsrc1; - uint compute_pgm_rsrc2; - uint kernel_code_properties; - uint workitem_private_segment_byte_size; - uint workgroup_group_segment_byte_size; - uint gds_segment_byte_size; - ulong kernarg_segment_byte_size; - uint workgroup_fbarrier_count; - ushort wavefront_sgpr_count; - ushort workitem_vgpr_count; - ushort reserved_vgpr_first; - ushort reserved_vgpr_count; - ushort reserved_sgpr_first; - ushort reserved_sgpr_count; - ushort debug_wavefront_private_segment_offset_sgpr; - ushort debug_private_segment_buffer_sgpr; - uchar kernarg_segment_alignment; - uchar group_segment_alignment; - uchar private_segment_alignment; - uchar wavefront_size; - int call_convention; - uchar reserved1[12]; - ulong runtime_loader_kernel_symbol; - AmdControlDirectives control_directives; -} AmdKernelCode; -\n -typedef struct _HwDispatchHeader { - uint writeData0; // CP WRITE_DATA write to rewind for memory - uint writeData1; - uint writeData2; - uint writeData3; - uint rewind; // REWIND execution - uint startExe; // valid bit - uint condExe0; // 0xC0032200 -- TYPE 3, COND_EXEC - uint condExe1; // 0x00000204 ---- - uint condExe2; // 0x00000000 ---- - uint condExe3; // 0x00000000 ---- - uint condExe4; // 0x00000000 ---- -} HwDispatchHeader; -\n -typedef struct _HwDispatch { - uint packet0; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values) - uint offset0; // 0x00000204 ---- OFFSET - uint startX; // 0x00000000 ---- COMPUTE_START_X: START = 0x0 - uint startY; // 0x00000000 ---- COMPUTE_START_Y: START = 0x0 - uint startZ; // 0x00000000 ---- COMPUTE_START_Z: START = 0x0 - uint wrkGrpSizeX; // 0x00000000 ---- COMPUTE_NUM_THREAD_X: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 - uint wrkGrpSizeY; // 0x00000000 ---- COMPUTE_NUM_THREAD_Y: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 - uint wrkGrpSizeZ; // 0x00000000 ---- COMPUTE_NUM_THREAD_Z: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 - uint packet1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offset1; // 0x0000020C ---- OFFSET - uint isaLo; // 0x00000000 ---- COMPUTE_PGM_LO: DATA = 0x0 - uint isaHi; // 0x00000000 ---- COMPUTE_PGM_HI: DATA = 0x0, INST_ATC__CI__VI = 0x0 - uint packet2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offset2; // 0x00000212 ---- OFFSET - uint resource1; // 0x00000000 ---- COMPUTE_PGM_RSRC1 - uint resource2; // 0x00000000 ---- COMPUTE_PGM_RSRC2 - uint packet3; // 0xc0017602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (1 value) - uint offset3; // 0x00000215 ---- OFFSET - uint pad31; // 0x000003ff ---- COMPUTE_RESOURCE_LIMITS - uint packet31; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (1 value) - uint offset31; // 0x00000218 ---- OFFSET - uint ringSize; // 0x00000000 ---- COMPUTE_TMPRING_SIZE: WAVES = 0x0, WAVESIZE = 0x0 - uint user0; // 0xC0047602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (4 values) - uint offsUser0; // 0x00000240 ---- OFFSET - uint scratchLo; // 0x00000000 ---- COMPUTE_USER_DATA_0: DATA = 0x0 - uint scratchHi; // 0x80000000 ---- COMPUTE_USER_DATA_1: DATA = 0x80000000 - uint scratchSize; // 0x00000000 ---- COMPUTE_USER_DATA_2: DATA = 0x0 - uint padUser; // 0x00EA7FAC ---- COMPUTE_USER_DATA_3: DATA = 0xEA7FAC - uint user1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offsUser1; // 0x00000244 ---- OFFSET - uint aqlPtrLo; // 0x00000000 ---- COMPUTE_USER_DATA_4: DATA = 0x0 - uint aqlPtrHi; // 0x00000000 ---- COMPUTE_USER_DATA_5: DATA = 0x0 - uint user2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offsUser2; // 0x00000246 ---- OFFSET - uint hsaQueueLo; // 0x00000000 ---- COMPUTE_USER_DATA_6: DATA = 0x0 - uint hsaQueueHi; // 0x00000000 ---- COMPUTE_USER_DATA_7: DATA = 0x0 - uint user3; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offsUser3; // 0x00000246 ---- OFFSET - uint argsLo; // 0x00000000 ---- COMPUTE_USER_DATA_8: DATA = 0x0 - uint argsHi; // 0x00000000 ---- COMPUTE_USER_DATA_9: DATA = 0x0 - uint copyData; // 0xC0044000 -- TYPE 3, COPY_DATA - uint copyDataFlags; // 0x00000405 ---- srcSel 0x5, destSel 0x4, countSel 0x0, wrConfirm 0x0, engineSel 0x0 - uint scratchAddrLo; // 0x000201C4 ---- srcAddressLo - uint scratchAddrHi; // 0x00000000 ---- srcAddressHi - uint shPrivateLo; // 0x00002580 ---- dstAddressLo - uint shPrivateHi; // 0x00000000 ---- dstAddressHi - uint user4; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) - uint offsUser4; // 0x00000248 ---- OFFSET - uint scratchOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0 - uint privSize; // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30 - uint packet4; // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE - uint glbSizeX; // 0x00000000 - uint glbSizeY; // 0x00000000 - uint glbSizeZ; // 0x00000000 - uint padd41; // 0x00000021 -} HwDispatch; -\n -static const uint WavefrontSize = 64; -static const uint MaxWaveSize = 0x400; -static const uint UsrRegOffset = 0x240; -static const uint Pm4Nop = 0xC0001002; -static const uint Pm4UserRegs = 0xC0007602; -static const uint Pm4CopyReg = 0xC0044000; -static const uint PrivateSegEna = 0x1; -static const uint DispatchEna = 0x2; -static const uint QueuePtrEna = 0x4; -static const uint KernelArgEna = 0x8; -static const uint FlatScratchEna = 0x20; -\n -uint GetCmdTemplateHeaderSize() { return sizeof(HwDispatchHeader); } -\n -uint GetCmdTemplateDispatchSize() { return sizeof(HwDispatch); } -\n -void EmptyCmdTemplateDispatch(ulong cmdBuf) -{ - volatile __global HwDispatch* dispatch = (volatile __global HwDispatch*)cmdBuf; - dispatch->glbSizeX = 0; - dispatch->glbSizeY = 0; - dispatch->glbSizeZ = 0; -} -\n -void RunCmdTemplateDispatch( - ulong cmdBuf, - __global HsaAqlDispatchPacket* aqlPkt, - ulong scratch, - ulong hsaQueue, - uint scratchSize, - uint scratchOffset, - uint numMaxWaves, - uint useATC) - \n -{ - volatile __global HwDispatch* dispatch = (volatile __global HwDispatch*)cmdBuf; -uint usrRegCnt = 0; - -// Program workgroup size -dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0]; -dispatch->wrkGrpSizeY = aqlPkt->workgroup_size[1]; -dispatch->wrkGrpSizeZ = aqlPkt->workgroup_size[2]; - -// ISA address -__global AmdKernelCode* kernelObj = (__global AmdKernelCode*)aqlPkt->kernel_object_address; -ulong isa = aqlPkt->kernel_object_address + kernelObj->kernel_code_entry_byte_offset; - -dispatch->isaLo = (uint)(isa >> 8); -dispatch->isaHi = (uint)(isa >> 40) | (useATC ? 0x100 : 0); - -// Program PGM resource registers -dispatch->resource1 = kernelObj->compute_pgm_rsrc1; -dispatch->resource2 = kernelObj->compute_pgm_rsrc2; - -uint flags = kernelObj->kernel_code_properties; -uint privateSize = kernelObj->workitem_private_segment_byte_size; - -uint ldsSize = aqlPkt->group_segment_size_bytes + -kernelObj->workgroup_group_segment_byte_size; - -// Align up the LDS blocks 128 * 4(in DWORDs) -uint ldsBlocks = (ldsSize + 511) >> 9; - -dispatch->resource2 |= (ldsBlocks << 15); - -// Private/scratch segment was enabled -if (flags & PrivateSegEna) { - uint waveSize = privateSize * WavefrontSize; - // 256 DWRODs is the minimum for SQ - waveSize = max(MaxWaveSize, waveSize); - - uint numWaves = scratchSize / waveSize; - - numWaves = min(numWaves, numMaxWaves); - - dispatch->ringSize = numWaves; - dispatch->ringSize |= (waveSize >> 10) << 12; - dispatch->user0 = Pm4UserRegs | (4 << 16); - dispatch->scratchLo = (uint)scratch; - dispatch->scratchHi = ((uint)(scratch >> 32)) | 0x80000000; // Enables swizzle - dispatch->scratchSize = scratchSize; - usrRegCnt += 4; -} -else { - dispatch->ringSize = 0; - dispatch->user0 = Pm4Nop | (4 << 16); -} - -// Pointer to the AQL dispatch packet -dispatch->user1 = (flags & DispatchEna) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16)); -dispatch->offsUser1 = UsrRegOffset + usrRegCnt; -usrRegCnt += (flags & DispatchEna) ? 2 : 0; -ulong gpuAqlPtr = (ulong)aqlPkt; -dispatch->aqlPtrLo = (uint)gpuAqlPtr; -dispatch->aqlPtrHi = (uint)(gpuAqlPtr >> 32); - -// Pointer to the AQL queue header -if (flags & QueuePtrEna) { - dispatch->user2 = Pm4UserRegs | (2 << 16); - dispatch->offsUser2 = UsrRegOffset + usrRegCnt; - usrRegCnt += 2; - dispatch->hsaQueueLo = (uint)hsaQueue; - dispatch->hsaQueueHi = (uint)(hsaQueue >> 32); -} -else { - dispatch->user2 = Pm4Nop | (2 << 16); -} - -// Pointer to the AQL kernel arguments -dispatch->user3 = (flags & KernelArgEna) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16)); -dispatch->offsUser3 = UsrRegOffset + usrRegCnt; -usrRegCnt += (flags & KernelArgEna) ? 2 : 0; -dispatch->argsLo = (uint)aqlPkt->kernel_arg_address; -dispatch->argsHi = (uint)(aqlPkt->kernel_arg_address >> 32); - -// Provide pointer to the private/scratch buffer for the flat address -if (flags & FlatScratchEna) { - dispatch->copyData = Pm4CopyReg; - dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16); - dispatch->offsUser4 = UsrRegOffset + usrRegCnt; - dispatch->scratchOffs = scratchOffset; - dispatch->privSize = privateSize; -} -else { - dispatch->copyData = Pm4Nop | (8 << 16); -} - -// Update the global launch grid -dispatch->glbSizeX = aqlPkt->grid_size[0]; -dispatch->glbSizeY = aqlPkt->grid_size[1]; -dispatch->glbSizeZ = aqlPkt->grid_size[2]; -} +%s \n extern void __amd_scheduler(__global void *, __global void *, uint); \n @@ -289,7 +18,6 @@ scheduler( { __amd_scheduler(queue, params, paramIdx); } -\n -); +\n); } // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index c3154a3bfb..dab2648aa2 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -742,9 +742,17 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize) // Check if device has SDMA engines if (dev().numDMAEngines() != 0) { + uint sdma; + // If only 1 DMA engine is available then use that one + if ((dev().numDMAEngines() < 2) || (idx & 0x1)) { + sdma = 0; + } + else { + sdma = 1; + } + queues_[SdmaEngine] = Queue::Create( - dev().iDev(), Pal::QueueTypeDma, - idx % dev().numDMAEngines(), cmdAllocator_); + dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_); if (nullptr == queues_[SdmaEngine]) { return false; } @@ -2117,26 +2125,25 @@ VirtualGPU::submitKernelInternal( if (!dev().settings().useDeviceQueue_) { // Add the termination handshake to the host queue eventBegin(MainEngine); - //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(), - // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - // vmParentWrap + offsetof(AmdAqlWrap, child_counter), - // 0, dev().settings().useDeviceQueue_); + iCmd()->CmdVirtualQueueHandshake( + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + 0, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); } // Get the global loop start before the scheduler - //Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); - //static_cast(gpuDefQueue->blitMgr()).runScheduler( - // *gpuDefQueue->virtualQueue_, - // *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, - // gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); + static_cast(gpuDefQueue->blitMgr()).runScheduler( + *gpuDefQueue->virtualQueue_, + *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); const static bool FlushL2 = true; gpuDefQueue->flushCUCaches(FlushL2); // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler - Pal::gpusize patchStart = 0; - //Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); + Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); // Program parameters for the scheduler SchedulerParam* param = &reinterpret_cast (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; @@ -2178,9 +2185,9 @@ VirtualGPU::submitKernelInternal( Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); gpuDefQueue->eventBegin(MainEngine); - //gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( - // signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / - // (DeviceQueueMaskSize * maskGroups_)); + gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( + signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / + (DeviceQueueMaskSize * maskGroups_)); // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call. // Thus TS command for profiling has to follow in the next CB. constexpr bool ForceSubmitFirst = true; @@ -2194,10 +2201,10 @@ VirtualGPU::submitKernelInternal( if (dev().settings().useDeviceQueue_) { // Add the termination handshake to the host queue eventBegin(MainEngine); - //iCmd()->CmdVirtualQueueHandshake(*gpuDefQueue->schedParams_->iMem(), - // vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - // vmParentWrap + offsetof(AmdAqlWrap, child_counter), - // signalAddr, dev().settings().useDeviceQueue_); + iCmd()->CmdVirtualQueueHandshake( + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + signalAddr, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); }