Files
rocm-systems/wddm/cmd_util.cpp
T
Flora Cui 79a63cf292 wsl/hsakmt: initial commit
Signed-off-by: lyndonli <Lyndon.Li@amd.com>
Signed-off-by: Horatio Zhang <Hongkun.Zhang@amd.com>
Signed-off-by: Shi.Leslie <Yuliang.Shi@amd.com>
Signed-off-by: LonglongYao <Longlong.Yao@amd.com>
Signed-off-by: tiancyin <tianci.yin@amd.com>
Signed-off-by: Frank Min <Frank.Min@amd.com>
Signed-off-by: Aaron Liu <aaron.liu@amd.com>
Signed-off-by: Shane Xiao <shane.xiao@amd.com>
Signed-off-by: Lang Yu <lang.yu@amd.com>
Signed-off-by: Feifei Xu <Feifei.Xu@amd.com>
Signed-off-by: Ruili Ji <ruiliji2@amd.com>
Signed-off-by: Qiang Yu <qiang.yu@amd.com>
Signed-off-by: Flora Cui <flora.cui@amd.com>
2025-11-05 18:53:15 +08:00

282 lines
10 KiB
C++

/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
#include "inc/wddm/cmd_util.h"
namespace rocr {
namespace core {
/*
* Builds a COPY_DATA packet that copies data.
*/
size_t CmdUtil::BuildCopyData(
uint64_t *pDstAddr,
void *pBuffer,
uint32_t dstSel,
uint32_t dstCachePolicy,
uint32_t srcSel,
uint32_t srcCachePolicy,
uint32_t countSel,
uint32_t wrConfirm) {
PM4MEC_COPY_DATA copy_data = {0};
GenerateCmdHeader(&copy_data, IT_COPY_DATA);
copy_data.bitfields2.dst_sel = dstSel;
copy_data.bitfields2.src_sel = srcSel;
copy_data.bitfields2.dst_cache_policy = dstCachePolicy;
copy_data.bitfields2.src_cache_policy = srcCachePolicy;
copy_data.bitfields2.count_sel = countSel;
copy_data.bitfields2.wr_confirm = wrConfirm;
copy_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(pDstAddr) >> 3);
copy_data.dst_addr_hi = PtrHigh32(pDstAddr);
memcpy(pBuffer, &copy_data, sizeof(copy_data));
return sizeof(copy_data);
}
/*
* Builds a EVENT_WRITE packet.
* Applications can use Barrier command to ensure their
* command is executed only after all other commands have
* completed their execution.
*/
size_t CmdUtil::BuildBarrier(
void *pBuffer,
uint32_t eventIndex,
uint32_t eventType) {
BarrierTemplate barrier = {0};
GenerateCmdHeader(&barrier.event_write, IT_EVENT_WRITE);
barrier.event_write.bitfields2.event_index = eventIndex;
barrier.event_write.bitfields2.event_type = eventType;
memcpy(pBuffer, &barrier, sizeof(barrier));
return sizeof(barrier);
}
/*
* Builds a ACQUIRE_MEM packet.
* Users can submit this command to
* invalidate Gpu caches - L1 and or L2.
*/
size_t CmdUtil::BuildAcquireMem(
uint8_t major,
void *pBuffer) {
size_t ret;
if (major == 9) {
gfx9::AcquireMemTemplate acq = {0};
GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
// Specify the size of memory to invalidate. Size is
// specified in terms of 256 byte chunks. A coher_size
// of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits)
// of memory. The field coher_size_hi specifies memory from
// bits 40-64 for a total of 256 TB.
acq.acquire_mem.coher_size = 0xFFFFFFFF;
acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
// Specify the address of memory to invalidate. The
// address must be 256 byte aligned.
acq.acquire_mem.coher_base_lo = 0;
acq.acquire_mem.bitfields6.coher_base_hi = 0;
// Specify the poll interval for determing if operation is complete
acq.acquire_mem.bitfields7.poll_interval = 4;
acq.acquire_mem.bitfields2.coher_cntl =
(1 << 29) | // CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK
(1 << 27) | // CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK
(1 << 28); // CP_COHER_CNTL__SH_KCACHE_VOL_ACTION_ENA_MASK
memcpy(pBuffer, &acq, sizeof(acq));
ret = sizeof(acq);
} else if (major >= 10) {
gfx10::AcquireMemTemplate acq = {0};
GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
acq.acquire_mem.coher_size = 0xFFFFFFFF;
acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
acq.acquire_mem.coher_base_lo = 0;
acq.acquire_mem.bitfields6.coher_base_hi = 0;
acq.acquire_mem.bitfields7.poll_interval = 4;
acq.acquire_mem.bitfields8.gcr_cntl =
(1 << 16) | // SEQ = FORWARD
(1 << 15) | // GL2_WB
(1 << 14) | // GL2_INV
(1 << 9) | // GL1_INV
(1 << 8) | // GLV_INV
(1 << 7) | // GLK_INV
(1 << 6) | // GLK_WB
(1 << 5) | // GLM_INV
(1 << 4) | // GLM_WB
(1 << 0); // GLI_INV = ALL
memcpy(pBuffer, &acq, sizeof(acq));
ret = sizeof(acq);
}
return ret;
}
/*
* Builds a scratch packet.
*/
size_t CmdUtil::BuildScratch(
void *pScratchBase,
void *pBuffer) {
struct SetScratchTemplate scratch = {0};
GenerateSetShRegHeader(&scratch, mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO);
scratch.scratch_lo = Ptr48Low32(pScratchBase);
scratch.scratch_hi = Ptr48High8(pScratchBase);
memcpy(pBuffer, &scratch, sizeof(scratch));
return sizeof(scratch);
}
/**
* @ Set Compute Shader parameter for gfx11 and above
*/
size_t CmdUtil::BuildComputeShaderParams(void *pBuffer) {
struct DispatchProgramResourceRegs compute_shader_params = {0};
GenerateSetShRegHeader(&compute_shader_params, mmCOMPUTE_PGM_RSRC3);
// IMAGE_OP: Indicates the compute program contains an image op
// instruction and should be stalled by its WAIT_SYNC fence.
compute_shader_params.compute_pgm_rsrc3 = (1 << 31);
memcpy(pBuffer, &compute_shader_params, sizeof(compute_shader_params));
return sizeof(compute_shader_params);
}
/*
* Builds a dispatch packet.
*/
size_t CmdUtil::BuildDispatch(
struct DispatchInfo *pInfo,
void *pBuffer) {
DispatchTemplate dispatch = {0};
GenerateSetShRegHeader(&dispatch.dimension_regs, mmCOMPUTE_NUM_THREAD_X);
dispatch.dimension_regs.compute_num_thread_x = pInfo->pPacket->workgroup_size_x;
dispatch.dimension_regs.compute_num_thread_y = pInfo->pPacket->workgroup_size_y;
dispatch.dimension_regs.compute_num_thread_z = pInfo->pPacket->workgroup_size_z;
// TODO: Add AQL packet index for debugger
// Debugger requires AQL packet index in COMPUTE_DISPATCH_PKT_ADDR_LO
GenerateSetShRegHeader(&dispatch.program_regs, mmCOMPUTE_PGM_LO);
dispatch.program_regs.compute_pgm_lo = Ptr48Low32(pInfo->pEntry);
dispatch.program_regs.compute_pgm_hi = Ptr48High8(pInfo->pEntry);
GenerateSetShRegHeader(&dispatch.program_resource_regs, mmCOMPUTE_PGM_RSRC1);
dispatch.program_resource_regs.compute_pgm_rsrc1 = pInfo->pKernelObject->compute_pgm_rsrc1;
dispatch.program_resource_regs.compute_pgm_rsrc2 =
(pInfo->ldsBlks << 15) | pInfo->pKernelObject->compute_pgm_rsrc2;
GenerateSetShRegHeader(&dispatch.resource_regs, mmCOMPUTE_RESOURCE_LIMITS);
dispatch.resource_regs.compute_resource_limits = 0x3ff;
dispatch.resource_regs.compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
dispatch.resource_regs.compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
dispatch.resource_regs.compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
dispatch.resource_regs.compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
dispatch.resource_regs.compute_tmpring_size = pInfo->pAmdQueue->compute_tmpring_size;
GenerateSetShRegHeader(&dispatch.compute_user_data_regs, mmCOMPUTE_USER_DATA_0);
uint32_t sgpr_no = 0;
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
assert(pInfo->major < 11);
pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
sgpr_no * sizeof(uint32_t);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->pAmdQueue->scratch_resource_descriptor[0];
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->pAmdQueue->scratch_resource_descriptor[1];
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->pAmdQueue->scratch_resource_descriptor[2];
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->srd;
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) {
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pPacket);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pPacket);
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pAmdQueue);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pAmdQueue);
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
PtrLow32(pInfo->pPacket->kernarg_address);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
PtrHigh32(pInfo->pPacket->kernarg_address);
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) {
// This feature may be enabled as a side effect of indirect calls.
// However, the compiler team confirmed that the dispatch id itself is not used,
// so safe to send 0 for each dispatch.
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = NULL;
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = NULL;
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) {
assert(pInfo->major < 11);
pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
sgpr_no * sizeof(uint32_t);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
PtrLow32(pInfo->pScratchBase);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
PtrHigh32(pInfo->pScratchBase);
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) {
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->scratchSizePerWave / (pInfo->wave32 ? 32 : 64);
}
GenerateCmdHeader(&dispatch.dispatch_direct, IT_DISPATCH_DIRECT);
dispatch.dispatch_direct.dispatch_initiator =
(1 << 0) | // COMPUTE_SHADER_EN
(1 << 2) | // FORCE_START_AT_000
(1 << 5); // USE_THREAD_DIMENSIONS
if (pInfo->wave32) dispatch.dispatch_direct.dispatch_initiator |= (1 << 15); // CS_W32_EN
dispatch.dispatch_direct.dim_x = pInfo->pPacket->grid_size_x;
dispatch.dispatch_direct.dim_y = pInfo->pPacket->grid_size_y;
dispatch.dispatch_direct.dim_z = pInfo->pPacket->grid_size_z;
memcpy(pBuffer, &dispatch, sizeof(dispatch));
return sizeof(dispatch);
}
/*
* Builds a ATOMIC_MEM packet.
* Users can submit this command
* to perform atomic operations.
*/
size_t CmdUtil::BuildAtomicMem(
uint64_t *pAddr,
uint32_t atomic,
void *pBuffer,
uint32_t cachePolicy,
uint64_t srcData) {
AtomicTemplate atom = {0};
GenerateCmdHeader(&atom.atomic, IT_ATOMIC_MEM);
atom.atomic.addr_lo = PtrLow32(pAddr);
atom.atomic.addr_hi = PtrHigh32(pAddr);
atom.atomic.bitfields2.atomic = atomic;
atom.atomic.bitfields2.cache_policy = cachePolicy;
atom.atomic.src_data_lo = LowPart(srcData);
atom.atomic.src_data_hi = HighPart(srcData);
memcpy(pBuffer, &atom, sizeof(atom));
return sizeof(atom);
}
} // namespace core
} // namespace rocr