diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index 223e7a9243..abd260762f 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -389,6 +389,61 @@ void ihipStream_t::lockclose_postKernelCommand(const char * kernelName, hc::acce +#if USE_DISPATCH_HSA_KERNEL==0 +// Precursor: the stream is already locked,specifically so this routine can enqueue work into the specified av. +void ihipStream_t::launchModuleKernel( + hc::accelerator_view av, + hsa_signal_t signal, + uint32_t blockDimX, + uint32_t blockDimY, + uint32_t blockDimZ, + uint32_t gridDimX, + uint32_t gridDimY, + uint32_t gridDimZ, + uint32_t groupSegmentSize, + uint32_t privateSegmentSize, + void *kernarg, + size_t kernSize, + uint64_t kernel){ + hsa_status_t status; + void *kern; + + hsa_amd_memory_pool_t *pool = reinterpret_cast(av.get_hsa_kernarg_region()); + status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern); + status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)av.get_hsa_agent(), 0, kern); + memcpy(kern, kernarg, kernSize); + hsa_queue_t *Queue = (hsa_queue_t*)av.get_hsa_queue(); + const uint32_t queue_mask = Queue->size-1; + uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue); + hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]); + + dispatch_packet->completion_signal = signal; + dispatch_packet->workgroup_size_x = blockDimX; + dispatch_packet->workgroup_size_y = blockDimY; + dispatch_packet->workgroup_size_z = blockDimZ; + dispatch_packet->grid_size_x = blockDimX * gridDimX; + dispatch_packet->grid_size_y = blockDimY * gridDimY; + dispatch_packet->grid_size_z = blockDimZ * gridDimZ; + dispatch_packet->group_segment_size = groupSegmentSize; + dispatch_packet->private_segment_size = privateSegmentSize; + dispatch_packet->kernarg_address = kern; + dispatch_packet->kernel_object = kernel; + uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + + uint16_t setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + uint32_t header32 = header | (setup << 16); + + __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE); + + hsa_queue_store_write_index_relaxed(Queue, packet_index + 1); + hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index); +} +#endif + + //============================================================================= // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted. // The packed _peerAgents can efficiently be used on each memory allocation. diff --git a/hipamd/src/hip_hcc.h b/hipamd/src/hip_hcc.h index 5ab51b5ea3..b01d41be14 100644 --- a/hipamd/src/hip_hcc.h +++ b/hipamd/src/hip_hcc.h @@ -32,6 +32,8 @@ THE SOFTWARE. #error("This version of HIP requires a newer version of HCC."); #endif +#define USE_DISPATCH_HSA_KERNEL 0 +// //--- diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index 2daa251004..606d99f2fd 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -282,6 +282,8 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, grid_launch_parm lp; hStream = ihipPreLaunchKernel(hStream, 0, 0, &lp, f->_kernelName); +#if USE_DISPATCH_HSA_KERNEL + hsa_kernel_dispatch_packet_t aql; memset(&aql, 0, sizeof(aql)); @@ -305,6 +307,31 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize); +#else + + /* + Create signal + */ + + hsa_signal_t signal; + status = hsa_signal_create(1, 0, NULL, &signal); + + + /* + Launch AQL packet + */ + hStream->launchModuleKernel(*lp.av, signal, blockDimX, blockDimY, blockDimZ, + gridDimX, gridDimY, gridDimZ, groupSegmentSize, privateSegmentSize, config[1], kernArgSize, f->_kernel); + + + /* + Wait for signal + */ + + hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + +#endif // USE_DISPATCH_HSA_KERNEL + ihipPostLaunchKernel(f->_kernelName, hStream, lp);