From 91b247014feb8277e2b9e010456ef3be3181d430 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 11 Nov 2014 16:27:13 -0500 Subject: [PATCH] P4 to Git Change 1095935 by bwicakso@opencl-hsa-stg-bwicakso on 2014/11/11 16:00:10 ECR #333755 - Part 2- Update to foundation spec 1.0 20141019: - hsa_dispatch_packet_t now becomes hsa_kernel_dispatch_packet_t - all bit mask in a struct are removed and replaced by enums that indicates the bit position and width. Test: TC precheckin Review: Hari, Fan, Shucai, German, Yunjun. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#268 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#103 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#15 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#338 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.cpp#25 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.hpp#12 edit [ROCm/clr commit: c7988f7209ac33cc1c0505d40afae525e0d4025a] --- .../rocclr/runtime/device/gpu/gpukernel.cpp | 25 +++++++++++-------- .../rocclr/runtime/device/gpu/gpukernel.hpp | 2 +- .../rocclr/runtime/device/gpu/gpusched.hpp | 2 +- .../rocclr/runtime/device/gpu/gpuvirtual.cpp | 10 +++++--- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp index ac166aa1f5..58e4358b15 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp @@ -3791,10 +3791,13 @@ WriteAqlArg( *dst += size; } -const hsa_packet_header_t DISPATCH_PACKET_HEADER = { - HSA_PACKET_TYPE_DISPATCH, 1, HSA_FENCE_SCOPE_SYSTEM, HSA_FENCE_SCOPE_COMPONENT, 0 }; +const uint16_t kDispatchPacketHeader = + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_COMPONENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -hsa_dispatch_packet_t* +hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( VirtualGPU& gpu, const amd::Kernel& kernel, @@ -4028,7 +4031,8 @@ HSAILKernel::loadArguments( "Size and the number of arguments don't match!"); uint argBufSize = amd::alignUp( static_cast(argsBufferSize()), sizeof(uint32_t)); - hsa_dispatch_packet_t* hsaDisp = reinterpret_cast( + hsa_kernel_dispatch_packet_t* hsaDisp = + reinterpret_cast( gpu.cb(0)->sysMemCopy() + argBufSize); amd::NDRange local(sizes.local()); @@ -4037,9 +4041,8 @@ HSAILKernel::loadArguments( // Check if runtime has to find local workgroup size findLocalWorkSize(sizes.dimensions(), sizes.global(), local); - hsaDisp->header = DISPATCH_PACKET_HEADER; - hsaDisp->dimensions = sizes.dimensions(); - hsaDisp->reserved = 0; + hsaDisp->header = kDispatchPacketHeader; + hsaDisp->setup = sizes.dimensions(); hsaDisp->workgroup_size_x = local[0]; hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; @@ -4053,14 +4056,14 @@ HSAILKernel::loadArguments( // Initialize kernel ISA and execution buffer requirements hsaDisp->private_segment_size = spillSegSize(); hsaDisp->group_segment_size = ldsAddress - ldsSize(); - hsaDisp->kernel_object_address = gpuAqlCode()->vmAddress(); + hsaDisp->kernel_object = gpuAqlCode()->vmAddress(); ConstBuffer* cb = gpu.constBufs_[0]; - cb->uploadDataToHw(argBufSize + sizeof(hsa_dispatch_packet_t)); + cb->uploadDataToHw(argBufSize + sizeof(hsa_kernel_dispatch_packet_t)); uint64_t argList = cb->vmAddress() + cb->wrtOffset(); - hsaDisp->kernarg_address = argList; - hsaDisp->reserved3 = 0; + hsaDisp->kernarg_address = reinterpret_cast(argList); + hsaDisp->reserved2 = 0; hsaDisp->completion_signal = 0; memList.push_back(cb); diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp index 36c1b1c42f..2c2d5a15b6 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp @@ -885,7 +885,7 @@ public: //! Returns AQL packet in CPU memory //! if the kerenl arguments were successfully loaded, otherwise NULL - hsa_dispatch_packet_t* loadArguments( + hsa_kernel_dispatch_packet_t* loadArguments( VirtualGPU& gpu, //!< Running GPU context const amd::Kernel& kernel, //!< AMD kernel object const amd::NDRangeContainer& sizes, //!< NDrange container diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp b/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp index b4eb224633..01985190cd 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp @@ -47,7 +47,7 @@ struct AmdAqlWrap { uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects uint32_t reserved[5]; //!< For the future usage - hsa_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet + hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet }; struct AmdEvent { diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index b2426250a9..fb2d38f007 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -347,7 +347,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) AmdAqlWrap* slots = reinterpret_cast(&header[1]); for (uint i = 0; i < numSlots; ++i) { uint64_t argStart = vaBase + argOffs + i * singleArgSize; - slots[i].aql.kernarg_address = argStart; + slots[i].aql.kernarg_address = reinterpret_cast(argStart); slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; } // Upload data back to local memory @@ -1735,7 +1735,7 @@ VirtualGPU::submitKernelInternalHSA( } // Program the kernel arguments for the GPU execution - hsa_dispatch_packet_t* aqlPkt = + hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem, vmDefQueue, &vmParentWrap, memList); if (NULL == aqlPkt) { @@ -1812,7 +1812,7 @@ VirtualGPU::submitKernelInternalHSA( uint64_t* kernels = (uint64_t*)( const_cast(hsaKernel.prog().kernelTable())->map(this)); for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { - if (kernels[j] == wraps[i].aql.kernel_object_address) { + if (kernels[j] == wraps[i].aql.kernel_object) { break; } } @@ -1828,7 +1828,9 @@ VirtualGPU::submitKernelInternalHSA( printf("Error: couldn't find child kernel!\n"); continue; } - uint offsArg = wraps[i].aql.kernarg_address - + const uint64_t kernarg_address = + static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); + uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress(); address argum = gpuDefQueue->virtualQueue_->data() + offsArg; print << "Kernel: " << child->name() << "\n";