diff --git a/rocclr/runtime/device/gpu/gpudefs.hpp b/rocclr/runtime/device/gpu/gpudefs.hpp index da68bd8600..8a3d271c1a 100644 --- a/rocclr/runtime/device/gpu/gpudefs.hpp +++ b/rocclr/runtime/device/gpu/gpudefs.hpp @@ -57,6 +57,12 @@ const static uint CalOfflineImpl = 0xffffffff; //! Alignment restriciton for the pinned memory const static size_t PinnedMemoryAlignment = 4 * Ki; +//! HSA path specific defines for images +const static uint HsaImageObjectSize = 48; +const static uint HsaImageObjectAlignment = 16; +const static uint HsaSamplerObjectSize = 32; +const static uint HsaSamplerObjectAlignment = 16; + //! Defines all supported ASIC families enum AsicFamilies { Family7xx, diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp index 004fc66b58..159c916109 100644 --- a/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/rocclr/runtime/device/gpu/gpudevice.cpp @@ -909,7 +909,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices) // Allocate SRD manager srdManager_ = new SrdManager(*this, - std::max(HSA_IMAGE_OBJECT_SIZE, HSA_SAMPLER_OBJECT_SIZE), 64 * Ki); + std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki); if (srdManager_ == NULL) { return false; } @@ -2509,7 +2509,7 @@ Sampler::create( if (0 == hwSrd_) { return false; } - dev_.fillHwSampler(oclSamplerState, hwState_, HSA_SAMPLER_OBJECT_SIZE); + dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize); return true; } diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index 4417275d52..ac166aa1f5 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -3791,7 +3791,10 @@ WriteAqlArg( *dst += size; } -HsaAqlDispatchPacket* +const hsa_packet_header_t DISPATCH_PACKET_HEADER = { + HSA_PACKET_TYPE_DISPATCH, 1, HSA_FENCE_SCOPE_SYSTEM, HSA_FENCE_SCOPE_COMPONENT, 0 }; + +hsa_dispatch_packet_t* HSAILKernel::loadArguments( VirtualGPU& gpu, const amd::Kernel& kernel, @@ -3944,7 +3947,7 @@ HSAILKernel::loadArguments( if (dev().settings().hsailDirectSRD_) { // Image arguments are of size 48 bytes and aligned to 16 bytes WriteAqlArg(&aqlArgBuf, image->hwState(), - HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT); + HsaImageObjectSize, HsaImageObjectAlignment); } else { //! \note Special case for the image views. @@ -3952,9 +3955,9 @@ HSAILKernel::loadArguments( //! this view without a wait for SRD resource. if (image->memoryType() == Resource::ImageView) { // Copy the current structre into CB1 - memcpy(aqlStruct, image->hwState(), HSA_IMAGE_OBJECT_SIZE); + memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(HSA_IMAGE_OBJECT_SIZE); + cb->uploadDataToHw(HsaImageObjectSize); // Then use a pointer in aqlArgBuffer to CB1 uint64_t srd = cb->vmAddress() + cb->wrtOffset(); WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); @@ -3983,7 +3986,7 @@ HSAILKernel::loadArguments( (sampler->getDeviceSampler(dev())); if (dev().settings().hsailDirectSRD_) { WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(), - HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT); + HsaSamplerObjectSize, HsaSamplerObjectAlignment); } else { uint64_t srd = gpuSampler->hwSrd(); @@ -4025,46 +4028,41 @@ HSAILKernel::loadArguments( "Size and the number of arguments don't match!"); uint argBufSize = amd::alignUp( static_cast(argsBufferSize()), sizeof(uint32_t)); - HsaAqlDispatchPacket* aqlPkt = reinterpret_cast( + hsa_dispatch_packet_t* hsaDisp = reinterpret_cast( gpu.cb(0)->sysMemCopy() + argBufSize); - amd::NDRange local(sizes.local()); - amd::NDRange global(sizes.global()); - - // Initialize the Global, Local and Offset values - aqlPkt->dimensions = sizes.dimensions(); - // Initialize the work grid parameter - for (uint idx = 0; idx < 3; idx++) { - aqlPkt->grid_size[idx] = 1; - aqlPkt->workgroup_size[idx] = 1; - } + amd::NDRange local(sizes.local()); + const amd::NDRange& global = sizes.global(); // Check if runtime has to find local workgroup size - findLocalWorkSize(aqlPkt->dimensions, global, local); - for (uint idx = 0; idx < aqlPkt->dimensions; idx++) { - aqlPkt->grid_size[idx] = global[idx]; - aqlPkt->workgroup_size[idx] = local[idx]; - } + findLocalWorkSize(sizes.dimensions(), sizes.global(), local); - // Initialize if dispatch should enable profiling - aqlPkt->reserved2 = 0; //config->profile ? 1:0; + hsaDisp->header = DISPATCH_PACKET_HEADER; + hsaDisp->dimensions = sizes.dimensions(); + hsaDisp->reserved = 0; + + hsaDisp->workgroup_size_x = local[0]; + hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; + hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1; + + hsaDisp->grid_size_x = global[0]; + hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1; + hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1; + hsaDisp->reserved2 = 0; // Initialize kernel ISA and execution buffer requirements - aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress(); - aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize(); - aqlPkt->private_segment_size_bytes = spillSegSize(); - - // Initialize cache flush configuration for the dispatch - //! @todo Currently not used in emulation - aqlPkt->barrier = 1; - aqlPkt->release_fence_scope = 1; - aqlPkt->acquire_fence_scope = 2; - aqlPkt->invalidate_instruction_cache = 1; + hsaDisp->private_segment_size = spillSegSize(); + hsaDisp->group_segment_size = ldsAddress - ldsSize(); + hsaDisp->kernel_object_address = gpuAqlCode()->vmAddress(); ConstBuffer* cb = gpu.constBufs_[0]; - cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket)); + cb->uploadDataToHw(argBufSize + sizeof(hsa_dispatch_packet_t)); uint64_t argList = cb->vmAddress() + cb->wrtOffset(); - aqlPkt->kernel_arg_address = argList; + + hsaDisp->kernarg_address = argList; + hsaDisp->reserved3 = 0; + hsaDisp->completion_signal = 0; + memList.push_back(cb); memList.push_back(gpuAqlCode()); if (NULL != prog().globalStore()) { @@ -4078,7 +4076,7 @@ HSAILKernel::loadArguments( dev().srds().fillResourceList(memList); } - return aqlPkt; + return hsaDisp; } } // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp index 5e5be612a9..36c1b1c42f 100644 --- a/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/rocclr/runtime/device/gpu/gpukernel.hpp @@ -15,7 +15,7 @@ #include "device/gpu/gpuvirtual.hpp" #include "sc-hsa/Interface/SCHSAInterface.h" #include "device/gpu/gpuprintf.hpp" -#include "newcore.h" +#include "hsa.h" //! \namespace gpu GPU Device Implementation namespace gpu { @@ -885,7 +885,7 @@ public: //! Returns AQL packet in CPU memory //! if the kerenl arguments were successfully loaded, otherwise NULL - HsaAqlDispatchPacket* loadArguments( + hsa_dispatch_packet_t* loadArguments( VirtualGPU& gpu, //!< Running GPU context const amd::Kernel& kernel, //!< AMD kernel object const amd::NDRangeContainer& sizes, //!< NDrange container diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp index 9769fafecc..50f1b93391 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.cpp +++ b/rocclr/runtime/device/gpu/gpuprogram.cpp @@ -16,7 +16,8 @@ #include #include "utils/options.hpp" #include "utils/libUtils.h" -#include "newcore.h" +#include "hsa.h" +#include "hsa_ext_image.h" extern "C" bool ACL_API_ENTRY aclHsaLoader( @@ -80,8 +81,8 @@ void GetSamplerObjectParams(uint32_t* size, uint32_t* alignment) { if (GPU_DIRECT_SRD) { - *size = HSA_SAMPLER_OBJECT_SIZE; - *alignment = HSA_SAMPLER_OBJECT_ALIGNMENT; + *size = gpu::HsaSamplerObjectSize; + *alignment = gpu::HsaSamplerObjectAlignment; } else { *size = sizeof(uint64_t); @@ -94,45 +95,46 @@ InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize, uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW) { assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode"); - HsaSamplerFilterType filter = static_cast(fltr); - HsaSamplerAddressMode boundaryU = static_cast(addrU); + hsa_ext_sampler_filter_mode_t filter = + static_cast(fltr); + hsa_ext_sampler_addressing_mode_t boundaryU = + static_cast(addrU); uint32_t state = (unnormalize) ? amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue; - if (filter == HSA_SAMP_FILTER_NEAREST) { + if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) { state |= amd::Sampler::StateFilterNearest; } - else if (filter == HSA_SAMP_FILTER_LINEAR) { + else if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) { state |= amd::Sampler::StateFilterLinear; } switch (boundaryU) { - case HSA_SAMP_ADDRESS_CLAMPEDGE: + case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_EDGE: state |= amd::Sampler::StateAddressClampToEdge; break; - case HSA_SAMP_ADDRESS_CLAMPBORDER: + case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break; - case HSA_SAMP_ADDRESS_WRAP: + case HSA_EXT_SAMPLER_ADDRESSING_REPEAT: state |= amd::Sampler::StateAddressRepeat; break; - case HSA_SAMP_ADDRESS_MIRROR: + case HSA_EXT_SAMPLER_ADDRESSING_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break; - case HSA_SAMP_ADDRESS_MIRRORONCE: - case HSA_SAMP_ADDRESS_NONE: + case HSA_EXT_SAMPLER_ADDRESSING_UNDEFINED: default: break; } gpu::HSAILProgram* prog = reinterpret_cast(userData); if (prog->dev().settings().hsailDirectSRD_) { - char *pCPUbuf = new char[HSA_SAMPLER_OBJECT_SIZE]; + char *pCPUbuf = new char[gpu::HsaSamplerObjectSize]; if (!pCPUbuf) { assert(false); return; } - prog->dev().fillHwSampler(state, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE); - DmaMemoryCopy(userData, offset, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE); + prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize); + DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize); delete pCPUbuf; } else { diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp index 242580dd48..b4eb224633 100644 --- a/rocclr/runtime/device/gpu/gpusched.hpp +++ b/rocclr/runtime/device/gpu/gpusched.hpp @@ -4,7 +4,7 @@ #ifndef GPUSCHED_HPP_ #define GPUSCHED_HPP_ -#include "newcore.h" +#include "hsa.h" namespace gpu { @@ -47,7 +47,7 @@ struct AmdAqlWrap { uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects uint32_t reserved[5]; //!< For the future usage - HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet + hsa_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet }; struct AmdEvent { diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index 272273b476..598fb5c325 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -14,7 +14,7 @@ #include "device/gpu/gputhreadtrace.hpp" #include "device/gpu/gputimestamp.hpp" #include "device/gpu/gpublit.hpp" -#include "newcore.h" +#include "hsa.h" #include "sc-hsa/Interface/SCHSAInterface.h" #include #include @@ -342,7 +342,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize) AmdAqlWrap* slots = reinterpret_cast(&header[1]); for (uint i = 0; i < numSlots; ++i) { uint64_t argStart = vaBase + argOffs + i * singleArgSize; - slots[i].aql.kernel_arg_address = argStart; + slots[i].aql.kernarg_address = argStart; slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; } // Upload data back to local memory @@ -576,7 +576,8 @@ VirtualGPU::create( bool VirtualGPU::allocHsaQueueMem() { - amd_queue_t queue = {0}; + amd_queue_t queue; + memset(&queue, 0, sizeof(queue)); hsaQueueMem_ = new gpu::Memory(dev(), sizeof(queue)); if (hsaQueueMem_ == NULL) { return false; @@ -1726,7 +1727,7 @@ VirtualGPU::submitKernelInternalHSA( } // Program the kernel arguments for the GPU execution - HsaAqlDispatchPacket* aqlPkt = + hsa_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem, vmDefQueue, &vmParentWrap, memList); if (NULL == aqlPkt) { @@ -1793,12 +1794,12 @@ VirtualGPU::submitKernelInternalHSA( print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n"; } - print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size[0] << ", "; - print << wraps[i].aql.workgroup_size[1] << ", "; - print << wraps[i].aql.workgroup_size[2] << "]\n"; - print << "GridSize[ " << wraps[i].aql.grid_size[0] << ", "; - print << wraps[i].aql.grid_size[1] << ", "; - print << wraps[i].aql.grid_size[2] << "]\n"; + print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; + print << wraps[i].aql.workgroup_size_y << ", "; + print << wraps[i].aql.workgroup_size_z << "]\n"; + print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; + print << wraps[i].aql.grid_size_y << ", "; + print << wraps[i].aql.grid_size_z << "]\n"; uint64_t* kernels = (uint64_t*)( const_cast(hsaKernel.prog().kernelTable())->map(this)); @@ -1819,7 +1820,7 @@ VirtualGPU::submitKernelInternalHSA( printf("Error: couldn't find child kernel!\n"); continue; } - uint offsArg = wraps[i].aql.kernel_arg_address - + uint offsArg = wraps[i].aql.kernarg_address - gpuDefQueue->virtualQueue_->vmAddress(); address argum = gpuDefQueue->virtualQueue_->data() + offsArg; print << "Kernel: " << child->name() << "\n";