From 416f273e94914d6479f0a58b31fe2649de8e673b Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 26 Nov 2019 22:44:29 -0500 Subject: [PATCH] P4 to Git Change 2037301 by ssahasra@ssahasra-hip-vdi on 2019/11/26 22:42:25 SWDEV-204782 - introduce hostcall Hostcall is a service that allows a kernel to submit requests to the host using shared buffers, and block until a response is received. This will eventually replace the shared buffer currently used for printf, and repurposes the same hidden kernel argument. When the runtime launches a kernel that requires the hostcall service it performs the following actions: - Launch a hostcall listener thread if it is not already running. - Locate the hostcall buffer for the corresponding hardware queue, or create a new one. - Register the new hostcall buffer with the listener thread. - Set the hostcall buffer pointer as an implicit argument to the kernel. Affected files ... ... //depot/stg/opencl/drivers/opencl/make/hip.git/tests/Makefile#21 edit ... //depot/stg/opencl/drivers/opencl/make/hip.git/tests/build/Makefile.hip_tests#31 edit ... //depot/stg/opencl/drivers/opencl/make/hip.git/tests/scripts/hip_hostcall_tests.txt#1 add ... //depot/stg/opencl/drivers/opencl/make/hip.git/tests/scripts/run_all_tests.sh#22 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#30 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#19 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#143 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#45 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rochostcall.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rochostcall.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#92 edit --- rocclr/runtime/device/devkernel.cpp | 6 + rocclr/runtime/device/devkernel.hpp | 9 +- rocclr/runtime/device/rocm/rocdevice.cpp | 44 +++ rocclr/runtime/device/rocm/rocdevice.hpp | 5 + rocclr/runtime/device/rocm/rochostcall.cpp | 404 +++++++++++++++++++++ rocclr/runtime/device/rocm/rochostcall.hpp | 60 +++ rocclr/runtime/device/rocm/rocvirtual.cpp | 14 + 7 files changed, 539 insertions(+), 3 deletions(-) create mode 100644 rocclr/runtime/device/rocm/rochostcall.cpp create mode 100644 rocclr/runtime/device/rocm/rochostcall.hpp diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp index 1664582d96..e55c082a35 100644 --- a/rocclr/runtime/device/devkernel.cpp +++ b/rocclr/runtime/device/devkernel.cpp @@ -746,6 +746,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH case ValueKind::HiddenPrintfBuffer: *isHidden = true; return amd::KernelParameterDescriptor::HiddenPrintfBuffer; + case ValueKind::HiddenHostcallBuffer: + *isHidden = true; + return amd::KernelParameterDescriptor::HiddenHostcallBuffer; case ValueKind::HiddenDefaultQueue: *isHidden = true; return amd::KernelParameterDescriptor::HiddenDefaultQueue; @@ -779,6 +782,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { return amd::KernelParameterDescriptor::HiddenPrintfBuffer; } + else if (strcmp(&argInfo->argStr[2], "hostcall_buffer") == 0) { + return amd::KernelParameterDescriptor::HiddenHostcallBuffer; + } else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { return amd::KernelParameterDescriptor::HiddenDefaultQueue; } diff --git a/rocclr/runtime/device/devkernel.hpp b/rocclr/runtime/device/devkernel.hpp index 91b10fb654..0980537333 100644 --- a/rocclr/runtime/device/devkernel.hpp +++ b/rocclr/runtime/device/devkernel.hpp @@ -111,7 +111,8 @@ static const std::map ArgValueKind = {"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer}, {"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue}, {"HiddenCompletionAction", ValueKind::HiddenCompletionAction}, - {"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg} + {"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg}, + {"HiddenHostcallBuffer", ValueKind::HiddenHostcallBuffer}, }; static const std::map ArgValueType = @@ -225,7 +226,8 @@ static const std::map ArgValueKindV3 = {"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer}, {"hidden_default_queue", ValueKind::HiddenDefaultQueue}, {"hidden_completion_action", ValueKind::HiddenCompletionAction}, - {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg} + {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg}, + {"hidden_hostcall_buffer", ValueKind::HiddenHostcallBuffer}, }; static const std::map ArgValueTypeV3 = @@ -320,7 +322,8 @@ struct KernelParameterDescriptor { ImageObject = 11, SamplerObject = 12, QueueObject = 13, - HiddenMultiGridSync = 14 + HiddenMultiGridSync = 14, + HiddenHostcallBuffer = 15, }; clk_value_type_t type_; //!< The parameter's type size_t offset_; //!< Its offset in the parameter's stack diff --git a/rocclr/runtime/device/rocm/rocdevice.cpp b/rocclr/runtime/device/rocm/rocdevice.cpp index 7bff422b40..17b205f81c 100644 --- a/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/rocclr/runtime/device/rocm/rocdevice.cpp @@ -28,6 +28,8 @@ #include "pro/prodriver.hpp" #endif #include "platform/sampler.hpp" +#include "rochostcall.hpp" + #include #include #include @@ -1916,10 +1918,52 @@ void Device::releaseQueue(hsa_queue_t* queue) { } ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); + if (qInfo.hostcallBuffer_) { + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p", + qInfo.hostcallBuffer_, queue); + disableHostcalls(qInfo.hostcallBuffer_, queue); + context().svmFree(qInfo.hostcallBuffer_); + } + + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); hsa_queue_destroy(queue); queuePool_.erase(qIter); } +void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) { + auto qIter = queuePool_.find(queue); + assert(qIter != queuePool_.end()); + + auto& qInfo = qIter->second; + if (qInfo.hostcallBuffer_) { + return qInfo.hostcallBuffer_; + } + + // The number of packets required in each buffer is at least equal to the + // maximum number of waves supported by the device. + auto wavesPerCu = info().maxThreadsPerCU_ / info().wavefrontWidth_; + auto numPackets = info().maxComputeUnits_ * wavesPerCu; + + auto size = getHostcallBufferSize(numPackets); + auto align = getHostcallBufferAlignment(); + + void* buffer = context().svmAlloc(size, align, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS); + if (!buffer) { + ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, + "Failed to create hostcall buffer for hardware queue %p", queue); + return nullptr; + } + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Created hostcall buffer %p for hardware queue %p", buffer, + queue); + qInfo.hostcallBuffer_ = buffer; + if (!enableHostcalls(buffer, numPackets, queue)) { + ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to register hostcall buffer %p with listener", + buffer); + return nullptr; + } + return buffer; +} + bool Device::findLinkTypeAndHopCount(amd::Device* other_device, uint32_t* link_type, uint32_t* hop_count) { hsa_amd_memory_pool_link_info_t link_info; diff --git a/rocclr/runtime/device/rocm/rocdevice.hpp b/rocclr/runtime/device/rocm/rocdevice.hpp index b177fdfa1c..4fe0eea6d1 100644 --- a/rocclr/runtime/device/rocm/rocdevice.hpp +++ b/rocclr/runtime/device/rocm/rocdevice.hpp @@ -417,6 +417,10 @@ class Device : public NullDevice { //! Release HSA queue void releaseQueue(hsa_queue_t*); + //! For the given HSA queue, return an existing hostcall buffer or create a + //! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer. + void* getOrCreateHostcallBuffer(hsa_queue_t* queue); + //! Return multi GPU grid launch sync buffer address MGSync() const { return mg_sync_; } @@ -458,6 +462,7 @@ class Device : public NullDevice { struct QueueInfo { int refCount; + void* hostcallBuffer_; }; std::map queuePool_; //!< Pool of HSA queues for recycling diff --git a/rocclr/runtime/device/rocm/rochostcall.cpp b/rocclr/runtime/device/rocm/rochostcall.cpp new file mode 100644 index 0000000000..e3f725b794 --- /dev/null +++ b/rocclr/runtime/device/rocm/rochostcall.cpp @@ -0,0 +1,404 @@ +// +// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "runtime/utils/debug.hpp" +#include "runtime/top.hpp" +#include "runtime/utils/flags.hpp" + +#include "rochostcall.hpp" + +#include "os/os.hpp" +#include "thread/monitor.hpp" +#include "utils/util.hpp" + +#include + +#include +#include + +namespace { // anonymous + +enum ServiceID { + SERVICE_RESERVED = 0, + SERVICE_FUNCTION_CALL, +}; + +enum SignalValue { SIGNAL_DONE = 0, SIGNAL_INIT = 1 }; + +/** \brief Packet payload + * + * Contains 64 slots of 8 ulongs each, one for each workitem in the + * wave. A slot with index \c i contains valid data if the + * corresponding bit in PacketHeader::activemask is set. + */ +struct Payload { + uint64_t slots[64][8]; +}; + +/** Packet header */ +struct PacketHeader { + /** Tagged pointer to the next packet in an intrusive stack */ + uint64_t next_; + /** Bitmask that represents payload slots with valid data */ + uint64_t activemask_; + /** Service ID requested by the wave */ + uint32_t service_; + /** Control bits. + * \li 0: \c READY flag. Indicates packet awaiting a host response. + */ + uint32_t control_; +}; + +static_assert(std::is_standard_layout::value, + "the hostcall packet must be useable from other languages"); + +/** Field offsets in the packet control field */ +enum ControlOffset { + CONTROL_OFFSET_READY_FLAG = 0, + CONTROL_OFFSET_RESERVED0 = 1, +}; + +/** Field widths in the packet control field */ +enum ControlWidth { + CONTROL_WIDTH_READY_FLAG = 1, + CONTROL_WIDTH_RESERVED0 = 31, +}; + +/** \brief Shared buffer submitting hostcall requests. + * + * Holds hostcall packets requested by all kernels executing on the + * same device queue. Each hostcall buffer is associated with at most + * one device queue. + * + * Packets in the buffer are accessed using 64-bit tagged pointers to mitigate + * the ABA problem in lock-free stacks. The index_mask is used to extract the + * lower bits of the pointer, which form the index into the packet array. The + * remaining higher bits define a tag that is incremented on every pop from a + * stack. + */ +class HostcallBuffer { + /** Array of packet headers */ + PacketHeader* headers_; + /** Array of packet payloads */ + Payload* payloads_; + /** Signal used by kernels to indicate new work */ + hsa_signal_t doorbell_; + /** Stack of free packets. Uses tagged pointers. */ + uint64_t free_stack_; + /** Stack of ready packets. Uses tagged pointers */ + uint64_t ready_stack_; + /** Mask for accessing the packet index in the tagged pointer. */ + uint64_t index_mask_; + + PacketHeader* getHeader(uint64_t ptr) const; + Payload* getPayload(uint64_t ptr) const; + + public: + void processPackets(); + void initialize(uint32_t num_packets); + void setDoorbell(hsa_signal_t doorbell) { doorbell_ = doorbell; }; +}; + +static_assert(std::is_standard_layout::value, + "the hostcall buffer must be useable from other languages"); + +}; // namespace + +PacketHeader* HostcallBuffer::getHeader(uint64_t ptr) const { + return headers_ + (ptr & index_mask_); +} + +Payload* HostcallBuffer::getPayload(uint64_t ptr) const { + return payloads_ + (ptr & index_mask_); +} + +static uint32_t setControlField(uint32_t control, uint8_t offset, uint8_t width, uint32_t value) { + uint32_t mask = ~(((1 << width) - 1) << offset); + control &= mask; + return control | (value << offset); +} + +static uint32_t resetReadyFlag(uint32_t control) { + return setControlField(control, CONTROL_OFFSET_READY_FLAG, CONTROL_WIDTH_READY_FLAG, 0); +} + +/** \brief Signature for pointer accepted by the function call service. + * \param output Pointer to output arguments. + * \param input Pointer to input arguments. + * + * The function can accept up to seven 64-bit arguments via the + * #input pointer, and can produce up to two 64-bit arguments via the + * #output pointer. The contents of these arguments are defined by + * the function being invoked. + */ +typedef void (*HostcallFunctionCall)(uint64_t* output, const uint64_t* input); + +static void handleFunctionCall(void* state, uint32_t service, uint64_t* payload) { + uint64_t output[2]; + + auto fptr = reinterpret_cast(payload[0]); + fptr(output, payload + 1); + memcpy(payload, output, sizeof(output)); +} + +static bool handlePayload(uint32_t service, uint64_t* payload) { + switch (service) { + case SERVICE_FUNCTION_CALL: + handleFunctionCall(nullptr, service, payload); + return true; + break; + default: + ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: no handler found for service ID \"%d\".", + service); + amd::report_fatal(__FILE__, __LINE__, "Hostcall service not supported."); + return false; + break; + } +} + +void HostcallBuffer::processPackets() { + // Grab the entire ready stack and set the top to 0. New requests from the + // device will continue pushing on the stack while we process the packets that + // we have grabbed. + uint64_t ready_stack = __atomic_exchange_n(&ready_stack_, 0, std::memory_order_acquire); + if (!ready_stack) { + return; + } + + // Each wave can submit at most one packet at a time. The ready stack cannot + // contain multiple packets from the same wave, so consuming ready packets in + // a latest-first order does not affect ordering of hostcall within a wave. + for (decltype(ready_stack) iter = ready_stack, next = 0; iter; iter = next) { + auto header = getHeader(iter); + // Remember the next packet pointer, because we will no longer own the + // current packet at the end of this loop. + next = header->next_; + + auto service = header->service_; + auto payload = getPayload(iter); + auto activemask = header->activemask_; + while (activemask) { + auto wi = amd::leastBitSet(activemask); + activemask ^= static_cast(1) << wi; + auto slot = payload->slots[wi]; + handlePayload(service, slot); + } + + __atomic_store_n(&header->control_, resetReadyFlag(header->control_), + std::memory_order_release); + } +} + +static uintptr_t getHeaderStart() { + return amd::alignUp(sizeof(HostcallBuffer), alignof(PacketHeader)); +} + +static uintptr_t getPayloadStart(uint32_t num_packets) { + auto header_start = getHeaderStart(); + auto header_end = header_start + sizeof(PacketHeader) * num_packets; + return amd::alignUp(header_end, alignof(Payload)); +} + +size_t getHostcallBufferSize(uint32_t num_packets) { + size_t buffer_size = getPayloadStart(num_packets); + buffer_size += num_packets * sizeof(Payload); + return buffer_size; +} + +uint32_t getHostcallBufferAlignment() { return alignof(Payload); } + +static uint64_t getIndexMask(uint32_t num_packets) { + // The number of packets is at least equal to the maximum number of waves + // supported by the device. That means we do not need to account for the + // border cases where num_packets is zero or one. + assert(num_packets > 1); + if (!amd::isPowerOfTwo(num_packets)) { + num_packets = amd::nextPowerOfTwo(num_packets); + } + return num_packets - 1; +} + +void HostcallBuffer::initialize(uint32_t num_packets) { + auto base = reinterpret_cast(this); + headers_ = reinterpret_cast((base + getHeaderStart())); + payloads_ = reinterpret_cast((base + getPayloadStart(num_packets))); + index_mask_ = getIndexMask(num_packets); + + // The null pointer is identical to (uint64_t)0. When using tagged pointers, + // the tag and the index part of the array must never be zero at the same + // time. In the initialized free stack, headers[1].next points to headers[0], + // which has index 0. We initialize this pointer to have a tag of 1. + uint64_t next = index_mask_ + 1; + + // Initialize the free stack. + headers_[0].next_ = 0; + for (uint32_t ii = 1; ii != num_packets; ++ii) { + headers_[ii].next_ = next; + next = ii; + } + free_stack_ = next; + ready_stack_ = 0; +} + +/** \brief Manage a unique listener thread and its associated buffers. + */ +class HostcallListener { + std::set buffers_; + hsa_signal_t doorbell_; + + class Thread : public amd::Thread { + public: + Thread() : amd::Thread("Hostcall Listener Thread", CQ_THREAD_STACK_SIZE) {} + + //! The hostcall listener thread entry point. + void run(void* data) { + auto listener = reinterpret_cast(data); + listener->consumePackets(); + } + } thread_; //!< The hostcall listener thread. + + void consumePackets(); + + public: + /** \brief Add a buffer to the listener. + * + * Behaviour is undefined if: + * - hostcall_initialize_buffer() was not invoked successfully on + * the buffer prior to registration. + * - The same buffer is registered with multiple listeners. + * - The same buffer is associated with more than one hardware queue. + */ + void addBuffer(HostcallBuffer* buffer); + + /** \brief Remove a buffer that is no longer in use. + * + * The buffer can be reused after removal. Behaviour is undefined if the + * buffer is freed without first removing it. + */ + void removeBuffer(HostcallBuffer* buffer); + + /* \brief Return true if no buffers are registered. + */ + bool idle() const { + return buffers_.empty(); + } + + void terminate(); + bool initialize(); +}; + +HostcallListener* hostcallListener = nullptr; +amd::Monitor listenerLock("Hostcall listener lock"); + +void HostcallListener::consumePackets() { + uint64_t signal_value = SIGNAL_INIT; + uint64_t timeout = 1024 * 1024; + + while (true) { + while (true) { + uint64_t new_value = hsa_signal_wait_acquire(doorbell_, HSA_SIGNAL_CONDITION_NE, signal_value, timeout, + HSA_WAIT_STATE_BLOCKED); + if (new_value != signal_value) { + signal_value = new_value; + break; + } + } + + if (signal_value == SIGNAL_DONE) { + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Hostcall listener received SIGNAL_DONE"); + return; + } + + amd::ScopedLock lock{listenerLock}; + + for (auto ii : buffers_) { + ii->processPackets(); + } + } + + return; +} + +void HostcallListener::terminate() { + if (!amd::Os::isThreadAlive(thread_)) { + return; + } + + hsa_signal_store_release(doorbell_, SIGNAL_DONE); + + // FIXME_lmoriche: fix termination handshake + while (thread_.state() < Thread::FINISHED) { + amd::Os::yield(); + } + + hsa_signal_destroy(doorbell_); +} + +void HostcallListener::addBuffer(HostcallBuffer* buffer) { + assert(buffers_.count(buffer) == 0 && "buffer already present"); + buffer->setDoorbell(doorbell_); + buffers_.insert(buffer); +} + +void HostcallListener::removeBuffer(HostcallBuffer* buffer) { + assert(buffers_.count(buffer) != 0 && "unknown buffer"); + buffers_.erase(buffer); +} + +bool HostcallListener::initialize() { + auto status = hsa_signal_create(SIGNAL_INIT, 0, NULL, &doorbell_); + if (status != HSA_STATUS_SUCCESS) { + return false; + } + + // If the listener thread was not successfully initialized, clean + // everything up and bail out. + if (thread_.state() < Thread::INITIALIZED) { + hsa_signal_destroy(doorbell_); + return false; + } + + thread_.start(this); + return true; +} + +bool enableHostcalls(void* bfr, uint32_t numPackets, const void* queue) { + auto buffer = reinterpret_cast(bfr); + buffer->initialize(numPackets); + + amd::ScopedLock lock(listenerLock); + if (!hostcallListener) { + hostcallListener = new HostcallListener(); + if (!hostcallListener->initialize()) { + ClPrint(amd::LOG_ERROR, (amd::LOG_INIT | amd::LOG_QUEUE | amd::LOG_RESOURCE), + "Failed to launch hostcall listener"); + delete hostcallListener; + hostcallListener = nullptr; + return false; + } + ClPrint(amd::LOG_INFO, (amd::LOG_INIT | amd::LOG_QUEUE | amd::LOG_RESOURCE), + "Launched hostcall listener at %p", hostcallListener); + } + hostcallListener->addBuffer(buffer); + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Registered hostcall buffer %p with listener %p", buffer, + hostcallListener); + return true; +} + +void disableHostcalls(void* bfr, const void* queue) { + amd::ScopedLock lock(listenerLock); + if (!hostcallListener) { + return; + } + assert(bfr && "expected a hostcall buffer"); + auto buffer = reinterpret_cast(bfr); + hostcallListener->removeBuffer(buffer); + + if (hostcallListener->idle()) { + hostcallListener->terminate(); + delete hostcallListener; + hostcallListener = nullptr; + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Terminated hostcall listener"); + } +} diff --git a/rocclr/runtime/device/rocm/rochostcall.hpp b/rocclr/runtime/device/rocm/rochostcall.hpp new file mode 100644 index 0000000000..c4501f039a --- /dev/null +++ b/rocclr/runtime/device/rocm/rochostcall.hpp @@ -0,0 +1,60 @@ +// +// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// + +#pragma once + +/** \file Support for invoking host services from the device. + * + * A hostcall is a fixed-size request generated by a kernel running + * on the device, for some predefined service provided by the + * host. The life-cycle of a hostcall is as follows: + * + * 1. A workitem in the some kernel dispatch submits a request as a + * "packet" in a "hostcall buffer". The workitem blocks until it + * receives a response from the host. + * + * 2. A host thread called the "hostcall listener" notices the packet + * and invokes the desired service on the host. + * + * 3. When the service completes, the listener copies the response + * into the request packet. This unblocks the workitem, and the + * hostcall is said to be completed. + * + * The hostcall listeners and buffers are managed by the VDI + * runtime. The typical flow is as follows: + * + * - Create and launch one or more hostcall listeners. + * + * - Create and initialize a distinct hostcall buffer for each + * command queue in hardware (e.g., an hsa_queue_t on ROCm). + * + * - Register this buffer with the appropriate listener. + * + * - When a buffer is no longer used, deregister and then free + * it. This usually happens when the corresponding hardware queue + * is freed. + * + * - Destroy the listener(s) when they are no longer required. This must be + * done before exiting the application, so that the listener + * threads can join() correctly. + * + * A single listener is sufficient to correctly handle all hostcall + * buffers created in the application. The client may also launch + * multiple listeners, as long the same hostcall buffer is not + * registered with multiple listeners. + */ + +/** \brief Determine the buffer size to be allocated + * \param num_packets Number of packets to be supported. + * \return Required size, including any internal padding required for + * the packets and their headers. + */ +size_t getHostcallBufferSize(uint32_t num_packets); + +/** \brief Return the required alignment for a hostcall buffer. + */ +uint32_t getHostcallBufferAlignment(void); + +bool enableHostcalls(void* buffer, uint32_t numPackets, const void* queue); +void disableHostcalls(void* buffer, const void* queue); diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp index f23cde6f16..1b1de74087 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -13,6 +13,7 @@ #include "platform/command.hpp" #include "platform/memory.hpp" #include "platform/sampler.hpp" +#include "rochostcall.hpp" #include "utils/debug.hpp" #include "os/os.hpp" #include "amd_hsa_kernel_code.h" @@ -2075,6 +2076,19 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } break; } + case amd::KernelParameterDescriptor::HiddenHostcallBuffer: { + if (amd::IS_HIP) { + auto buffer = roc_device_.getOrCreateHostcallBuffer(gpu_queue_); + if (!buffer) { + ClPrint(amd::LOG_ERROR, amd::LOG_KERN, + "Kernel expects a hostcall buffer, but none found"); + return false; + } + assert(it.size_ == sizeof(buffer) && "check the sizes"); + WriteAqlArgAt(const_cast
(parameters), &buffer, it.size_, it.offset_); + } + break; + } case amd::KernelParameterDescriptor::HiddenDefaultQueue: { uint64_t vqVA = 0; amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());