From 416f273e94914d6479f0a58b31fe2649de8e673b Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 26 Nov 2019 22:44:29 -0500
Subject: [PATCH] P4 to Git Change 2037301 by ssahasra@ssahasra-hip-vdi on
2019/11/26 22:42:25
SWDEV-204782 - introduce hostcall
Hostcall is a service that allows a kernel to submit requests to the
host using shared buffers, and block until a response is received. This
will eventually replace the shared buffer currently used for printf, and
repurposes the same hidden kernel argument.
When the runtime launches a kernel that requires the hostcall service it
performs the following actions:
- Launch a hostcall listener thread if it is not already running.
- Locate the hostcall buffer for the corresponding hardware queue, or
create a new one.
- Register the new hostcall buffer with the listener thread.
- Set the hostcall buffer pointer as an implicit argument to the kernel.
Affected files ...
... //depot/stg/opencl/drivers/opencl/make/hip.git/tests/Makefile#21 edit
... //depot/stg/opencl/drivers/opencl/make/hip.git/tests/build/Makefile.hip_tests#31 edit
... //depot/stg/opencl/drivers/opencl/make/hip.git/tests/scripts/hip_hostcall_tests.txt#1 add
... //depot/stg/opencl/drivers/opencl/make/hip.git/tests/scripts/run_all_tests.sh#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#143 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#45 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rochostcall.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rochostcall.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#92 edit
---
rocclr/runtime/device/devkernel.cpp | 6 +
rocclr/runtime/device/devkernel.hpp | 9 +-
rocclr/runtime/device/rocm/rocdevice.cpp | 44 +++
rocclr/runtime/device/rocm/rocdevice.hpp | 5 +
rocclr/runtime/device/rocm/rochostcall.cpp | 404 +++++++++++++++++++++
rocclr/runtime/device/rocm/rochostcall.hpp | 60 +++
rocclr/runtime/device/rocm/rocvirtual.cpp | 14 +
7 files changed, 539 insertions(+), 3 deletions(-)
create mode 100644 rocclr/runtime/device/rocm/rochostcall.cpp
create mode 100644 rocclr/runtime/device/rocm/rochostcall.hpp
diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp
index 1664582d96..e55c082a35 100644
--- a/rocclr/runtime/device/devkernel.cpp
+++ b/rocclr/runtime/device/devkernel.cpp
@@ -746,6 +746,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const KernelArgMD& lcArg, bool* isH
case ValueKind::HiddenPrintfBuffer:
*isHidden = true;
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
+ case ValueKind::HiddenHostcallBuffer:
+ *isHidden = true;
+ return amd::KernelParameterDescriptor::HiddenHostcallBuffer;
case ValueKind::HiddenDefaultQueue:
*isHidden = true;
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
@@ -779,6 +782,9 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is
else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) {
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
}
+ else if (strcmp(&argInfo->argStr[2], "hostcall_buffer") == 0) {
+ return amd::KernelParameterDescriptor::HiddenHostcallBuffer;
+ }
else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) {
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
}
diff --git a/rocclr/runtime/device/devkernel.hpp b/rocclr/runtime/device/devkernel.hpp
index 91b10fb654..0980537333 100644
--- a/rocclr/runtime/device/devkernel.hpp
+++ b/rocclr/runtime/device/devkernel.hpp
@@ -111,7 +111,8 @@ static const std::map ArgValueKind =
{"HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer},
{"HiddenDefaultQueue", ValueKind::HiddenDefaultQueue},
{"HiddenCompletionAction", ValueKind::HiddenCompletionAction},
- {"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg}
+ {"HiddenMultigridSyncArg", ValueKind::HiddenMultiGridSyncArg},
+ {"HiddenHostcallBuffer", ValueKind::HiddenHostcallBuffer},
};
static const std::map ArgValueType =
@@ -225,7 +226,8 @@ static const std::map ArgValueKindV3 =
{"hidden_printf_buffer", ValueKind::HiddenPrintfBuffer},
{"hidden_default_queue", ValueKind::HiddenDefaultQueue},
{"hidden_completion_action", ValueKind::HiddenCompletionAction},
- {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg}
+ {"hidden_multigrid_sync_arg", ValueKind::HiddenMultiGridSyncArg},
+ {"hidden_hostcall_buffer", ValueKind::HiddenHostcallBuffer},
};
static const std::map ArgValueTypeV3 =
@@ -320,7 +322,8 @@ struct KernelParameterDescriptor {
ImageObject = 11,
SamplerObject = 12,
QueueObject = 13,
- HiddenMultiGridSync = 14
+ HiddenMultiGridSync = 14,
+ HiddenHostcallBuffer = 15,
};
clk_value_type_t type_; //!< The parameter's type
size_t offset_; //!< Its offset in the parameter's stack
diff --git a/rocclr/runtime/device/rocm/rocdevice.cpp b/rocclr/runtime/device/rocm/rocdevice.cpp
index 7bff422b40..17b205f81c 100644
--- a/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -28,6 +28,8 @@
#include "pro/prodriver.hpp"
#endif
#include "platform/sampler.hpp"
+#include "rochostcall.hpp"
+
#include
#include
#include
@@ -1916,10 +1918,52 @@ void Device::releaseQueue(hsa_queue_t* queue) {
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
+ if (qInfo.hostcallBuffer_) {
+ ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
+ qInfo.hostcallBuffer_, queue);
+ disableHostcalls(qInfo.hostcallBuffer_, queue);
+ context().svmFree(qInfo.hostcallBuffer_);
+ }
+
+ ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
hsa_queue_destroy(queue);
queuePool_.erase(qIter);
}
+void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
+ auto qIter = queuePool_.find(queue);
+ assert(qIter != queuePool_.end());
+
+ auto& qInfo = qIter->second;
+ if (qInfo.hostcallBuffer_) {
+ return qInfo.hostcallBuffer_;
+ }
+
+ // The number of packets required in each buffer is at least equal to the
+ // maximum number of waves supported by the device.
+ auto wavesPerCu = info().maxThreadsPerCU_ / info().wavefrontWidth_;
+ auto numPackets = info().maxComputeUnits_ * wavesPerCu;
+
+ auto size = getHostcallBufferSize(numPackets);
+ auto align = getHostcallBufferAlignment();
+
+ void* buffer = context().svmAlloc(size, align, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS);
+ if (!buffer) {
+ ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE,
+ "Failed to create hostcall buffer for hardware queue %p", queue);
+ return nullptr;
+ }
+ ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Created hostcall buffer %p for hardware queue %p", buffer,
+ queue);
+ qInfo.hostcallBuffer_ = buffer;
+ if (!enableHostcalls(buffer, numPackets, queue)) {
+ ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to register hostcall buffer %p with listener",
+ buffer);
+ return nullptr;
+ }
+ return buffer;
+}
+
bool Device::findLinkTypeAndHopCount(amd::Device* other_device,
uint32_t* link_type, uint32_t* hop_count) {
hsa_amd_memory_pool_link_info_t link_info;
diff --git a/rocclr/runtime/device/rocm/rocdevice.hpp b/rocclr/runtime/device/rocm/rocdevice.hpp
index b177fdfa1c..4fe0eea6d1 100644
--- a/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -417,6 +417,10 @@ class Device : public NullDevice {
//! Release HSA queue
void releaseQueue(hsa_queue_t*);
+ //! For the given HSA queue, return an existing hostcall buffer or create a
+ //! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
+ void* getOrCreateHostcallBuffer(hsa_queue_t* queue);
+
//! Return multi GPU grid launch sync buffer
address MGSync() const { return mg_sync_; }
@@ -458,6 +462,7 @@ class Device : public NullDevice {
struct QueueInfo {
int refCount;
+ void* hostcallBuffer_;
};
std::map queuePool_; //!< Pool of HSA queues for recycling
diff --git a/rocclr/runtime/device/rocm/rochostcall.cpp b/rocclr/runtime/device/rocm/rochostcall.cpp
new file mode 100644
index 0000000000..e3f725b794
--- /dev/null
+++ b/rocclr/runtime/device/rocm/rochostcall.cpp
@@ -0,0 +1,404 @@
+//
+// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "runtime/utils/debug.hpp"
+#include "runtime/top.hpp"
+#include "runtime/utils/flags.hpp"
+
+#include "rochostcall.hpp"
+
+#include "os/os.hpp"
+#include "thread/monitor.hpp"
+#include "utils/util.hpp"
+
+#include
+
+#include
+#include
+
+namespace { // anonymous
+
+enum ServiceID {
+ SERVICE_RESERVED = 0,
+ SERVICE_FUNCTION_CALL,
+};
+
+enum SignalValue { SIGNAL_DONE = 0, SIGNAL_INIT = 1 };
+
+/** \brief Packet payload
+ *
+ * Contains 64 slots of 8 ulongs each, one for each workitem in the
+ * wave. A slot with index \c i contains valid data if the
+ * corresponding bit in PacketHeader::activemask is set.
+ */
+struct Payload {
+ uint64_t slots[64][8];
+};
+
+/** Packet header */
+struct PacketHeader {
+ /** Tagged pointer to the next packet in an intrusive stack */
+ uint64_t next_;
+ /** Bitmask that represents payload slots with valid data */
+ uint64_t activemask_;
+ /** Service ID requested by the wave */
+ uint32_t service_;
+ /** Control bits.
+ * \li 0: \c READY flag. Indicates packet awaiting a host response.
+ */
+ uint32_t control_;
+};
+
+static_assert(std::is_standard_layout::value,
+ "the hostcall packet must be useable from other languages");
+
+/** Field offsets in the packet control field */
+enum ControlOffset {
+ CONTROL_OFFSET_READY_FLAG = 0,
+ CONTROL_OFFSET_RESERVED0 = 1,
+};
+
+/** Field widths in the packet control field */
+enum ControlWidth {
+ CONTROL_WIDTH_READY_FLAG = 1,
+ CONTROL_WIDTH_RESERVED0 = 31,
+};
+
+/** \brief Shared buffer submitting hostcall requests.
+ *
+ * Holds hostcall packets requested by all kernels executing on the
+ * same device queue. Each hostcall buffer is associated with at most
+ * one device queue.
+ *
+ * Packets in the buffer are accessed using 64-bit tagged pointers to mitigate
+ * the ABA problem in lock-free stacks. The index_mask is used to extract the
+ * lower bits of the pointer, which form the index into the packet array. The
+ * remaining higher bits define a tag that is incremented on every pop from a
+ * stack.
+ */
+class HostcallBuffer {
+ /** Array of packet headers */
+ PacketHeader* headers_;
+ /** Array of packet payloads */
+ Payload* payloads_;
+ /** Signal used by kernels to indicate new work */
+ hsa_signal_t doorbell_;
+ /** Stack of free packets. Uses tagged pointers. */
+ uint64_t free_stack_;
+ /** Stack of ready packets. Uses tagged pointers */
+ uint64_t ready_stack_;
+ /** Mask for accessing the packet index in the tagged pointer. */
+ uint64_t index_mask_;
+
+ PacketHeader* getHeader(uint64_t ptr) const;
+ Payload* getPayload(uint64_t ptr) const;
+
+ public:
+ void processPackets();
+ void initialize(uint32_t num_packets);
+ void setDoorbell(hsa_signal_t doorbell) { doorbell_ = doorbell; };
+};
+
+static_assert(std::is_standard_layout::value,
+ "the hostcall buffer must be useable from other languages");
+
+}; // namespace
+
+PacketHeader* HostcallBuffer::getHeader(uint64_t ptr) const {
+ return headers_ + (ptr & index_mask_);
+}
+
+Payload* HostcallBuffer::getPayload(uint64_t ptr) const {
+ return payloads_ + (ptr & index_mask_);
+}
+
+static uint32_t setControlField(uint32_t control, uint8_t offset, uint8_t width, uint32_t value) {
+ uint32_t mask = ~(((1 << width) - 1) << offset);
+ control &= mask;
+ return control | (value << offset);
+}
+
+static uint32_t resetReadyFlag(uint32_t control) {
+ return setControlField(control, CONTROL_OFFSET_READY_FLAG, CONTROL_WIDTH_READY_FLAG, 0);
+}
+
+/** \brief Signature for pointer accepted by the function call service.
+ * \param output Pointer to output arguments.
+ * \param input Pointer to input arguments.
+ *
+ * The function can accept up to seven 64-bit arguments via the
+ * #input pointer, and can produce up to two 64-bit arguments via the
+ * #output pointer. The contents of these arguments are defined by
+ * the function being invoked.
+ */
+typedef void (*HostcallFunctionCall)(uint64_t* output, const uint64_t* input);
+
+static void handleFunctionCall(void* state, uint32_t service, uint64_t* payload) {
+ uint64_t output[2];
+
+ auto fptr = reinterpret_cast(payload[0]);
+ fptr(output, payload + 1);
+ memcpy(payload, output, sizeof(output));
+}
+
+static bool handlePayload(uint32_t service, uint64_t* payload) {
+ switch (service) {
+ case SERVICE_FUNCTION_CALL:
+ handleFunctionCall(nullptr, service, payload);
+ return true;
+ break;
+ default:
+ ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: no handler found for service ID \"%d\".",
+ service);
+ amd::report_fatal(__FILE__, __LINE__, "Hostcall service not supported.");
+ return false;
+ break;
+ }
+}
+
+void HostcallBuffer::processPackets() {
+ // Grab the entire ready stack and set the top to 0. New requests from the
+ // device will continue pushing on the stack while we process the packets that
+ // we have grabbed.
+ uint64_t ready_stack = __atomic_exchange_n(&ready_stack_, 0, std::memory_order_acquire);
+ if (!ready_stack) {
+ return;
+ }
+
+ // Each wave can submit at most one packet at a time. The ready stack cannot
+ // contain multiple packets from the same wave, so consuming ready packets in
+ // a latest-first order does not affect ordering of hostcall within a wave.
+ for (decltype(ready_stack) iter = ready_stack, next = 0; iter; iter = next) {
+ auto header = getHeader(iter);
+ // Remember the next packet pointer, because we will no longer own the
+ // current packet at the end of this loop.
+ next = header->next_;
+
+ auto service = header->service_;
+ auto payload = getPayload(iter);
+ auto activemask = header->activemask_;
+ while (activemask) {
+ auto wi = amd::leastBitSet(activemask);
+ activemask ^= static_cast(1) << wi;
+ auto slot = payload->slots[wi];
+ handlePayload(service, slot);
+ }
+
+ __atomic_store_n(&header->control_, resetReadyFlag(header->control_),
+ std::memory_order_release);
+ }
+}
+
+static uintptr_t getHeaderStart() {
+ return amd::alignUp(sizeof(HostcallBuffer), alignof(PacketHeader));
+}
+
+static uintptr_t getPayloadStart(uint32_t num_packets) {
+ auto header_start = getHeaderStart();
+ auto header_end = header_start + sizeof(PacketHeader) * num_packets;
+ return amd::alignUp(header_end, alignof(Payload));
+}
+
+size_t getHostcallBufferSize(uint32_t num_packets) {
+ size_t buffer_size = getPayloadStart(num_packets);
+ buffer_size += num_packets * sizeof(Payload);
+ return buffer_size;
+}
+
+uint32_t getHostcallBufferAlignment() { return alignof(Payload); }
+
+static uint64_t getIndexMask(uint32_t num_packets) {
+ // The number of packets is at least equal to the maximum number of waves
+ // supported by the device. That means we do not need to account for the
+ // border cases where num_packets is zero or one.
+ assert(num_packets > 1);
+ if (!amd::isPowerOfTwo(num_packets)) {
+ num_packets = amd::nextPowerOfTwo(num_packets);
+ }
+ return num_packets - 1;
+}
+
+void HostcallBuffer::initialize(uint32_t num_packets) {
+ auto base = reinterpret_cast(this);
+ headers_ = reinterpret_cast((base + getHeaderStart()));
+ payloads_ = reinterpret_cast((base + getPayloadStart(num_packets)));
+ index_mask_ = getIndexMask(num_packets);
+
+ // The null pointer is identical to (uint64_t)0. When using tagged pointers,
+ // the tag and the index part of the array must never be zero at the same
+ // time. In the initialized free stack, headers[1].next points to headers[0],
+ // which has index 0. We initialize this pointer to have a tag of 1.
+ uint64_t next = index_mask_ + 1;
+
+ // Initialize the free stack.
+ headers_[0].next_ = 0;
+ for (uint32_t ii = 1; ii != num_packets; ++ii) {
+ headers_[ii].next_ = next;
+ next = ii;
+ }
+ free_stack_ = next;
+ ready_stack_ = 0;
+}
+
+/** \brief Manage a unique listener thread and its associated buffers.
+ */
+class HostcallListener {
+ std::set buffers_;
+ hsa_signal_t doorbell_;
+
+ class Thread : public amd::Thread {
+ public:
+ Thread() : amd::Thread("Hostcall Listener Thread", CQ_THREAD_STACK_SIZE) {}
+
+ //! The hostcall listener thread entry point.
+ void run(void* data) {
+ auto listener = reinterpret_cast(data);
+ listener->consumePackets();
+ }
+ } thread_; //!< The hostcall listener thread.
+
+ void consumePackets();
+
+ public:
+ /** \brief Add a buffer to the listener.
+ *
+ * Behaviour is undefined if:
+ * - hostcall_initialize_buffer() was not invoked successfully on
+ * the buffer prior to registration.
+ * - The same buffer is registered with multiple listeners.
+ * - The same buffer is associated with more than one hardware queue.
+ */
+ void addBuffer(HostcallBuffer* buffer);
+
+ /** \brief Remove a buffer that is no longer in use.
+ *
+ * The buffer can be reused after removal. Behaviour is undefined if the
+ * buffer is freed without first removing it.
+ */
+ void removeBuffer(HostcallBuffer* buffer);
+
+ /* \brief Return true if no buffers are registered.
+ */
+ bool idle() const {
+ return buffers_.empty();
+ }
+
+ void terminate();
+ bool initialize();
+};
+
+HostcallListener* hostcallListener = nullptr;
+amd::Monitor listenerLock("Hostcall listener lock");
+
+void HostcallListener::consumePackets() {
+ uint64_t signal_value = SIGNAL_INIT;
+ uint64_t timeout = 1024 * 1024;
+
+ while (true) {
+ while (true) {
+ uint64_t new_value = hsa_signal_wait_acquire(doorbell_, HSA_SIGNAL_CONDITION_NE, signal_value, timeout,
+ HSA_WAIT_STATE_BLOCKED);
+ if (new_value != signal_value) {
+ signal_value = new_value;
+ break;
+ }
+ }
+
+ if (signal_value == SIGNAL_DONE) {
+ ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Hostcall listener received SIGNAL_DONE");
+ return;
+ }
+
+ amd::ScopedLock lock{listenerLock};
+
+ for (auto ii : buffers_) {
+ ii->processPackets();
+ }
+ }
+
+ return;
+}
+
+void HostcallListener::terminate() {
+ if (!amd::Os::isThreadAlive(thread_)) {
+ return;
+ }
+
+ hsa_signal_store_release(doorbell_, SIGNAL_DONE);
+
+ // FIXME_lmoriche: fix termination handshake
+ while (thread_.state() < Thread::FINISHED) {
+ amd::Os::yield();
+ }
+
+ hsa_signal_destroy(doorbell_);
+}
+
+void HostcallListener::addBuffer(HostcallBuffer* buffer) {
+ assert(buffers_.count(buffer) == 0 && "buffer already present");
+ buffer->setDoorbell(doorbell_);
+ buffers_.insert(buffer);
+}
+
+void HostcallListener::removeBuffer(HostcallBuffer* buffer) {
+ assert(buffers_.count(buffer) != 0 && "unknown buffer");
+ buffers_.erase(buffer);
+}
+
+bool HostcallListener::initialize() {
+ auto status = hsa_signal_create(SIGNAL_INIT, 0, NULL, &doorbell_);
+ if (status != HSA_STATUS_SUCCESS) {
+ return false;
+ }
+
+ // If the listener thread was not successfully initialized, clean
+ // everything up and bail out.
+ if (thread_.state() < Thread::INITIALIZED) {
+ hsa_signal_destroy(doorbell_);
+ return false;
+ }
+
+ thread_.start(this);
+ return true;
+}
+
+bool enableHostcalls(void* bfr, uint32_t numPackets, const void* queue) {
+ auto buffer = reinterpret_cast(bfr);
+ buffer->initialize(numPackets);
+
+ amd::ScopedLock lock(listenerLock);
+ if (!hostcallListener) {
+ hostcallListener = new HostcallListener();
+ if (!hostcallListener->initialize()) {
+ ClPrint(amd::LOG_ERROR, (amd::LOG_INIT | amd::LOG_QUEUE | amd::LOG_RESOURCE),
+ "Failed to launch hostcall listener");
+ delete hostcallListener;
+ hostcallListener = nullptr;
+ return false;
+ }
+ ClPrint(amd::LOG_INFO, (amd::LOG_INIT | amd::LOG_QUEUE | amd::LOG_RESOURCE),
+ "Launched hostcall listener at %p", hostcallListener);
+ }
+ hostcallListener->addBuffer(buffer);
+ ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Registered hostcall buffer %p with listener %p", buffer,
+ hostcallListener);
+ return true;
+}
+
+void disableHostcalls(void* bfr, const void* queue) {
+ amd::ScopedLock lock(listenerLock);
+ if (!hostcallListener) {
+ return;
+ }
+ assert(bfr && "expected a hostcall buffer");
+ auto buffer = reinterpret_cast(bfr);
+ hostcallListener->removeBuffer(buffer);
+
+ if (hostcallListener->idle()) {
+ hostcallListener->terminate();
+ delete hostcallListener;
+ hostcallListener = nullptr;
+ ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Terminated hostcall listener");
+ }
+}
diff --git a/rocclr/runtime/device/rocm/rochostcall.hpp b/rocclr/runtime/device/rocm/rochostcall.hpp
new file mode 100644
index 0000000000..c4501f039a
--- /dev/null
+++ b/rocclr/runtime/device/rocm/rochostcall.hpp
@@ -0,0 +1,60 @@
+//
+// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#pragma once
+
+/** \file Support for invoking host services from the device.
+ *
+ * A hostcall is a fixed-size request generated by a kernel running
+ * on the device, for some predefined service provided by the
+ * host. The life-cycle of a hostcall is as follows:
+ *
+ * 1. A workitem in the some kernel dispatch submits a request as a
+ * "packet" in a "hostcall buffer". The workitem blocks until it
+ * receives a response from the host.
+ *
+ * 2. A host thread called the "hostcall listener" notices the packet
+ * and invokes the desired service on the host.
+ *
+ * 3. When the service completes, the listener copies the response
+ * into the request packet. This unblocks the workitem, and the
+ * hostcall is said to be completed.
+ *
+ * The hostcall listeners and buffers are managed by the VDI
+ * runtime. The typical flow is as follows:
+ *
+ * - Create and launch one or more hostcall listeners.
+ *
+ * - Create and initialize a distinct hostcall buffer for each
+ * command queue in hardware (e.g., an hsa_queue_t on ROCm).
+ *
+ * - Register this buffer with the appropriate listener.
+ *
+ * - When a buffer is no longer used, deregister and then free
+ * it. This usually happens when the corresponding hardware queue
+ * is freed.
+ *
+ * - Destroy the listener(s) when they are no longer required. This must be
+ * done before exiting the application, so that the listener
+ * threads can join() correctly.
+ *
+ * A single listener is sufficient to correctly handle all hostcall
+ * buffers created in the application. The client may also launch
+ * multiple listeners, as long the same hostcall buffer is not
+ * registered with multiple listeners.
+ */
+
+/** \brief Determine the buffer size to be allocated
+ * \param num_packets Number of packets to be supported.
+ * \return Required size, including any internal padding required for
+ * the packets and their headers.
+ */
+size_t getHostcallBufferSize(uint32_t num_packets);
+
+/** \brief Return the required alignment for a hostcall buffer.
+ */
+uint32_t getHostcallBufferAlignment(void);
+
+bool enableHostcalls(void* buffer, uint32_t numPackets, const void* queue);
+void disableHostcalls(void* buffer, const void* queue);
diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp
index f23cde6f16..1b1de74087 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -13,6 +13,7 @@
#include "platform/command.hpp"
#include "platform/memory.hpp"
#include "platform/sampler.hpp"
+#include "rochostcall.hpp"
#include "utils/debug.hpp"
#include "os/os.hpp"
#include "amd_hsa_kernel_code.h"
@@ -2075,6 +2076,19 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
break;
}
+ case amd::KernelParameterDescriptor::HiddenHostcallBuffer: {
+ if (amd::IS_HIP) {
+ auto buffer = roc_device_.getOrCreateHostcallBuffer(gpu_queue_);
+ if (!buffer) {
+ ClPrint(amd::LOG_ERROR, amd::LOG_KERN,
+ "Kernel expects a hostcall buffer, but none found");
+ return false;
+ }
+ assert(it.size_ == sizeof(buffer) && "check the sizes");
+ WriteAqlArgAt(const_cast(parameters), &buffer, it.size_, it.offset_);
+ }
+ break;
+ }
case amd::KernelParameterDescriptor::HiddenDefaultQueue: {
uint64_t vqVA = 0;
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());