diff --git a/rocclr/device/devhostcall.cpp b/rocclr/device/devhostcall.cpp index 8577abc01e..53cdfaa73b 100644 --- a/rocclr/device/devhostcall.cpp +++ b/rocclr/device/devhostcall.cpp @@ -300,7 +300,7 @@ class HostcallListener { } void terminate(); - bool initialize(amd::Device &dev); + bool initialize(const amd::Device &dev); }; HostcallListener* hostcallListener = nullptr; @@ -360,9 +360,14 @@ void HostcallListener::removeBuffer(HostcallBuffer* buffer) { buffers_.erase(buffer); } -bool HostcallListener::initialize(amd::Device &dev) { +bool HostcallListener::initialize(const amd::Device &dev) { doorbell_ = dev.createSignal(); - if ((doorbell_ == nullptr) || !doorbell_->Init(SIGNAL_INIT, device::Signal::WaitState::Blocked)) { +#ifdef WITH_PAL_DEVICE + auto ws = device::Signal::WaitState::Active; +#elif WITH_HSA_DEVICE + auto ws = device::Signal::WaitState::Blocked; +#endif + if ((doorbell_ == nullptr) || !doorbell_->Init(dev, SIGNAL_INIT, ws)) { return false; } @@ -377,7 +382,7 @@ bool HostcallListener::initialize(amd::Device &dev) { return true; } -bool enableHostcalls(amd::Device &dev, void* bfr, uint32_t numPackets) { +bool enableHostcalls(const amd::Device &dev, void* bfr, uint32_t numPackets) { auto buffer = reinterpret_cast(bfr); buffer->initialize(numPackets); diff --git a/rocclr/device/devhostcall.hpp b/rocclr/device/devhostcall.hpp index bfbc107782..1a3f275a9d 100644 --- a/rocclr/device/devhostcall.hpp +++ b/rocclr/device/devhostcall.hpp @@ -77,5 +77,5 @@ size_t getHostcallBufferSize(uint32_t num_packets); */ uint32_t getHostcallBufferAlignment(void); -bool enableHostcalls(amd::Device& dev, void* buffer, uint32_t numPackets); +bool enableHostcalls(const amd::Device& dev, void* buffer, uint32_t numPackets); void disableHostcalls(void* buffer); diff --git a/rocclr/device/devsignal.hpp b/rocclr/device/devsignal.hpp index 577881a489..3ce7b0766d 100644 --- a/rocclr/device/devsignal.hpp +++ b/rocclr/device/devsignal.hpp @@ -22,6 +22,10 @@ #include "top.hpp" +namespace amd { + class Device; +}; + namespace device { // Light abstraction over HSA/PAL signals @@ -45,7 +49,7 @@ protected: public: virtual ~Signal() {} - virtual bool Init(uint64_t init, WaitState ws) + virtual bool Init(const amd::Device& dev, uint64_t init, WaitState ws) { return false; } // Blocks the current thread untill the condition c is satisfied diff --git a/rocclr/device/pal/CMakeLists.txt b/rocclr/device/pal/CMakeLists.txt index 5f7c355d9d..121da30707 100644 --- a/rocclr/device/pal/CMakeLists.txt +++ b/rocclr/device/pal/CMakeLists.txt @@ -153,6 +153,7 @@ target_sources(rocclrpal PRIVATE palresource.cpp palschedcl.cpp palsettings.cpp + palsignal.cpp palthreadtrace.cpp paltimestamp.cpp palvirtual.cpp diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp index 98c31dff57..778f9f9f86 100644 --- a/rocclr/device/pal/paldevice.hpp +++ b/rocclr/device/pal/paldevice.hpp @@ -36,6 +36,7 @@ #include "device/pal/palsettings.hpp" #include "device/pal/palappprofile.hpp" #include "device/pal/palgpuopen.hpp" +#include "device/pal/palsignal.hpp" #include "acl.h" #include "memory" @@ -361,7 +362,7 @@ class Device : public NullDevice { //! Signal object allocation virtual device::Signal* createSignal() const { - return nullptr; + return new pal::Signal(); } //! Create the device program. diff --git a/rocclr/device/pal/palkernel.cpp b/rocclr/device/pal/palkernel.cpp index 8dc74a1982..dcfaa045c9 100644 --- a/rocclr/device/pal/palkernel.cpp +++ b/rocclr/device/pal/palkernel.cpp @@ -349,6 +349,18 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const WriteAqlArgAt(const_cast
(parameters), &bufferPtr, it.size_, it.offset_); } break; + case amd::KernelParameterDescriptor::HiddenHostcallBuffer: + if (amd::IS_HIP) { + auto buffer = gpu.getOrCreateHostcallBuffer(); + if (!buffer) { + ClPrint(amd::LOG_ERROR, amd::LOG_KERN, + "Kernel expects a hostcall buffer, but none found"); + return false; + } + assert(it.size_ == sizeof(buffer) && "check the sizes"); + WriteAqlArgAt(const_cast
(parameters), &buffer, it.size_, it.offset_); + } + break; case amd::KernelParameterDescriptor::HiddenDefaultQueue: if (vmDefQueue != 0) { WriteAqlArgAt(const_cast
(parameters), &vmDefQueue, it.size_, it.offset_); diff --git a/rocclr/device/pal/palsignal.cpp b/rocclr/device/pal/palsignal.cpp new file mode 100644 index 0000000000..53a9cefc8f --- /dev/null +++ b/rocclr/device/pal/palsignal.cpp @@ -0,0 +1,97 @@ +/* Copyright (c) 2021-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "device/device.hpp" +#include "palsignal.hpp" +#include "paldevice.hpp" +#include "os/os.hpp" + +#include + +namespace pal { + +Signal::~Signal() { + dev_->context().svmFree(amdSignal_); +} + +bool Signal::Init(const amd::Device& dev, uint64_t init, device::Signal::WaitState ws) { + dev_ = static_cast(&dev); + ws_ = ws; + + void* buffer = dev_->context().svmAlloc(sizeof(amd_signal_t), alignof(amd_signal_t), + CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS); + if (!buffer) { + ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, + "Failed to create amd_signal_t buffer"); + return false; + } + std::memset(buffer, 0, sizeof(amd_signal_t)); + + amdSignal_ = new (buffer) amd_signal_t(); + amdSignal_->value = init; + + return true; +} + +uint64_t Signal::Wait(uint64_t value, device::Signal::Condition c, uint64_t timeout) { + auto cmp = [](device::Signal::Condition c) -> std::function { + switch (c) { + case device::Signal::Condition::Eq: + return [](auto ls, auto rs) { return ls == rs; }; + case device::Signal::Condition::Ne: + return [](auto ls, auto rs) { return ls != rs; }; + case device::Signal::Condition::Lt: + return [](auto ls, auto rs) { return ls < rs; }; + case device::Signal::Condition::Gte: + return [](auto ls, auto rs) { return ls >= rs; }; + }; + ShouldNotReachHere(); + return [](auto ls, auto rs) { return false; }; + } (c); + + if (ws_ == device::Signal::WaitState::Blocked) { + guarantee(false, "Unimplemented"); + } else if (ws_ == device::Signal::WaitState::Active) { + auto start = amd::Os::timeNanos(); + while (true) { + auto end = amd::Os::timeNanos(); + auto duration = 1000 * (end - start); // convert to us + if (duration >= timeout) { + return -1; + } + + if (!cmp(amdSignal_->value, value)) { + amd::Os::yield(); + continue; + } + + std::atomic_thread_fence(std::memory_order_acquire); + return amdSignal_->value; + } + } + + return -1; +} + +void Signal::Reset(uint64_t value) { + amdSignal_->value = value; +} + +}; diff --git a/rocclr/device/pal/palsignal.hpp b/rocclr/device/pal/palsignal.hpp new file mode 100644 index 0000000000..b02a43b1ff --- /dev/null +++ b/rocclr/device/pal/palsignal.hpp @@ -0,0 +1,50 @@ +/* Copyright (c) 2021-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#pragma once + +#include "device/devsignal.hpp" + +#include + +namespace pal { + +class Device; + +class Signal: public device::Signal { +private: + const Device* dev_; + amd_signal_t* amdSignal_; + +public: + ~Signal() override; + + bool Init(const amd::Device& dev, uint64_t init, device::Signal::WaitState ws) override; + + uint64_t Wait(uint64_t value, device::Signal::Condition c, uint64_t timeout) override; + + void Reset(uint64_t value) override; + + void* getHandle() override { + return amdSignal_; + } +}; + +}; \ No newline at end of file diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index 8538f5b96a..9532a3c134 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -32,6 +32,7 @@ #include "device/pal/palblit.hpp" #include "device/pal/paldebugger.hpp" #include "device/appprofile.hpp" +#include "device/devhostcall.hpp" #include "hsa.h" #include "amd_hsa_kernel_code.h" #include "amd_hsa_queue.h" @@ -1032,6 +1033,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, &dbg_vmid); } + // The hostcall buffer for this vqueue is initialized on demand. + hostcallBuffer_ = nullptr; + return true; } @@ -1142,6 +1146,14 @@ VirtualGPU::~VirtualGPU() { it->execution().unlock(); } } + + if (hostcallBuffer_ != nullptr) { + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, + "deleting hostcall buffer %p for virtual queue %p", + hostcallBuffer_, this); + disableHostcalls(hostcallBuffer_); + dev().context().svmFree(hostcallBuffer_); + } } void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { @@ -3781,4 +3793,41 @@ void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cm } } } + +void* VirtualGPU::getOrCreateHostcallBuffer() { + if (hostcallBuffer_ != nullptr) { + return hostcallBuffer_; + } + + // The number of packets required in each buffer is at least equal to the + // maximum number of waves supported by the device. + auto wavesPerCu = dev().info().maxThreadsPerCU_ / dev().info().wavefrontWidth_; + auto numPackets = dev().info().maxComputeUnits_ * wavesPerCu; + + auto size = getHostcallBufferSize(numPackets); + auto align = getHostcallBufferAlignment(); + + hostcallBuffer_ = dev().context().svmAlloc(size, align, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS); + if (!hostcallBuffer_) { + ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, + "Failed to create hostcall buffer"); + return nullptr; + } + + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, + "Created hostcall buffer %p (numPackets == %d, size == %d, align == %d) for virtual queue %p\n", + hostcallBuffer_, + numPackets, + size, + align, + this); + + if (!enableHostcalls(dev(), hostcallBuffer_, numPackets)) { + ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, + "Failed to register hostcall buffer %p with listener", + hostcallBuffer_); + return nullptr; + } + return hostcallBuffer_; +} } // namespace pal diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp index d01efda3d7..0614f1721c 100644 --- a/rocclr/device/pal/palvirtual.hpp +++ b/rocclr/device/pal/palvirtual.hpp @@ -552,6 +552,8 @@ class VirtualGPU : public device::VirtualDevice { } } + void* getOrCreateHostcallBuffer(); + protected: void profileEvent(EngineType engine, bool type) const; @@ -673,6 +675,8 @@ class VirtualGPU : public device::VirtualDevice { Queue* queues_[AllEngines]; //!< HW queues for all engines MemoryRange sdmaRange_; //!< SDMA memory range for write access std::vector wrtBackImageBuffer_; //!< Array of images for write back + + void* hostcallBuffer_; //!< Hostcall buffer }; inline void VirtualGPU::logVmMemory(const std::string name, const Memory* memory) { diff --git a/rocclr/device/rocm/rocsignal.cpp b/rocclr/device/rocm/rocsignal.cpp index c252075e43..013f4f3e55 100644 --- a/rocclr/device/rocm/rocsignal.cpp +++ b/rocclr/device/rocm/rocsignal.cpp @@ -26,7 +26,7 @@ Signal::~Signal() { hsa_signal_destroy(signal_); } -bool Signal::Init(uint64_t init, device::Signal::WaitState ws) { +bool Signal::Init(const amd::Device& dev, uint64_t init, device::Signal::WaitState ws) { hsa_status_t status = hsa_signal_create(init, 0, nullptr, &signal_); if (status != HSA_STATUS_SUCCESS) { return false; diff --git a/rocclr/device/rocm/rocsignal.hpp b/rocclr/device/rocm/rocsignal.hpp index 1c54fce40b..2350c14e2f 100644 --- a/rocclr/device/rocm/rocsignal.hpp +++ b/rocclr/device/rocm/rocsignal.hpp @@ -33,7 +33,7 @@ private: public: ~Signal() override; - bool Init(uint64_t init, device::Signal::WaitState ws) override; + bool Init(const amd::Device& dev, uint64_t init, device::Signal::WaitState ws) override; uint64_t Wait(uint64_t value, device::Signal::Condition c, uint64_t timeout) override;