Merge commit '2305f8ae565389cfc348a747c467c004909d7a90' into develop

This commit is contained in:
systems-assistant[bot]
2025-08-13 00:24:58 +00:00
22 ha cambiato i file con 308 aggiunte e 122 eliminazioni
@@ -781,6 +781,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const
return (__bf16)a + (__bf16)b;
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Adds two bfloat16 values, will not fuse into fma
*/
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd_rn(const __hip_bfloat16 a,
const __hip_bfloat16 b) {
#pragma clang fp contract(off)
return (__bf16)a + (__bf16)b;
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Subtracts two bfloat16 values
@@ -789,6 +799,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const
return (__bf16)a - (__bf16)b;
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Subtracts two bfloat16 values, will not fuse into fma
*/
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub_rn(const __hip_bfloat16 a,
const __hip_bfloat16 b) {
#pragma clang fp contract(off)
return (__bf16)a - (__bf16)b;
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Divides two bfloat16 values
@@ -815,6 +835,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const
return (__bf16)a * (__bf16)b;
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Multiplies two bfloat16 values, will not fuse into fma
*/
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul_rn(const __hip_bfloat16 a,
const __hip_bfloat16 b) {
#pragma clang fp contract(off)
return (__bf16)a * (__bf16)b;
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Negate a bfloat16 value
@@ -861,6 +891,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a,
return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
* \brief Adds two bfloat162 values, will not fuse into fma
*/
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a,
const __hip_bfloat162 b) {
#pragma clang fp contract(off)
return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
* \brief Performs FMA of given bfloat162 values
@@ -879,6 +919,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a,
return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
* \brief Multiplies two bfloat162 values, will not fuse into fma
*/
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2_rn(const __hip_bfloat162 a,
const __hip_bfloat162 b) {
#pragma clang fp contract(off)
return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
* \brief Converts a bfloat162 into negative
@@ -896,6 +946,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a,
return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
* \brief Subtracts two bfloat162 values, will not fuse into fma
*/
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2_rn(const __hip_bfloat162 a,
const __hip_bfloat162 b) {
#pragma clang fp contract(off)
return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
}
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Operator to multiply two __hip_bfloat16 numbers
@@ -1367,6 +1367,15 @@ THE SOFTWARE.
return __half_raw{
static_cast<__half_raw>(x).data +
static_cast<__half_raw>(y).data};
}
inline
__HOST_DEVICE__
__half __hadd_rn(__half x, __half y)
{
#pragma clang fp contract(off)
return __half_raw{
static_cast<__half_raw>(x).data +
static_cast<__half_raw>(y).data};
}
inline
__HOST_DEVICE__
@@ -1390,6 +1399,15 @@ THE SOFTWARE.
}
inline
__HOST_DEVICE__
__half __hsub_rn(__half x, __half y)
{
#pragma clang fp contract(off)
return __half_raw{
static_cast<__half_raw>(x).data -
static_cast<__half_raw>(y).data};
}
inline
__HOST_DEVICE__
__half __hmul(__half x, __half y)
{
return __half_raw{
@@ -1398,6 +1416,15 @@ THE SOFTWARE.
}
inline
__HOST_DEVICE__
__half __hmul_rn(__half x, __half y)
{
#pragma clang fp contract(off)
return __half_raw{
static_cast<__half_raw>(x).data *
static_cast<__half_raw>(y).data};
}
inline
__HOST_DEVICE__
__half __hadd_sat(__half x, __half y)
{
return __clamp_01(__hadd(x, y));
@@ -1446,6 +1473,16 @@ THE SOFTWARE.
static_cast<__half2_raw>(x).data +
static_cast<__half2_raw>(y).data};
}
inline
__HOST_DEVICE__
__half2 __hadd2_rn(__half2 x, __half2 y)
{
#pragma clang fp contract(off)
return __half2{
static_cast<__half2_raw>(x).data +
static_cast<__half2_raw>(y).data};
}
inline
__HOST_DEVICE__
__half2 __habs2(__half2 x)
@@ -1462,6 +1499,15 @@ THE SOFTWARE.
}
inline
__HOST_DEVICE__
__half2 __hsub2_rn(__half2 x, __half2 y)
{
#pragma clang fp contract(off)
return __half2{
static_cast<__half2_raw>(x).data -
static_cast<__half2_raw>(y).data};
}
inline
__HOST_DEVICE__
__half2 __hmul2(__half2 x, __half2 y)
{
return __half2{
@@ -1470,6 +1516,15 @@ THE SOFTWARE.
}
inline
__HOST_DEVICE__
__half2 __hmul2_rn(__half2 x, __half2 y)
{
#pragma clang fp contract(off)
return __half2{
static_cast<__half2_raw>(x).data *
static_cast<__half2_raw>(y).data};
}
inline
__HOST_DEVICE__
__half2 __hadd2_sat(__half2 x, __half2 y)
{
auto r = static_cast<__half2_raw>(__hadd2(x, y));
@@ -51,8 +51,8 @@ THE SOFTWARE.
#include <type_traits>
#endif // defined(__HIPCC_RTC__)
template <class T, int N> constexpr __hip_internal::size_t __hip_vec_align_v() {
return (N == 4 && alignof(T) == 8) ? 16 : N * alignof(T);
template <class T, int _hip_N> constexpr __hip_internal::size_t __hip_vec_align_v() {
return (_hip_N == 4 && alignof(T) == 8) ? 16 : _hip_N * alignof(T);
}
template <typename T, unsigned int n> struct HIP_vector_base;
@@ -199,16 +199,16 @@ struct integer_sequence {
template<size_t... Ints>
using index_sequence = integer_sequence<size_t, Ints...>;
template<size_t N, size_t... Ints>
struct make_index_sequence_impl : make_index_sequence_impl<N - 1, N - 1, Ints...> {};
template <size_t _hip_N, size_t... Ints>
struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
template<size_t... Ints>
struct make_index_sequence_impl<0, Ints...> {
using type = index_sequence<Ints...>;
};
template<size_t N>
using make_index_sequence = typename make_index_sequence_impl<N>::type;
template <size_t _hip_N>
using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type;
template <size_t... Ints>
constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
+9 -11
Vedi File
@@ -284,9 +284,7 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
for (auto& hostVar : hostVarsIter->second) {
auto varIter = vars_.find(hostVar);
if (varIter == vars_.end()) {
LogPrintfError(
"removeFatBinary: Unable to find module 0x%x hostVar 0x%x",
module, hostVar);
LogPrintfError("removeFatBinary: Unable to find module 0x%x hostVar 0x%x", module, hostVar);
} else {
delete varIter->second;
vars_.erase(varIter);
@@ -325,8 +323,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
for (auto& hostFunc : hostFuncsIter->second) {
auto funcIter = functions_.find(hostFunc);
if (funcIter == functions_.end()) {
LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x",
module, hostFunc);
LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x", module,
hostFunc);
} else {
delete funcIter->second;
functions_.erase(funcIter);
@@ -343,8 +341,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
delete moduleIter->second;
modules_.erase(moduleIter);
} else {
LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x",
module, hostModule);
LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x", module,
hostModule);
}
module_to_hostModule_.erase(hostModuleIter);
}
@@ -383,7 +381,7 @@ hipError_t StatCO::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, i
}
// Lazy load
FatBinaryInfo **module = it->second->moduleInfo();
FatBinaryInfo** module = it->second->moduleInfo();
if (*(module) == nullptr) {
amd::ScopedLock lock(sclock_);
if (*(module) == nullptr) {
@@ -405,7 +403,7 @@ hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hos
}
// Lazy load
FatBinaryInfo **module = it->second->moduleInfo();
FatBinaryInfo** module = it->second->moduleInfo();
if (*(module) == nullptr) {
hipError_t err = digestFatBinary(module_to_hostModule_[module], *module);
assert(err == hipSuccess);
@@ -437,7 +435,7 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice
}
// Lazy load
FatBinaryInfo **module = it->second->moduleInfo();
FatBinaryInfo** module = it->second->moduleInfo();
if (*(module) == nullptr) {
hipError_t err = digestFatBinary(module_to_hostModule_[module], *module);
assert(err == hipSuccess);
@@ -464,7 +462,7 @@ hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) {
for (auto& vecIter : managedVars_) {
for (auto& var : vecIter.second) {
// Lazy load
FatBinaryInfo **module = var->moduleInfo();
FatBinaryInfo** module = var->moduleInfo();
if (*(module) == nullptr) {
err = digestFatBinary(module_to_hostModule_[module], *module);
assert(err == hipSuccess);
+3
Vedi File
@@ -200,6 +200,9 @@ class IPCEvent : public Event {
if (!amd::Os::MemoryUnmapFile(ipc_evt_.ipc_shmem_, sizeof(hip::ihipIpcEventShmem_t))) {
// print hipErrorInvalidHandle;
}
if (owners == 0) {
amd::Os::shm_unlink(ipc_evt_.ipc_name_);
}
}
}
IPCEvent() : Event(hipEventInterprocess) {}
+6 -5
Vedi File
@@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc
return (cl_event)0;
}
amd::Event* event = new amd::UserEvent(*as_amd(context));
if (event == NULL) {
auto event = new amd::UserEvent(*as_amd(context));
if (event == nullptr || !event->Create()) {
delete event;
*not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
return (cl_event)0;
}
event->retain();
*not_null(errcode_ret) = CL_SUCCESS;
return as_cl(event);
return as_cl(reinterpret_cast<amd::Event*>(event));
}
RUNTIME_EXIT
@@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st
if (execution_status > CL_COMPLETE) {
return CL_INVALID_VALUE;
}
if (!as_amd(event)->setStatus(execution_status)) {
auto user_event = reinterpret_cast<amd::UserEvent*>(as_amd(event));
if (!user_event->SetExecutionStatus(execution_status)) {
return CL_INVALID_OPERATION;
}
return CL_SUCCESS;
+9
Vedi File
@@ -95,6 +95,7 @@ class StreamOperationCommand;
class BatchMemoryOperationCommand;
class VirtualMapCommand;
class ExternalSemaphoreCmd;
class UserEvent;
class Isa;
class Device;
struct KernelParameterDescriptor;
@@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject {
ShouldNotReachHere();
}
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
virtual void ReleaseAllHwQueues() {}
@@ -2037,6 +2039,13 @@ class Device : public RuntimeObject {
return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
}
/// @brief Creates HW user event for OpenCL implementation
/// @return The pointer to a HW event structure, known to the HW backend
virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; }
/// @brief Sets HW user event to the complete status
virtual void SetUserEvent(amd::UserEvent* event) const {}
//! Returns TRUE if the device is available for computations
bool isOnline() const { return online_; }
+39 -29
Vedi File
@@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, amd::CopyMetadata copyMetadata) const {
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, amd::CopyMetadata copyMetadata) const {
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
if (dev().settings().blocking_blit_ &&
(setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size,
buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
}
// ================================================================================================
void DmaBlitManager::releaseBuffer(BufferState &buffer) const {
if (buffer.pinnedMem_) {
gpu().addPinnedMem(buffer.pinnedMem_);
@@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata,
bool enablePin) const {
// Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow
gpu().releaseGpuMemoryFence(hostToDev);
gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_);
// If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer
// For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning
bool status = true;
@@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
copyMetadata.isAsync_);
const_address src = static_cast<const_address>(hostSrc) + copyOffset;
status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata);
if (status ) {
// Wait for current signal of previous rocr copy if its not pinned mem
if (status) {
if (outBuffer.pinnedMem_ == nullptr) {
// Wait for current signal of previous rocr copy if its not pinned mem
gpu().Barriers().WaitCurrent();
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
hostDst + copyOffset, stagingBuffer, copysize);
@@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
break;
}
}
// Release Pinned Memory back to pool if any
releaseBuffer(outBuffer);
// Update Offset and Transfer Size
@@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
firstTx = false;
}
// @note: HIP requires a blocking wait on D2H with the pageable system memory
if (amd::IS_HIP && !hostToDev) {
gpu().Barriers().WaitCurrent();
}
if(!status) {
return false;
}
return true;
}
// ================================================================================================
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
: DmaBlitManager(gpu, setup),
@@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
bool result = false;
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
!srcMemory.isCpuUncached())) {
if (dev().settings().blocking_blit_ &&
(setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
!srcMemory.isCpuUncached()))) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
@@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
bool result = false;
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
if (dev().settings().blocking_blit_ &&
(setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const {
}
// ================================================================================================
bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
bool KernelBlitManager::runScheduler(uint64_t vqVM,
hsa_queue_t* schedulerQueue,
hsa_signal_t& schedulerSignal,
uint threads) {
uint threads, uint64_t aql_wrap) {
size_t globalWorkOffset[1] = {0};
size_t globalWorkSize[1] = {threads};
size_t localWorkSize[1] = {1};
@@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
device::Kernel* devKernel = const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
SchedulerParam* sp = reinterpret_cast<SchedulerParam*>(schedulerParam->getHostMem());
auto* sp = reinterpret_cast<SchedulerParam*>(
gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment));
memset(sp, 0, sizeof(SchedulerParam));
Memory* schedulerMem = dev().getRocMemory(schedulerParam);
sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
sp->kernarg_address = reinterpret_cast<uint64_t>(sp);
sp->thread_counter = 0;
sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
sp->complete_signal = schedulerSignal;
hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr);
sp->vqueue_header = vqVM;
sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
sp->parentAQL = reinterpret_cast<uint64_t>(aql_wrap);
if (dev().info().maxEngineClockFrequency_ > 0) {
sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
// Use a device side global atomics to workaround the reliance of PCIe 3 atomics
sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
cl_mem mem = as_cl<amd::Memory>(schedulerParam);
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
constexpr bool kDirectVa = true;
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa);
address parameters = captureArguments(kernels_[Scheduler]);
@@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
return false;
}
releaseArguments(parameters);
if (!WaitForSignal(schedulerSignal)) {
// Wait for the scheduler to finish all operations
gpu().WaitCompleteSignal(sp->complete_signal);
// @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper
// value without the wait for all previous commands (see the PCIE3 atomics workaround above).
// The scheduler can enqueue extra commands, but the real queue write index didn't have any
// progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode
// with device enqueue and user events, because device enqueue is blocking below
if (!WaitForSignal(sp->complete_signal)) {
LogWarning("Failed schedulerSignal wait");
return false;
}
return true;
}
@@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager {
) const;
bool runScheduler(uint64_t vqVM,
amd::Memory* schedulerParam,
hsa_queue_t* schedulerQueue,
hsa_signal_t& schedulerSignal,
uint threads);
uint threads, uint64_t aql_wrap);
//! Runs a blit kernel for GWS init
bool RunGwsInit(uint32_t value //!< Initial value for GWS resource
@@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
}
}
// ================================================================================================
bool Device::CreateUserEvent(amd::UserEvent* event) const {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if ((signal == nullptr) ||
(HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) {
return false;
}
hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne);
event->SetHwEvent(signal.release());
return true;
}
// ================================================================================================
void Device::SetUserEvent(amd::UserEvent* event) const {
auto signal = reinterpret_cast<ProfilingSignal*>(event->HwEvent());
assert(signal != nullptr && "Can't have user event without hw event!");
hsa_signal_silent_store_relaxed(signal->signal_, 0);
}
// ================================================================================================
bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
// Query ptr type to see if it's a HMM allocation
@@ -450,6 +450,8 @@ class Device : public NullDevice {
uint32_t hip_event_flags = 0) const;
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
virtual void ReleaseGlobalSignal(void* signal) const;
virtual bool CreateUserEvent(amd::UserEvent* event) const;
virtual void SetUserEvent(amd::UserEvent* event) const;
//! Allocate host memory in terms of numa policy set by user
void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
@@ -97,6 +97,8 @@ Settings::Settings() {
limit_blit_wg_ = 16;
dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
// note: OCL user events don't allow CPU blocking calls in DD mode
blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH;
}
// ================================================================================================
@@ -51,7 +51,8 @@ class Settings : public device::Settings {
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
uint dynamic_queues_ : 1; //!< Dynamic queues management
uint reserved_ : 22;
uint blocking_blit_ : 1; //!< Blit ops can be blocking on CPU
uint reserved_ : 21;
};
uint value_;
};
@@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
return false;
}
// ================================================================================================
void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) {
barrier_packet_.dep_signal[0] = signal;
dispatchBarrierPacket(kBarrierPacketHeader, false);
}
// ================================================================================================
void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
hsa_signal_t signal) {
@@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
deviceQueueSize_(0),
maskGroups_(0),
schedulerThreads_(0),
schedulerParam_(nullptr),
schedulerQueue_(nullptr),
schedulerSignal_({0}),
barriers_(*this),
managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
@@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() {
delete printfdbg_;
if (0 != schedulerSignal_.handle) {
hsa_signal_destroy(schedulerSignal_);
}
if (nullptr != schedulerQueue_) {
hsa_queue_destroy(schedulerQueue_);
}
if (nullptr != schedulerParam_) {
schedulerParam_->release();
}
if (nullptr != virtualQueue_) {
virtualQueue_->release();
}
@@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
// If we have host memory, use it
if ((devMemory->owner()->getHostMem() != nullptr) &&
(devMemory->owner()->getSvmPtr() == nullptr)) {
if (!devMemory->isHostMemDirectAccess()) {
if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) {
// Make sure GPU finished operation before synchronization with the backing store
releaseGpuMemoryFence();
}
@@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
// ================================================================================================
bool VirtualGPU::createSchedulerParam()
{
if (nullptr != schedulerParam_) {
if (nullptr != schedulerQueue_) {
return true;
}
while(true) {
schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(),
CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap));
if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) {
break;
}
while (true) {
// The queue is written by multiple threads of the scheduler kernel
if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI,
callbackQueue, &roc_device_, std::numeric_limits<uint>::max(),
@@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam()
break;
}
hsa_signal_t signal0 = {0};
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) {
break;
}
schedulerSignal_ = signal0;
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
if (nullptr == schedulerMem) {
break;
}
schedulerParam_->setVirtualDevice(this);
return true;
}
if (0 != schedulerSignal_.handle) {
hsa_signal_destroy(schedulerSignal_);
schedulerSignal_.handle = 0;
}
if (nullptr != schedulerQueue_) {
hsa_queue_destroy(schedulerQueue_);
schedulerQueue_ = nullptr;
}
if (nullptr != schedulerParam_) {
schedulerParam_->release();
schedulerParam_ = nullptr;
}
return false;
}
@@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
global[i] = static_cast<uint32_t>(sizes.global()[i]);
local[i] = static_cast<uint16_t>(local_size[i]);
}
uint64_t spVA = 0;
// Check if runtime has to setup hidden arguments
for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
const auto& it = signature.at(i);
@@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
break;
}
case amd::KernelParameterDescriptor::HiddenCompletionAction: {
uint64_t spVA = 0;
if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) {
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(
reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
if (devKernel->dynamicParallelism()) {
auto params = allocKernArg(sizeof(AmdAqlWrap), 64);
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(params);
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_DONE;
spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
spVA = reinterpret_cast<uint64_t>(wrap);
}
WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_);
break;
@@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
dispatchBarrierPacket(kBarrierPacketHeader, true);
if (virtualQueue_ != nullptr) {
static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
getVQVirtualAddress(), schedulerParam_, schedulerQueue_,
schedulerSignal_, schedulerThreads_);
getVQVirtualAddress(), schedulerQueue_,
schedulerThreads_, spVA);
}
}
@@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
// ================================================================================================
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
if (!AMD_DIRECT_DISPATCH) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
@@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
pinnedMems_.push_back(mem);
}
} else {
mem->release();
if (command_ != nullptr) {
command_->AddPinnedMemory(mem);
} else {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
mem->release();
}
}
}
@@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice {
void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
void setFenceDirty(bool state) { fence_dirty_ = state; }
void HiddenHeapInit();
void WaitCompleteSignal(hsa_signal_t signal);
void HiddenHeapInit();
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
uint64_t getQueueID();
@@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice {
//!< one thread
uint schedulerThreads_; //!< The number of scheduler threads
amd::Memory* schedulerParam_;
hsa_queue_t* schedulerQueue_;
hsa_signal_t schedulerSignal_;
HwQueueTracker barriers_; //!< Tracks active barriers in ROCr
+3
Vedi File
@@ -285,6 +285,9 @@ class Os : AllStatic {
//! Deletes file
static int unlink(const std::string& path);
//! Removes the shared memory object name
static int shm_unlink(const std::string& path);
// Library routines:
//
typedef bool (*SymbolCallback)(std::string, const void*, void*);
+1
Vedi File
@@ -650,6 +650,7 @@ std::string Os::getTempFileName() {
}
int Os::unlink(const std::string& path) { return ::unlink(path.c_str()); }
int Os::shm_unlink(const std::string& path) { return ::shm_unlink(path.c_str()); }
#if defined(ATI_ARCH_X86)
void Os::cpuid(int regs[4], int info) {
+3
Vedi File
@@ -505,6 +505,9 @@ std::string Os::getTempFileName() {
int Os::unlink(const std::string& path) { return ::_unlink(path.c_str()); }
// shm_unlink is a nop on windows
int Os::shm_unlink(const std::string& path) { return 0; }
void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); }
uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); }
+14 -3
Vedi File
@@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
ScopedLock l(notify_lock_);
if ((status() > CL_COMPLETE) && (nullptr != queue) &&
// If HW event was assigned, then notification can be ignored, since a barrier was issued
(HwEvent() == nullptr) &&
// @note: Force the marker always in OCL for now, since OCL events require precise
// sequence of the status update
((HwEvent() == nullptr) || !amd::IS_HIP) &&
!notified_.test_and_set()) {
// Make sure the queue is draining the enqueued commands.
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
@@ -364,8 +366,17 @@ void Command::enqueue() {
// Notify all commands about the waiter. Barrier will be sent in order to obtain
// HSA signal for a wait on the current queue
for (const auto& event : eventWaitList()) {
event->notifyCmdQueue(!kCpuWait);
for (const auto &event: eventWaitList()) {
if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) {
if (event->status() >= CL_COMPLETE) {
reinterpret_cast<amd::UserEvent*>(event)->AddDependent(this);
} else {
setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
return;
}
} else {
event->notifyCmdQueue(!kCpuWait);
}
}
// The batch update must be lock protected to avoid a race condition
+42 -3
Vedi File
@@ -397,6 +397,9 @@ class Command : public Event {
//! Release the resources associated with this event.
virtual void releaseResources();
//! Empty function for adding pinned memory
virtual void AddPinnedMemory(Memory* pinned) {}
//! Set the next GPU command
void setNext(Command* next) { next_ = next; }
@@ -424,16 +427,46 @@ class Command : public Event {
};
class UserEvent : public Command {
const Context& context_;
const Context& context_; //!< OCL context associated with the event
std::vector<Command*> dependents_; //!< Commands, which depends on this user event
public:
UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) {
setStatus(CL_SUBMITTED);
}
//! Creates a user event in the backend layer
bool Create() {
if (AMD_DIRECT_DISPATCH) {
return context_.devices()[0]->CreateUserEvent(this);
} else {
return true;
}
}
//! Sets the execution status of the user event
bool SetExecutionStatus(cl_int status) {
if (AMD_DIRECT_DISPATCH) {
// If it's invalid status, then mark dependent commands as invalid
if (status < CL_COMPLETE) {
for (auto it : dependents_) {
it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
}
}
dependents_.clear();
context_.devices()[0]->SetUserEvent(this);
}
return setStatus(status);
}
//! Adds dependent commands for the user event
void AddDependent(Command* command) {
dependents_.push_back(command);
}
virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); }
virtual const Context& context() const { return context_; }
const Context& context() const { return context_; }
};
class ClGlEvent : public Command {
@@ -450,7 +483,7 @@ class ClGlEvent : public Command {
bool awaitCompletion() { return waitForFence(); }
virtual const Context& context() const { return context_; }
const Context& context() const { return context_; }
};
inline Command& Event::command() { return *static_cast<Command*>(this); }
@@ -465,6 +498,7 @@ class NDRangeContainer;
class OneMemoryArgCommand : public Command {
protected:
Memory* memory_;
std::vector<Memory*> pinned_memory_; //!< Pinned memory object
public:
OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
@@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command {
memory_->release();
DEBUG_ONLY(memory_ = NULL);
Command::releaseResources();
for (auto it : pinned_memory_) {
it->release();
}
}
//! Adds pinned memory, used in this command for later release
virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); }
bool validateMemory();
bool validatePeerMemory();
};
@@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) {
(vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) {
cpu_wait = true;
}
} else {
// Force CPU wait for OpenCL, since the tests may check OCL command status after finish
cpu_wait = true;
}
size_t batchSize = GetSubmissionBatchSize();