Merge commit '2305f8ae565389cfc348a747c467c004909d7a90' into develop
This commit is contained in:
@@ -781,6 +781,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const
|
||||
return (__bf16)a + (__bf16)b;
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
* \brief Adds two bfloat16 values, will not fuse into fma
|
||||
*/
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd_rn(const __hip_bfloat16 a,
|
||||
const __hip_bfloat16 b) {
|
||||
#pragma clang fp contract(off)
|
||||
return (__bf16)a + (__bf16)b;
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
* \brief Subtracts two bfloat16 values
|
||||
@@ -789,6 +799,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const
|
||||
return (__bf16)a - (__bf16)b;
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
* \brief Subtracts two bfloat16 values, will not fuse into fma
|
||||
*/
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub_rn(const __hip_bfloat16 a,
|
||||
const __hip_bfloat16 b) {
|
||||
#pragma clang fp contract(off)
|
||||
return (__bf16)a - (__bf16)b;
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
* \brief Divides two bfloat16 values
|
||||
@@ -815,6 +835,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const
|
||||
return (__bf16)a * (__bf16)b;
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
* \brief Multiplies two bfloat16 values, will not fuse into fma
|
||||
*/
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul_rn(const __hip_bfloat16 a,
|
||||
const __hip_bfloat16 b) {
|
||||
#pragma clang fp contract(off)
|
||||
return (__bf16)a * (__bf16)b;
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
* \brief Negate a bfloat16 value
|
||||
@@ -861,6 +891,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a,
|
||||
return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
|
||||
* \brief Adds two bfloat162 values, will not fuse into fma
|
||||
*/
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a,
|
||||
const __hip_bfloat162 b) {
|
||||
#pragma clang fp contract(off)
|
||||
return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
|
||||
* \brief Performs FMA of given bfloat162 values
|
||||
@@ -879,6 +919,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a,
|
||||
return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
|
||||
* \brief Multiplies two bfloat162 values, will not fuse into fma
|
||||
*/
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2_rn(const __hip_bfloat162 a,
|
||||
const __hip_bfloat162 b) {
|
||||
#pragma clang fp contract(off)
|
||||
return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
|
||||
* \brief Converts a bfloat162 into negative
|
||||
@@ -896,6 +946,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a,
|
||||
return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
|
||||
* \brief Subtracts two bfloat162 values, will not fuse into fma
|
||||
*/
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2_rn(const __hip_bfloat162 a,
|
||||
const __hip_bfloat162 b) {
|
||||
#pragma clang fp contract(off)
|
||||
return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
|
||||
}
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
* \brief Operator to multiply two __hip_bfloat16 numbers
|
||||
|
||||
@@ -1367,6 +1367,15 @@ THE SOFTWARE.
|
||||
return __half_raw{
|
||||
static_cast<__half_raw>(x).data +
|
||||
static_cast<__half_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half __hadd_rn(__half x, __half y)
|
||||
{
|
||||
#pragma clang fp contract(off)
|
||||
return __half_raw{
|
||||
static_cast<__half_raw>(x).data +
|
||||
static_cast<__half_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
@@ -1390,6 +1399,15 @@ THE SOFTWARE.
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half __hsub_rn(__half x, __half y)
|
||||
{
|
||||
#pragma clang fp contract(off)
|
||||
return __half_raw{
|
||||
static_cast<__half_raw>(x).data -
|
||||
static_cast<__half_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half __hmul(__half x, __half y)
|
||||
{
|
||||
return __half_raw{
|
||||
@@ -1398,6 +1416,15 @@ THE SOFTWARE.
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half __hmul_rn(__half x, __half y)
|
||||
{
|
||||
#pragma clang fp contract(off)
|
||||
return __half_raw{
|
||||
static_cast<__half_raw>(x).data *
|
||||
static_cast<__half_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half __hadd_sat(__half x, __half y)
|
||||
{
|
||||
return __clamp_01(__hadd(x, y));
|
||||
@@ -1446,6 +1473,16 @@ THE SOFTWARE.
|
||||
static_cast<__half2_raw>(x).data +
|
||||
static_cast<__half2_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half2 __hadd2_rn(__half2 x, __half2 y)
|
||||
{
|
||||
#pragma clang fp contract(off)
|
||||
return __half2{
|
||||
static_cast<__half2_raw>(x).data +
|
||||
static_cast<__half2_raw>(y).data};
|
||||
}
|
||||
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half2 __habs2(__half2 x)
|
||||
@@ -1462,6 +1499,15 @@ THE SOFTWARE.
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half2 __hsub2_rn(__half2 x, __half2 y)
|
||||
{
|
||||
#pragma clang fp contract(off)
|
||||
return __half2{
|
||||
static_cast<__half2_raw>(x).data -
|
||||
static_cast<__half2_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half2 __hmul2(__half2 x, __half2 y)
|
||||
{
|
||||
return __half2{
|
||||
@@ -1470,6 +1516,15 @@ THE SOFTWARE.
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half2 __hmul2_rn(__half2 x, __half2 y)
|
||||
{
|
||||
#pragma clang fp contract(off)
|
||||
return __half2{
|
||||
static_cast<__half2_raw>(x).data *
|
||||
static_cast<__half2_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__HOST_DEVICE__
|
||||
__half2 __hadd2_sat(__half2 x, __half2 y)
|
||||
{
|
||||
auto r = static_cast<__half2_raw>(__hadd2(x, y));
|
||||
|
||||
@@ -51,8 +51,8 @@ THE SOFTWARE.
|
||||
#include <type_traits>
|
||||
#endif // defined(__HIPCC_RTC__)
|
||||
|
||||
template <class T, int N> constexpr __hip_internal::size_t __hip_vec_align_v() {
|
||||
return (N == 4 && alignof(T) == 8) ? 16 : N * alignof(T);
|
||||
template <class T, int _hip_N> constexpr __hip_internal::size_t __hip_vec_align_v() {
|
||||
return (_hip_N == 4 && alignof(T) == 8) ? 16 : _hip_N * alignof(T);
|
||||
}
|
||||
|
||||
template <typename T, unsigned int n> struct HIP_vector_base;
|
||||
|
||||
@@ -199,16 +199,16 @@ struct integer_sequence {
|
||||
template<size_t... Ints>
|
||||
using index_sequence = integer_sequence<size_t, Ints...>;
|
||||
|
||||
template<size_t N, size_t... Ints>
|
||||
struct make_index_sequence_impl : make_index_sequence_impl<N - 1, N - 1, Ints...> {};
|
||||
template <size_t _hip_N, size_t... Ints>
|
||||
struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
|
||||
|
||||
template<size_t... Ints>
|
||||
struct make_index_sequence_impl<0, Ints...> {
|
||||
using type = index_sequence<Ints...>;
|
||||
};
|
||||
|
||||
template<size_t N>
|
||||
using make_index_sequence = typename make_index_sequence_impl<N>::type;
|
||||
template <size_t _hip_N>
|
||||
using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type;
|
||||
|
||||
template <size_t... Ints>
|
||||
constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
|
||||
|
||||
@@ -284,9 +284,7 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
|
||||
for (auto& hostVar : hostVarsIter->second) {
|
||||
auto varIter = vars_.find(hostVar);
|
||||
if (varIter == vars_.end()) {
|
||||
LogPrintfError(
|
||||
"removeFatBinary: Unable to find module 0x%x hostVar 0x%x",
|
||||
module, hostVar);
|
||||
LogPrintfError("removeFatBinary: Unable to find module 0x%x hostVar 0x%x", module, hostVar);
|
||||
} else {
|
||||
delete varIter->second;
|
||||
vars_.erase(varIter);
|
||||
@@ -325,8 +323,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
|
||||
for (auto& hostFunc : hostFuncsIter->second) {
|
||||
auto funcIter = functions_.find(hostFunc);
|
||||
if (funcIter == functions_.end()) {
|
||||
LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x",
|
||||
module, hostFunc);
|
||||
LogPrintfError("removeFatBinary: Unable to find module 0x%x hostFunc 0x%x", module,
|
||||
hostFunc);
|
||||
} else {
|
||||
delete funcIter->second;
|
||||
functions_.erase(funcIter);
|
||||
@@ -343,8 +341,8 @@ hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
|
||||
delete moduleIter->second;
|
||||
modules_.erase(moduleIter);
|
||||
} else {
|
||||
LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x",
|
||||
module, hostModule);
|
||||
LogPrintfError("removeFatBinary: Unable to find module 0x%x via hostModule 0x%x", module,
|
||||
hostModule);
|
||||
}
|
||||
module_to_hostModule_.erase(hostModuleIter);
|
||||
}
|
||||
@@ -383,7 +381,7 @@ hipError_t StatCO::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, i
|
||||
}
|
||||
|
||||
// Lazy load
|
||||
FatBinaryInfo **module = it->second->moduleInfo();
|
||||
FatBinaryInfo** module = it->second->moduleInfo();
|
||||
if (*(module) == nullptr) {
|
||||
amd::ScopedLock lock(sclock_);
|
||||
if (*(module) == nullptr) {
|
||||
@@ -405,7 +403,7 @@ hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hos
|
||||
}
|
||||
|
||||
// Lazy load
|
||||
FatBinaryInfo **module = it->second->moduleInfo();
|
||||
FatBinaryInfo** module = it->second->moduleInfo();
|
||||
if (*(module) == nullptr) {
|
||||
hipError_t err = digestFatBinary(module_to_hostModule_[module], *module);
|
||||
assert(err == hipSuccess);
|
||||
@@ -437,7 +435,7 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice
|
||||
}
|
||||
|
||||
// Lazy load
|
||||
FatBinaryInfo **module = it->second->moduleInfo();
|
||||
FatBinaryInfo** module = it->second->moduleInfo();
|
||||
if (*(module) == nullptr) {
|
||||
hipError_t err = digestFatBinary(module_to_hostModule_[module], *module);
|
||||
assert(err == hipSuccess);
|
||||
@@ -464,7 +462,7 @@ hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) {
|
||||
for (auto& vecIter : managedVars_) {
|
||||
for (auto& var : vecIter.second) {
|
||||
// Lazy load
|
||||
FatBinaryInfo **module = var->moduleInfo();
|
||||
FatBinaryInfo** module = var->moduleInfo();
|
||||
if (*(module) == nullptr) {
|
||||
err = digestFatBinary(module_to_hostModule_[module], *module);
|
||||
assert(err == hipSuccess);
|
||||
|
||||
@@ -200,6 +200,9 @@ class IPCEvent : public Event {
|
||||
if (!amd::Os::MemoryUnmapFile(ipc_evt_.ipc_shmem_, sizeof(hip::ihipIpcEventShmem_t))) {
|
||||
// print hipErrorInvalidHandle;
|
||||
}
|
||||
if (owners == 0) {
|
||||
amd::Os::shm_unlink(ipc_evt_.ipc_name_);
|
||||
}
|
||||
}
|
||||
}
|
||||
IPCEvent() : Event(hipEventInterprocess) {}
|
||||
|
||||
@@ -251,15 +251,16 @@ RUNTIME_ENTRY_RET(cl_event, clCreateUserEvent, (cl_context context, cl_int* errc
|
||||
return (cl_event)0;
|
||||
}
|
||||
|
||||
amd::Event* event = new amd::UserEvent(*as_amd(context));
|
||||
if (event == NULL) {
|
||||
auto event = new amd::UserEvent(*as_amd(context));
|
||||
if (event == nullptr || !event->Create()) {
|
||||
delete event;
|
||||
*not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
|
||||
return (cl_event)0;
|
||||
}
|
||||
|
||||
event->retain();
|
||||
*not_null(errcode_ret) = CL_SUCCESS;
|
||||
return as_cl(event);
|
||||
return as_cl(reinterpret_cast<amd::Event*>(event));
|
||||
}
|
||||
RUNTIME_EXIT
|
||||
|
||||
@@ -288,8 +289,8 @@ RUNTIME_ENTRY(cl_int, clSetUserEventStatus, (cl_event event, cl_int execution_st
|
||||
if (execution_status > CL_COMPLETE) {
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
|
||||
if (!as_amd(event)->setStatus(execution_status)) {
|
||||
auto user_event = reinterpret_cast<amd::UserEvent*>(as_amd(event));
|
||||
if (!user_event->SetExecutionStatus(execution_status)) {
|
||||
return CL_INVALID_OPERATION;
|
||||
}
|
||||
return CL_SUCCESS;
|
||||
|
||||
@@ -95,6 +95,7 @@ class StreamOperationCommand;
|
||||
class BatchMemoryOperationCommand;
|
||||
class VirtualMapCommand;
|
||||
class ExternalSemaphoreCmd;
|
||||
class UserEvent;
|
||||
class Isa;
|
||||
class Device;
|
||||
struct KernelParameterDescriptor;
|
||||
@@ -1329,6 +1330,7 @@ class VirtualDevice : public amd::HeapObject {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
|
||||
virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }
|
||||
|
||||
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
|
||||
virtual void ReleaseAllHwQueues() {}
|
||||
@@ -2037,6 +2039,13 @@ class Device : public RuntimeObject {
|
||||
return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
|
||||
}
|
||||
|
||||
/// @brief Creates HW user event for OpenCL implementation
|
||||
/// @return The pointer to a HW event structure, known to the HW backend
|
||||
virtual bool CreateUserEvent(amd::UserEvent* event) const { return false; }
|
||||
|
||||
/// @brief Sets HW user event to the complete status
|
||||
virtual void SetUserEvent(amd::UserEvent* event) const {}
|
||||
|
||||
//! Returns TRUE if the device is available for computations
|
||||
bool isOnline() const { return online_; }
|
||||
|
||||
|
||||
@@ -54,8 +54,8 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire, amd::CopyMetadata copyMetadata) const {
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBuffer_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
if (dev().settings().blocking_blit_ && (setup_.disableReadBuffer_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached()))) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
|
||||
@@ -138,8 +138,9 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire, amd::CopyMetadata copyMetadata) const {
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap()) {
|
||||
if (dev().settings().blocking_blit_ &&
|
||||
(setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
|
||||
@@ -685,6 +686,7 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size,
|
||||
buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void DmaBlitManager::releaseBuffer(BufferState &buffer) const {
|
||||
if (buffer.pinnedMem_) {
|
||||
gpu().addPinnedMem(buffer.pinnedMem_);
|
||||
@@ -696,7 +698,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
|
||||
size_t size, bool hostToDev, amd::CopyMetadata& copyMetadata,
|
||||
bool enablePin) const {
|
||||
// Do not skip wait here for D2H. Resolving dependent signals for SDMA engine is slow
|
||||
gpu().releaseGpuMemoryFence(hostToDev);
|
||||
gpu().releaseGpuMemoryFence(hostToDev || !dev().settings().blocking_blit_);
|
||||
// If Pinning is enabled, Pin host Memory for copy size > MinSizeForPinnedTransfer
|
||||
// For 16KB < size <= MinSizeForPinnedTransfer Use staging buffer without pinning
|
||||
bool status = true;
|
||||
@@ -740,9 +742,9 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
|
||||
copyMetadata.isAsync_);
|
||||
const_address src = static_cast<const_address>(hostSrc) + copyOffset;
|
||||
status = rocrCopyBuffer(stagingBuffer, dstAgent, src , srcAgent, copysize, copyMetadata);
|
||||
if (status ) {
|
||||
// Wait for current signal of previous rocr copy if its not pinned mem
|
||||
if (status) {
|
||||
if (outBuffer.pinnedMem_ == nullptr) {
|
||||
// Wait for current signal of previous rocr copy if its not pinned mem
|
||||
gpu().Barriers().WaitCurrent();
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
|
||||
hostDst + copyOffset, stagingBuffer, copysize);
|
||||
@@ -752,6 +754,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Release Pinned Memory back to pool if any
|
||||
releaseBuffer(outBuffer);
|
||||
// Update Offset and Transfer Size
|
||||
@@ -760,12 +763,18 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
|
||||
firstTx = false;
|
||||
}
|
||||
|
||||
// @note: HIP requires a blocking wait on D2H with the pageable system memory
|
||||
if (amd::IS_HIP && !hostToDev) {
|
||||
gpu().Barriers().WaitCurrent();
|
||||
}
|
||||
|
||||
if(!status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
|
||||
: DmaBlitManager(gpu, setup),
|
||||
@@ -1718,8 +1727,9 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
bool result = false;
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
|
||||
!srcMemory.isCpuUncached())) {
|
||||
if (dev().settings().blocking_blit_ &&
|
||||
(setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() &&
|
||||
!srcMemory.isCpuUncached()))) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
|
||||
@@ -1854,8 +1864,9 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
bool result = false;
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap()) {
|
||||
if (dev().settings().blocking_blit_ &&
|
||||
(setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
|
||||
@@ -2718,10 +2729,9 @@ void KernelBlitManager::releaseArguments(address args) const {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
|
||||
bool KernelBlitManager::runScheduler(uint64_t vqVM,
|
||||
hsa_queue_t* schedulerQueue,
|
||||
hsa_signal_t& schedulerSignal,
|
||||
uint threads) {
|
||||
uint threads, uint64_t aql_wrap) {
|
||||
size_t globalWorkOffset[1] = {0};
|
||||
size_t globalWorkSize[1] = {threads};
|
||||
size_t localWorkSize[1] = {1};
|
||||
@@ -2731,21 +2741,16 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
|
||||
device::Kernel* devKernel = const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
|
||||
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
|
||||
|
||||
SchedulerParam* sp = reinterpret_cast<SchedulerParam*>(schedulerParam->getHostMem());
|
||||
auto* sp = reinterpret_cast<SchedulerParam*>(
|
||||
gpu().allocKernArg(sizeof(SchedulerParam), kCBAlignment));
|
||||
memset(sp, 0, sizeof(SchedulerParam));
|
||||
|
||||
Memory* schedulerMem = dev().getRocMemory(schedulerParam);
|
||||
sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
|
||||
sp->kernarg_address = reinterpret_cast<uint64_t>(sp);
|
||||
sp->thread_counter = 0;
|
||||
sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
|
||||
sp->complete_signal = schedulerSignal;
|
||||
|
||||
hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
|
||||
|
||||
|
||||
sp->complete_signal = gpu().Barriers().ActiveSignal(kInitSignalValueOne, nullptr);
|
||||
sp->vqueue_header = vqVM;
|
||||
|
||||
sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
|
||||
sp->parentAQL = reinterpret_cast<uint64_t>(aql_wrap);
|
||||
|
||||
if (dev().info().maxEngineClockFrequency_ > 0) {
|
||||
sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
|
||||
@@ -2754,8 +2759,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
|
||||
// Use a device side global atomics to workaround the reliance of PCIe 3 atomics
|
||||
sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
|
||||
|
||||
cl_mem mem = as_cl<amd::Memory>(schedulerParam);
|
||||
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
|
||||
constexpr bool kDirectVa = true;
|
||||
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), sp, 0, nullptr, kDirectVa);
|
||||
|
||||
address parameters = captureArguments(kernels_[Scheduler]);
|
||||
|
||||
@@ -2764,12 +2769,17 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
|
||||
return false;
|
||||
}
|
||||
releaseArguments(parameters);
|
||||
|
||||
if (!WaitForSignal(schedulerSignal)) {
|
||||
// Wait for the scheduler to finish all operations
|
||||
gpu().WaitCompleteSignal(sp->complete_signal);
|
||||
// @note: A wait shouldn't be really necessary, but the queue write_index may not get a proper
|
||||
// value without the wait for all previous commands (see the PCIE3 atomics workaround above).
|
||||
// The scheduler can enqueue extra commands, but the real queue write index didn't have any
|
||||
// progress. That leads to hangs and requires blocking. Then the wait causes problems in DD mode
|
||||
// with device enqueue and user events, because device enqueue is blocking below
|
||||
if (!WaitForSignal(sp->complete_signal)) {
|
||||
LogWarning("Failed schedulerSignal wait");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -492,10 +492,8 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
) const;
|
||||
|
||||
bool runScheduler(uint64_t vqVM,
|
||||
amd::Memory* schedulerParam,
|
||||
hsa_queue_t* schedulerQueue,
|
||||
hsa_signal_t& schedulerSignal,
|
||||
uint threads);
|
||||
uint threads, uint64_t aql_wrap);
|
||||
|
||||
//! Runs a blit kernel for GWS init
|
||||
bool RunGwsInit(uint32_t value //!< Initial value for GWS resource
|
||||
|
||||
@@ -3464,6 +3464,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Device::CreateUserEvent(amd::UserEvent* event) const {
|
||||
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
|
||||
if ((signal == nullptr) ||
|
||||
(HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal->signal_))) {
|
||||
return false;
|
||||
}
|
||||
hsa_signal_silent_store_relaxed(signal->signal_, kInitSignalValueOne);
|
||||
event->SetHwEvent(signal.release());
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::SetUserEvent(amd::UserEvent* event) const {
|
||||
auto signal = reinterpret_cast<ProfilingSignal*>(event->HwEvent());
|
||||
assert(signal != nullptr && "Can't have user event without hw event!");
|
||||
hsa_signal_silent_store_relaxed(signal->signal_, 0);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
|
||||
// Query ptr type to see if it's a HMM allocation
|
||||
|
||||
@@ -450,6 +450,8 @@ class Device : public NullDevice {
|
||||
uint32_t hip_event_flags = 0) const;
|
||||
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
|
||||
virtual void ReleaseGlobalSignal(void* signal) const;
|
||||
virtual bool CreateUserEvent(amd::UserEvent* event) const;
|
||||
virtual void SetUserEvent(amd::UserEvent* event) const;
|
||||
|
||||
//! Allocate host memory in terms of numa policy set by user
|
||||
void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
|
||||
|
||||
@@ -97,6 +97,8 @@ Settings::Settings() {
|
||||
limit_blit_wg_ = 16;
|
||||
|
||||
dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
|
||||
// note: OCL user events don't allow CPU blocking calls in DD mode
|
||||
blocking_blit_ = amd::IS_HIP || !AMD_DIRECT_DISPATCH;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
|
||||
@@ -51,7 +51,8 @@ class Settings : public device::Settings {
|
||||
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
|
||||
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
|
||||
uint dynamic_queues_ : 1; //!< Dynamic queues management
|
||||
uint reserved_ : 22;
|
||||
uint blocking_blit_ : 1; //!< Blit ops can be blocking on CPU
|
||||
uint reserved_ : 21;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -1209,6 +1209,12 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
|
||||
return false;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) {
|
||||
barrier_packet_.dep_signal[0] = signal;
|
||||
dispatchBarrierPacket(kBarrierPacketHeader, false);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
hsa_signal_t signal) {
|
||||
@@ -1413,9 +1419,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
deviceQueueSize_(0),
|
||||
maskGroups_(0),
|
||||
schedulerThreads_(0),
|
||||
schedulerParam_(nullptr),
|
||||
schedulerQueue_(nullptr),
|
||||
schedulerSignal_({0}),
|
||||
barriers_(*this),
|
||||
managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
|
||||
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
|
||||
@@ -1490,18 +1494,10 @@ VirtualGPU::~VirtualGPU() {
|
||||
|
||||
delete printfdbg_;
|
||||
|
||||
if (0 != schedulerSignal_.handle) {
|
||||
hsa_signal_destroy(schedulerSignal_);
|
||||
}
|
||||
|
||||
if (nullptr != schedulerQueue_) {
|
||||
hsa_queue_destroy(schedulerQueue_);
|
||||
}
|
||||
|
||||
if (nullptr != schedulerParam_) {
|
||||
schedulerParam_->release();
|
||||
}
|
||||
|
||||
if (nullptr != virtualQueue_) {
|
||||
virtualQueue_->release();
|
||||
}
|
||||
@@ -2604,7 +2600,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
|
||||
// If we have host memory, use it
|
||||
if ((devMemory->owner()->getHostMem() != nullptr) &&
|
||||
(devMemory->owner()->getSvmPtr() == nullptr)) {
|
||||
if (!devMemory->isHostMemDirectAccess()) {
|
||||
if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) {
|
||||
// Make sure GPU finished operation before synchronization with the backing store
|
||||
releaseGpuMemoryFence();
|
||||
}
|
||||
@@ -3081,18 +3077,11 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::createSchedulerParam()
|
||||
{
|
||||
if (nullptr != schedulerParam_) {
|
||||
if (nullptr != schedulerQueue_) {
|
||||
return true;
|
||||
}
|
||||
|
||||
while(true) {
|
||||
schedulerParam_ = new (dev().context()) amd::Buffer(dev().context(),
|
||||
CL_MEM_ALLOC_HOST_PTR, sizeof(SchedulerParam) + sizeof(AmdAqlWrap));
|
||||
|
||||
if ((nullptr != schedulerParam_) && !schedulerParam_->create(nullptr)) {
|
||||
break;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
// The queue is written by multiple threads of the scheduler kernel
|
||||
if (HSA_STATUS_SUCCESS != hsa_queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI,
|
||||
callbackQueue, &roc_device_, std::numeric_limits<uint>::max(),
|
||||
@@ -3100,39 +3089,14 @@ bool VirtualGPU::createSchedulerParam()
|
||||
break;
|
||||
}
|
||||
|
||||
hsa_signal_t signal0 = {0};
|
||||
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &signal0)) {
|
||||
break;
|
||||
}
|
||||
|
||||
schedulerSignal_ = signal0;
|
||||
|
||||
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
|
||||
|
||||
if (nullptr == schedulerMem) {
|
||||
break;
|
||||
}
|
||||
|
||||
schedulerParam_->setVirtualDevice(this);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (0 != schedulerSignal_.handle) {
|
||||
hsa_signal_destroy(schedulerSignal_);
|
||||
schedulerSignal_.handle = 0;
|
||||
}
|
||||
|
||||
if (nullptr != schedulerQueue_) {
|
||||
hsa_queue_destroy(schedulerQueue_);
|
||||
schedulerQueue_ = nullptr;
|
||||
}
|
||||
|
||||
if (nullptr != schedulerParam_) {
|
||||
schedulerParam_->release();
|
||||
schedulerParam_ = nullptr;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3355,7 +3319,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
global[i] = static_cast<uint32_t>(sizes.global()[i]);
|
||||
local[i] = static_cast<uint16_t>(local_size[i]);
|
||||
}
|
||||
|
||||
uint64_t spVA = 0;
|
||||
// Check if runtime has to setup hidden arguments
|
||||
for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
|
||||
const auto& it = signature.at(i);
|
||||
@@ -3415,14 +3379,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
break;
|
||||
}
|
||||
case amd::KernelParameterDescriptor::HiddenCompletionAction: {
|
||||
uint64_t spVA = 0;
|
||||
if (nullptr != schedulerParam_ && devKernel->dynamicParallelism()) {
|
||||
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
|
||||
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(
|
||||
reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
|
||||
if (devKernel->dynamicParallelism()) {
|
||||
auto params = allocKernArg(sizeof(AmdAqlWrap), 64);
|
||||
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(params);
|
||||
memset(wrap, 0, sizeof(AmdAqlWrap));
|
||||
wrap->state = AQL_WRAP_DONE;
|
||||
spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
|
||||
spVA = reinterpret_cast<uint64_t>(wrap);
|
||||
}
|
||||
WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_);
|
||||
break;
|
||||
@@ -3651,8 +3613,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
dispatchBarrierPacket(kBarrierPacketHeader, true);
|
||||
if (virtualQueue_ != nullptr) {
|
||||
static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
|
||||
getVQVirtualAddress(), schedulerParam_, schedulerQueue_,
|
||||
schedulerSignal_, schedulerThreads_);
|
||||
getVQVirtualAddress(), schedulerQueue_,
|
||||
schedulerThreads_, spVA);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3833,10 +3795,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
|
||||
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
|
||||
//! unconditionally, before it can release pinned memory
|
||||
releaseGpuMemoryFence();
|
||||
if (!AMD_DIRECT_DISPATCH) {
|
||||
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
|
||||
//! unconditionally, before it can release pinned memory
|
||||
releaseGpuMemoryFence();
|
||||
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
|
||||
if (pinnedMems_.size() > 7) {
|
||||
pinnedMems_.front()->release();
|
||||
@@ -3847,7 +3809,14 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
|
||||
pinnedMems_.push_back(mem);
|
||||
}
|
||||
} else {
|
||||
mem->release();
|
||||
if (command_ != nullptr) {
|
||||
command_->AddPinnedMemory(mem);
|
||||
} else {
|
||||
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
|
||||
//! unconditionally, before it can release pinned memory
|
||||
releaseGpuMemoryFence();
|
||||
mem->release();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -450,8 +450,9 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void* allocKernArg(size_t size, size_t alignment);
|
||||
bool isFenceDirty() const { return fence_dirty_; }
|
||||
void setFenceDirty(bool state) { fence_dirty_ = state; }
|
||||
void HiddenHeapInit();
|
||||
void WaitCompleteSignal(hsa_signal_t signal);
|
||||
|
||||
void HiddenHeapInit();
|
||||
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
|
||||
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
|
||||
uint64_t getQueueID();
|
||||
@@ -592,9 +593,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//!< one thread
|
||||
uint schedulerThreads_; //!< The number of scheduler threads
|
||||
|
||||
amd::Memory* schedulerParam_;
|
||||
hsa_queue_t* schedulerQueue_;
|
||||
hsa_signal_t schedulerSignal_;
|
||||
|
||||
HwQueueTracker barriers_; //!< Tracks active barriers in ROCr
|
||||
|
||||
|
||||
@@ -285,6 +285,9 @@ class Os : AllStatic {
|
||||
//! Deletes file
|
||||
static int unlink(const std::string& path);
|
||||
|
||||
//! Removes the shared memory object name
|
||||
static int shm_unlink(const std::string& path);
|
||||
|
||||
// Library routines:
|
||||
//
|
||||
typedef bool (*SymbolCallback)(std::string, const void*, void*);
|
||||
|
||||
@@ -650,6 +650,7 @@ std::string Os::getTempFileName() {
|
||||
}
|
||||
|
||||
int Os::unlink(const std::string& path) { return ::unlink(path.c_str()); }
|
||||
int Os::shm_unlink(const std::string& path) { return ::shm_unlink(path.c_str()); }
|
||||
|
||||
#if defined(ATI_ARCH_X86)
|
||||
void Os::cpuid(int regs[4], int info) {
|
||||
|
||||
@@ -505,6 +505,9 @@ std::string Os::getTempFileName() {
|
||||
|
||||
int Os::unlink(const std::string& path) { return ::_unlink(path.c_str()); }
|
||||
|
||||
// shm_unlink is a nop on windows
|
||||
int Os::shm_unlink(const std::string& path) { return 0; }
|
||||
|
||||
void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); }
|
||||
|
||||
uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); }
|
||||
|
||||
@@ -269,7 +269,9 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
|
||||
ScopedLock l(notify_lock_);
|
||||
if ((status() > CL_COMPLETE) && (nullptr != queue) &&
|
||||
// If HW event was assigned, then notification can be ignored, since a barrier was issued
|
||||
(HwEvent() == nullptr) &&
|
||||
// @note: Force the marker always in OCL for now, since OCL events require precise
|
||||
// sequence of the status update
|
||||
((HwEvent() == nullptr) || !amd::IS_HIP) &&
|
||||
!notified_.test_and_set()) {
|
||||
// Make sure the queue is draining the enqueued commands.
|
||||
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
|
||||
@@ -364,8 +366,17 @@ void Command::enqueue() {
|
||||
|
||||
// Notify all commands about the waiter. Barrier will be sent in order to obtain
|
||||
// HSA signal for a wait on the current queue
|
||||
for (const auto& event : eventWaitList()) {
|
||||
event->notifyCmdQueue(!kCpuWait);
|
||||
for (const auto &event: eventWaitList()) {
|
||||
if (!amd::IS_HIP && event->command().type() == CL_COMMAND_USER) {
|
||||
if (event->status() >= CL_COMPLETE) {
|
||||
reinterpret_cast<amd::UserEvent*>(event)->AddDependent(this);
|
||||
} else {
|
||||
setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
event->notifyCmdQueue(!kCpuWait);
|
||||
}
|
||||
}
|
||||
|
||||
// The batch update must be lock protected to avoid a race condition
|
||||
|
||||
@@ -397,6 +397,9 @@ class Command : public Event {
|
||||
//! Release the resources associated with this event.
|
||||
virtual void releaseResources();
|
||||
|
||||
//! Empty function for adding pinned memory
|
||||
virtual void AddPinnedMemory(Memory* pinned) {}
|
||||
|
||||
//! Set the next GPU command
|
||||
void setNext(Command* next) { next_ = next; }
|
||||
|
||||
@@ -424,16 +427,46 @@ class Command : public Event {
|
||||
};
|
||||
|
||||
class UserEvent : public Command {
|
||||
const Context& context_;
|
||||
const Context& context_; //!< OCL context associated with the event
|
||||
std::vector<Command*> dependents_; //!< Commands, which depends on this user event
|
||||
|
||||
public:
|
||||
UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) {
|
||||
setStatus(CL_SUBMITTED);
|
||||
}
|
||||
|
||||
//! Creates a user event in the backend layer
|
||||
bool Create() {
|
||||
if (AMD_DIRECT_DISPATCH) {
|
||||
return context_.devices()[0]->CreateUserEvent(this);
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
//! Sets the execution status of the user event
|
||||
bool SetExecutionStatus(cl_int status) {
|
||||
if (AMD_DIRECT_DISPATCH) {
|
||||
// If it's invalid status, then mark dependent commands as invalid
|
||||
if (status < CL_COMPLETE) {
|
||||
for (auto it : dependents_) {
|
||||
it->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
|
||||
}
|
||||
}
|
||||
dependents_.clear();
|
||||
context_.devices()[0]->SetUserEvent(this);
|
||||
}
|
||||
return setStatus(status);
|
||||
}
|
||||
|
||||
//! Adds dependent commands for the user event
|
||||
void AddDependent(Command* command) {
|
||||
dependents_.push_back(command);
|
||||
}
|
||||
|
||||
virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); }
|
||||
|
||||
virtual const Context& context() const { return context_; }
|
||||
const Context& context() const { return context_; }
|
||||
};
|
||||
|
||||
class ClGlEvent : public Command {
|
||||
@@ -450,7 +483,7 @@ class ClGlEvent : public Command {
|
||||
|
||||
bool awaitCompletion() { return waitForFence(); }
|
||||
|
||||
virtual const Context& context() const { return context_; }
|
||||
const Context& context() const { return context_; }
|
||||
};
|
||||
|
||||
inline Command& Event::command() { return *static_cast<Command*>(this); }
|
||||
@@ -465,6 +498,7 @@ class NDRangeContainer;
|
||||
class OneMemoryArgCommand : public Command {
|
||||
protected:
|
||||
Memory* memory_;
|
||||
std::vector<Memory*> pinned_memory_; //!< Pinned memory object
|
||||
|
||||
public:
|
||||
OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
|
||||
@@ -477,8 +511,13 @@ class OneMemoryArgCommand : public Command {
|
||||
memory_->release();
|
||||
DEBUG_ONLY(memory_ = NULL);
|
||||
Command::releaseResources();
|
||||
for (auto it : pinned_memory_) {
|
||||
it->release();
|
||||
}
|
||||
}
|
||||
|
||||
//! Adds pinned memory, used in this command for later release
|
||||
virtual void AddPinnedMemory(Memory* pinned) override { pinned_memory_.push_back(pinned); }
|
||||
bool validateMemory();
|
||||
bool validatePeerMemory();
|
||||
};
|
||||
|
||||
@@ -173,6 +173,9 @@ void HostQueue::finish(bool cpu_wait) {
|
||||
(vdev()->QueuedAsyncHandlers().load() > DEBUG_HIP_BLOCK_SYNC)) {
|
||||
cpu_wait = true;
|
||||
}
|
||||
} else {
|
||||
// Force CPU wait for OpenCL, since the tests may check OCL command status after finish
|
||||
cpu_wait = true;
|
||||
}
|
||||
|
||||
size_t batchSize = GetSubmissionBatchSize();
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user