SWDEV-248499 Fix a crash when printf is used with cooperative kernels
root cause - cooperative queue is not inserted into queuePool_ (HSA queues) of ROC device calss causing a crash when creating hostcall buffers for printf Change-Id: I3f9aceb4e5fe6a7c7a2a549a4bb0a3511fe02799
此提交包含在:
+28
-11
@@ -142,6 +142,7 @@ Device::Device(hsa_agent_t bkendDevice)
|
||||
, vgpusAccess_("Virtual GPU List Ops Lock", true)
|
||||
, hsa_exclusive_gpu_access_(false)
|
||||
, queuePool_(QueuePriority::Total)
|
||||
, coopHostcallBuffer_(nullptr)
|
||||
, numOfVgpus_(0) {
|
||||
group_segment_.handle = 0;
|
||||
system_segment_.handle = 0;
|
||||
@@ -233,6 +234,12 @@ Device::~Device() {
|
||||
}
|
||||
|
||||
delete[] p2p_agents_list_;
|
||||
|
||||
if (coopHostcallBuffer_) {
|
||||
disableHostcalls(coopHostcallBuffer_);
|
||||
context().svmFree(coopHostcallBuffer_);
|
||||
coopHostcallBuffer_ = nullptr;
|
||||
}
|
||||
}
|
||||
bool NullDevice::initCompiler(bool isOffline) {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
@@ -2323,20 +2330,26 @@ void Device::releaseQueue(hsa_queue_t* queue) {
|
||||
hsa_queue_destroy(queue);
|
||||
}
|
||||
|
||||
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
|
||||
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue) {
|
||||
decltype(queuePool_)::value_type::iterator qIter;
|
||||
for (auto& it : queuePool_) {
|
||||
qIter = it.find(queue);
|
||||
if (qIter != it.end()) {
|
||||
break;
|
||||
|
||||
if (!coop_queue) {
|
||||
for (auto &it : queuePool_) {
|
||||
qIter = it.find(queue);
|
||||
if (qIter != it.end()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert(qIter != queuePool_[QueuePriority::High].end());
|
||||
assert(qIter != queuePool_[QueuePriority::High].end());
|
||||
|
||||
auto& qInfo = qIter->second;
|
||||
if (qInfo.hostcallBuffer_) {
|
||||
return qInfo.hostcallBuffer_;
|
||||
if (qIter->second.hostcallBuffer_) {
|
||||
return qIter->second.hostcallBuffer_;
|
||||
}
|
||||
} else {
|
||||
if (coopHostcallBuffer_) {
|
||||
return coopHostcallBuffer_;
|
||||
}
|
||||
}
|
||||
|
||||
// The number of packets required in each buffer is at least equal to the
|
||||
@@ -2355,7 +2368,11 @@ void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Created hostcall buffer %p for hardware queue %p", buffer,
|
||||
queue);
|
||||
qInfo.hostcallBuffer_ = buffer;
|
||||
if (!coop_queue) {
|
||||
qIter->second.hostcallBuffer_ = buffer;
|
||||
} else {
|
||||
coopHostcallBuffer_ = buffer;
|
||||
}
|
||||
if (!enableHostcalls(buffer, numPackets)) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to register hostcall buffer %p with listener",
|
||||
buffer);
|
||||
|
||||
@@ -465,7 +465,7 @@ class Device : public NullDevice {
|
||||
|
||||
//! For the given HSA queue, return an existing hostcall buffer or create a
|
||||
//! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
|
||||
void* getOrCreateHostcallBuffer(hsa_queue_t* queue);
|
||||
void* getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue = false);
|
||||
|
||||
//! Return multi GPU grid launch sync buffer
|
||||
address MGSync() const { return mg_sync_; }
|
||||
@@ -531,6 +531,7 @@ class Device : public NullDevice {
|
||||
//! returns a hsa queue from queuePool with least refCount and updates the refCount as well
|
||||
hsa_queue_t* getQueueFromPool(const uint qIndex);
|
||||
|
||||
void* coopHostcallBuffer_;
|
||||
public:
|
||||
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
||||
|
||||
|
||||
@@ -2226,7 +2226,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
}
|
||||
case amd::KernelParameterDescriptor::HiddenHostcallBuffer: {
|
||||
if (amd::IS_HIP) {
|
||||
auto buffer = roc_device_.getOrCreateHostcallBuffer(gpu_queue_);
|
||||
auto buffer = roc_device_.getOrCreateHostcallBuffer(gpu_queue_, coopGroups);
|
||||
if (!buffer) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_KERN,
|
||||
"Kernel expects a hostcall buffer, but none found");
|
||||
|
||||
新增問題並參考
封鎖使用者