diff --git a/rocclr/runtime/device/rocm/rocblit.cpp b/rocclr/runtime/device/rocm/rocblit.cpp index d369166d51..fa9b3b766c 100644 --- a/rocclr/runtime/device/rocm/rocblit.cpp +++ b/rocclr/runtime/device/rocm/rocblit.cpp @@ -2214,6 +2214,9 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; + // Use a device side global atomics to workaround the reliance of PCIe 3 atomics + sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue); + cl_mem mem = as_cl(schedulerParam); setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem); diff --git a/rocclr/runtime/device/rocm/rocsched.hpp b/rocclr/runtime/device/rocm/rocsched.hpp index 7fdd9d7c48..b243cab7f8 100644 --- a/rocclr/runtime/device/rocm/rocsched.hpp +++ b/rocclr/runtime/device/rocm/rocsched.hpp @@ -66,10 +66,8 @@ struct SchedulerParam { uint64_t vqueue_header; uint32_t signal; //!< Signal to stop the child queue uint32_t eng_clk; //!< Engine clock in Mhz - uint32_t releaseHostCP; //!< Releases CP on the host queue uint64_t parentAQL; //!< Host parent AmdAqlWrap packet - uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue - uint32_t reserved[2]; //!< Processed mask groups by one thread + uint64_t write_index; //!< Write Index to the child queue }; }