From aa3989dcd09c66afb580e2b8f734bd4c14bbf421 Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 17 Jan 2019 18:20:24 -0500
Subject: [PATCH] P4 to Git Change 1732168 by jatang@jatang_win_pal_lc on
2019/01/17 17:58:53
SWDEV-172202 - Workaround the scheduler for systems don't support PCIe 3 atomics properly.
The idea is the scheduler uses a device side global as write_index, and only write the write_index back to the hsa queue when the last thread of the scheduler leaves.
This change along with the library side change have been tested on systems with or without proper PCIe 3 atomics support.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsched.hpp#2 edit
---
rocclr/runtime/device/rocm/rocblit.cpp | 3 +++
rocclr/runtime/device/rocm/rocsched.hpp | 4 +---
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/rocclr/runtime/device/rocm/rocblit.cpp b/rocclr/runtime/device/rocm/rocblit.cpp
index d369166d51..fa9b3b766c 100644
--- a/rocclr/runtime/device/rocm/rocblit.cpp
+++ b/rocclr/runtime/device/rocm/rocblit.cpp
@@ -2214,6 +2214,9 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
+ // Use a device side global atomics to workaround the reliance of PCIe 3 atomics
+ sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
+
cl_mem mem = as_cl(schedulerParam);
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
diff --git a/rocclr/runtime/device/rocm/rocsched.hpp b/rocclr/runtime/device/rocm/rocsched.hpp
index 7fdd9d7c48..b243cab7f8 100644
--- a/rocclr/runtime/device/rocm/rocsched.hpp
+++ b/rocclr/runtime/device/rocm/rocsched.hpp
@@ -66,10 +66,8 @@ struct SchedulerParam {
uint64_t vqueue_header;
uint32_t signal; //!< Signal to stop the child queue
uint32_t eng_clk; //!< Engine clock in Mhz
- uint32_t releaseHostCP; //!< Releases CP on the host queue
uint64_t parentAQL; //!< Host parent AmdAqlWrap packet
- uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue
- uint32_t reserved[2]; //!< Processed mask groups by one thread
+ uint64_t write_index; //!< Write Index to the child queue
};
}