P4 to Git Change 1732168 by jatang@jatang_win_pal_lc on 2019/01/17 17:58:53
SWDEV-172202 - Workaround the scheduler for systems don't support PCIe 3 atomics properly. The idea is the scheduler uses a device side global as write_index, and only write the write_index back to the hsa queue when the last thread of the scheduler leaves. This change along with the library side change have been tested on systems with or without proper PCIe 3 atomics support. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#29 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsched.hpp#2 edit
Cette révision appartient à :
@@ -2214,6 +2214,9 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
|
||||
sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
|
||||
sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
|
||||
|
||||
// Use a device side global atomics to workaround the reliance of PCIe 3 atomics
|
||||
sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
|
||||
|
||||
cl_mem mem = as_cl<amd::Memory>(schedulerParam);
|
||||
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &mem);
|
||||
|
||||
|
||||
@@ -66,10 +66,8 @@ struct SchedulerParam {
|
||||
uint64_t vqueue_header;
|
||||
uint32_t signal; //!< Signal to stop the child queue
|
||||
uint32_t eng_clk; //!< Engine clock in Mhz
|
||||
uint32_t releaseHostCP; //!< Releases CP on the host queue
|
||||
uint64_t parentAQL; //!< Host parent AmdAqlWrap packet
|
||||
uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue
|
||||
uint32_t reserved[2]; //!< Processed mask groups by one thread
|
||||
uint64_t write_index; //!< Write Index to the child queue
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur