24 #include "hip/hcc_detail/hip_util.h"
25 #include "hip/hcc_detail/staging_buffer.h"
29 #if defined(__HCC__) && (__hcc_workweek__ < 1502)
30 #error("This version of HIP requires a newer version of HCC.");
41 #define USE_PEER_TO_PEER 0
44 #define USE_HCC_LOCK 0
53 extern const int release;
55 extern int HIP_LAUNCH_BLOCKING;
57 extern int HIP_PRINT_ENV;
58 extern int HIP_ATP_MARKER;
62 extern int HIP_STAGING_SIZE;
63 extern int HIP_STAGING_BUFFERS;
64 extern int HIP_PININPLACE;
65 extern int HIP_STREAM_SIGNALS;
66 extern int HIP_VISIBLE_DEVICES;
71 extern int HIP_DISABLE_HW_KERNEL_DEP;
72 extern int HIP_DISABLE_HW_COPY_DEP;
74 extern thread_local
int tls_defaultDevice;
75 extern thread_local
hipError_t tls_lastHipError;
81 #define KNRM "\x1B[0m"
82 #define KRED "\x1B[31m"
83 #define KGRN "\x1B[32m"
84 #define KYEL "\x1B[33m"
85 #define KBLU "\x1B[34m"
86 #define KMAG "\x1B[35m"
87 #define KCYN "\x1B[36m"
88 #define KWHT "\x1B[37m"
90 #define API_COLOR KGRN
97 #define STREAM_THREAD_SAFE 1
100 #define DEVICE_THREAD_SAFE 1
109 #define FORCE_SAMEDIR_COPY_DEP 1
114 #define COMPILE_HIP_DB 1
122 #define COMPILE_HIP_TRACE_API 0x3
127 #ifndef COMPILE_HIP_ATP_MARKER
128 #define COMPILE_HIP_ATP_MARKER 0
133 #define ONE_OBJECT_FILE 0
139 #if COMPILE_HIP_ATP_MARKER
140 #include "AMDTActivityLogger.h"
141 #define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString)
144 #define SCOPED_MARKER(markerName,group,userString)
148 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
149 #define API_TRACE(...)\
151 if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\
152 std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\
153 if (COMPILE_HIP_DB && HIP_TRACE_API) {\
154 fprintf (stderr, API_COLOR "<<hip-api: %s\n" KNRM, s.c_str());\
156 SCOPED_MARKER(s.c_str(), "HIP", NULL);\
161 #define API_TRACE(...)
169 #define HIP_INIT_API(...) \
170 std::call_once(hip_initialized, ihipInit);\
171 API_TRACE(__VA_ARGS__);
173 #define ihipLogStatus(_hip_status) \
175 hipError_t _local_hip_status = _hip_status; \
176 tls_lastHipError = _local_hip_status;\
178 if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API) {\
179 fprintf(stderr, " %ship-api: %-30s ret=%2d (%s)>>\n" KNRM, (_local_hip_status == 0) ? API_COLOR:KRED, __func__, _local_hip_status, ihipErrorString(_local_hip_status));\
197 static const char *dbName [] =
208 #define tprintf(trace_level, ...) {\
209 if (HIP_DB & (1<<(trace_level))) {\
210 fprintf (stderr, " %s:", dbName[trace_level]); \
211 fprintf (stderr, __VA_ARGS__);\
212 fprintf (stderr, "%s", KNRM); \
217 #define tprintf(trace_level, ...)
242 const hipStream_t hipStreamNull = 0x0;
253 static const char* ihipCommandName[] = {
254 "CopyH2H",
"CopyH2D",
"CopyD2H",
"CopyD2D",
"Kernel"
259 typedef uint64_t SIGSEQNUM;
267 hsa_signal_t _hsa_signal;
283 bool try_lock() {
return true; }
288 #if STREAM_THREAD_SAFE
289 typedef std::mutex StreamMutex;
291 #warning "Stream thread-safe disabled"
295 #if DEVICE_THREAD_SAFE
296 typedef std::mutex DeviceMutex;
299 #warning "Device thread-safe disabled"
311 _criticalData(&criticalData),
312 _autoUnlock(autoUnlock)
315 _criticalData->_mutex.lock();
321 _criticalData->_mutex.unlock();
327 _criticalData->_mutex.unlock();
331 T *operator->() {
return _criticalData; };
339 template <
typename MUTEX_TYPE>
344 void lock() { _mutex.lock(); }
345 void unlock() { _mutex.unlock(); }
351 template <
typename MUTEX_TYPE>
356 _last_command_type(ihipCommandCopyH2H),
357 _last_copy_signal(NULL),
359 _oldest_live_sig_id(1),
362 _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
374 ihipCommand_t _last_command_type;
381 hc::completion_future _last_kernel_future;
385 SIGSEQNUM _oldest_live_sig_id;
386 std::deque<ihipSignal_t> _signalPool;
389 SIGSEQNUM _stream_sig_id;
401 typedef uint64_t SeqNum_t ;
403 ihipStream_t(
unsigned device_index, hc::accelerator_view av,
unsigned int flags);
408 void locked_copySync (
void* dst,
const void* src,
size_t sizeBytes,
unsigned kind);
410 void copyAsync(
void* dst,
const void* src,
size_t sizeBytes,
unsigned kind);
414 bool lockopen_preKernelCommand();
415 void lockclose_postKernelCommand(hc::completion_future &kernel_future);
419 void locked_reclaimSignals(SIGSEQNUM sigNum);
420 void locked_wait(
bool assertQueueEmpty=
false);
429 SIGSEQNUM lastCopySeqId (
LockedAccessor_StreamCrit_t &crit) {
return crit->_last_copy_signal ? crit->_last_copy_signal->_sig_id : 0; };
442 hc::accelerator_view _av;
449 void enqueueBarrier(hsa_queue_t* queue,
ihipSignal_t *depSignal);
453 unsigned resolveMemcpyDirection(
bool srcInDeviceMem,
bool dstInDeviceMem);
454 void setCopyAgents(
unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent);
456 unsigned _device_index;
458 friend std::ostream& operator<<(std::ostream& os,
const ihipStream_t& s);
462 inline std::ostream& operator<<(std::ostream& os,
const ihipStream_t& s)
465 os << s._device_index;
474 enum hipEventStatus_t {
475 hipEventStatusUnitialized = 0,
476 hipEventStatusCreated = 1,
477 hipEventStatusRecording = 2,
478 hipEventStatusRecorded = 3,
484 hipEventStatus_t _state;
489 hc::completion_future _marker;
492 SIGSEQNUM _copy_seq_id;
505 template <
class MUTEX_TYPE>
511 void init(
unsigned deviceCnt) {
512 assert(_peerAgents ==
nullptr);
513 _peerAgents =
new hsa_agent_t[deviceCnt];
517 if (_peerAgents !=
nullptr) {
519 _peerAgents =
nullptr;
524 std::list<ihipStream_t*> &streams() {
return _streams; };
525 const std::list<ihipStream_t*> &const_streams()
const {
return _streams; };
528 ihipStream_t::SeqNum_t incStreamId() {
return _stream_id++; };
537 uint32_t peerCnt()
const {
return _peerCnt; };
538 hsa_agent_t *peerAgents()
const {
return _peerAgents; };
542 std::list<ihipStream_t*> _streams;
543 ihipStream_t::SeqNum_t _stream_id;
546 std::list<ihipDevice_t*> _peers;
548 hsa_agent_t *_peerAgents;
550 void recomputePeerAgents();
570 void init(
unsigned device_index,
unsigned deviceCnt, hc::accelerator &acc,
unsigned flags);
576 void locked_waitAllStreams();
577 void locked_syncDefaultStream(
bool waitOnSelf);
582 unsigned _device_index;
585 hc::accelerator _acc;
586 hsa_agent_t _hsa_agent;
593 unsigned _compute_units;
598 unsigned _device_flags;
613 extern std::once_flag hip_initialized;
615 extern bool g_visible_device;
616 extern unsigned g_deviceCnt;
617 extern std::vector<int> g_hip_visible_devices;
618 extern hsa_agent_t g_cpu_agent ;
627 hc::completion_future ihipMemcpyKernel(hipStream_t, T*,
const T*,
size_t);
630 hc::completion_future ihipMemsetKernel(hipStream_t, T*, T,
size_t);
632 hipStream_t ihipSyncAndResolveStream(hipStream_t);
633 template <
typename T>
635 hc::completion_future
636 ihipMemsetKernel(hipStream_t stream, T * ptr, T val,
size_t sizeBytes)
638 int wg = std::min((
unsigned)8, stream->getDevice()->_compute_units);
639 const int threads_per_wg = 256;
641 int threads = wg * threads_per_wg;
642 if (threads > sizeBytes) {
643 threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
647 hc::extent<1> ext(threads);
648 auto ext_tile = ext.tile(threads_per_wg);
650 hc::completion_future cf =
651 hc::parallel_for_each(
654 [=] (hc::tiled_index<1> idx)
657 int offset = amp_get_global_id(0);
659 int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
661 for (
int i=offset; i<sizeBytes; i+=stride) {
669 template <
typename T>
670 hc::completion_future
671 ihipMemcpyKernel(hipStream_t stream, T * c,
const T * a,
size_t sizeBytes)
673 int wg = std::min((
unsigned)8, stream->getDevice()->_compute_units);
674 const int threads_per_wg = 256;
676 int threads = wg * threads_per_wg;
677 if (threads > sizeBytes) {
678 threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
682 hc::extent<1> ext(threads);
683 auto ext_tile = ext.tile(threads_per_wg);
685 hc::completion_future cf =
686 hc::parallel_for_each(
689 [=] (hc::tiled_index<1> idx)
692 int offset = amp_get_global_id(0);
694 int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
696 for (
int i=offset; i<sizeBytes; i+=stride) {
Definition: hip_hcc.h:566
Definition: hip_hcc.h:340
Definition: hip_hcc.h:279
hipError_t
Definition: hip_runtime_api.h:142
Definition: hip_runtime_api.h:47
Definition: hip_hcc.h:506
Definition: hip_hcc.h:266
Definition: hip_runtime_api.h:74
Definition: staging_buffer.h:40
Definition: hip_hcc.h:483
Definition: hip_hcc.h:220
Definition: hip_hcc.h:399
Definition: hip_hcc.h:352
Definition: hip_hcc.h:307