24 #include "hip/hcc_detail/hip_util.h"
25 #include "hip/hcc_detail/staging_buffer.h"
28 #if defined(__HCC__) && (__hcc_workweek__ < 16155)
29 #error("This version of HIP requires a newer version of HCC.");
40 #define USE_PEER_TO_PEER 2
44 #define USE_HCC_LOCK_API 1
52 extern const int release;
54 extern int HIP_LAUNCH_BLOCKING;
56 extern int HIP_PRINT_ENV;
57 extern int HIP_ATP_MARKER;
61 extern int HIP_STAGING_SIZE;
62 extern int HIP_STAGING_BUFFERS;
63 extern int HIP_PININPLACE;
64 extern int HIP_STREAM_SIGNALS;
65 extern int HIP_VISIBLE_DEVICES;
70 extern int HIP_DISABLE_HW_KERNEL_DEP;
71 extern int HIP_DISABLE_HW_COPY_DEP;
73 extern thread_local
int tls_defaultDevice;
74 extern thread_local
hipError_t tls_lastHipError;
80 #define KNRM "\x1B[0m"
81 #define KRED "\x1B[31m"
82 #define KGRN "\x1B[32m"
83 #define KYEL "\x1B[33m"
84 #define KBLU "\x1B[34m"
85 #define KMAG "\x1B[35m"
86 #define KCYN "\x1B[36m"
87 #define KWHT "\x1B[37m"
89 #define API_COLOR KGRN
94 #define STREAM_THREAD_SAFE 1
97 #define DEVICE_THREAD_SAFE 1
106 #define FORCE_SAMEDIR_COPY_DEP 1
111 #define COMPILE_HIP_DB 1
119 #define COMPILE_HIP_TRACE_API 0x3
124 #ifndef COMPILE_HIP_ATP_MARKER
125 #define COMPILE_HIP_ATP_MARKER 0
130 #define ONE_OBJECT_FILE 0
136 #if COMPILE_HIP_ATP_MARKER
137 #include "AMDTActivityLogger.h"
138 #define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString)
141 #define SCOPED_MARKER(markerName,group,userString)
145 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
146 #define API_TRACE(...)\
148 if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\
149 std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\
150 if (COMPILE_HIP_DB && HIP_TRACE_API) {\
151 fprintf (stderr, API_COLOR "<<hip-api: %s\n" KNRM, s.c_str());\
153 SCOPED_MARKER(s.c_str(), "HIP", NULL);\
158 #define API_TRACE(...)
166 #define HIP_INIT_API(...) \
167 std::call_once(hip_initialized, ihipInit);\
168 API_TRACE(__VA_ARGS__);
170 #define ihipLogStatus(_hip_status) \
172 hipError_t _local_hip_status = _hip_status; \
173 tls_lastHipError = _local_hip_status;\
175 if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API) {\
176 fprintf(stderr, " %ship-api: %-30s ret=%2d (%s)>>\n" KNRM, (_local_hip_status == 0) ? API_COLOR:KRED, __func__, _local_hip_status, ihipErrorString(_local_hip_status));\
194 static const char *dbName [] =
205 #define tprintf(trace_level, ...) {\
206 if (HIP_DB & (1<<(trace_level))) {\
207 fprintf (stderr, " %s:", dbName[trace_level]); \
208 fprintf (stderr, __VA_ARGS__);\
209 fprintf (stderr, "%s", KNRM); \
214 #define tprintf(trace_level, ...)
239 const hipStream_t hipStreamNull = 0x0;
251 static const char* ihipCommandName[] = {
252 "CopyH2H",
"CopyH2D",
"CopyD2H",
"CopyD2D",
"CopyP2P",
"Kernel"
257 typedef uint64_t SIGSEQNUM;
265 hsa_signal_t _hsa_signal;
281 bool try_lock() {
return true; }
286 #if STREAM_THREAD_SAFE
287 typedef std::mutex StreamMutex;
289 #warning "Stream thread-safe disabled"
293 #if DEVICE_THREAD_SAFE
294 typedef std::mutex DeviceMutex;
297 #warning "Device thread-safe disabled"
309 _criticalData(&criticalData),
310 _autoUnlock(autoUnlock)
313 _criticalData->_mutex.lock();
319 _criticalData->_mutex.unlock();
325 _criticalData->_mutex.unlock();
329 T *operator->() {
return _criticalData; };
337 template <
typename MUTEX_TYPE>
342 void lock() { _mutex.lock(); }
343 void unlock() { _mutex.unlock(); }
349 template <
typename MUTEX_TYPE>
354 _last_command_type(ihipCommandCopyH2H),
355 _last_copy_signal(NULL),
357 _oldest_live_sig_id(1),
360 _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
372 ihipCommand_t _last_command_type;
379 hc::completion_future _last_kernel_future;
383 SIGSEQNUM _oldest_live_sig_id;
384 std::deque<ihipSignal_t> _signalPool;
387 SIGSEQNUM _stream_sig_id;
399 typedef uint64_t SeqNum_t ;
401 ihipStream_t(
unsigned device_index, hc::accelerator_view av,
unsigned int flags);
406 void locked_copySync (
void* dst,
const void* src,
size_t sizeBytes,
unsigned kind);
408 void copyAsync(
void* dst,
const void* src,
size_t sizeBytes,
unsigned kind);
412 bool lockopen_preKernelCommand();
413 void lockclose_postKernelCommand(hc::completion_future &kernel_future);
417 void locked_reclaimSignals(SIGSEQNUM sigNum);
418 void locked_wait(
bool assertQueueEmpty=
false);
427 SIGSEQNUM lastCopySeqId (
LockedAccessor_StreamCrit_t &crit) {
return crit->_last_copy_signal ? crit->_last_copy_signal->_sig_id : 0; };
440 hc::accelerator_view _av;
448 std::vector<hc::completion_future> _depFutures;
451 void enqueueBarrier(hsa_queue_t* queue,
ihipSignal_t *depSignal);
455 unsigned resolveMemcpyDirection(
bool srcTracked,
bool dstTracked,
bool srcInDeviceMem,
bool dstInDeviceMem);
456 void setAsyncCopyAgents(
unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent);
458 unsigned _device_index;
460 friend std::ostream& operator<<(std::ostream& os,
const ihipStream_t& s);
464 inline std::ostream& operator<<(std::ostream& os,
const ihipStream_t& s)
467 os << s._device_index;
476 enum hipEventStatus_t {
477 hipEventStatusUnitialized = 0,
478 hipEventStatusCreated = 1,
479 hipEventStatusRecording = 2,
480 hipEventStatusRecorded = 3,
486 hipEventStatus_t _state;
491 hc::completion_future _marker;
494 SIGSEQNUM _copy_seq_id;
507 template <
class MUTEX_TYPE>
513 void init(
unsigned deviceCnt) {
514 assert(_peerAgents ==
nullptr);
515 _peerAgents =
new hsa_agent_t[deviceCnt];
519 if (_peerAgents !=
nullptr) {
521 _peerAgents =
nullptr;
526 std::list<ihipStream_t*> &streams() {
return _streams; };
527 const std::list<ihipStream_t*> &const_streams()
const {
return _streams; };
530 ihipStream_t::SeqNum_t incStreamId() {
return _stream_id++; };
540 uint32_t peerCnt()
const {
return _peerCnt; };
541 hsa_agent_t *peerAgents()
const {
return _peerAgents; };
546 std::list< ihipStream_t* > _streams;
547 ihipStream_t::SeqNum_t _stream_id;
551 std::list<ihipDevice_t*> _peers;
553 hsa_agent_t *_peerAgents;
555 void recomputePeerAgents();
575 void init(
unsigned device_index,
unsigned deviceCnt, hc::accelerator &acc,
unsigned flags);
581 void locked_waitAllStreams();
582 void locked_syncDefaultStream(
bool waitOnSelf);
587 unsigned _device_index;
590 hc::accelerator _acc;
591 hsa_agent_t _hsa_agent;
598 unsigned _compute_units;
603 unsigned _device_flags;
618 extern std::once_flag hip_initialized;
620 extern bool g_visible_device;
621 extern unsigned g_deviceCnt;
622 extern std::vector<int> g_hip_visible_devices;
623 extern hsa_agent_t g_cpu_agent ;
632 hc::completion_future ihipMemcpyKernel(hipStream_t, T*,
const T*,
size_t);
635 hc::completion_future ihipMemsetKernel(hipStream_t, T*, T,
size_t);
637 hipStream_t ihipSyncAndResolveStream(hipStream_t);
638 template <
typename T>
640 hc::completion_future
641 ihipMemsetKernel(hipStream_t stream, T * ptr, T val,
size_t sizeBytes)
643 int wg = std::min((
unsigned)8, stream->getDevice()->_compute_units);
644 const int threads_per_wg = 256;
646 int threads = wg * threads_per_wg;
647 if (threads > sizeBytes) {
648 threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
652 hc::extent<1> ext(threads);
653 auto ext_tile = ext.tile(threads_per_wg);
655 hc::completion_future cf =
656 hc::parallel_for_each(
659 [=] (hc::tiled_index<1> idx)
662 int offset = amp_get_global_id(0);
664 int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
666 for (
int i=offset; i<sizeBytes; i+=stride) {
674 template <
typename T>
675 hc::completion_future
676 ihipMemcpyKernel(hipStream_t stream, T * c,
const T * a,
size_t sizeBytes)
678 int wg = std::min((
unsigned)8, stream->getDevice()->_compute_units);
679 const int threads_per_wg = 256;
681 int threads = wg * threads_per_wg;
682 if (threads > sizeBytes) {
683 threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
687 hc::extent<1> ext(threads);
688 auto ext_tile = ext.tile(threads_per_wg);
690 hc::completion_future cf =
691 hc::parallel_for_each(
694 [=] (hc::tiled_index<1> idx)
697 int offset = amp_get_global_id(0);
699 int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
701 for (
int i=offset; i<sizeBytes; i+=stride) {
Definition: hip_hcc.h:571
Definition: hip_hcc.h:338
Definition: hip_hcc.h:277
hipError_t
Definition: hip_runtime_api.h:142
Definition: hip_runtime_api.h:47
Definition: hip_hcc.h:508
Definition: hip_hcc.h:264
Definition: hip_runtime_api.h:74
Definition: staging_buffer.h:40
Definition: hip_hcc.h:485
Definition: hip_hcc.h:217
Definition: hip_hcc.h:397
Definition: hip_hcc.h:350
Definition: hip_hcc.h:305