HIP: Heterogenous-computing Interface for Portability
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hip_hcc.h
1 /*
2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
3 Permission is hereby granted, free of charge, to any person obtaining a copy
4 of this software and associated documentation files (the "Software"), to deal
5 in the Software without restriction, including without limitation the rights
6 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 copies of the Software, and to permit persons to whom the Software is
8 furnished to do so, subject to the following conditions:
9 The above copyright notice and this permission notice shall be included in
10 all copies or substantial portions of the Software.
11 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
12 IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
15 LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 THE SOFTWARE.
18 */
19 
20 #ifndef HIP_HCC_H
21 #define HIP_HCC_H
22 
23 #include <hc.hpp>
24 #include "hip/hcc_detail/hip_util.h"
25 #include "hip/hcc_detail/staging_buffer.h"
26 
27 #define HIP_HCC
28 
29 #if defined(__HCC__) && (__hcc_workweek__ < 1502)
30 #error("This version of HIP requires a newer version of HCC.");
31 #endif
32 
33 // #define USE_MEMCPYTOSYMBOL
34 //
35 //Use the new HCC accelerator_view::copy instead of am_copy
36 #define USE_AV_COPY 0
37 
38 // Compile peer-to-peer support.
39 // >= 2 : use HCC hc:accelerator::get_is_peer
40 // >= 3 : use hc::am_memtracker_update_peers(...)
41 #define USE_PEER_TO_PEER 0
42 
43 // Use new lock API in HCC:
44 #define USE_HCC_LOCK 0
45 
46 //#define INLINE static inline
47 
48 //---
49 // Environment variables:
50 
51 // Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release.
52 //static const int debug = 0;
53 extern const int release;
54 
55 extern int HIP_LAUNCH_BLOCKING;
56 
57 extern int HIP_PRINT_ENV;
58 extern int HIP_ATP_MARKER;
59 //extern int HIP_TRACE_API;
60 extern int HIP_ATP;
61 extern int HIP_DB;
62 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
63 extern int HIP_STAGING_BUFFERS; // TODO - remove, two buffers should be enough.
64 extern int HIP_PININPLACE;
65 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
66 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
67 
68 
69 //---
70 // Chicken bits for disabling functionality to work around potential issues:
71 extern int HIP_DISABLE_HW_KERNEL_DEP;
72 extern int HIP_DISABLE_HW_COPY_DEP;
73 
74 extern thread_local int tls_defaultDevice;
75 extern thread_local hipError_t tls_lastHipError;
76 class ihipStream_t;
77 class ihipDevice_t;
78 
79 
80 // Color defs for debug messages:
81 #define KNRM "\x1B[0m"
82 #define KRED "\x1B[31m"
83 #define KGRN "\x1B[32m"
84 #define KYEL "\x1B[33m"
85 #define KBLU "\x1B[34m"
86 #define KMAG "\x1B[35m"
87 #define KCYN "\x1B[36m"
88 #define KWHT "\x1B[37m"
89 
90 #define API_COLOR KGRN
91 
92 
93 #define HIP_HCC
94 
95 // If set, thread-safety is enforced on all stream functions.
96 // Stream functions will acquire a mutex before entering critical sections.
97 #define STREAM_THREAD_SAFE 1
98 
99 
100 #define DEVICE_THREAD_SAFE 1
101 
102 // If FORCE_COPY_DEP=1 , HIP runtime will add
103 // synchronization for copy commands in the same stream, regardless of command type.
104 // If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered.
105 // ROCR runtime implementation currently provides this guarantee when using SDMA queues but not
106 // when using shader queues.
107 // TODO - measure if this matters for performance, in particular for back-to-back small copies.
108 // If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands.
109 #define FORCE_SAMEDIR_COPY_DEP 1
110 
111 
112 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
113 // May be set to 0 to remove debug if checks - possible code size and performance difference?
114 #define COMPILE_HIP_DB 1
115 
116 
117 // Compile HIP tracing capability.
118 // 0x1 = print a string at function entry with arguments.
119 // 0x2 = prints a simple message with function name + return code when function exits.
120 // 0x3 = print both.
121 // Must be enabled at runtime with HIP_TRACE_API
122 #define COMPILE_HIP_TRACE_API 0x3
123 
124 
125 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
126 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
127 #ifndef COMPILE_HIP_ATP_MARKER
128 #define COMPILE_HIP_ATP_MARKER 0
129 #endif
130 
131 
132 // #include CPP files to produce one object file
133 #define ONE_OBJECT_FILE 0
134 
135 
136 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary.
137 // TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned
138 // through ptr-to-args (ie the pointers allocated by hipMalloc).
139 #if COMPILE_HIP_ATP_MARKER
140 #include "AMDTActivityLogger.h"
141 #define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString)
142 #else
143 // Swallow scoped markers:
144 #define SCOPED_MARKER(markerName,group,userString)
145 #endif
146 
147 
148 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
149 #define API_TRACE(...)\
150 {\
151  if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\
152  std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\
153  if (COMPILE_HIP_DB && HIP_TRACE_API) {\
154  fprintf (stderr, API_COLOR "<<hip-api: %s\n" KNRM, s.c_str());\
155  }\
156  SCOPED_MARKER(s.c_str(), "HIP", NULL);\
157  }\
158 }
159 #else
160 // Swallow API_TRACE
161 #define API_TRACE(...)
162 #endif
163 
164 
165 
166 // This macro should be called at the beginning of every HIP API.
167 // It initialies the hip runtime (exactly once), and
168 // generate trace string that can be output to stderr or to ATP file.
169 #define HIP_INIT_API(...) \
170  std::call_once(hip_initialized, ihipInit);\
171  API_TRACE(__VA_ARGS__);
172 
173 #define ihipLogStatus(_hip_status) \
174  ({\
175  hipError_t _local_hip_status = _hip_status; /*local copy so _hip_status only evaluated once*/ \
176  tls_lastHipError = _local_hip_status;\
177  \
178  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API) {\
179  fprintf(stderr, " %ship-api: %-30s ret=%2d (%s)>>\n" KNRM, (_local_hip_status == 0) ? API_COLOR:KRED, __func__, _local_hip_status, ihipErrorString(_local_hip_status));\
180  }\
181  _local_hip_status;\
182  })
183 
184 
185 
186 
187 //---
188 //HIP_DB Debug flags:
189 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
190 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
191 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
192 #define DB_COPY1 3 /* 0x08 - trace memory copy commands. . */
193 #define DB_SIGNAL 4 /* 0x10 - trace signal pool commands */
194 #define DB_COPY2 5 /* 0x20 - trace memory copy commands. Detailed. */
195 // When adding a new debug flag, also add to the char name table below.
196 
197 static const char *dbName [] =
198 {
199  KNRM "hip-api", // not used,
200  KYEL "hip-sync",
201  KCYN "hip-mem",
202  KMAG "hip-copy1",
203  KRED "hip-signal",
204  KNRM "hip-copy2",
205 };
206 
207 #if COMPILE_HIP_DB
208 #define tprintf(trace_level, ...) {\
209  if (HIP_DB & (1<<(trace_level))) {\
210  fprintf (stderr, " %s:", dbName[trace_level]); \
211  fprintf (stderr, __VA_ARGS__);\
212  fprintf (stderr, "%s", KNRM); \
213  }\
214 }
215 #else
216 /* Compile to empty code */
217 #define tprintf(trace_level, ...)
218 #endif
219 
220 class ihipException : public std::exception
221 {
222 public:
223  ihipException(hipError_t e) : _code(e) {};
224 
225  hipError_t _code;
226 };
227 
228 
229 #ifdef __cplusplus
230 extern "C" {
231 #endif
232 
233 typedef class ihipStream_t* hipStream_t;
234 //typedef struct hipEvent_t {
235 // struct ihipEvent_t *_handle;
236 //} hipEvent_t;
237 
238 #ifdef __cplusplus
239 }
240 #endif
241 
242 const hipStream_t hipStreamNull = 0x0;
243 
244 
245 enum ihipCommand_t {
246  ihipCommandCopyH2H,
247  ihipCommandCopyH2D,
248  ihipCommandCopyD2H,
249  ihipCommandCopyD2D,
250  ihipCommandKernel,
251 };
252 
253 static const char* ihipCommandName[] = {
254  "CopyH2H", "CopyH2D", "CopyD2H", "CopyD2D", "Kernel"
255 };
256 
257 
258 
259 typedef uint64_t SIGSEQNUM;
260 
261 //---
262 // Small wrapper around signals.
263 // Designed to be used from stream.
264 // TODO-someday refactor this class so it can be stored in a vector<>
265 // we already store the index here so we can use for garbage collection.
266 struct ihipSignal_t {
267  hsa_signal_t _hsa_signal; // hsa signal handle
268  int _index; // Index in pool, used for garbage collection.
269  SIGSEQNUM _sig_id; // unique sequentially increasing ID.
270 
271  ihipSignal_t();
272  ~ihipSignal_t();
273 
274  void release();
275 };
276 
277 
278 // Used to remove lock, for performance or stimulating bugs.
280 {
281  public:
282  void lock() { }
283  bool try_lock() {return true; }
284  void unlock() { }
285 };
286 
287 
288 #if STREAM_THREAD_SAFE
289 typedef std::mutex StreamMutex;
290 #else
291 #warning "Stream thread-safe disabled"
292 typedef FakeMutex StreamMutex;
293 #endif
294 
295 #if DEVICE_THREAD_SAFE
296 typedef std::mutex DeviceMutex;
297 #else
298 typedef FakeMutex DeviceMutex;
299 #warning "Device thread-safe disabled"
300 #endif
301 
302 //
303 //---
304 // Protects access to the member _data with a lock acquired on contruction/destruction.
305 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
306 template<typename T>
308 {
309 public:
310  LockedAccessor(T &criticalData, bool autoUnlock=true) :
311  _criticalData(&criticalData),
312  _autoUnlock(autoUnlock)
313 
314  {
315  _criticalData->_mutex.lock();
316  };
317 
318  ~LockedAccessor()
319  {
320  if (_autoUnlock) {
321  _criticalData->_mutex.unlock();
322  }
323  }
324 
325  void unlock()
326  {
327  _criticalData->_mutex.unlock();
328  }
329 
330  // Syntactic sugar so -> can be used to get the underlying type.
331  T *operator->() { return _criticalData; };
332 
333 private:
334  T *_criticalData;
335  bool _autoUnlock;
336 };
337 
338 
339 template <typename MUTEX_TYPE>
340 struct LockedBase {
341 
342  // Experts-only interface for explicit locking.
343  // Most uses should use the lock-accessor.
344  void lock() { _mutex.lock(); }
345  void unlock() { _mutex.unlock(); }
346 
347  MUTEX_TYPE _mutex;
348 };
349 
350 
351 template <typename MUTEX_TYPE>
352 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
353 {
354 public:
356  _last_command_type(ihipCommandCopyH2H),
357  _last_copy_signal(NULL),
358  _signalCursor(0),
359  _oldest_live_sig_id(1),
360  _stream_sig_id(0)
361  {
362  _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
363  };
364 
366  _signalPool.clear();
367  }
368 
370 
371 
372 public:
373  // Critical Data:
374  ihipCommand_t _last_command_type; // type of the last command
375 
376  // signal of last copy command sent to the stream.
377  // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency.
378  // Copy can be either H2D or D2H.
379  ihipSignal_t *_last_copy_signal;
380 
381  hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU.
382 
383  // Signal pool:
384  int _signalCursor;
385  SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated.
386  std::deque<ihipSignal_t> _signalPool; // Pool of signals for use by this stream.
387 
388 
389  SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id.
390 };
391 
392 
395 
396 
397 
398 // Internal stream structure.
400 public:
401 typedef uint64_t SeqNum_t ;
402 
403  ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags);
404  ~ihipStream_t();
405 
406  // kind is hipMemcpyKind
407  void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind);
408  void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind);
409 
410  void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
411 
412  //---
413  // Thread-safe accessors - these acquire / release mutex:
414  bool lockopen_preKernelCommand();
415  void lockclose_postKernelCommand(hc::completion_future &kernel_future);
416 
417  int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
418 
419  void locked_reclaimSignals(SIGSEQNUM sigNum);
420  void locked_wait(bool assertQueueEmpty=false);
421  SIGSEQNUM locked_lastCopySeqId() {LockedAccessor_StreamCrit_t crit(_criticalData); return lastCopySeqId(crit); };
422 
423  // Use this if we already have the stream critical data mutex:
424  void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
425 
426 
427 
428  // Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function.
429  SIGSEQNUM lastCopySeqId (LockedAccessor_StreamCrit_t &crit) { return crit->_last_copy_signal ? crit->_last_copy_signal->_sig_id : 0; };
430  ihipSignal_t * allocSignal (LockedAccessor_StreamCrit_t &crit);
431 
432 
433  //-- Non-racy accessors:
434  // These functions access fields set at initialization time and are non-racy (so do not acquire mutex)
435  ihipDevice_t * getDevice() const;
436 
437 
438 public:
439  //---
440  //Public member vars - these are set at initialization and never change:
441  SeqNum_t _id; // monotonic sequence ID
442  hc::accelerator_view _av;
443  unsigned _flags;
444 
445 private: // Critical Data. THis MUST be accessed through LockedAccessor_StreamCrit_t
446  ihipStreamCritical_t _criticalData;
447 
448 private:
449  void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal);
450  void waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal);
451 
452  // The unsigned return is hipMemcpyKind
453  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
454  void setCopyAgents(unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent);
455 
456  unsigned _device_index; // index into the g_device array
457 
458  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
459 };
460 
461 
462 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
463 {
464  os << "stream#";
465  os << s._device_index;
466  os << '.';
467  os << s._id;
468  return os;
469 }
470 
471 
472 //----
473 // Internal event structure:
474 enum hipEventStatus_t {
475  hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
476  hipEventStatusCreated = 1,
477  hipEventStatusRecording = 2, // event has been enqueued to record something.
478  hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid.
479 } ;
480 
481 
482 // internal hip event structure.
483 struct ihipEvent_t {
484  hipEventStatus_t _state;
485 
486  hipStream_t _stream; // Stream where the event is recorded, or NULL if all streams.
487  unsigned _flags;
488 
489  hc::completion_future _marker;
490  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
491 
492  SIGSEQNUM _copy_seq_id;
493 } ;
494 
495 
496 
497 
498 
499 //---
500 // Data that must be protected with thread-safe access
501 // All members are private - this class must be accessed through friend LockedAccessor which
502 // will lock the mutex on construction and unlock on destruction.
503 //
504 // MUTEX_TYPE is template argument so can easily convert to FakeMutex for performance or stress testing.
505 template <class MUTEX_TYPE>
507 {
508 public:
509  ihipDeviceCriticalBase_t() : _stream_id(0), _peerAgents(nullptr) {};
510 
511  void init(unsigned deviceCnt) {
512  assert(_peerAgents == nullptr);
513  _peerAgents = new hsa_agent_t[deviceCnt];
514  };
515 
517  if (_peerAgents != nullptr) {
518  delete _peerAgents;
519  _peerAgents = nullptr;
520  }
521  }
522  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
523 
524  std::list<ihipStream_t*> &streams() { return _streams; };
525  const std::list<ihipStream_t*> &const_streams() const { return _streams; };
526 
527  // "Allocate" a stream ID:
528  ihipStream_t::SeqNum_t incStreamId() { return _stream_id++; };
529 
530  bool addPeer(ihipDevice_t *peer);
531  bool removePeer(ihipDevice_t *peer);
532  void resetPeers(ihipDevice_t *thisDevice);
533 
534 
535  void addStream(ihipStream_t *stream);
536 
537  uint32_t peerCnt() const { return _peerCnt; };
538  hsa_agent_t *peerAgents() const { return _peerAgents; };
539 
540 
541 private:
542  std::list<ihipStream_t*> _streams; // streams associated with this device.
543  ihipStream_t::SeqNum_t _stream_id;
544 
545  // These reflect the currently Enabled set of peers for this GPU:
546  std::list<ihipDevice_t*> _peers; // list of enabled peer devices.
547  uint32_t _peerCnt; // number of enabled peers
548  hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.)
549 private:
550  void recomputePeerAgents();
551 };
552 
553 // Note Mutex selected based on DeviceMutex
555 
556 // This type is used by functions that need access to the critical device structures.
558 
559 
560 
561 //-------------------------------------------------------------------------------------------------
562 // Functions which read or write the critical data are named locked_.
563 // ihipDevice_t does not use recursive locks so the ihip implementation must avoid calling a locked_ function from within a locked_ function.
564 // External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in
565 // performance-sensitive code we may want to refactor by adding non-locked functions and creating a new locked_ member function to call them all.
567 {
568 public: // Functions:
569  ihipDevice_t() {}; // note: calls constructor for _criticalData
570  void init(unsigned device_index, unsigned deviceCnt, hc::accelerator &acc, unsigned flags);
571  ~ihipDevice_t();
572 
573  void locked_addStream(ihipStream_t *s);
574  void locked_removeStream(ihipStream_t *s);
575  void locked_reset();
576  void locked_waitAllStreams();
577  void locked_syncDefaultStream(bool waitOnSelf);
578 
579  ihipDeviceCritical_t &criticalData() { return _criticalData; }; // TODO, move private. Fix P2P.
580 
581 public: // Data, set at initialization:
582  unsigned _device_index; // index into g_devices.
583 
584  hipDeviceProp_t _props; // saved device properties.
585  hc::accelerator _acc;
586  hsa_agent_t _hsa_agent; // hsa agent handle
587 
588  // The NULL stream is used if no other stream is specified.
589  // NULL has special synchronization properties with other streams.
590  ihipStream_t *_default_stream;
591 
592 
593  unsigned _compute_units;
594 
595  StagingBuffer *_staging_buffer[2]; // one buffer for each direction.
596 
597 
598  unsigned _device_flags;
599 
600 private:
601  hipError_t getProperties(hipDeviceProp_t* prop);
602 
603 private: // Critical data, protected with locked access:
604  // Members of _protected data MUST be accessed through the LockedAccessor.
605  // Search for LockedAccessor<ihipDeviceCritical_t> for examples; do not access _criticalData directly.
606  ihipDeviceCritical_t _criticalData;
607 
608 };
609 
610 
611 
612 // Global variable definition:
613 extern std::once_flag hip_initialized;
614 extern ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system.
615 extern bool g_visible_device; // Set the flag when HIP_VISIBLE_DEVICES is set
616 extern unsigned g_deviceCnt;
617 extern std::vector<int> g_hip_visible_devices; /* vector of integers that contains the visible device IDs */
618 extern hsa_agent_t g_cpu_agent ; // the CPU agent.
619 //=================================================================================================
620 void ihipInit();
621 const char *ihipErrorString(hipError_t);
622 ihipDevice_t *ihipGetTlsDefaultDevice();
623 ihipDevice_t *ihipGetDevice(int);
624 void ihipSetTs(hipEvent_t e);
625 
626 template<typename T>
627 hc::completion_future ihipMemcpyKernel(hipStream_t, T*, const T*, size_t);
628 
629 template<typename T>
630 hc::completion_future ihipMemsetKernel(hipStream_t, T*, T, size_t);
631 
632 hipStream_t ihipSyncAndResolveStream(hipStream_t);
633 template <typename T>
634 
635 hc::completion_future
636 ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
637 {
638  int wg = std::min((unsigned)8, stream->getDevice()->_compute_units);
639  const int threads_per_wg = 256;
640 
641  int threads = wg * threads_per_wg;
642  if (threads > sizeBytes) {
643  threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
644  }
645 
646 
647  hc::extent<1> ext(threads);
648  auto ext_tile = ext.tile(threads_per_wg);
649 
650  hc::completion_future cf =
651  hc::parallel_for_each(
652  stream->_av,
653  ext_tile,
654  [=] (hc::tiled_index<1> idx)
655  __attribute__((hc))
656  {
657  int offset = amp_get_global_id(0);
658  // TODO-HCC - change to hc_get_local_size()
659  int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
660 
661  for (int i=offset; i<sizeBytes; i+=stride) {
662  ptr[i] = val;
663  }
664  });
665 
666  return cf;
667 }
668 
669 template <typename T>
670 hc::completion_future
671 ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes)
672 {
673  int wg = std::min((unsigned)8, stream->getDevice()->_compute_units);
674  const int threads_per_wg = 256;
675 
676  int threads = wg * threads_per_wg;
677  if (threads > sizeBytes) {
678  threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
679  }
680 
681 
682  hc::extent<1> ext(threads);
683  auto ext_tile = ext.tile(threads_per_wg);
684 
685  hc::completion_future cf =
686  hc::parallel_for_each(
687  stream->_av,
688  ext_tile,
689  [=] (hc::tiled_index<1> idx)
690  __attribute__((hc))
691  {
692  int offset = amp_get_global_id(0);
693  // TODO-HCC - change to hc_get_local_size()
694  int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
695 
696  for (int i=offset; i<sizeBytes; i+=stride) {
697  c[i] = a[i];
698  }
699  });
700 
701  return cf;
702 }
703 
704 #endif
Definition: hip_hcc.h:566
Definition: hip_hcc.h:340
Definition: hip_hcc.h:279
hipError_t
Definition: hip_runtime_api.h:142
Definition: hip_runtime_api.h:47
Definition: hip_hcc.h:506
Definition: hip_hcc.h:266
Definition: hip_runtime_api.h:74
Definition: staging_buffer.h:40
Definition: hip_hcc.h:483
Definition: hip_hcc.h:220
Definition: hip_hcc.h:399
Definition: hip_hcc.h:352
Definition: hip_hcc.h:307