HIP: Heterogenous-computing Interface for Portability
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hip_hcc.h
1 /*
2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
3 Permission is hereby granted, free of charge, to any person obtaining a copy
4 of this software and associated documentation files (the "Software"), to deal
5 in the Software without restriction, including without limitation the rights
6 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 copies of the Software, and to permit persons to whom the Software is
8 furnished to do so, subject to the following conditions:
9 The above copyright notice and this permission notice shall be included in
10 all copies or substantial portions of the Software.
11 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
12 IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
15 LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 THE SOFTWARE.
18 */
19 
20 #ifndef HIP_HCC_H
21 #define HIP_HCC_H
22 
23 #include <hc.hpp>
24 #include "hip/hcc_detail/hip_util.h"
25 #include "hip/hcc_detail/staging_buffer.h"
26 
27 
28 #if defined(__HCC__) && (__hcc_workweek__ < 16155)
29 #error("This version of HIP requires a newer version of HCC.");
30 #endif
31 
32 // #define USE_MEMCPYTOSYMBOL
33 //
34 //Use the new HCC accelerator_view::copy instead of am_copy
35 #define USE_AV_COPY 0
36 
37 // Compile peer-to-peer support.
38 // >= 2 : use HCC hc:accelerator::get_is_peer
39 // >= 3 : use hc::am_memtracker_update_peers(...)
40 #define USE_PEER_TO_PEER 2
41 
42 
43 // Use new am_memory_host_lock APIs:
44 #define USE_HCC_LOCK_API 1
45 
46 
47 //---
48 // Environment variables:
49 
50 // Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release.
51 //static const int debug = 0;
52 extern const int release;
53 
54 extern int HIP_LAUNCH_BLOCKING;
55 
56 extern int HIP_PRINT_ENV;
57 extern int HIP_ATP_MARKER;
58 //extern int HIP_TRACE_API;
59 extern int HIP_ATP;
60 extern int HIP_DB;
61 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
62 extern int HIP_STAGING_BUFFERS; // TODO - remove, two buffers should be enough.
63 extern int HIP_PININPLACE;
64 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
65 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
66 
67 
68 //---
69 // Chicken bits for disabling functionality to work around potential issues:
70 extern int HIP_DISABLE_HW_KERNEL_DEP;
71 extern int HIP_DISABLE_HW_COPY_DEP;
72 
73 extern thread_local int tls_defaultDevice;
74 extern thread_local hipError_t tls_lastHipError;
75 class ihipStream_t;
76 class ihipDevice_t;
77 
78 
79 // Color defs for debug messages:
80 #define KNRM "\x1B[0m"
81 #define KRED "\x1B[31m"
82 #define KGRN "\x1B[32m"
83 #define KYEL "\x1B[33m"
84 #define KBLU "\x1B[34m"
85 #define KMAG "\x1B[35m"
86 #define KCYN "\x1B[36m"
87 #define KWHT "\x1B[37m"
88 
89 #define API_COLOR KGRN
90 
91 
92 // If set, thread-safety is enforced on all stream functions.
93 // Stream functions will acquire a mutex before entering critical sections.
94 #define STREAM_THREAD_SAFE 1
95 
96 
97 #define DEVICE_THREAD_SAFE 1
98 
99 // If FORCE_COPY_DEP=1 , HIP runtime will add
100 // synchronization for copy commands in the same stream, regardless of command type.
101 // If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered.
102 // ROCR runtime implementation currently provides this guarantee when using SDMA queues but not
103 // when using shader queues.
104 // TODO - measure if this matters for performance, in particular for back-to-back small copies.
105 // If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands.
106 #define FORCE_SAMEDIR_COPY_DEP 1
107 
108 
109 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
110 // May be set to 0 to remove debug if checks - possible code size and performance difference?
111 #define COMPILE_HIP_DB 1
112 
113 
114 // Compile HIP tracing capability.
115 // 0x1 = print a string at function entry with arguments.
116 // 0x2 = prints a simple message with function name + return code when function exits.
117 // 0x3 = print both.
118 // Must be enabled at runtime with HIP_TRACE_API
119 #define COMPILE_HIP_TRACE_API 0x3
120 
121 
122 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
123 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
124 #ifndef COMPILE_HIP_ATP_MARKER
125 #define COMPILE_HIP_ATP_MARKER 0
126 #endif
127 
128 
129 // #include CPP files to produce one object file
130 #define ONE_OBJECT_FILE 0
131 
132 
133 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary.
134 // TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned
135 // through ptr-to-args (ie the pointers allocated by hipMalloc).
136 #if COMPILE_HIP_ATP_MARKER
137 #include "AMDTActivityLogger.h"
138 #define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString)
139 #else
140 // Swallow scoped markers:
141 #define SCOPED_MARKER(markerName,group,userString)
142 #endif
143 
144 
145 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
146 #define API_TRACE(...)\
147 {\
148  if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\
149  std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\
150  if (COMPILE_HIP_DB && HIP_TRACE_API) {\
151  fprintf (stderr, API_COLOR "<<hip-api: %s\n" KNRM, s.c_str());\
152  }\
153  SCOPED_MARKER(s.c_str(), "HIP", NULL);\
154  }\
155 }
156 #else
157 // Swallow API_TRACE
158 #define API_TRACE(...)
159 #endif
160 
161 
162 
163 // This macro should be called at the beginning of every HIP API.
164 // It initialies the hip runtime (exactly once), and
165 // generate trace string that can be output to stderr or to ATP file.
166 #define HIP_INIT_API(...) \
167  std::call_once(hip_initialized, ihipInit);\
168  API_TRACE(__VA_ARGS__);
169 
170 #define ihipLogStatus(_hip_status) \
171  ({\
172  hipError_t _local_hip_status = _hip_status; /*local copy so _hip_status only evaluated once*/ \
173  tls_lastHipError = _local_hip_status;\
174  \
175  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API) {\
176  fprintf(stderr, " %ship-api: %-30s ret=%2d (%s)>>\n" KNRM, (_local_hip_status == 0) ? API_COLOR:KRED, __func__, _local_hip_status, ihipErrorString(_local_hip_status));\
177  }\
178  _local_hip_status;\
179  })
180 
181 
182 
183 
184 //---
185 //HIP_DB Debug flags:
186 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
187 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
188 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
189 #define DB_COPY1 3 /* 0x08 - trace memory copy commands. . */
190 #define DB_SIGNAL 4 /* 0x10 - trace signal pool commands */
191 #define DB_COPY2 5 /* 0x20 - trace memory copy commands. Detailed. */
192 // When adding a new debug flag, also add to the char name table below.
193 
194 static const char *dbName [] =
195 {
196  KNRM "hip-api", // not used,
197  KYEL "hip-sync",
198  KCYN "hip-mem",
199  KMAG "hip-copy1",
200  KRED "hip-signal",
201  KNRM "hip-copy2",
202 };
203 
204 #if COMPILE_HIP_DB
205 #define tprintf(trace_level, ...) {\
206  if (HIP_DB & (1<<(trace_level))) {\
207  fprintf (stderr, " %s:", dbName[trace_level]); \
208  fprintf (stderr, __VA_ARGS__);\
209  fprintf (stderr, "%s", KNRM); \
210  }\
211 }
212 #else
213 /* Compile to empty code */
214 #define tprintf(trace_level, ...)
215 #endif
216 
217 class ihipException : public std::exception
218 {
219 public:
220  ihipException(hipError_t e) : _code(e) {};
221 
222  hipError_t _code;
223 };
224 
225 
226 #ifdef __cplusplus
227 extern "C" {
228 #endif
229 
230 typedef class ihipStream_t* hipStream_t;
231 //typedef struct hipEvent_t {
232 // struct ihipEvent_t *_handle;
233 //} hipEvent_t;
234 
235 #ifdef __cplusplus
236 }
237 #endif
238 
239 const hipStream_t hipStreamNull = 0x0;
240 
241 
242 enum ihipCommand_t {
243  ihipCommandCopyH2H,
244  ihipCommandCopyH2D,
245  ihipCommandCopyD2H,
246  ihipCommandCopyD2D,
247  ihipCommandCopyP2P,
248  ihipCommandKernel,
249 };
250 
251 static const char* ihipCommandName[] = {
252  "CopyH2H", "CopyH2D", "CopyD2H", "CopyD2D", "CopyP2P", "Kernel"
253 };
254 
255 
256 
257 typedef uint64_t SIGSEQNUM;
258 
259 //---
260 // Small wrapper around signals.
261 // Designed to be used from stream.
262 // TODO-someday refactor this class so it can be stored in a vector<>
263 // we already store the index here so we can use for garbage collection.
264 struct ihipSignal_t {
265  hsa_signal_t _hsa_signal; // hsa signal handle
266  int _index; // Index in pool, used for garbage collection.
267  SIGSEQNUM _sig_id; // unique sequentially increasing ID.
268 
269  ihipSignal_t();
270  ~ihipSignal_t();
271 
272  void release();
273 };
274 
275 
276 // Used to remove lock, for performance or stimulating bugs.
278 {
279  public:
280  void lock() { }
281  bool try_lock() {return true; }
282  void unlock() { }
283 };
284 
285 
286 #if STREAM_THREAD_SAFE
287 typedef std::mutex StreamMutex;
288 #else
289 #warning "Stream thread-safe disabled"
290 typedef FakeMutex StreamMutex;
291 #endif
292 
293 #if DEVICE_THREAD_SAFE
294 typedef std::mutex DeviceMutex;
295 #else
296 typedef FakeMutex DeviceMutex;
297 #warning "Device thread-safe disabled"
298 #endif
299 
300 //
301 //---
302 // Protects access to the member _data with a lock acquired on contruction/destruction.
303 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
304 template<typename T>
306 {
307 public:
308  LockedAccessor(T &criticalData, bool autoUnlock=true) :
309  _criticalData(&criticalData),
310  _autoUnlock(autoUnlock)
311 
312  {
313  _criticalData->_mutex.lock();
314  };
315 
316  ~LockedAccessor()
317  {
318  if (_autoUnlock) {
319  _criticalData->_mutex.unlock();
320  }
321  }
322 
323  void unlock()
324  {
325  _criticalData->_mutex.unlock();
326  }
327 
328  // Syntactic sugar so -> can be used to get the underlying type.
329  T *operator->() { return _criticalData; };
330 
331 private:
332  T *_criticalData;
333  bool _autoUnlock;
334 };
335 
336 
337 template <typename MUTEX_TYPE>
338 struct LockedBase {
339 
340  // Experts-only interface for explicit locking.
341  // Most uses should use the lock-accessor.
342  void lock() { _mutex.lock(); }
343  void unlock() { _mutex.unlock(); }
344 
345  MUTEX_TYPE _mutex;
346 };
347 
348 
349 template <typename MUTEX_TYPE>
350 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
351 {
352 public:
354  _last_command_type(ihipCommandCopyH2H),
355  _last_copy_signal(NULL),
356  _signalCursor(0),
357  _oldest_live_sig_id(1),
358  _stream_sig_id(0)
359  {
360  _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
361  };
362 
364  _signalPool.clear();
365  }
366 
368 
369 
370 public:
371  // Critical Data:
372  ihipCommand_t _last_command_type; // type of the last command
373 
374  // signal of last copy command sent to the stream.
375  // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency.
376  // Copy can be either H2D or D2H.
377  ihipSignal_t *_last_copy_signal;
378 
379  hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU.
380 
381  // Signal pool:
382  int _signalCursor;
383  SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated.
384  std::deque<ihipSignal_t> _signalPool; // Pool of signals for use by this stream.
385 
386 
387  SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id.
388 };
389 
390 
393 
394 
395 
396 // Internal stream structure.
398 public:
399 typedef uint64_t SeqNum_t ;
400 
401  ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags);
402  ~ihipStream_t();
403 
404  // kind is hipMemcpyKind
405  void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind);
406  void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind);
407 
408  void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
409 
410  //---
411  // Thread-safe accessors - these acquire / release mutex:
412  bool lockopen_preKernelCommand();
413  void lockclose_postKernelCommand(hc::completion_future &kernel_future);
414 
415  int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
416 
417  void locked_reclaimSignals(SIGSEQNUM sigNum);
418  void locked_wait(bool assertQueueEmpty=false);
419  SIGSEQNUM locked_lastCopySeqId() {LockedAccessor_StreamCrit_t crit(_criticalData); return lastCopySeqId(crit); };
420 
421  // Use this if we already have the stream critical data mutex:
422  void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
423 
424 
425 
426  // Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function.
427  SIGSEQNUM lastCopySeqId (LockedAccessor_StreamCrit_t &crit) { return crit->_last_copy_signal ? crit->_last_copy_signal->_sig_id : 0; };
428  ihipSignal_t * allocSignal (LockedAccessor_StreamCrit_t &crit);
429 
430 
431  //-- Non-racy accessors:
432  // These functions access fields set at initialization time and are non-racy (so do not acquire mutex)
433  ihipDevice_t * getDevice() const;
434 
435 
436 public:
437  //---
438  //Public member vars - these are set at initialization and never change:
439  SeqNum_t _id; // monotonic sequence ID
440  hc::accelerator_view _av;
441  unsigned _flags;
442 
443 private:
444  // Critical Data. THis MUST be accessed through LockedAccessor_StreamCrit_t
445  ihipStreamCritical_t _criticalData;
446 
447  // Array of dependency completion_future.
448  std::vector<hc::completion_future> _depFutures;
449 
450 private:
451  void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal);
452  void waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal);
453 
454  // The unsigned return is hipMemcpyKind
455  unsigned resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem);
456  void setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent);
457 
458  unsigned _device_index; // index into the g_device array
459 
460  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
461 };
462 
463 
464 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
465 {
466  os << "stream#";
467  os << s._device_index;
468  os << '.';
469  os << s._id;
470  return os;
471 }
472 
473 
474 //----
475 // Internal event structure:
476 enum hipEventStatus_t {
477  hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
478  hipEventStatusCreated = 1,
479  hipEventStatusRecording = 2, // event has been enqueued to record something.
480  hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid.
481 } ;
482 
483 
484 // internal hip event structure.
485 struct ihipEvent_t {
486  hipEventStatus_t _state;
487 
488  hipStream_t _stream; // Stream where the event is recorded, or NULL if all streams.
489  unsigned _flags;
490 
491  hc::completion_future _marker;
492  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
493 
494  SIGSEQNUM _copy_seq_id;
495 } ;
496 
497 
498 
499 
500 
501 //---
502 // Data that must be protected with thread-safe access
503 // All members are private - this class must be accessed through friend LockedAccessor which
504 // will lock the mutex on construction and unlock on destruction.
505 //
506 // MUTEX_TYPE is template argument so can easily convert to FakeMutex for performance or stress testing.
507 template <class MUTEX_TYPE>
509 {
510 public:
511  ihipDeviceCriticalBase_t() : _stream_id(0), _peerAgents(nullptr) {};
512 
513  void init(unsigned deviceCnt) {
514  assert(_peerAgents == nullptr);
515  _peerAgents = new hsa_agent_t[deviceCnt];
516  };
517 
519  if (_peerAgents != nullptr) {
520  delete _peerAgents;
521  _peerAgents = nullptr;
522  }
523  }
524  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
525 
526  std::list<ihipStream_t*> &streams() { return _streams; };
527  const std::list<ihipStream_t*> &const_streams() const { return _streams; };
528 
529  // "Allocate" a stream ID:
530  ihipStream_t::SeqNum_t incStreamId() { return _stream_id++; };
531 
532  bool isPeer(const ihipDevice_t *peer); // returns Trus if peer has access to memory physically located on this device.
533  bool addPeer(ihipDevice_t *peer);
534  bool removePeer(ihipDevice_t *peer);
535  void resetPeers(ihipDevice_t *thisDevice);
536 
537 
538  void addStream(ihipStream_t *stream);
539 
540  uint32_t peerCnt() const { return _peerCnt; };
541  hsa_agent_t *peerAgents() const { return _peerAgents; };
542 
543 
544 private:
545  //std::list< std::shared_ptr<ihipStream_t> > _streams; // streams associated with this device. TODO - convert to shared_ptr.
546  std::list< ihipStream_t* > _streams; // streams associated with this device.
547  ihipStream_t::SeqNum_t _stream_id;
548 
549  // These reflect the currently Enabled set of peers for this GPU:
550  // Enabled peers have permissions to access the memory physically allocated on this device.
551  std::list<ihipDevice_t*> _peers; // list of enabled peer devices.
552  uint32_t _peerCnt; // number of enabled peers
553  hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.)
554 private:
555  void recomputePeerAgents();
556 };
557 
558 // Note Mutex selected based on DeviceMutex
560 
561 // This type is used by functions that need access to the critical device structures.
563 
564 
565 
566 //-------------------------------------------------------------------------------------------------
567 // Functions which read or write the critical data are named locked_.
568 // ihipDevice_t does not use recursive locks so the ihip implementation must avoid calling a locked_ function from within a locked_ function.
569 // External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in
570 // performance-sensitive code we may want to refactor by adding non-locked functions and creating a new locked_ member function to call them all.
572 {
573 public: // Functions:
574  ihipDevice_t() {}; // note: calls constructor for _criticalData
575  void init(unsigned device_index, unsigned deviceCnt, hc::accelerator &acc, unsigned flags);
576  ~ihipDevice_t();
577 
578  void locked_addStream(ihipStream_t *s);
579  void locked_removeStream(ihipStream_t *s);
580  void locked_reset();
581  void locked_waitAllStreams();
582  void locked_syncDefaultStream(bool waitOnSelf);
583 
584  ihipDeviceCritical_t &criticalData() { return _criticalData; }; // TODO, move private. Fix P2P.
585 
586 public: // Data, set at initialization:
587  unsigned _device_index; // index into g_devices.
588 
589  hipDeviceProp_t _props; // saved device properties.
590  hc::accelerator _acc;
591  hsa_agent_t _hsa_agent; // hsa agent handle
592 
593  // The NULL stream is used if no other stream is specified.
594  // NULL has special synchronization properties with other streams.
595  ihipStream_t *_default_stream;
596 
597 
598  unsigned _compute_units;
599 
600  StagingBuffer *_staging_buffer[2]; // one buffer for each direction.
601 
602 
603  unsigned _device_flags;
604 
605 private:
606  hipError_t getProperties(hipDeviceProp_t* prop);
607 
608 private: // Critical data, protected with locked access:
609  // Members of _protected data MUST be accessed through the LockedAccessor.
610  // Search for LockedAccessor<ihipDeviceCritical_t> for examples; do not access _criticalData directly.
611  ihipDeviceCritical_t _criticalData;
612 
613 };
614 
615 
616 
617 // Global variable definition:
618 extern std::once_flag hip_initialized;
619 extern ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system.
620 extern bool g_visible_device; // Set the flag when HIP_VISIBLE_DEVICES is set
621 extern unsigned g_deviceCnt;
622 extern std::vector<int> g_hip_visible_devices; /* vector of integers that contains the visible device IDs */
623 extern hsa_agent_t g_cpu_agent ; // the CPU agent.
624 //=================================================================================================
625 void ihipInit();
626 const char *ihipErrorString(hipError_t);
627 ihipDevice_t *ihipGetTlsDefaultDevice();
628 ihipDevice_t *ihipGetDevice(int);
629 void ihipSetTs(hipEvent_t e);
630 
631 template<typename T>
632 hc::completion_future ihipMemcpyKernel(hipStream_t, T*, const T*, size_t);
633 
634 template<typename T>
635 hc::completion_future ihipMemsetKernel(hipStream_t, T*, T, size_t);
636 
637 hipStream_t ihipSyncAndResolveStream(hipStream_t);
638 template <typename T>
639 
640 hc::completion_future
641 ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
642 {
643  int wg = std::min((unsigned)8, stream->getDevice()->_compute_units);
644  const int threads_per_wg = 256;
645 
646  int threads = wg * threads_per_wg;
647  if (threads > sizeBytes) {
648  threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
649  }
650 
651 
652  hc::extent<1> ext(threads);
653  auto ext_tile = ext.tile(threads_per_wg);
654 
655  hc::completion_future cf =
656  hc::parallel_for_each(
657  stream->_av,
658  ext_tile,
659  [=] (hc::tiled_index<1> idx)
660  __attribute__((hc))
661  {
662  int offset = amp_get_global_id(0);
663  // TODO-HCC - change to hc_get_local_size()
664  int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
665 
666  for (int i=offset; i<sizeBytes; i+=stride) {
667  ptr[i] = val;
668  }
669  });
670 
671  return cf;
672 }
673 
674 template <typename T>
675 hc::completion_future
676 ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes)
677 {
678  int wg = std::min((unsigned)8, stream->getDevice()->_compute_units);
679  const int threads_per_wg = 256;
680 
681  int threads = wg * threads_per_wg;
682  if (threads > sizeBytes) {
683  threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
684  }
685 
686 
687  hc::extent<1> ext(threads);
688  auto ext_tile = ext.tile(threads_per_wg);
689 
690  hc::completion_future cf =
691  hc::parallel_for_each(
692  stream->_av,
693  ext_tile,
694  [=] (hc::tiled_index<1> idx)
695  __attribute__((hc))
696  {
697  int offset = amp_get_global_id(0);
698  // TODO-HCC - change to hc_get_local_size()
699  int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
700 
701  for (int i=offset; i<sizeBytes; i+=stride) {
702  c[i] = a[i];
703  }
704  });
705 
706  return cf;
707 }
708 
709 #endif
Definition: hip_hcc.h:571
Definition: hip_hcc.h:338
Definition: hip_hcc.h:277
hipError_t
Definition: hip_runtime_api.h:142
Definition: hip_runtime_api.h:47
Definition: hip_hcc.h:508
Definition: hip_hcc.h:264
Definition: hip_runtime_api.h:74
Definition: staging_buffer.h:40
Definition: hip_hcc.h:485
Definition: hip_hcc.h:217
Definition: hip_hcc.h:397
Definition: hip_hcc.h:350
Definition: hip_hcc.h:305