HIP: Heterogenous-computing Interface for Portability
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
27 #pragma once
28 
29 //---
30 // Top part of file can be compiled with any compiler
31 
32 
33 #include <cstring>
34 #include <cmath>
35 #include <string.h>
36 #include <stddef.h>
37 
38 
39 #define CUDA_SUCCESS hipSuccess
40 
41 #include <hip_runtime_api.h>
42 
43 //---
44 // Remainder of this file only compiles with HCC
45 #ifdef __HCC__
46 #include <hc.hpp>
47 #include <grid_launch.h>
48 
49 //TODO-HCC-GL - change this to typedef.
50 //typedef grid_launch_parm hipLaunchParm ;
51 #define hipLaunchParm grid_launch_parm
52 
53 #include <hcc_detail/hip_texture.h>
54 #include <hcc_detail/host_defines.h>
55 
56 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
57 #if defined (__KALMAR_ACCELERATOR__) && not defined (__HCC_ACCELERATOR__)
58 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
59 #endif
60 
61 // Feature tests:
62 #if defined(__HCC_ACCELERATOR__) and (__HCC_ACCELERATOR__ != 0)
63 // Device compile and not host compile:
64 
65 
66 #define __HIP_DEVICE_COMPILE__ 1
67 
68 //TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these.
69  // 32-bit Atomics:
70 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
71 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
72 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
73 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
74 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
75 
76 // 64-bit Atomics:
77 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
78 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
79 
80 // Doubles
81 #define __HIP_ARCH_HAS_DOUBLES__ (1)
82 
83 //warp cross-lane operations:
84 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
85 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
86 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
87 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
88 
89 //sync
90 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
91 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
92 
93 // misc
94 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
95 #define __HIP_ARCH_HAS_3DGRID__ (1)
96 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
97 
98 #else
99 // Host compile and not device compile:
100 #define __HIP_DEVICE_COMPILE__ 0
101 
102 #endif
103 
104 
105 
106 
107 
108 //TODO-HCC this is currently ignored by HCC target of HIP
109 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
110 
111 // Detect if we are compiling C++ mode or C mode
112 #if defined(__cplusplus)
113 #define __HCC_CPP__
114 #elif defined(__STDC_VERSION__)
115 #define __HCC_C__
116 #endif
117 
118 #define clock_t long long int
119 __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); };
120 __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); };
121 
122 //atomicAdd()
123 __device__ inline int atomicAdd(int* address, int val)
124 {
125  return hc::atomic_fetch_add(address,val);
126 }
127 __device__ inline unsigned int atomicAdd(unsigned int* address,
128  unsigned int val)
129 {
130  return hc::atomic_fetch_add(address,val);
131 }
132 __device__ inline unsigned long long int atomicAdd(unsigned long long int* address,
133  unsigned long long int val)
134 {
135  return (long long int)hc::atomic_fetch_add((uint64_t*)address,(uint64_t)val);
136 }
137 __device__ inline float atomicAdd(float* address, float val)
138 {
139  return hc::atomic_fetch_add(address,val);
140 }
141 
142 //atomicSub()
143 __device__ inline int atomicSub(int* address, int val)
144 {
145  return hc::atomic_fetch_sub(address,val);
146 }
147 __device__ inline unsigned int atomicSub(unsigned int* address,
148  unsigned int val)
149 {
150  return hc::atomic_fetch_sub(address,val);
151 }
152 
153 //atomicExch()
154 __device__ inline int atomicExch(int* address, int val)
155 {
156  return hc::atomic_exchange(address,val);
157 }
158 __device__ inline unsigned int atomicExch(unsigned int* address,
159  unsigned int val)
160 {
161  return hc::atomic_exchange(address,val);
162 }
163 __device__ inline unsigned long long int atomicExch(unsigned long long int* address,
164  unsigned long long int val)
165 {
166  return (long long int)hc::atomic_exchange((uint64_t*)address,(uint64_t)val);
167 }
168 __device__ inline float atomicExch(float* address, float val)
169 {
170  return hc::atomic_exchange(address,val);
171 }
172 
173 //atomicMin()
174 __device__ inline int atomicMin(int* address, int val)
175 {
176  return hc::atomic_fetch_min(address,val);
177 }
178 __device__ inline unsigned int atomicMin(unsigned int* address,
179  unsigned int val)
180 {
181  return hc::atomic_fetch_min(address,val);
182 }
183 __device__ inline unsigned long long int atomicMin(unsigned long long int* address,
184  unsigned long long int val)
185 {
186  return (long long int)hc::atomic_fetch_min((uint64_t*)address,(uint64_t)val);
187 }
188 
189 //atomicMax()
190 __device__ inline int atomicMax(int* address, int val)
191 {
192  return hc::atomic_fetch_max(address,val);
193 }
194 __device__ inline unsigned int atomicMax(unsigned int* address,
195  unsigned int val)
196 {
197  return hc::atomic_fetch_max(address,val);
198 }
199 __device__ inline unsigned long long int atomicMax(unsigned long long int* address,
200  unsigned long long int val)
201 {
202  return (long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val);
203 }
204 
205 //atomicInc()
206 __device__ inline unsigned int atomicInc(unsigned int* address)
207 {
208  return hc::atomic_fetch_inc(address);
209 }
210 
211 //atomicDec()
212 __device__ inline unsigned int atomicDec(unsigned int* address)
213 {
214  return hc::atomic_fetch_dec(address);
215 }
216 
217 //atomicCAS()
218 __device__ inline int atomicCAS(int* address, int compare, int val)
219 {
220  hc::atomic_compare_exchange(address,&compare,val);
221  return *address;
222 }
223 __device__ inline unsigned int atomicCAS(unsigned int* address,
224  unsigned int compare,
225  unsigned int val)
226 {
227  hc::atomic_compare_exchange(address,&compare,val);
228  return *address;
229 }
230 __device__ inline unsigned long long int atomicCAS(unsigned long long int* address,
231  unsigned long long int compare,
232  unsigned long long int val)
233 {
234  hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
235  return *address;
236 }
237 
238 //atomicAnd()
239 __device__ inline int atomicAnd(int* address, int val)
240 {
241  return hc::atomic_fetch_and(address,val);
242 }
243 __device__ inline unsigned int atomicAnd(unsigned int* address,
244  unsigned int val)
245 {
246  return hc::atomic_fetch_and(address,val);
247 }
248 __device__ inline unsigned long long int atomicAnd(unsigned long long int* address,
249  unsigned long long int val)
250 {
251  return (long long int)hc::atomic_fetch_and((uint64_t*)address,(uint64_t)val);
252 }
253 
254 //atomicOr()
255 __device__ inline int atomicOr(int* address, int val)
256 {
257  return hc::atomic_fetch_or(address,val);
258 }
259 __device__ inline unsigned int atomicOr(unsigned int* address,
260  unsigned int val)
261 {
262  return hc::atomic_fetch_or(address,val);
263 }
264 __device__ inline unsigned long long int atomicOr(unsigned long long int* address,
265  unsigned long long int val)
266 {
267  return (long long int)hc::atomic_fetch_or((uint64_t*)address,(uint64_t)val);
268 }
269 
270 //atomicXor()
271 __device__ inline int atomicXor(int* address, int val)
272 {
273  return hc::atomic_fetch_xor(address,val);
274 }
275 __device__ inline unsigned int atomicXor(unsigned int* address,
276  unsigned int val)
277 {
278  return hc::atomic_fetch_xor(address,val);
279 }
280 __device__ inline unsigned long long int atomicXor(unsigned long long int* address,
281  unsigned long long int val)
282 {
283  return (long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
284 }
285 
286 #ifdef __HCC__
287 #include <hc.hpp>
288 // integer intrinsic function __poc __clz __ffs __brev
289 __device__ inline unsigned int __popc( unsigned int input)
290 {
291  return hc::__popcount_u32_b32( input);
292 }
293 
294 __device__ inline unsigned int __popcll( unsigned long long int input)
295 {
296  return hc::__popcount_u32_b64(input);
297 }
298 
299 __device__ inline unsigned int __clz(unsigned int input)
300 {
301  return hc::__firstbit_u32_u32( input);
302 }
303 
304 __device__ inline unsigned int __clzll(unsigned long long int input)
305 {
306  return hc::__firstbit_u32_u64( input);
307 }
308 
309 __device__ inline unsigned int __clz(int input)
310 {
311  return hc::__firstbit_u32_s32( input);
312 }
313 
314 __device__ inline unsigned int __clzll(long long int input)
315 {
316  return hc::__firstbit_u32_s64( input);
317 }
318 
319 __device__ inline unsigned int __ffs(unsigned int input)
320 {
321  return hc::__lastbit_u32_u32( input)+1;
322 }
323 
324 __device__ inline unsigned int __ffsll(unsigned long long int input)
325 {
326  return hc::__lastbit_u32_u64( input)+1;
327 }
328 
329 __device__ inline unsigned int __brev( unsigned int input)
330 {
331  return hc::__bitrev_b32( input);
332 }
333 
334 __device__ inline unsigned long long int __brevll( unsigned long long int input)
335 {
336  return hc::__bitrev_b64( input);
337 }
338 
339 // warp vote function __all __any __ballot
340 
341 __device__ inline int __all( int input)
342 {
343  return hc::__all( input);
344 }
345 
346 
347 __device__ inline int __any( int input)
348 {
349  return hc::__any( input);
350 }
351 
352 
353 __device__ inline unsigned long long int __ballot( int input)
354 {
355  return hc::__ballot( input);
356 }
357 
358 #endif
359 
360 
361 
362 #ifdef __HCC_ACCELERATOR__
363 #include <hc_math.hpp>
364 // TODO: Choose whether default is precise math or fast math based on compilation flag.
365 using namespace hc::precise_math;
366 
367 //TODO: Undo this once min/max functions are supported by hc
368 inline int min(int arg1, int arg2) __attribute((hc,cpu)) { \
369  return (int)(hc::precise_math::fmin((float)arg1, (float)arg2));}
370 inline int max(int arg1, int arg2) __attribute((hc,cpu)) { \
371  return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));}
372 
373 
374 //TODO - add a couple fast math operations here, the set here will grow :
375 __device__ inline float __log2f(float x) {return hc::fast_math::log2(x); };
376 __device__ inline float __powf(float base, float exponent) {return hc::fast_math::powf(base, exponent); };
377 
378 #endif
379 
380 
381 
385 #define hipThreadIdx_x (amp_get_local_id(2))
386 #define hipThreadIdx_y (amp_get_local_id(1))
387 #define hipThreadIdx_z (amp_get_local_id(0))
388 
389 #define hipBlockIdx_x (hc_get_group_id(2))
390 #define hipBlockIdx_y (hc_get_group_id(1))
391 #define hipBlockIdx_z (hc_get_group_id(0))
392 
393 #define hipBlockDim_x (amp_get_local_size(2))
394 #define hipBlockDim_y (amp_get_local_size(1))
395 #define hipBlockDim_z (amp_get_local_size(0))
396 
397 #define hipGridDim_x (hc_get_num_groups(2))
398 #define hipGridDim_y (hc_get_num_groups(1))
399 #define hipGridDim_z (hc_get_num_groups(0))
400 
401 
402 extern int warpSize ;
403 
404 
405 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
406 
407 
408 #if 0
409 #define KALMAR_PFE_BEGIN() \
410  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
411  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
412  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
413  \
414  hc::completion_future cf = hc::parallel_for_each (\
415  *lp.av,\
416  __hipExtTile,\
417  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]]
418 
419 
420 
421 #define KALMAR_PFE_END \
422  ); \
423  if (HIP_LAUNCH_BLOCKING) {\
424  if (HIP_TRACE_API) {\
425  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
426  }\
427  cf.wait(); \
428  if (HIP_TRACE_API) {\
429  fprintf(stderr, "hiptrace1: ...completed.\n");\
430  }\
431  }
432 #endif
433 
434 
435 
436 #define HIP_KERNEL_NAME(...) __VA_ARGS__
437 
438 
439 #ifdef __HCC_CPP__
440 hc::accelerator_view *ihipLaunchKernel(hipStream_t stream);
441 
442 #if not defined(DISABLE_GRID_LAUNCH)
443 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
444 do {\
445  grid_launch_parm lp;\
446  lp.gridDim.x = _numBlocks3D.x; \
447  lp.gridDim.y = _numBlocks3D.y; \
448  lp.gridDim.z = _numBlocks3D.z; \
449  lp.groupDim.x = _blockDim3D.x; \
450  lp.groupDim.y = _blockDim3D.y; \
451  lp.groupDim.z = _blockDim3D.z; \
452  lp.groupMemBytes = _groupMemBytes;\
453  hc::completion_future cf;\
454  lp.cf = &cf; \
455  lp.av = (ihipLaunchKernel(_stream)); \
456  if (HIP_TRACE_API) {\
457  fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
458  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
459  }\
460  _kernelName (lp, __VA_ARGS__);\
461 } while(0)
462 
463 #else
464 #warning(DISABLE_GRID_LAUNCH set)
465 
466 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
467 do {\
468  grid_launch_parm lp;\
469  lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x;/*Convert from #blocks to #threads*/ \
470  lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y;/*Convert from #blocks to #threads*/ \
471  lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z;/*Convert from #blocks to #threads*/ \
472  lp.groupDim.x = _blockDim3D.x; \
473  lp.groupDim.y = _blockDim3D.y; \
474  lp.groupDim.z = _blockDim3D.z; \
475  lp.groupMemBytes = _groupMemBytes;\
476  hc::completion_future cf;\
477  lp.cf = &cf; \
478  lp.av = (ihipLaunchKernel(_stream)); \
479  if (HIP_TRACE_API) {\
480  fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
481  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
482  }\
483  _kernelName (lp, __VA_ARGS__);\
484 } while(0)
485 /*end hipLaunchKernel */
486 #endif
487 
488 #elif defined (__HCC_C__)
489 
490 //TODO - develop C interface.
491 
492 #endif
493 
494 
495 #if not defined(DISABLE_GRID_LAUNCH)
496 // TODO -In GL these are no-ops and can be removed:
497 // Keep them around for a little while as a fallback.
498 #define KERNELBEGIN
499 #define KERNELEND
500 
501 #else
502 
503 // TODO-GL:
504 // These wrap the kernel in a PFE loop with macros.
505 // Not required with GL but exist here as a fallback.
506 #define KERNELBEGIN \
507  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
508  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
509  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
510  \
511  hc::completion_future cf = \
512  hc::parallel_for_each (\
513  *lp.av,\
514  __hipExtTile,\
515  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]] \
516  {
517 
518 
519 #define KERNELEND \
520  }); \
521  if (HIP_LAUNCH_BLOCKING) {\
522  if (HIP_TRACE_API) {\
523  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
524  }\
525  cf.wait(); \
526  if (HIP_TRACE_API) {\
527  fprintf(stderr, "hiptrace1: ...completed.\n");\
528  }\
529  }
530 
531 #endif /*DISABLE_GRID_LAUNCH*/
532 
533 
534 #endif // __HCC__
535 
536 
541 extern int HIP_PRINT_ENV ;
542 extern int HIP_TRACE_API;
543 extern int HIP_LAUNCH_BLOCKING ;
544 
550 // End doxygen API:
int HIP_TRACE_API
Trace HIP APIs.
Definition: hip_hcc.cpp:57
Definition: hip_hcc.cpp:82
int HIP_PRINT_ENV
Print all HIP-related environment variables.
Definition: hip_hcc.cpp:56
int HIP_LAUNCH_BLOCKING
Make all HIP APIs host-synchronous.
Definition: hip_hcc.cpp:58