HIP: Heterogenous-computing Interface for Portability
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
27 #pragma once
28 
29 //---
30 // Top part of file can be compiled with any compiler
31 
32 
33 #include <cstring>
34 #include <cmath>
35 #include <string.h>
36 #include <stddef.h>
37 
38 
39 #define CUDA_SUCCESS hipSuccess
40 
41 #include <hip_runtime_api.h>
42 
43 //---
44 // Remainder of this file only compiles with HCC
45 #ifdef __HCC__
46 #include <hc.hpp>
47 #include <grid_launch.h>
48 
49 //TODO-HCC-GL - change this to typedef.
50 //typedef grid_launch_parm hipLaunchParm ;
51 #define hipLaunchParm grid_launch_parm
52 
53 #include <hcc_detail/hip_texture.h>
55 
56 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
57 #if defined (__KALMAR_ACCELERATOR__) && not defined (__HCC_ACCELERATOR__)
58 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
59 #endif
60 
61 // Feature tests:
62 #if defined(__HCC_ACCELERATOR__) and (__HCC_ACCELERATOR__ != 0)
63 // Device compile and not host compile:
64 
65 //TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these.
66  // 32-bit Atomics:
67 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
68 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
69 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
70 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
71 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
72 
73 // 64-bit Atomics:
74 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
75 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
76 
77 // Doubles
78 #define __HIP_ARCH_HAS_DOUBLES__ (1)
79 
80 //warp cross-lane operations:
81 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
82 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
83 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
84 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
85 
86 //sync
87 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
88 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
89 
90 // misc
91 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
92 #define __HIP_ARCH_HAS_3DGRID__ (1)
93 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
94 
95 #endif
96 
97 
98 
99 
100 
101 //TODO-HCC this is currently ignored by HCC target of HIP
102 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
103 
104 // Detect if we are compiling C++ mode or C mode
105 #if defined(__cplusplus)
106 #define __HCC_CPP__
107 #elif defined(__STDC_VERSION__)
108 #define __HCC_C__
109 #endif
110 
111 
112 // TODO - hipify-clang - change to use the function call.
113 //#define warpSize hc::__wavesize()
114 const int warpSize = 64;
115 
116 
117 #define clock_t long long int
118 __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); };
119 __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); };
120 
121 //atomicAdd()
122 __device__ inline int atomicAdd(int* address, int val)
123 {
124  return hc::atomic_fetch_add(address,val);
125 }
126 __device__ inline unsigned int atomicAdd(unsigned int* address,
127  unsigned int val)
128 {
129  return hc::atomic_fetch_add(address,val);
130 }
131 __device__ inline unsigned long long int atomicAdd(unsigned long long int* address,
132  unsigned long long int val)
133 {
134  return (long long int)hc::atomic_fetch_add((uint64_t*)address,(uint64_t)val);
135 }
136 __device__ inline float atomicAdd(float* address, float val)
137 {
138  return hc::atomic_fetch_add(address,val);
139 }
140 
141 //atomicSub()
142 __device__ inline int atomicSub(int* address, int val)
143 {
144  return hc::atomic_fetch_sub(address,val);
145 }
146 __device__ inline unsigned int atomicSub(unsigned int* address,
147  unsigned int val)
148 {
149  return hc::atomic_fetch_sub(address,val);
150 }
151 
152 //atomicExch()
153 __device__ inline int atomicExch(int* address, int val)
154 {
155  return hc::atomic_exchange(address,val);
156 }
157 __device__ inline unsigned int atomicExch(unsigned int* address,
158  unsigned int val)
159 {
160  return hc::atomic_exchange(address,val);
161 }
162 __device__ inline unsigned long long int atomicExch(unsigned long long int* address,
163  unsigned long long int val)
164 {
165  return (long long int)hc::atomic_exchange((uint64_t*)address,(uint64_t)val);
166 }
167 __device__ inline float atomicExch(float* address, float val)
168 {
169  return hc::atomic_exchange(address,val);
170 }
171 
172 //atomicMin()
173 __device__ inline int atomicMin(int* address, int val)
174 {
175  return hc::atomic_fetch_min(address,val);
176 }
177 __device__ inline unsigned int atomicMin(unsigned int* address,
178  unsigned int val)
179 {
180  return hc::atomic_fetch_min(address,val);
181 }
182 __device__ inline unsigned long long int atomicMin(unsigned long long int* address,
183  unsigned long long int val)
184 {
185  return (long long int)hc::atomic_fetch_min((uint64_t*)address,(uint64_t)val);
186 }
187 
188 //atomicMax()
189 __device__ inline int atomicMax(int* address, int val)
190 {
191  return hc::atomic_fetch_max(address,val);
192 }
193 __device__ inline unsigned int atomicMax(unsigned int* address,
194  unsigned int val)
195 {
196  return hc::atomic_fetch_max(address,val);
197 }
198 __device__ inline unsigned long long int atomicMax(unsigned long long int* address,
199  unsigned long long int val)
200 {
201  return (long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val);
202 }
203 
204 //atomicCAS()
205 __device__ inline int atomicCAS(int* address, int compare, int val)
206 {
207  hc::atomic_compare_exchange(address,&compare,val);
208  return *address;
209 }
210 __device__ inline unsigned int atomicCAS(unsigned int* address,
211  unsigned int compare,
212  unsigned int val)
213 {
214  hc::atomic_compare_exchange(address,&compare,val);
215  return *address;
216 }
217 __device__ inline unsigned long long int atomicCAS(unsigned long long int* address,
218  unsigned long long int compare,
219  unsigned long long int val)
220 {
221  hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
222  return *address;
223 }
224 
225 //atomicAnd()
226 __device__ inline int atomicAnd(int* address, int val)
227 {
228  return hc::atomic_fetch_and(address,val);
229 }
230 __device__ inline unsigned int atomicAnd(unsigned int* address,
231  unsigned int val)
232 {
233  return hc::atomic_fetch_and(address,val);
234 }
235 __device__ inline unsigned long long int atomicAnd(unsigned long long int* address,
236  unsigned long long int val)
237 {
238  return (long long int)hc::atomic_fetch_and((uint64_t*)address,(uint64_t)val);
239 }
240 
241 //atomicOr()
242 __device__ inline int atomicOr(int* address, int val)
243 {
244  return hc::atomic_fetch_or(address,val);
245 }
246 __device__ inline unsigned int atomicOr(unsigned int* address,
247  unsigned int val)
248 {
249  return hc::atomic_fetch_or(address,val);
250 }
251 __device__ inline unsigned long long int atomicOr(unsigned long long int* address,
252  unsigned long long int val)
253 {
254  return (long long int)hc::atomic_fetch_or((uint64_t*)address,(uint64_t)val);
255 }
256 
257 //atomicXor()
258 __device__ inline int atomicXor(int* address, int val)
259 {
260  return hc::atomic_fetch_xor(address,val);
261 }
262 __device__ inline unsigned int atomicXor(unsigned int* address,
263  unsigned int val)
264 {
265  return hc::atomic_fetch_xor(address,val);
266 }
267 __device__ inline unsigned long long int atomicXor(unsigned long long int* address,
268  unsigned long long int val)
269 {
270  return (long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
271 }
272 
273 #include <hc.hpp>
274 // integer intrinsic function __poc __clz __ffs __brev
275 __device__ inline unsigned int __popc( unsigned int input)
276 {
277  return hc::__popcount_u32_b32( input);
278 }
279 
280 __device__ inline unsigned int __popcll( unsigned long long int input)
281 {
282  return hc::__popcount_u32_b64(input);
283 }
284 
285 __device__ inline unsigned int __clz(unsigned int input)
286 {
287  return hc::__firstbit_u32_u32( input);
288 }
289 
290 __device__ inline unsigned int __clzll(unsigned long long int input)
291 {
292  return hc::__firstbit_u32_u64( input);
293 }
294 
295 __device__ inline unsigned int __clz(int input)
296 {
297  return hc::__firstbit_u32_s32( input);
298 }
299 
300 __device__ inline unsigned int __clzll(long long int input)
301 {
302  return hc::__firstbit_u32_s64( input);
303 }
304 
305 __device__ inline unsigned int __ffs(unsigned int input)
306 {
307  return hc::__lastbit_u32_u32( input)+1;
308 }
309 
310 __device__ inline unsigned int __ffsll(unsigned long long int input)
311 {
312  return hc::__lastbit_u32_u64( input)+1;
313 }
314 
315 __device__ inline unsigned int __ffs(int input)
316 {
317  return hc::__lastbit_u32_s32( input)+1;
318 }
319 
320 __device__ inline unsigned int __ffsll(long long int input)
321 {
322  return hc::__lastbit_u32_s64( input)+1;
323 }
324 
325 __device__ inline unsigned int __brev( unsigned int input)
326 {
327  return hc::__bitrev_b32( input);
328 }
329 
330 __device__ inline unsigned long long int __brevll( unsigned long long int input)
331 {
332  return hc::__bitrev_b64( input);
333 }
334 
335 // warp vote function __all __any __ballot
336 __device__ inline int __all( int input)
337 {
338  return hc::__all( input);
339 }
340 
341 
342 __device__ inline int __any( int input)
343 {
344  if( hc::__any( input)!=0) return 1;
345  else return 0;
346 }
347 
348 __device__ inline unsigned long long int __ballot( int input)
349 {
350  return hc::__ballot( input);
351 }
352 
353 // warp shuffle functions
354 __device__ inline int __shfl(int input, int lane, int width=warpSize)
355 {
356  return hc::__shfl(input,lane,width);
357 }
358 
359 __device__ inline int __shfl_up(int input, unsigned int lane_delta, int width=warpSize)
360 {
361  return hc::__shfl_up(input,lane_delta,width);
362 }
363 
364 __device__ inline int __shfl_down(int input, unsigned int lane_delta, int width=warpSize)
365 {
366  return hc::__shfl_down(input,lane_delta,width);
367 }
368 
369 __device__ inline int __shfl_xor(int input, int lane_mask, int width=warpSize)
370 {
371  return hc::__shfl_xor(input,lane_mask,width);
372 }
373 
374 __device__ inline float __shfl(float input, int lane, int width=warpSize)
375 {
376  return hc::__shfl(input,lane,width);
377 }
378 
379 __device__ inline float __shfl_up(float input, unsigned int lane_delta, int width=warpSize)
380 {
381  return hc::__shfl_up(input,lane_delta,width);
382 }
383 
384 __device__ inline float __shfl_down(float input, unsigned int lane_delta, int width=warpSize)
385 {
386  return hc::__shfl_down(input,lane_delta,width);
387 }
388 
389 __device__ inline float __shfl_xor(float input, int lane_mask, int width=warpSize)
390 {
391  return hc::__shfl_xor(input,lane_mask,width);
392 }
393 
394 
395 #include <hc_math.hpp>
396 // TODO: Choose whether default is precise math or fast math based on compilation flag.
397 #ifdef __HCC_ACCELERATOR__
398 using namespace hc::precise_math;
399 #endif
400 
401 //TODO: Undo this once min/max functions are supported by hc
402 inline int min(int arg1, int arg2) __attribute((hc,cpu)) { \
403  return (int)(hc::precise_math::fmin((float)arg1, (float)arg2));}
404 inline int max(int arg1, int arg2) __attribute((hc,cpu)) { \
405  return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));}
406 
407 
408 //TODO - add a couple fast math operations here, the set here will grow :
409 __device__ inline float __cosf(float x) {return hc::fast_math::cosf(x); };
410 __device__ inline float __expf(float x) {return hc::fast_math::expf(x); };
411 __device__ inline float __frsqrt_rn(float x) {return hc::fast_math::rsqrt(x); };
412 __device__ inline float __fsqrt_rd(float x) {return hc::fast_math::sqrt(x); };
413 __device__ inline float __fsqrt_rn(float x) {return hc::fast_math::sqrt(x); };
414 __device__ inline float __fsqrt_ru(float x) {return hc::fast_math::sqrt(x); };
415 __device__ inline float __fsqrt_rz(float x) {return hc::fast_math::sqrt(x); };
416 __device__ inline float __log10f(float x) {return hc::fast_math::log10f(x); };
417 __device__ inline float __log2f(float x) {return hc::fast_math::log2f(x); };
418 __device__ inline float __logf(float x) {return hc::fast_math::logf(x); };
419 __device__ inline float __powf(float base, float exponent) {return hc::fast_math::powf(base, exponent); };
420 __device__ inline void __sincosf(float x, float *s, float *c) {return hc::fast_math::sincosf(x, s, c); };
421 __device__ inline float __sinf(float x) {return hc::fast_math::sinf(x); };
422 __device__ inline float __tanf(float x) {return hc::fast_math::tanf(x); };
423 __device__ inline float __dsqrt_rd(double x) {return hc::fast_math::sqrt(x); };
424 __device__ inline float __dsqrt_rn(double x) {return hc::fast_math::sqrt(x); };
425 __device__ inline float __dsqrt_ru(double x) {return hc::fast_math::sqrt(x); };
426 __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); };
427 
431 #define hipThreadIdx_x (amp_get_local_id(2))
432 #define hipThreadIdx_y (amp_get_local_id(1))
433 #define hipThreadIdx_z (amp_get_local_id(0))
434 
435 #define hipBlockIdx_x (hc_get_group_id(2))
436 #define hipBlockIdx_y (hc_get_group_id(1))
437 #define hipBlockIdx_z (hc_get_group_id(0))
438 
439 #define hipBlockDim_x (amp_get_local_size(2))
440 #define hipBlockDim_y (amp_get_local_size(1))
441 #define hipBlockDim_z (amp_get_local_size(0))
442 
443 #define hipGridDim_x (hc_get_num_groups(2))
444 #define hipGridDim_y (hc_get_num_groups(1))
445 #define hipGridDim_z (hc_get_num_groups(0))
446 
447 
448 
449 
450 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
451 
452 
453 #if 0
454 #define KALMAR_PFE_BEGIN() \
455  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
456  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
457  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
458  \
459  hc::completion_future cf = hc::parallel_for_each (\
460  *lp.av,\
461  __hipExtTile,\
462  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]]
463 
464 
465 
466 #define KALMAR_PFE_END \
467  ); \
468  if (HIP_LAUNCH_BLOCKING) {\
469  if (HIP_TRACE_API) {\
470  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
471  }\
472  cf.wait(); \
473  if (HIP_TRACE_API) {\
474  fprintf(stderr, "hiptrace1: ...completed.\n");\
475  }\
476  }
477 #endif
478 
479 
480 
481 #define HIP_KERNEL_NAME(...) __VA_ARGS__
482 
483 
484 #ifdef __HCC_CPP__
485 hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av);
486 void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &cf);
487 
488 // TODO - move to common header file.
489 #define KNRM "\x1B[0m"
490 #define KGRN "\x1B[32m"
491 
492 #if not defined(DISABLE_GRID_LAUNCH)
493 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
494 do {\
495  grid_launch_parm lp;\
496  lp.gridDim.x = _numBlocks3D.x; \
497  lp.gridDim.y = _numBlocks3D.y; \
498  lp.gridDim.z = _numBlocks3D.z; \
499  lp.groupDim.x = _blockDim3D.x; \
500  lp.groupDim.y = _blockDim3D.y; \
501  lp.groupDim.z = _blockDim3D.z; \
502  lp.groupMemBytes = _groupMemBytes;\
503  hc::completion_future cf;\
504  lp.cf = &cf; \
505  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
506  if (HIP_TRACE_API) {\
507  fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n" KNRM, \
508  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
509  }\
510  _kernelName (lp, __VA_ARGS__);\
511  ihipPostLaunchKernel(trueStream, cf);\
512 } while(0)
513 
514 #else
515 #warning(DISABLE_GRID_LAUNCH set)
516 
517 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
518 do {\
519  grid_launch_parm lp;\
520  lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x;/*Convert from #blocks to #threads*/ \
521  lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y;/*Convert from #blocks to #threads*/ \
522  lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z;/*Convert from #blocks to #threads*/ \
523  lp.groupDim.x = _blockDim3D.x; \
524  lp.groupDim.y = _blockDim3D.y; \
525  lp.groupDim.z = _blockDim3D.z; \
526  lp.groupMemBytes = _groupMemBytes;\
527  hc::completion_future cf;\
528  lp.cf = &cf; \
529  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
530  if (HIP_TRACE_API) {\
531  fprintf(stderr, "==hip-api: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
532  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
533  }\
534  _kernelName (lp, __VA_ARGS__);\
535  ihipPostLaunchKernel(trueStream, cf);\
536 } while(0)
537 /*end hipLaunchKernel */
538 #endif
539 
540 #elif defined (__HCC_C__)
541 
542 //TODO - develop C interface.
543 
544 #endif
545 
546 
547 #if not defined(DISABLE_GRID_LAUNCH)
548 // TODO -In GL these are no-ops and can be removed:
549 // Keep them around for a little while as a fallback.
550 #define KERNELBEGIN
551 #define KERNELEND
552 
553 #else
554 
555 // TODO-GL:
556 // These wrap the kernel in a PFE loop with macros.
557 // Not required with GL but exist here as a fallback.
558 #define KERNELBEGIN \
559  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
560  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
561  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
562  \
563  hc::completion_future cf = \
564  hc::parallel_for_each (\
565  *lp.av,\
566  __hipExtTile,\
567  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]] \
568  {
569 
570 
571 #define KERNELEND \
572  }); \
573  if (HIP_LAUNCH_BLOCKING) {\
574  if (HIP_TRACE_API) {\
575  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
576  }\
577  cf.wait(); \
578  if (HIP_TRACE_API) {\
579  fprintf(stderr, "hiptrace1: ...completed.\n");\
580  }\
581  }
582 
583 #endif /*DISABLE_GRID_LAUNCH*/
584 
585 
586 #endif // __HCC__
587 
588 
593 extern int HIP_PRINT_ENV ;
594 extern int HIP_TRACE_API;
595 extern int HIP_LAUNCH_BLOCKING ;
596 
602 // End doxygen API:
int HIP_TRACE_API
Trace HIP APIs.
Definition: hip_hcc.cpp:73
TODO-doc.
HIP C++ Texture API for hcc compiler.
int HIP_PRINT_ENV
Print all HIP-related environment variables.
Definition: hip_hcc.cpp:72
Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language ...
int HIP_LAUNCH_BLOCKING
Make all HIP APIs host-synchronous.
Definition: hip_hcc.cpp:70