HIP: Heterogenous-computing Interface for Portability
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
27 #pragma once
28 
29 //---
30 // Top part of file can be compiled with any compiler
31 
32 
33 #include <cstring>
34 #include <cmath>
35 #include <string.h>
36 #include <stddef.h>
37 
38 
39 #define CUDA_SUCCESS hipSuccess
40 
41 #include <hip_runtime_api.h>
42 
43 //---
44 // Remainder of this file only compiles with HCC
45 #ifdef __HCC__
46 #include <hc.hpp>
47 #include <grid_launch.h>
48 
49 //TODO-HCC-GL - change this to typedef.
50 //typedef grid_launch_parm hipLaunchParm ;
51 #define hipLaunchParm grid_launch_parm
52 
53 #include <hcc_detail/hip_texture.h>
55 
56 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
57 #if defined (__KALMAR_ACCELERATOR__) && not defined (__HCC_ACCELERATOR__)
58 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
59 #endif
60 
61 // Feature tests:
62 #if defined(__HCC_ACCELERATOR__) and (__HCC_ACCELERATOR__ != 0)
63 // Device compile and not host compile:
64 
65 //TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these.
66  // 32-bit Atomics:
67 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
68 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
69 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
70 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
71 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
72 
73 // 64-bit Atomics:
74 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
75 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
76 
77 // Doubles
78 #define __HIP_ARCH_HAS_DOUBLES__ (1)
79 
80 //warp cross-lane operations:
81 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
82 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
83 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
84 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
85 
86 //sync
87 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
88 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
89 
90 // misc
91 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
92 #define __HIP_ARCH_HAS_3DGRID__ (1)
93 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
94 
95 #endif
96 
97 
98 
99 
100 
101 //TODO-HCC this is currently ignored by HCC target of HIP
102 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
103 
104 // Detect if we are compiling C++ mode or C mode
105 #if defined(__cplusplus)
106 #define __HCC_CPP__
107 #elif defined(__STDC_VERSION__)
108 #define __HCC_C__
109 #endif
110 
111 #define clock_t long long int
112 __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); };
113 __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); };
114 
115 //atomicAdd()
116 __device__ inline int atomicAdd(int* address, int val)
117 {
118  return hc::atomic_fetch_add(address,val);
119 }
120 __device__ inline unsigned int atomicAdd(unsigned int* address,
121  unsigned int val)
122 {
123  return hc::atomic_fetch_add(address,val);
124 }
125 __device__ inline unsigned long long int atomicAdd(unsigned long long int* address,
126  unsigned long long int val)
127 {
128  return (long long int)hc::atomic_fetch_add((uint64_t*)address,(uint64_t)val);
129 }
130 __device__ inline float atomicAdd(float* address, float val)
131 {
132  return hc::atomic_fetch_add(address,val);
133 }
134 
135 //atomicSub()
136 __device__ inline int atomicSub(int* address, int val)
137 {
138  return hc::atomic_fetch_sub(address,val);
139 }
140 __device__ inline unsigned int atomicSub(unsigned int* address,
141  unsigned int val)
142 {
143  return hc::atomic_fetch_sub(address,val);
144 }
145 
146 //atomicExch()
147 __device__ inline int atomicExch(int* address, int val)
148 {
149  return hc::atomic_exchange(address,val);
150 }
151 __device__ inline unsigned int atomicExch(unsigned int* address,
152  unsigned int val)
153 {
154  return hc::atomic_exchange(address,val);
155 }
156 __device__ inline unsigned long long int atomicExch(unsigned long long int* address,
157  unsigned long long int val)
158 {
159  return (long long int)hc::atomic_exchange((uint64_t*)address,(uint64_t)val);
160 }
161 __device__ inline float atomicExch(float* address, float val)
162 {
163  return hc::atomic_exchange(address,val);
164 }
165 
166 //atomicMin()
167 __device__ inline int atomicMin(int* address, int val)
168 {
169  return hc::atomic_fetch_min(address,val);
170 }
171 __device__ inline unsigned int atomicMin(unsigned int* address,
172  unsigned int val)
173 {
174  return hc::atomic_fetch_min(address,val);
175 }
176 __device__ inline unsigned long long int atomicMin(unsigned long long int* address,
177  unsigned long long int val)
178 {
179  return (long long int)hc::atomic_fetch_min((uint64_t*)address,(uint64_t)val);
180 }
181 
182 //atomicMax()
183 __device__ inline int atomicMax(int* address, int val)
184 {
185  return hc::atomic_fetch_max(address,val);
186 }
187 __device__ inline unsigned int atomicMax(unsigned int* address,
188  unsigned int val)
189 {
190  return hc::atomic_fetch_max(address,val);
191 }
192 __device__ inline unsigned long long int atomicMax(unsigned long long int* address,
193  unsigned long long int val)
194 {
195  return (long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val);
196 }
197 
198 //atomicCAS()
199 __device__ inline int atomicCAS(int* address, int compare, int val)
200 {
201  hc::atomic_compare_exchange(address,&compare,val);
202  return *address;
203 }
204 __device__ inline unsigned int atomicCAS(unsigned int* address,
205  unsigned int compare,
206  unsigned int val)
207 {
208  hc::atomic_compare_exchange(address,&compare,val);
209  return *address;
210 }
211 __device__ inline unsigned long long int atomicCAS(unsigned long long int* address,
212  unsigned long long int compare,
213  unsigned long long int val)
214 {
215  hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
216  return *address;
217 }
218 
219 //atomicAnd()
220 __device__ inline int atomicAnd(int* address, int val)
221 {
222  return hc::atomic_fetch_and(address,val);
223 }
224 __device__ inline unsigned int atomicAnd(unsigned int* address,
225  unsigned int val)
226 {
227  return hc::atomic_fetch_and(address,val);
228 }
229 __device__ inline unsigned long long int atomicAnd(unsigned long long int* address,
230  unsigned long long int val)
231 {
232  return (long long int)hc::atomic_fetch_and((uint64_t*)address,(uint64_t)val);
233 }
234 
235 //atomicOr()
236 __device__ inline int atomicOr(int* address, int val)
237 {
238  return hc::atomic_fetch_or(address,val);
239 }
240 __device__ inline unsigned int atomicOr(unsigned int* address,
241  unsigned int val)
242 {
243  return hc::atomic_fetch_or(address,val);
244 }
245 __device__ inline unsigned long long int atomicOr(unsigned long long int* address,
246  unsigned long long int val)
247 {
248  return (long long int)hc::atomic_fetch_or((uint64_t*)address,(uint64_t)val);
249 }
250 
251 //atomicXor()
252 __device__ inline int atomicXor(int* address, int val)
253 {
254  return hc::atomic_fetch_xor(address,val);
255 }
256 __device__ inline unsigned int atomicXor(unsigned int* address,
257  unsigned int val)
258 {
259  return hc::atomic_fetch_xor(address,val);
260 }
261 __device__ inline unsigned long long int atomicXor(unsigned long long int* address,
262  unsigned long long int val)
263 {
264  return (long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
265 }
266 
267 #include <hc.hpp>
268 // integer intrinsic function __poc __clz __ffs __brev
269 __device__ inline unsigned int __popc( unsigned int input)
270 {
271  return hc::__popcount_u32_b32( input);
272 }
273 
274 __device__ inline unsigned int __popcll( unsigned long long int input)
275 {
276  return hc::__popcount_u32_b64(input);
277 }
278 
279 __device__ inline unsigned int __clz(unsigned int input)
280 {
281  return hc::__firstbit_u32_u32( input);
282 }
283 
284 __device__ inline unsigned int __clzll(unsigned long long int input)
285 {
286  return hc::__firstbit_u32_u64( input);
287 }
288 
289 __device__ inline unsigned int __clz(int input)
290 {
291  return hc::__firstbit_u32_s32( input);
292 }
293 
294 __device__ inline unsigned int __clzll(long long int input)
295 {
296  return hc::__firstbit_u32_s64( input);
297 }
298 
299 __device__ inline unsigned int __ffs(unsigned int input)
300 {
301  return hc::__lastbit_u32_u32( input)+1;
302 }
303 
304 __device__ inline unsigned int __ffsll(unsigned long long int input)
305 {
306  return hc::__lastbit_u32_u64( input)+1;
307 }
308 
309 __device__ inline unsigned int __ffs(int input)
310 {
311  return hc::__lastbit_u32_s32( input)+1;
312 }
313 
314 __device__ inline unsigned int __ffsll(long long int input)
315 {
316  return hc::__lastbit_u32_s64( input)+1;
317 }
318 
319 __device__ inline unsigned int __brev( unsigned int input)
320 {
321  return hc::__bitrev_b32( input);
322 }
323 
324 __device__ inline unsigned long long int __brevll( unsigned long long int input)
325 {
326  return hc::__bitrev_b64( input);
327 }
328 
329 // warp vote function __all __any __ballot
330 __device__ inline int __all( int input)
331 {
332  return hc::__all( input);
333 }
334 
335 __device__ inline int __any( int input)
336 {
337  if( hc::__any( input)!=0) return 1;
338  else return 0;
339 }
340 
341 __device__ inline unsigned long long int __ballot( int input)
342 {
343  return hc::__ballot( input);
344 }
345 
346 // warp shuffle functions
347 __device__ inline int __shfl(int input, int lane, int width)
348 {
349  return hc::__shfl(input,lane,width);
350 }
351 
352 __device__ inline int __shfl_up(int input, unsigned int lane_delta, int width)
353 {
354  return hc::__shfl_up(input,lane_delta,width);
355 }
356 
357 __device__ inline int __shfl_down(int input, unsigned int lane_delta, int width)
358 {
359  return hc::__shfl_down(input,lane_delta,width);
360 }
361 
362 __device__ inline int __shfl_xor(int input, int lane_mask, int width)
363 {
364  return hc::__shfl_xor(input,lane_mask,width);
365 }
366 
367 __device__ inline float __shfl(float input, int lane, int width)
368 {
369  return hc::__shfl(input,lane,width);
370 }
371 
372 __device__ inline float __shfl_up(float input, unsigned int lane_delta, int width)
373 {
374  return hc::__shfl_up(input,lane_delta,width);
375 }
376 
377 __device__ inline float __shfl_down(float input, unsigned int lane_delta, int width)
378 {
379  return hc::__shfl_down(input,lane_delta,width);
380 }
381 
382 __device__ inline float __shfl_xor(float input, int lane_mask, int width)
383 {
384  return hc::__shfl_xor(input,lane_mask,width);
385 }
386 
387 
388 #include <hc_math.hpp>
389 // TODO: Choose whether default is precise math or fast math based on compilation flag.
390 #ifdef __HCC_ACCELERATOR__
391 using namespace hc::precise_math;
392 #endif
393 
394 //TODO: Undo this once min/max functions are supported by hc
395 inline int min(int arg1, int arg2) __attribute((hc,cpu)) { \
396  return (int)(hc::precise_math::fmin((float)arg1, (float)arg2));}
397 inline int max(int arg1, int arg2) __attribute((hc,cpu)) { \
398  return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));}
399 
400 
401 //TODO - add a couple fast math operations here, the set here will grow :
402 __device__ inline float __cosf(float x) {return hc::fast_math::cosf(x); };
403 __device__ inline float __expf(float x) {return hc::fast_math::expf(x); };
404 __device__ inline float __frsqrt_rn(float x) {return hc::fast_math::rsqrt(x); };
405 __device__ inline float __fsqrt_rd(float x) {return hc::fast_math::sqrt(x); };
406 __device__ inline float __fsqrt_rn(float x) {return hc::fast_math::sqrt(x); };
407 __device__ inline float __fsqrt_ru(float x) {return hc::fast_math::sqrt(x); };
408 __device__ inline float __fsqrt_rz(float x) {return hc::fast_math::sqrt(x); };
409 __device__ inline float __log10f(float x) {return hc::fast_math::log10f(x); };
410 __device__ inline float __log2f(float x) {return hc::fast_math::log2f(x); };
411 __device__ inline float __logf(float x) {return hc::fast_math::logf(x); };
412 __device__ inline float __powf(float base, float exponent) {return hc::fast_math::powf(base, exponent); };
413 __device__ inline void __sincosf(float x, float *s, float *c) {return hc::fast_math::sincosf(x, s, c); };
414 __device__ inline float __sinf(float x) {return hc::fast_math::sinf(x); };
415 __device__ inline float __tanf(float x) {return hc::fast_math::tanf(x); };
416 __device__ inline float __dsqrt_rd(double x) {return hc::fast_math::sqrt(x); };
417 __device__ inline float __dsqrt_rn(double x) {return hc::fast_math::sqrt(x); };
418 __device__ inline float __dsqrt_ru(double x) {return hc::fast_math::sqrt(x); };
419 __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); };
420 
424 #define hipThreadIdx_x (amp_get_local_id(2))
425 #define hipThreadIdx_y (amp_get_local_id(1))
426 #define hipThreadIdx_z (amp_get_local_id(0))
427 
428 #define hipBlockIdx_x (hc_get_group_id(2))
429 #define hipBlockIdx_y (hc_get_group_id(1))
430 #define hipBlockIdx_z (hc_get_group_id(0))
431 
432 #define hipBlockDim_x (amp_get_local_size(2))
433 #define hipBlockDim_y (amp_get_local_size(1))
434 #define hipBlockDim_z (amp_get_local_size(0))
435 
436 #define hipGridDim_x (hc_get_num_groups(2))
437 #define hipGridDim_y (hc_get_num_groups(1))
438 #define hipGridDim_z (hc_get_num_groups(0))
439 
440 
441 extern int warpSize ;
442 
443 
444 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
445 
446 
447 #if 0
448 #define KALMAR_PFE_BEGIN() \
449  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
450  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
451  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
452  \
453  hc::completion_future cf = hc::parallel_for_each (\
454  *lp.av,\
455  __hipExtTile,\
456  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]]
457 
458 
459 
460 #define KALMAR_PFE_END \
461  ); \
462  if (HIP_LAUNCH_BLOCKING) {\
463  if (HIP_TRACE_API) {\
464  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
465  }\
466  cf.wait(); \
467  if (HIP_TRACE_API) {\
468  fprintf(stderr, "hiptrace1: ...completed.\n");\
469  }\
470  }
471 #endif
472 
473 
474 
475 #define HIP_KERNEL_NAME(...) __VA_ARGS__
476 
477 
478 #ifdef __HCC_CPP__
479 hc::accelerator_view *ihipLaunchKernel(hipStream_t stream);
480 
481 #if not defined(DISABLE_GRID_LAUNCH)
482 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
483 do {\
484  grid_launch_parm lp;\
485  lp.gridDim.x = _numBlocks3D.x; \
486  lp.gridDim.y = _numBlocks3D.y; \
487  lp.gridDim.z = _numBlocks3D.z; \
488  lp.groupDim.x = _blockDim3D.x; \
489  lp.groupDim.y = _blockDim3D.y; \
490  lp.groupDim.z = _blockDim3D.z; \
491  lp.groupMemBytes = _groupMemBytes;\
492  hc::completion_future cf;\
493  lp.cf = &cf; \
494  lp.av = (ihipLaunchKernel(_stream)); \
495  if (HIP_TRACE_API) {\
496  fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
497  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
498  }\
499  _kernelName (lp, __VA_ARGS__);\
500 } while(0)
501 
502 #else
503 #warning(DISABLE_GRID_LAUNCH set)
504 
505 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
506 do {\
507  grid_launch_parm lp;\
508  lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x;/*Convert from #blocks to #threads*/ \
509  lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y;/*Convert from #blocks to #threads*/ \
510  lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z;/*Convert from #blocks to #threads*/ \
511  lp.groupDim.x = _blockDim3D.x; \
512  lp.groupDim.y = _blockDim3D.y; \
513  lp.groupDim.z = _blockDim3D.z; \
514  lp.groupMemBytes = _groupMemBytes;\
515  hc::completion_future cf;\
516  lp.cf = &cf; \
517  lp.av = (ihipLaunchKernel(_stream)); \
518  if (HIP_TRACE_API) {\
519  fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
520  #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
521  }\
522  _kernelName (lp, __VA_ARGS__);\
523 } while(0)
524 /*end hipLaunchKernel */
525 #endif
526 
527 #elif defined (__HCC_C__)
528 
529 //TODO - develop C interface.
530 
531 #endif
532 
533 
534 #if not defined(DISABLE_GRID_LAUNCH)
535 // TODO -In GL these are no-ops and can be removed:
536 // Keep them around for a little while as a fallback.
537 #define KERNELBEGIN
538 #define KERNELEND
539 
540 #else
541 
542 // TODO-GL:
543 // These wrap the kernel in a PFE loop with macros.
544 // Not required with GL but exist here as a fallback.
545 #define KERNELBEGIN \
546  hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
547  auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
548  __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
549  \
550  hc::completion_future cf = \
551  hc::parallel_for_each (\
552  *lp.av,\
553  __hipExtTile,\
554  [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]] \
555  {
556 
557 
558 #define KERNELEND \
559  }); \
560  if (HIP_LAUNCH_BLOCKING) {\
561  if (HIP_TRACE_API) {\
562  fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
563  }\
564  cf.wait(); \
565  if (HIP_TRACE_API) {\
566  fprintf(stderr, "hiptrace1: ...completed.\n");\
567  }\
568  }
569 
570 #endif /*DISABLE_GRID_LAUNCH*/
571 
572 
573 #endif // __HCC__
574 
575 
580 extern int HIP_PRINT_ENV ;
581 extern int HIP_TRACE_API;
582 extern int HIP_LAUNCH_BLOCKING ;
583 
589 // End doxygen API:
int HIP_TRACE_API
Trace HIP APIs.
Definition: hip_hcc.cpp:57
TODO-doc.
Definition: hip_hcc.cpp:82
HIP C++ Texture API for hcc compiler.
int HIP_PRINT_ENV
Print all HIP-related environment variables.
Definition: hip_hcc.cpp:56
Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language ...
int HIP_LAUNCH_BLOCKING
Make all HIP APIs host-synchronous.
Definition: hip_hcc.cpp:58