HIP: Heterogenous-computing Interface for Portability
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_RUNTIME_H
30 #define HIP_RUNTIME_H
31 
32 //---
33 // Top part of file can be compiled with any compiler
34 
35 
36 //#include <cstring>
37 #if __cplusplus
38 #include <cmath>
39 #else
40 #include <math.h>
41 #include <string.h>
42 #include <stddef.h>
43 #endif
44 // Define NVCC_COMPAT for CUDA compatibility
45 #define NVCC_COMPAT
46 #define CUDA_SUCCESS hipSuccess
47 
48 #include <hip/hip_runtime_api.h>
49 //#include "hip/hcc_detail/hip_hcc.h"
50 //---
51 // Remainder of this file only compiles with HCC
52 #ifdef __HCC__
53 #include <grid_launch.h>
54 
55 #if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20)
56 // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
57 #else
58 #error (HCC must support GRID_LAUNCH_20)
59 #endif
60 
61 extern int HIP_TRACE_API;
62 
63 //TODO-HCC-GL - change this to typedef.
64 //typedef grid_launch_parm hipLaunchParm ;
65 #define hipLaunchParm grid_launch_parm
66 #ifdef __cplusplus
67 //#include <hip/hcc_detail/hip_texture.h>
68 #include <hip/hcc_detail/hip_ldg.h>
69 #endif
71 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
72 #if defined (__KALMAR_ACCELERATOR__) && !defined (__HCC_ACCELERATOR__)
73 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
74 #endif
75 
76 // Feature tests:
77 #if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
78 // Device compile and not host compile:
79 
80 //TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these.
81  // 32-bit Atomics:
82 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
83 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
84 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
85 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
86 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
87 
88 // 64-bit Atomics:
89 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
90 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
91 
92 // Doubles
93 #define __HIP_ARCH_HAS_DOUBLES__ (1)
94 
95 //warp cross-lane operations:
96 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
97 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
98 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
99 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
100 
101 //sync
102 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
103 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
104 
105 // misc
106 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
107 #define __HIP_ARCH_HAS_3DGRID__ (1)
108 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
109 
110 #endif /* Device feature flags */
111 
112 
113 //TODO-HCC this is currently ignored by HCC target of HIP
114 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
115 
116 // Detect if we are compiling C++ mode or C mode
117 #if defined(__cplusplus)
118 #define __HCC_CPP__
119 #elif defined(__STDC_VERSION__)
120 #define __HCC_C__
121 #endif
122 
123 __device__ float acosf(float x);
124 __device__ float acoshf(float x);
125 __device__ float asinf(float x);
126 __device__ float asinhf(float x);
127 __device__ float atan2f(float y, float x);
128 __device__ float atanf(float x);
129 __device__ float atanhf(float x);
130 __device__ float cbrtf(float x);
131 __device__ float ceilf(float x);
132 __device__ float copysignf(float x, float y);
133 __device__ float coshf(float x);
134 __device__ float cyl_bessel_i0f(float x);
135 __device__ float cyl_bessel_i1f(float x);
136 __device__ float erfcf(float x);
137 __device__ float erfcinvf(float y);
138 __host__ float erfcinvf(float y);
139 __device__ float erfcxf(float x);
140 __host__ float erfcxf(float x);
141 __device__ float erff(float x);
142 __device__ float erfinvf(float y);
143 __host__ float erfinvf(float y);
144 __device__ float exp2f(float x);
145 __device__ float expm1f(float x);
146 __device__ float fabsf(float x);
147 __device__ float fdimf(float x, float y);
148 __device__ __host__ float fdividef(float x, float y);
149 __device__ float floorf(float x);
150 __device__ float fmaf(float x, float y, float z);
151 __device__ float fmaxf(float x, float y);
152 __device__ float fminf(float x, float y);
153 __device__ float fmodf(float x, float y);
154 __device__ float frexpf(float x, float y);
155 __device__ float hypotf(float x, float y);
156 __device__ float ilogbf(float x);
157 __host__ __device__ unsigned isfinite(float a);
158 __device__ unsigned isinf(float a);
159 __device__ unsigned isnan(float a);
160 __device__ float j0f(float x);
161 __device__ float j1f(float x);
162 __device__ float jnf(int n, float x);
163 __device__ float ldexpf(float x, int exp);
164 __device__ float lgammaf(float x);
165 __device__ long long int llrintf(float x);
166 __device__ long long int llroundf(float x);
167 __device__ float log1pf(float x);
168 __device__ float logbf(float x);
169 __device__ long int lrintf(float x);
170 __device__ long int lroundf(float x);
171 __device__ float modff(float x, float *iptr);
172 __device__ float nanf(const char* tagp);
173 __device__ float nearbyintf(float x);
174 __device__ float nextafterf(float x, float y);
175 __device__ float norm3df(float a, float b, float c);
176 __host__ float norm3df(float a, float b, float c);
177 __device__ float norm4df(float a, float b, float c, float d);
178 __host__ float norm4df(float a, float b, float c, float d);
179 __device__ float normcdff(float y);
180 __host__ float normcdff(float y);
181 __device__ float normcdfinvf(float y);
182 __host__ float normcdfinvf(float y);
183 __device__ float normf(int dim, const float *a);
184 __device__ float rcbrtf(float x);
185 __host__ float rcbrtf(float x);
186 __device__ float remainderf(float x, float y);
187 __device__ float remquof(float x, float y, int *quo);
188 __device__ float rhypotf(float x, float y);
189 __host__ float rhypotf(float x, float y);
190 __device__ float rintf(float x);
191 __device__ float rnorm3df(float a, float b, float c);
192 __host__ float rnorm3df(float a, float b, float c);
193 __device__ float rnorm4df(float a, float b, float c, float d);
194 __host__ float rnorm4df(float a, float b, float c, float d);
195 __device__ float rnormf(int dim, const float* a);
196 __host__ float rnormf(int dim, const float* a);
197 __device__ float roundf(float x);
198 __device__ float rsqrtf(float x);
199 __device__ float scalblnf(float x, long int n);
200 __device__ float scalbnf(float x, int n);
201 __host__ __device__ unsigned signbit(float a);
202 __device__ void sincospif(float x, float *sptr, float *cptr);
203 __host__ void sincospif(float x, float *sptr, float *cptr);
204 __device__ float sinhf(float x);
205 __device__ float sinpif(float x);
206 __device__ float sqrtf(float x);
207 __device__ float tanhf(float x);
208 __device__ float tgammaf(float x);
209 __device__ float truncf(float x);
210 __device__ float y0f(float x);
211 __device__ float y1f(float x);
212 __device__ float ynf(int n, float x);
213 
214 __host__ __device__ float cospif(float x);
215 __host__ __device__ float sinpif(float x);
216 __device__ float sqrtf(float x);
217 __host__ __device__ float rsqrtf(float x);
218 
219 __device__ double acos(double x);
220 __device__ double acosh(double x);
221 __device__ double asin(double x);
222 __device__ double asinh(double x);
223 __device__ double atan(double x);
224 __device__ double atan2(double y, double x);
225 __device__ double atanh(double x);
226 __device__ double cbrt(double x);
227 __device__ double ceil(double x);
228 __device__ double copysign(double x, double y);
229 __device__ double cos(double x);
230 __device__ double cosh(double x);
231 __host__ __device__ double cospi(double x);
232 __device__ double cyl_bessel_i0(double x);
233 __device__ double cyl_bessel_i1(double x);
234 __device__ double erf(double x);
235 __device__ double erfc(double x);
236 __device__ double erfcinv(double y);
237 __device__ double erfcx(double x);
238 __device__ double erfinv(double x);
239 __device__ double exp(double x);
240 __device__ double exp10(double x);
241 __device__ double exp2(double x);
242 __device__ double expm1(double x);
243 __device__ double fabs(double x);
244 __device__ double fdim(double x, double y);
245 __device__ double fdivide(double x, double y);
246 __device__ double floor(double x);
247 __device__ double fma(double x, double y, double z);
248 __device__ double fmax(double x, double y);
249 __device__ double fmin(double x, double y);
250 __device__ double fmod(double x, double y);
251 __device__ double frexp(double x, int *nptr);
252 __device__ double hypot(double x, double y);
253 __device__ double ilogb(double x);
254 __host__ __device__ unsigned isfinite(double x);
255 __device__ unsigned isinf(double x);
256 __device__ unsigned isnan(double x);
257 __device__ double j0(double x);
258 __device__ double j1(double x);
259 __device__ double jn(int n, double x);
260 __device__ double ldexp(double x, int exp);
261 __device__ double lgamma(double x);
262 __device__ long long llrint(double x);
263 __device__ long long llround(double x);
264 __device__ double log(double x);
265 __device__ double log10(double x);
266 __device__ double log1p(double x);
267 __device__ double log2(double x);
268 __device__ double logb(double x);
269 __device__ long int lrint(double x);
270 __device__ long int lround(double x);
271 __device__ double modf(double x, double *iptr);
272 __device__ double nan(const char* tagp);
273 __device__ double nearbyint(double x);
274 __device__ double nextafter(double x, double y);
275 __device__ double norm(int dim, const double* t);
276 __device__ double norm3d(double a, double b, double c);
277 __host__ double norm3d(double a, double b, double c);
278 __device__ double norm4d(double a, double b, double c, double d);
279 __host__ double norm4d(double a, double b, double c, double d);
280 __device__ double normcdf(double y);
281 __host__ double normcdf(double y);
282 __device__ double normcdfinv(double y);
283 __host__ double normcdfinv(double y);
284 __device__ double pow(double x, double y);
285 __device__ double rcbrt(double x);
286 __host__ double rcbrt(double x);
287 __device__ double remainder(double x, double y);
288 __device__ double remquo(double x, double y, int *quo);
289 __device__ double rhypot(double x, double y);
290 __host__ double rhypot(double x, double y);
291 __device__ double rint(double x);
292 __device__ double rnorm(int dim, const double* t);
293 __host__ double rnorm(int dim, const double* t);
294 __device__ double rnorm3d(double a, double b, double c);
295 __host__ double rnorm3d(double a, double b, double c);
296 __device__ double rnorm4d(double a, double b, double c, double d);
297 __host__ double rnorm4d(double a, double b, double c, double d);
298 __device__ double round(double x);
299 __host__ __device__ double rsqrt(double x);
300 __device__ double scalbln(double x, long int n);
301 __device__ double scalbn(double x, int n);
302 __host__ __device__ unsigned signbit(double a);
303 __device__ double sin(double a);
304 __device__ void sincos(double x, double *sptr, double *cptr);
305 __device__ void sincospi(double x, double *sptr, double *cptr);
306 __host__ void sincospi(double x, double *sptr, double *cptr);
307 __device__ double sinh(double x);
308 __host__ __device__ double sinpi(double x);
309 __device__ double sqrt(double x);
310 __device__ double tan(double x);
311 __device__ double tanh(double x);
312 __device__ double tgamma(double x);
313 __device__ double trunc(double x);
314 __device__ double y0(double x);
315 __device__ double y1(double y);
316 __device__ double yn(int n, double x);
317 
318 __host__ double erfcinv(double y);
319 __host__ double erfcx(double x);
320 __host__ double erfinv(double y);
321 __host__ double fdivide(double x, double y);
322 
323 // TODO - hipify-clang - change to use the function call.
324 //#define warpSize hc::__wavesize()
325 extern const int warpSize;
326 
327 
328 #define clock_t long long int
329 __device__ long long int clock64();
330 __device__ clock_t clock();
331 
332 //atomicAdd()
333 __device__ int atomicAdd(int* address, int val);
334 __device__ unsigned int atomicAdd(unsigned int* address,
335  unsigned int val);
336 
337 __device__ unsigned long long int atomicAdd(unsigned long long int* address,
338  unsigned long long int val);
339 
340 __device__ float atomicAdd(float* address, float val);
341 
342 
343 //atomicSub()
344 __device__ int atomicSub(int* address, int val);
345 
346 __device__ unsigned int atomicSub(unsigned int* address,
347  unsigned int val);
348 
349 
350 //atomicExch()
351 __device__ int atomicExch(int* address, int val);
352 
353 __device__ unsigned int atomicExch(unsigned int* address,
354  unsigned int val);
355 
356 __device__ unsigned long long int atomicExch(unsigned long long int* address,
357  unsigned long long int val);
358 
359 __device__ float atomicExch(float* address, float val);
360 
361 
362 //atomicMin()
363 __device__ int atomicMin(int* address, int val);
364 __device__ unsigned int atomicMin(unsigned int* address,
365  unsigned int val);
366 __device__ unsigned long long int atomicMin(unsigned long long int* address,
367  unsigned long long int val);
368 
369 
370 //atomicMax()
371 __device__ int atomicMax(int* address, int val);
372 __device__ unsigned int atomicMax(unsigned int* address,
373  unsigned int val);
374 __device__ unsigned long long int atomicMax(unsigned long long int* address,
375  unsigned long long int val);
376 
377 
378 //atomicCAS()
379 __device__ int atomicCAS(int* address, int compare, int val);
380 __device__ unsigned int atomicCAS(unsigned int* address,
381  unsigned int compare,
382  unsigned int val);
383 __device__ unsigned long long int atomicCAS(unsigned long long int* address,
384  unsigned long long int compare,
385  unsigned long long int val);
386 
387 
388 //atomicAnd()
389 __device__ int atomicAnd(int* address, int val);
390 __device__ unsigned int atomicAnd(unsigned int* address,
391  unsigned int val);
392 __device__ unsigned long long int atomicAnd(unsigned long long int* address,
393  unsigned long long int val);
394 
395 
396 //atomicOr()
397 __device__ int atomicOr(int* address, int val);
398 __device__ unsigned int atomicOr(unsigned int* address,
399  unsigned int val);
400 __device__ unsigned long long int atomicOr(unsigned long long int* address,
401  unsigned long long int val);
402 
403 
404 //atomicXor()
405 __device__ int atomicXor(int* address, int val);
406 __device__ unsigned int atomicXor(unsigned int* address,
407  unsigned int val);
408 __device__ unsigned long long int atomicXor(unsigned long long int* address,
409  unsigned long long int val);
410 
411 //atomicInc()
412 __device__ unsigned int atomicInc(unsigned int* address,
413  unsigned int val);
414 
415 
416 //atomicDec()
417 __device__ unsigned int atomicDec(unsigned int* address,
418  unsigned int val);
419 
420 //__mul24 __umul24
421 __device__ int __mul24(int arg1, int arg2);
422 __device__ unsigned int __umul24(unsigned int arg1, unsigned int arg2);
423 
424 // integer intrinsic function __poc __clz __ffs __brev
425 __device__ unsigned int __popc( unsigned int input);
426 __device__ unsigned int __popcll( unsigned long long int input);
427 __device__ unsigned int __clz(unsigned int input);
428 __device__ unsigned int __clzll(unsigned long long int input);
429 __device__ unsigned int __clz(int input);
430 __device__ unsigned int __clzll(long long int input);
431 __device__ unsigned int __ffs(unsigned int input);
432 __device__ unsigned int __ffsll(unsigned long long int input);
433 __device__ unsigned int __ffs(int input);
434 __device__ unsigned int __ffsll(long long int input);
435 __device__ unsigned int __brev( unsigned int input);
436 __device__ unsigned long long int __brevll( unsigned long long int input);
437 
438 
439 // warp vote function __all __any __ballot
440 __device__ int __all( int input);
441 __device__ int __any( int input);
442 __device__ unsigned long long int __ballot( int input);
443 
444 // warp shuffle functions
445 #ifdef __cplusplus
446 __device__ int __shfl(int input, int lane, int width=warpSize);
447 __device__ int __shfl_up(int input, unsigned int lane_delta, int width=warpSize);
448 __device__ int __shfl_down(int input, unsigned int lane_delta, int width=warpSize);
449 __device__ int __shfl_xor(int input, int lane_mask, int width=warpSize);
450 __device__ float __shfl(float input, int lane, int width=warpSize);
451 __device__ float __shfl_up(float input, unsigned int lane_delta, int width=warpSize);
452 __device__ float __shfl_down(float input, unsigned int lane_delta, int width=warpSize);
453 __device__ float __shfl_xor(float input, int lane_mask, int width=warpSize);
454 #else
455 __device__ int __shfl(int input, int lane, int width);
456 __device__ int __shfl_up(int input, unsigned int lane_delta, int width);
457 __device__ int __shfl_down(int input, unsigned int lane_delta, int width);
458 __device__ int __shfl_xor(int input, int lane_mask, int width);
459 __device__ float __shfl(float input, int lane, int width);
460 __device__ float __shfl_up(float input, unsigned int lane_delta, int width);
461 __device__ float __shfl_down(float input, unsigned int lane_delta, int width);
462 __device__ float __shfl_xor(float input, int lane_mask, int width);
463 #endif
464 
465 __host__ __device__ int min(int arg1, int arg2);
466 __host__ __device__ int max(int arg1, int arg2);
467 
468 __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();
469 
470 //TODO - add a couple fast math operations here, the set here will grow :
471 
472 // Single Precision Precise Math
473 __device__ float __hip_precise_cosf(float);
474 __device__ float __hip_precise_exp10f(float);
475 __device__ float __hip_precise_expf(float);
476 __device__ float __hip_precise_frsqrt_rn(float);
477 __device__ float __hip_precise_fsqrt_rd(float);
478 __device__ float __hip_precise_fsqrt_rn(float);
479 __device__ float __hip_precise_fsqrt_ru(float);
480 __device__ float __hip_precise_fsqrt_rz(float);
481 __device__ float __hip_precise_log10f(float);
482 __device__ float __hip_precise_log2f(float);
483 __device__ float __hip_precise_logf(float);
484 __device__ float __hip_precise_powf(float, float);
485 __device__ void __hip_precise_sincosf(float,float*,float*);
486 __device__ float __hip_precise_sinf(float);
487 __device__ float __hip_precise_tanf(float);
488 
489 // Double Precision Precise Math
490 __device__ double __hip_precise_dsqrt_rd(double);
491 __device__ double __hip_precise_dsqrt_rn(double);
492 __device__ double __hip_precise_dsqrt_ru(double);
493 __device__ double __hip_precise_dsqrt_rz(double);
494 
495 // Single Precision Fast Math
496 extern __attribute__((const)) float __hip_fast_cosf(float) __asm("llvm.cos.f32");
497 extern __attribute__((const)) float __hip_fast_exp2f(float) __asm("llvm.exp2.f32");
498 __device__ float __hip_fast_exp10f(float);
499 __device__ float __hip_fast_expf(float);
500 __device__ float __hip_fast_frsqrt_rn(float);
501 extern __attribute__((const)) float __hip_fast_fsqrt_rd(float) __asm("llvm.sqrt.f32");
502 __device__ float __hip_fast_fsqrt_rn(float);
503 __device__ float __hip_fast_fsqrt_ru(float);
504 __device__ float __hip_fast_fsqrt_rz(float);
505 __device__ float __hip_fast_log10f(float);
506 extern __attribute__((const)) float __hip_fast_log2f(float) __asm("llvm.log2.f32");
507 __device__ float __hip_fast_logf(float);
508 __device__ float __hip_fast_powf(float, float);
509 __device__ void __hip_fast_sincosf(float,float*,float*);
510 extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
511 __device__ float __hip_fast_tanf(float);
512 extern __attribute__((const)) float __hip_fast_fmaf(float,float,float) __asm("llvm.fma.f32");
513 extern __attribute__((const)) float __hip_fast_frcp(float) __asm("llvm.amdgcn.rcp.f32");
514 
515 extern __attribute__((const)) double __hip_fast_dsqrt(double) __asm("llvm.sqrt.f64");
516 extern __attribute__((const)) double __hip_fast_fma(double,double,double) __asm("llvm.fma.f64");
517 extern __attribute__((const)) double __hip_fast_drcp(double) __asm("llvm.amdgcn.rcp.f64");
518 
519 #ifdef HIP_FAST_MATH
520 // Single Precision Precise Math when enabled
521 
522 __device__ inline float cosf(float x) {
523  return __hip_fast_cosf(x);
524 }
525 
526 __device__ inline float exp10f(float x) {
527  return __hip_fast_exp10f(x);
528 }
529 
530 __device__ inline float expf(float x) {
531  return __hip_fast_expf(x);
532 }
533 
534 __device__ inline float log10f(float x) {
535  return __hip_fast_log10f(x);
536 }
537 
538 __device__ inline float log2f(float x) {
539  return __hip_fast_log2f(x);
540 }
541 
542 __device__ inline float logf(float x) {
543  return __hip_fast_logf(x);
544 }
545 
546 __device__ inline float powf(float base, float exponent) {
547  return __hip_fast_powf(base, exponent);
548 }
549 
550 __device__ inline void sincosf(float x, float *s, float *c) {
551  return __hip_fast_sincosf(x, s, c);
552 }
553 
554 __device__ inline float sinf(float x) {
555  return __hip_fast_sinf(x);
556 }
557 
558 __device__ inline float tanf(float x) {
559  return __hip_fast_tanf(x);
560 }
561 
562 #else
563 
564 __device__ float sinf(float);
565 __device__ float cosf(float);
566 __device__ float tanf(float);
567 __device__ void sincosf(float, float*, float*);
568 __device__ float logf(float);
569 __device__ float log2f(float);
570 __device__ float log10f(float);
571 __device__ float expf(float);
572 __device__ float exp10f(float);
573 __device__ float powf(float, float);
574 
575 #endif
576 // Single Precision Fast Math
577 __device__ inline float __cosf(float x) {
578  return __hip_fast_cosf(x);
579 }
580 
581 __device__ inline float __exp10f(float x) {
582  return __hip_fast_exp10f(x);
583 }
584 
585 __device__ inline float __expf(float x) {
586  return __hip_fast_expf(x);
587 }
588 
589 __device__ inline float __frsqrt_rn(float x) {
590  return __hip_fast_frsqrt_rn(x);
591 }
592 
593 __device__ inline float __fsqrt_rd(float x) {
594  return __hip_fast_fsqrt_rd(x);
595 }
596 
597 __device__ inline float __fsqrt_rn(float x) {
598  return __hip_fast_fsqrt_rn(x);
599 }
600 
601 __device__ inline float __fsqrt_ru(float x) {
602  return __hip_fast_fsqrt_ru(x);
603 }
604 
605 __device__ inline float __fsqrt_rz(float x) {
606  return __hip_fast_fsqrt_rz(x);
607 }
608 
609 __device__ inline float __log10f(float x) {
610  return __hip_fast_log10f(x);
611 }
612 
613 __device__ inline float __log2f(float x) {
614  return __hip_fast_log2f(x);
615 }
616 
617 __device__ inline float __logf(float x) {
618  return __hip_fast_logf(x);
619 }
620 
621 __device__ inline float __powf(float base, float exponent) {
622  return __hip_fast_powf(base, exponent);
623 }
624 
625 __device__ inline void __sincosf(float x, float *s, float *c) {
626  return __hip_fast_sincosf(x, s, c);
627 }
628 
629 __device__ inline float __sinf(float x) {
630  return __hip_fast_sinf(x);
631 }
632 
633 __device__ inline float __tanf(float x) {
634  return __hip_fast_tanf(x);
635 }
636 
637 __device__ inline float __fmaf_rd(float x, float y, float z) {
638  return __hip_fast_fmaf(x, y, z);
639 }
640 
641 __device__ inline float __fmaf_rn(float x, float y, float z) {
642  return __hip_fast_fmaf(x, y, z);
643 }
644 
645 __device__ inline float __fmaf_ru(float x, float y, float z) {
646  return __hip_fast_fmaf(x, y, z);
647 }
648 
649 __device__ inline float __fmaf_rz(float x, float y, float z) {
650  return __hip_fast_fmaf(x, y, z);
651 }
652 
653 __device__ inline float __frcp_rd(float x) {
654  return __hip_fast_frcp(x);
655 }
656 
657 __device__ inline float __frcp_rn(float x) {
658  return __hip_fast_frcp(x);
659 }
660 
661 __device__ inline float __frcp_ru(float x) {
662  return __hip_fast_frcp(x);
663 }
664 
665 __device__ inline float __frcp_rz(float x) {
666  return __hip_fast_frcp(x);
667 }
668 
669 __device__ inline double __dsqrt_rd(double x) {
670  return __hip_fast_dsqrt(x);
671 }
672 
673 __device__ inline double __dsqrt_rn(double x) {
674  return __hip_fast_dsqrt(x);
675 }
676 
677 __device__ inline double __dsqrt_ru(double x) {
678  return __hip_fast_dsqrt(x);
679 }
680 
681 __device__ inline double __dsqrt_rz(double x) {
682  return __hip_fast_dsqrt(x);
683 }
684 
685 __device__ inline double __fma_rd(double x, double y, double z) {
686  return __hip_fast_fma(x, y, z);
687 }
688 
689 __device__ inline double __fma_rn(double x, double y, double z) {
690  return __hip_fast_fma(x, y, z);
691 }
692 
693 __device__ inline double __fma_ru(double x, double y, double z) {
694  return __hip_fast_fma(x, y, z);
695 }
696 
697 __device__ inline double __fma_rz(double x, double y, double z) {
698  return __hip_fast_fma(x, y, z);
699 }
700 
701 __device__ inline double __drcp_rd(double x) {
702  return __hip_fast_drcp(x);
703 }
704 
705 __device__ inline double __drcp_rn(double x) {
706  return __hip_fast_drcp(x);
707 }
708 
709 __device__ inline double __drcp_ru(double x) {
710  return __hip_fast_drcp(x);
711 }
712 
713 __device__ inline double __drcp_rz(double x) {
714  return __hip_fast_drcp(x);
715 }
716 
750 // __device__ void __threadfence_block(void);
751 extern "C" __device__ void __threadfence_block(void);
752 
762 // __device__ void __threadfence(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional")));
763 extern "C" __device__ void __threadfence(void);
764 
774 //__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround configuration, see hip_kernel_language.md for details")));
775 __device__ void __threadfence_system(void) ;
776 
777 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
778 __device__ float __hip_ds_bpermutef(int index, float src);
779 __device__ unsigned __hip_ds_permute(int index, unsigned src);
780 __device__ float __hip_ds_permutef(int index, float src);
781 
782 __device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
783 __device__ float __hip_ds_swizzlef(float src, int pattern);
784 
785 __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
786 
787 // doxygen end Fence Fence
793 #define hipThreadIdx_x (hc_get_workitem_id(0))
794 #define hipThreadIdx_y (hc_get_workitem_id(1))
795 #define hipThreadIdx_z (hc_get_workitem_id(2))
796 
797 #define hipBlockIdx_x (hc_get_group_id(0))
798 #define hipBlockIdx_y (hc_get_group_id(1))
799 #define hipBlockIdx_z (hc_get_group_id(2))
800 
801 #define hipBlockDim_x (hc_get_group_size(0))
802 #define hipBlockDim_y (hc_get_group_size(1))
803 #define hipBlockDim_z (hc_get_group_size(2))
804 
805 #define hipGridDim_x (hc_get_num_groups(0))
806 #define hipGridDim_y (hc_get_num_groups(1))
807 #define hipGridDim_z (hc_get_num_groups(2))
808 
809 //extern "C" __device__ void* memcpy(void* dst, void* src, size_t size);
810 //extern "C" __device__ void* memset(void* ptr, uint8_t val, size_t size);
811 
812 extern "C" __device__ void* __hip_hc_malloc(size_t);
813 extern "C" __device__ void* __hip_hc_free(void *ptr);
814 
815 //extern "C" __device__ void* malloc(size_t size);
816 //extern "C" __device__ void* free(void *ptr);
817 
818 extern "C" __device__ char4 __hip_hc_add8pk(char4, char4);
819 extern "C" __device__ char4 __hip_hc_sub8pk(char4, char4);
820 extern "C" __device__ char4 __hip_hc_mul8pk(char4, char4);
821 
822 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
823 
824 #define HIP_KERNEL_NAME(...) __VA_ARGS__
825 #define HIP_SYMBOL(X) #X
826 
827 #ifdef __HCC_CPP__
828 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
829 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
830 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
831 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
832 extern void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launch_parm &lp);
833 
834 
835 // Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be either size_t or dim3 types
836 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
837 do {\
838  grid_launch_parm lp;\
839  lp.dynamic_group_mem_bytes = _groupMemBytes; \
840  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
841  _kernelName (lp, ##__VA_ARGS__);\
842  ihipPostLaunchKernel(#_kernelName, trueStream, lp);\
843 } while(0)
844 
845 
846 #elif defined (__HCC_C__)
847 
848 //TODO - develop C interface.
849 
850 #endif
851 
856 // Macro to replace extern __shared__ declarations
857 // to local variable definitions
858 #define HIP_DYNAMIC_SHARED(type, var) \
859  __attribute__((address_space(3))) type* var = \
860  (__attribute__((address_space(3))) type*)__get_dynamicgroupbaseptr(); \
861 
862 #define HIP_DYNAMIC_SHARED_ATTRIBUTE __attribute__((address_space(3)))
863 
864 #endif // __HCC__
865 
866 
871 //extern int HIP_PRINT_ENV ; ///< Print all HIP-related environment variables.
872 //extern int HIP_TRACE_API; ///< Trace HIP APIs.
873 //extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous
874 
880 // End doxygen API:
888 #endif
__device__ void __threadfence(void)
threadfence makes wirtes visible to other threads running on same GPU.
TODO-doc.
Definition: hip_runtime_api.h:151
#define __host__
Definition: host_defines.h:35
__device__ void __threadfence_system(void)
threadfence_system makes writes to pinned system memory visible on host CPU.
Definition: device_util.cpp:2600
Definition: hip_vector_types.h:140
__device__ void __threadfence_block(void)
threadfence_block makes writes visible to threads running in same block.
Definition: hip_hcc.h:463