46 #define CUDA_SUCCESS hipSuccess
48 #include <hip/hip_runtime_api.h>
53 #include <grid_launch.h>
55 #if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20)
58 #error (HCC must support GRID_LAUNCH_20)
61 extern int HIP_TRACE_API;
65 #define hipLaunchParm grid_launch_parm
68 #include <hip/hcc_detail/hip_ldg.h>
72 #if defined (__KALMAR_ACCELERATOR__) && !defined (__HCC_ACCELERATOR__)
73 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
77 #if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
82 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
83 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
84 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
85 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
86 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
89 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
90 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
93 #define __HIP_ARCH_HAS_DOUBLES__ (1)
96 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
97 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
98 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
99 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
102 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
103 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
106 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
107 #define __HIP_ARCH_HAS_3DGRID__ (1)
108 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
114 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
117 #if defined(__cplusplus)
119 #elif defined(__STDC_VERSION__)
123 __device__
float acosf(
float x);
124 __device__
float acoshf(
float x);
125 __device__
float asinf(
float x);
126 __device__
float asinhf(
float x);
127 __device__
float atan2f(
float y,
float x);
128 __device__
float atanf(
float x);
129 __device__
float atanhf(
float x);
130 __device__
float cbrtf(
float x);
131 __device__
float ceilf(
float x);
132 __device__
float copysignf(
float x,
float y);
133 __device__
float coshf(
float x);
134 __device__
float cyl_bessel_i0f(
float x);
135 __device__
float cyl_bessel_i1f(
float x);
136 __device__
float erfcf(
float x);
137 __device__
float erfcinvf(
float y);
139 __device__
float erfcxf(
float x);
141 __device__
float erff(
float x);
142 __device__
float erfinvf(
float y);
144 __device__
float exp2f(
float x);
145 __device__
float expm1f(
float x);
146 __device__
float fabsf(
float x);
147 __device__
float fdimf(
float x,
float y);
148 __device__
__host__ float fdividef(
float x,
float y);
149 __device__
float floorf(
float x);
150 __device__
float fmaf(
float x,
float y,
float z);
151 __device__
float fmaxf(
float x,
float y);
152 __device__
float fminf(
float x,
float y);
153 __device__
float fmodf(
float x,
float y);
154 __device__
float frexpf(
float x,
float y);
155 __device__
float hypotf(
float x,
float y);
156 __device__
float ilogbf(
float x);
157 __host__ __device__
unsigned isfinite(
float a);
158 __device__
unsigned isinf(
float a);
159 __device__
unsigned isnan(
float a);
160 __device__
float j0f(
float x);
161 __device__
float j1f(
float x);
162 __device__
float jnf(
int n,
float x);
163 __device__
float ldexpf(
float x,
int exp);
164 __device__
float lgammaf(
float x);
165 __device__
long long int llrintf(
float x);
166 __device__
long long int llroundf(
float x);
167 __device__
float log1pf(
float x);
168 __device__
float logbf(
float x);
169 __device__
long int lrintf(
float x);
170 __device__
long int lroundf(
float x);
171 __device__
float modff(
float x,
float *iptr);
172 __device__
float nanf(
const char* tagp);
173 __device__
float nearbyintf(
float x);
174 __device__
float nextafterf(
float x,
float y);
175 __device__
float norm3df(
float a,
float b,
float c);
176 __host__ float norm3df(
float a,
float b,
float c);
177 __device__
float norm4df(
float a,
float b,
float c,
float d);
178 __host__ float norm4df(
float a,
float b,
float c,
float d);
179 __device__
float normcdff(
float y);
181 __device__
float normcdfinvf(
float y);
182 __host__ float normcdfinvf(
float y);
183 __device__
float normf(
int dim,
const float *a);
184 __device__
float rcbrtf(
float x);
186 __device__
float remainderf(
float x,
float y);
187 __device__
float remquof(
float x,
float y,
int *quo);
188 __device__
float rhypotf(
float x,
float y);
189 __host__ float rhypotf(
float x,
float y);
190 __device__
float rintf(
float x);
191 __device__
float rnorm3df(
float a,
float b,
float c);
192 __host__ float rnorm3df(
float a,
float b,
float c);
193 __device__
float rnorm4df(
float a,
float b,
float c,
float d);
194 __host__ float rnorm4df(
float a,
float b,
float c,
float d);
195 __device__
float rnormf(
int dim,
const float* a);
196 __host__ float rnormf(
int dim,
const float* a);
197 __device__
float roundf(
float x);
198 __device__
float rsqrtf(
float x);
199 __device__
float scalblnf(
float x,
long int n);
200 __device__
float scalbnf(
float x,
int n);
201 __host__ __device__
unsigned signbit(
float a);
202 __device__
void sincospif(
float x,
float *sptr,
float *cptr);
203 __host__ void sincospif(
float x,
float *sptr,
float *cptr);
204 __device__
float sinhf(
float x);
205 __device__
float sinpif(
float x);
206 __device__
float sqrtf(
float x);
207 __device__
float tanhf(
float x);
208 __device__
float tgammaf(
float x);
209 __device__
float truncf(
float x);
210 __device__
float y0f(
float x);
211 __device__
float y1f(
float x);
212 __device__
float ynf(
int n,
float x);
214 __host__ __device__
float cospif(
float x);
215 __host__ __device__
float sinpif(
float x);
216 __device__
float sqrtf(
float x);
217 __host__ __device__
float rsqrtf(
float x);
219 __device__
double acos(
double x);
220 __device__
double acosh(
double x);
221 __device__
double asin(
double x);
222 __device__
double asinh(
double x);
223 __device__
double atan(
double x);
224 __device__
double atan2(
double y,
double x);
225 __device__
double atanh(
double x);
226 __device__
double cbrt(
double x);
227 __device__
double ceil(
double x);
228 __device__
double copysign(
double x,
double y);
229 __device__
double cos(
double x);
230 __device__
double cosh(
double x);
231 __host__ __device__
double cospi(
double x);
232 __device__
double cyl_bessel_i0(
double x);
233 __device__
double cyl_bessel_i1(
double x);
234 __device__
double erf(
double x);
235 __device__
double erfc(
double x);
236 __device__
double erfcinv(
double y);
237 __device__
double erfcx(
double x);
238 __device__
double erfinv(
double x);
239 __device__
double exp(
double x);
240 __device__
double exp10(
double x);
241 __device__
double exp2(
double x);
242 __device__
double expm1(
double x);
243 __device__
double fabs(
double x);
244 __device__
double fdim(
double x,
double y);
245 __device__
double fdivide(
double x,
double y);
246 __device__
double floor(
double x);
247 __device__
double fma(
double x,
double y,
double z);
248 __device__
double fmax(
double x,
double y);
249 __device__
double fmin(
double x,
double y);
250 __device__
double fmod(
double x,
double y);
251 __device__
double frexp(
double x,
int *nptr);
252 __device__
double hypot(
double x,
double y);
253 __device__
double ilogb(
double x);
254 __host__ __device__
unsigned isfinite(
double x);
255 __device__
unsigned isinf(
double x);
256 __device__
unsigned isnan(
double x);
257 __device__
double j0(
double x);
258 __device__
double j1(
double x);
259 __device__
double jn(
int n,
double x);
260 __device__
double ldexp(
double x,
int exp);
261 __device__
double lgamma(
double x);
262 __device__
long long llrint(
double x);
263 __device__
long long llround(
double x);
264 __device__
double log(
double x);
265 __device__
double log10(
double x);
266 __device__
double log1p(
double x);
267 __device__
double log2(
double x);
268 __device__
double logb(
double x);
269 __device__
long int lrint(
double x);
270 __device__
long int lround(
double x);
271 __device__
double modf(
double x,
double *iptr);
272 __device__
double nan(
const char* tagp);
273 __device__
double nearbyint(
double x);
274 __device__
double nextafter(
double x,
double y);
275 __device__
double norm(
int dim,
const double* t);
276 __device__
double norm3d(
double a,
double b,
double c);
277 __host__ double norm3d(
double a,
double b,
double c);
278 __device__
double norm4d(
double a,
double b,
double c,
double d);
279 __host__ double norm4d(
double a,
double b,
double c,
double d);
280 __device__
double normcdf(
double y);
282 __device__
double normcdfinv(
double y);
283 __host__ double normcdfinv(
double y);
284 __device__
double pow(
double x,
double y);
285 __device__
double rcbrt(
double x);
287 __device__
double remainder(
double x,
double y);
288 __device__
double remquo(
double x,
double y,
int *quo);
289 __device__
double rhypot(
double x,
double y);
290 __host__ double rhypot(
double x,
double y);
291 __device__
double rint(
double x);
292 __device__
double rnorm(
int dim,
const double* t);
293 __host__ double rnorm(
int dim,
const double* t);
294 __device__
double rnorm3d(
double a,
double b,
double c);
295 __host__ double rnorm3d(
double a,
double b,
double c);
296 __device__
double rnorm4d(
double a,
double b,
double c,
double d);
297 __host__ double rnorm4d(
double a,
double b,
double c,
double d);
298 __device__
double round(
double x);
299 __host__ __device__
double rsqrt(
double x);
300 __device__
double scalbln(
double x,
long int n);
301 __device__
double scalbn(
double x,
int n);
302 __host__ __device__
unsigned signbit(
double a);
303 __device__
double sin(
double a);
304 __device__
void sincos(
double x,
double *sptr,
double *cptr);
305 __device__
void sincospi(
double x,
double *sptr,
double *cptr);
306 __host__ void sincospi(
double x,
double *sptr,
double *cptr);
307 __device__
double sinh(
double x);
308 __host__ __device__
double sinpi(
double x);
309 __device__
double sqrt(
double x);
310 __device__
double tan(
double x);
311 __device__
double tanh(
double x);
312 __device__
double tgamma(
double x);
313 __device__
double trunc(
double x);
314 __device__
double y0(
double x);
315 __device__
double y1(
double y);
316 __device__
double yn(
int n,
double x);
321 __host__ double fdivide(
double x,
double y);
325 extern const int warpSize;
328 #define clock_t long long int
329 __device__
long long int clock64();
330 __device__ clock_t clock();
333 __device__
int atomicAdd(
int* address,
int val);
334 __device__
unsigned int atomicAdd(
unsigned int* address,
337 __device__
unsigned long long int atomicAdd(
unsigned long long int* address,
338 unsigned long long int val);
340 __device__
float atomicAdd(
float* address,
float val);
344 __device__
int atomicSub(
int* address,
int val);
346 __device__
unsigned int atomicSub(
unsigned int* address,
351 __device__
int atomicExch(
int* address,
int val);
353 __device__
unsigned int atomicExch(
unsigned int* address,
356 __device__
unsigned long long int atomicExch(
unsigned long long int* address,
357 unsigned long long int val);
359 __device__
float atomicExch(
float* address,
float val);
363 __device__
int atomicMin(
int* address,
int val);
364 __device__
unsigned int atomicMin(
unsigned int* address,
366 __device__
unsigned long long int atomicMin(
unsigned long long int* address,
367 unsigned long long int val);
371 __device__
int atomicMax(
int* address,
int val);
372 __device__
unsigned int atomicMax(
unsigned int* address,
374 __device__
unsigned long long int atomicMax(
unsigned long long int* address,
375 unsigned long long int val);
379 __device__
int atomicCAS(
int* address,
int compare,
int val);
380 __device__
unsigned int atomicCAS(
unsigned int* address,
381 unsigned int compare,
383 __device__
unsigned long long int atomicCAS(
unsigned long long int* address,
384 unsigned long long int compare,
385 unsigned long long int val);
389 __device__
int atomicAnd(
int* address,
int val);
390 __device__
unsigned int atomicAnd(
unsigned int* address,
392 __device__
unsigned long long int atomicAnd(
unsigned long long int* address,
393 unsigned long long int val);
397 __device__
int atomicOr(
int* address,
int val);
398 __device__
unsigned int atomicOr(
unsigned int* address,
400 __device__
unsigned long long int atomicOr(
unsigned long long int* address,
401 unsigned long long int val);
405 __device__
int atomicXor(
int* address,
int val);
406 __device__
unsigned int atomicXor(
unsigned int* address,
408 __device__
unsigned long long int atomicXor(
unsigned long long int* address,
409 unsigned long long int val);
412 __device__
unsigned int atomicInc(
unsigned int* address,
417 __device__
unsigned int atomicDec(
unsigned int* address,
421 __device__
int __mul24(
int arg1,
int arg2);
422 __device__
unsigned int __umul24(
unsigned int arg1,
unsigned int arg2);
425 __device__
unsigned int __popc(
unsigned int input);
426 __device__
unsigned int __popcll(
unsigned long long int input);
427 __device__
unsigned int __clz(
unsigned int input);
428 __device__
unsigned int __clzll(
unsigned long long int input);
429 __device__
unsigned int __clz(
int input);
430 __device__
unsigned int __clzll(
long long int input);
431 __device__
unsigned int __ffs(
unsigned int input);
432 __device__
unsigned int __ffsll(
unsigned long long int input);
433 __device__
unsigned int __ffs(
int input);
434 __device__
unsigned int __ffsll(
long long int input);
435 __device__
unsigned int __brev(
unsigned int input);
436 __device__
unsigned long long int __brevll(
unsigned long long int input);
440 __device__
int __all(
int input);
441 __device__
int __any(
int input);
442 __device__
unsigned long long int __ballot(
int input);
446 __device__
int __shfl(
int input,
int lane,
int width=warpSize);
447 __device__
int __shfl_up(
int input,
unsigned int lane_delta,
int width=warpSize);
448 __device__
int __shfl_down(
int input,
unsigned int lane_delta,
int width=warpSize);
449 __device__
int __shfl_xor(
int input,
int lane_mask,
int width=warpSize);
450 __device__
float __shfl(
float input,
int lane,
int width=warpSize);
451 __device__
float __shfl_up(
float input,
unsigned int lane_delta,
int width=warpSize);
452 __device__
float __shfl_down(
float input,
unsigned int lane_delta,
int width=warpSize);
453 __device__
float __shfl_xor(
float input,
int lane_mask,
int width=warpSize);
455 __device__
int __shfl(
int input,
int lane,
int width);
456 __device__
int __shfl_up(
int input,
unsigned int lane_delta,
int width);
457 __device__
int __shfl_down(
int input,
unsigned int lane_delta,
int width);
458 __device__
int __shfl_xor(
int input,
int lane_mask,
int width);
459 __device__
float __shfl(
float input,
int lane,
int width);
460 __device__
float __shfl_up(
float input,
unsigned int lane_delta,
int width);
461 __device__
float __shfl_down(
float input,
unsigned int lane_delta,
int width);
462 __device__
float __shfl_xor(
float input,
int lane_mask,
int width);
465 __host__ __device__
int min(
int arg1,
int arg2);
466 __host__ __device__
int max(
int arg1,
int arg2);
468 __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();
473 __device__
float __hip_precise_cosf(
float);
474 __device__
float __hip_precise_exp10f(
float);
475 __device__
float __hip_precise_expf(
float);
476 __device__
float __hip_precise_frsqrt_rn(
float);
477 __device__
float __hip_precise_fsqrt_rd(
float);
478 __device__
float __hip_precise_fsqrt_rn(
float);
479 __device__
float __hip_precise_fsqrt_ru(
float);
480 __device__
float __hip_precise_fsqrt_rz(
float);
481 __device__
float __hip_precise_log10f(
float);
482 __device__
float __hip_precise_log2f(
float);
483 __device__
float __hip_precise_logf(
float);
484 __device__
float __hip_precise_powf(
float,
float);
485 __device__
void __hip_precise_sincosf(
float,
float*,
float*);
486 __device__
float __hip_precise_sinf(
float);
487 __device__
float __hip_precise_tanf(
float);
490 __device__
double __hip_precise_dsqrt_rd(
double);
491 __device__
double __hip_precise_dsqrt_rn(
double);
492 __device__
double __hip_precise_dsqrt_ru(
double);
493 __device__
double __hip_precise_dsqrt_rz(
double);
496 extern __attribute__((const))
float __hip_fast_cosf(
float) __asm("llvm.cos.f32");
497 extern __attribute__((const))
float __hip_fast_exp2f(
float) __asm("llvm.exp2.f32");
498 __device__
float __hip_fast_exp10f(
float);
499 __device__
float __hip_fast_expf(
float);
500 __device__
float __hip_fast_frsqrt_rn(
float);
501 extern __attribute__((const))
float __hip_fast_fsqrt_rd(
float) __asm("llvm.sqrt.f32");
502 __device__
float __hip_fast_fsqrt_rn(
float);
503 __device__
float __hip_fast_fsqrt_ru(
float);
504 __device__
float __hip_fast_fsqrt_rz(
float);
505 __device__
float __hip_fast_log10f(
float);
506 extern __attribute__((const))
float __hip_fast_log2f(
float) __asm("llvm.log2.f32");
507 __device__
float __hip_fast_logf(
float);
508 __device__
float __hip_fast_powf(
float,
float);
509 __device__
void __hip_fast_sincosf(
float,
float*,
float*);
510 extern __attribute__((const))
float __hip_fast_sinf(
float) __asm("llvm.sin.f32");
511 __device__
float __hip_fast_tanf(
float);
512 extern __attribute__((const))
float __hip_fast_fmaf(
float,
float,
float) __asm("llvm.fma.f32");
513 extern __attribute__((const))
float __hip_fast_frcp(
float) __asm("llvm.amdgcn.rcp.f32");
515 extern __attribute__((const))
double __hip_fast_dsqrt(
double) __asm("llvm.sqrt.f64");
516 extern __attribute__((const))
double __hip_fast_fma(
double,
double,
double) __asm("llvm.fma.f64");
517 extern __attribute__((const))
double __hip_fast_drcp(
double) __asm("llvm.amdgcn.rcp.f64");
522 __device__
inline float cosf(
float x) {
523 return __hip_fast_cosf(x);
526 __device__
inline float exp10f(
float x) {
527 return __hip_fast_exp10f(x);
530 __device__
inline float expf(
float x) {
531 return __hip_fast_expf(x);
534 __device__
inline float log10f(
float x) {
535 return __hip_fast_log10f(x);
538 __device__
inline float log2f(
float x) {
539 return __hip_fast_log2f(x);
542 __device__
inline float logf(
float x) {
543 return __hip_fast_logf(x);
546 __device__
inline float powf(
float base,
float exponent) {
547 return __hip_fast_powf(base, exponent);
550 __device__
inline void sincosf(
float x,
float *s,
float *c) {
551 return __hip_fast_sincosf(x, s, c);
554 __device__
inline float sinf(
float x) {
555 return __hip_fast_sinf(x);
558 __device__
inline float tanf(
float x) {
559 return __hip_fast_tanf(x);
564 __device__
float sinf(
float);
565 __device__
float cosf(
float);
566 __device__
float tanf(
float);
567 __device__
void sincosf(
float,
float*,
float*);
568 __device__
float logf(
float);
569 __device__
float log2f(
float);
570 __device__
float log10f(
float);
571 __device__
float expf(
float);
572 __device__
float exp10f(
float);
573 __device__
float powf(
float,
float);
577 __device__
inline float __cosf(
float x) {
578 return __hip_fast_cosf(x);
581 __device__
inline float __exp10f(
float x) {
582 return __hip_fast_exp10f(x);
585 __device__
inline float __expf(
float x) {
586 return __hip_fast_expf(x);
589 __device__
inline float __frsqrt_rn(
float x) {
590 return __hip_fast_frsqrt_rn(x);
593 __device__
inline float __fsqrt_rd(
float x) {
594 return __hip_fast_fsqrt_rd(x);
597 __device__
inline float __fsqrt_rn(
float x) {
598 return __hip_fast_fsqrt_rn(x);
601 __device__
inline float __fsqrt_ru(
float x) {
602 return __hip_fast_fsqrt_ru(x);
605 __device__
inline float __fsqrt_rz(
float x) {
606 return __hip_fast_fsqrt_rz(x);
609 __device__
inline float __log10f(
float x) {
610 return __hip_fast_log10f(x);
613 __device__
inline float __log2f(
float x) {
614 return __hip_fast_log2f(x);
617 __device__
inline float __logf(
float x) {
618 return __hip_fast_logf(x);
621 __device__
inline float __powf(
float base,
float exponent) {
622 return __hip_fast_powf(base, exponent);
625 __device__
inline void __sincosf(
float x,
float *s,
float *c) {
626 return __hip_fast_sincosf(x, s, c);
629 __device__
inline float __sinf(
float x) {
630 return __hip_fast_sinf(x);
633 __device__
inline float __tanf(
float x) {
634 return __hip_fast_tanf(x);
637 __device__
inline float __fmaf_rd(
float x,
float y,
float z) {
638 return __hip_fast_fmaf(x, y, z);
641 __device__
inline float __fmaf_rn(
float x,
float y,
float z) {
642 return __hip_fast_fmaf(x, y, z);
645 __device__
inline float __fmaf_ru(
float x,
float y,
float z) {
646 return __hip_fast_fmaf(x, y, z);
649 __device__
inline float __fmaf_rz(
float x,
float y,
float z) {
650 return __hip_fast_fmaf(x, y, z);
653 __device__
inline float __frcp_rd(
float x) {
654 return __hip_fast_frcp(x);
657 __device__
inline float __frcp_rn(
float x) {
658 return __hip_fast_frcp(x);
661 __device__
inline float __frcp_ru(
float x) {
662 return __hip_fast_frcp(x);
665 __device__
inline float __frcp_rz(
float x) {
666 return __hip_fast_frcp(x);
669 __device__
inline double __dsqrt_rd(
double x) {
670 return __hip_fast_dsqrt(x);
673 __device__
inline double __dsqrt_rn(
double x) {
674 return __hip_fast_dsqrt(x);
677 __device__
inline double __dsqrt_ru(
double x) {
678 return __hip_fast_dsqrt(x);
681 __device__
inline double __dsqrt_rz(
double x) {
682 return __hip_fast_dsqrt(x);
685 __device__
inline double __fma_rd(
double x,
double y,
double z) {
686 return __hip_fast_fma(x, y, z);
689 __device__
inline double __fma_rn(
double x,
double y,
double z) {
690 return __hip_fast_fma(x, y, z);
693 __device__
inline double __fma_ru(
double x,
double y,
double z) {
694 return __hip_fast_fma(x, y, z);
697 __device__
inline double __fma_rz(
double x,
double y,
double z) {
698 return __hip_fast_fma(x, y, z);
701 __device__
inline double __drcp_rd(
double x) {
702 return __hip_fast_drcp(x);
705 __device__
inline double __drcp_rn(
double x) {
706 return __hip_fast_drcp(x);
709 __device__
inline double __drcp_ru(
double x) {
710 return __hip_fast_drcp(x);
713 __device__
inline double __drcp_rz(
double x) {
714 return __hip_fast_drcp(x);
777 __device__
unsigned __hip_ds_bpermute(
int index,
unsigned src);
778 __device__
float __hip_ds_bpermutef(
int index,
float src);
779 __device__
unsigned __hip_ds_permute(
int index,
unsigned src);
780 __device__
float __hip_ds_permutef(
int index,
float src);
782 __device__
unsigned __hip_ds_swizzle(
unsigned int src,
int pattern);
783 __device__
float __hip_ds_swizzlef(
float src,
int pattern);
785 __device__
int __hip_move_dpp(
int src,
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl);
793 #define hipThreadIdx_x (hc_get_workitem_id(0))
794 #define hipThreadIdx_y (hc_get_workitem_id(1))
795 #define hipThreadIdx_z (hc_get_workitem_id(2))
797 #define hipBlockIdx_x (hc_get_group_id(0))
798 #define hipBlockIdx_y (hc_get_group_id(1))
799 #define hipBlockIdx_z (hc_get_group_id(2))
801 #define hipBlockDim_x (hc_get_group_size(0))
802 #define hipBlockDim_y (hc_get_group_size(1))
803 #define hipBlockDim_z (hc_get_group_size(2))
805 #define hipGridDim_x (hc_get_num_groups(0))
806 #define hipGridDim_y (hc_get_num_groups(1))
807 #define hipGridDim_z (hc_get_num_groups(2))
812 extern "C" __device__
void* __hip_hc_malloc(
size_t);
813 extern "C" __device__
void* __hip_hc_free(
void *ptr);
822 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
824 #define HIP_KERNEL_NAME(...) __VA_ARGS__
825 #define HIP_SYMBOL(X) #X
829 extern hipStream_t ihipPreLaunchKernel(
hipStream_t stream,
dim3 grid,
size_t block, grid_launch_parm *lp,
const char *kernelNameStr);
830 extern hipStream_t ihipPreLaunchKernel(
hipStream_t stream,
size_t grid,
dim3 block, grid_launch_parm *lp,
const char *kernelNameStr);
831 extern hipStream_t ihipPreLaunchKernel(
hipStream_t stream,
size_t grid,
size_t block, grid_launch_parm *lp,
const char *kernelNameStr);
832 extern void ihipPostLaunchKernel(
const char *kernelName,
hipStream_t stream, grid_launch_parm &lp);
836 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
838 grid_launch_parm lp;\
839 lp.dynamic_group_mem_bytes = _groupMemBytes; \
840 hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
841 _kernelName (lp, ##__VA_ARGS__);\
842 ihipPostLaunchKernel(#_kernelName, trueStream, lp);\
846 #elif defined (__HCC_C__)
858 #define HIP_DYNAMIC_SHARED(type, var) \
859 __attribute__((address_space(3))) type* var = \
860 (__attribute__((address_space(3))) type*)__get_dynamicgroupbaseptr(); \
862 #define HIP_DYNAMIC_SHARED_ATTRIBUTE __attribute__((address_space(3)))
__device__ void __threadfence(void)
threadfence makes wirtes visible to other threads running on same GPU.
Definition: hip_runtime_api.h:151
#define __host__
Definition: host_defines.h:35
__device__ void __threadfence_system(void)
threadfence_system makes writes to pinned system memory visible on host CPU.
Definition: device_util.cpp:2600
Definition: hip_vector_types.h:140
__device__ void __threadfence_block(void)
threadfence_block makes writes visible to threads running in same block.
Definition: hip_hcc.h:463