41 #define CUDA_SUCCESS hipSuccess
43 #include <hip/hip_runtime_api.h>
48 #include <grid_launch.h>
49 extern int HIP_TRACE_API;
53 #define hipLaunchParm grid_launch_parm
55 #include <hip/hcc_detail/hip_texture.h>
56 #include <hip/hcc_detail/hip_ldg.h>
58 #include <hip/hcc_detail/host_defines.h>
60 #if defined (__KALMAR_ACCELERATOR__) && !defined (__HCC_ACCELERATOR__)
61 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
65 #if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
70 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
71 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
72 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
73 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
74 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
77 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
78 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
81 #define __HIP_ARCH_HAS_DOUBLES__ (1)
84 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
85 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
86 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
87 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
90 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
91 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
94 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
95 #define __HIP_ARCH_HAS_3DGRID__ (1)
96 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
102 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
105 #if defined(__cplusplus)
107 #elif defined(__STDC_VERSION__)
111 __device__
float acosf(
float x);
112 __device__
float acoshf(
float x);
113 __device__
float asinf(
float x);
114 __device__
float asinhf(
float x);
115 __device__
float atan2f(
float y,
float x);
116 __device__
float atanf(
float x);
117 __device__
float atanhf(
float x);
118 __device__
float cbrtf(
float x);
119 __device__
float ceilf(
float x);
120 __device__
float copysignf(
float x,
float y);
121 __device__
float cosf(
float x);
122 __device__
float coshf(
float x);
123 __device__
float cyl_bessel_i0f(
float x);
124 __device__
float cyl_bessel_i1f(
float x);
125 __device__
float erfcf(
float x);
126 __device__
float erfcinvf(
float y);
127 __device__
float erfcxf(
float x);
128 __device__
float erff(
float x);
129 __device__
float erfinvf(
float y);
130 __device__
float exp10f(
float x);
131 __device__
float exp2f(
float x);
132 __device__
float expf(
float x);
133 __device__
float expm1f(
float x);
134 __device__
float fabsf(
float x);
135 __device__
float fdimf(
float x,
float y);
136 __device__
float fdividef(
float x,
float y);
137 __device__
float floorf(
float x);
138 __device__
float fmaf(
float x,
float y,
float z);
139 __device__
float fmaxf(
float x,
float y);
140 __device__
float fminf(
float x,
float y);
141 __device__
float fmodf(
float x,
float y);
142 __device__
float frexpf(
float x,
float y);
143 __device__
float hypotf(
float x,
float y);
144 __device__
float ilogbf(
float x);
145 __host__ __device__
unsigned isfinite(
float a);
146 __device__
unsigned isinf(
float a);
147 __device__
unsigned isnan(
float a);
148 __device__
float j0f(
float x);
149 __device__
float j1f(
float x);
150 __device__
float jnf(
int n,
float x);
151 __device__
float ldexpf(
float x,
int exp);
152 __device__
float lgammaf(
float x);
153 __device__
long long int llrintf(
float x);
154 __device__
long long int llroundf(
float x);
155 __device__
float log10f(
float x);
156 __device__
float log1pf(
float x);
157 __device__
float log2f(
float x);
158 __device__
float logbf(
float x);
159 __device__
float logf(
float x);
160 __device__
long int lrintf(
float x);
161 __device__
long int lroundf(
float x);
162 __device__
float modff(
float x,
float *iptr);
163 __device__
float nanf(
const char* tagp);
164 __device__
float nearbyintf(
float x);
165 __device__
float nextafterf(
float x,
float y);
166 __device__
float norm3df(
float a,
float b,
float c);
167 __device__
float norm4df(
float a,
float b,
float c,
float d);
168 __device__
float normcdff(
float y);
169 __device__
float normcdfinvf(
float y);
170 __device__
float normf(
int dim,
const float *a);
171 __device__
float powf(
float x,
float y);
172 __device__
float rcbtrf(
float x);
173 __device__
float remainderf(
float x,
float y);
174 __device__
float remquof(
float x,
float y,
int *quo);
175 __device__
float rhypotf(
float x,
float y);
176 __device__
float rintf(
float x);
177 __device__
float rnorm3df(
float a,
float b,
float c);
178 __device__
float rnorm4df(
float a,
float b,
float c,
float d);
179 __device__
float rnormf(
int dim,
const float* a);
180 __device__
float roundf(
float x);
181 __device__
float rsqrtf(
float x);
182 __device__
float scalblnf(
float x,
long int n);
183 __device__
float scalbnf(
float x,
int n);
184 __host__ __device__
unsigned signbit(
float a);
185 __device__
void sincosf(
float x,
float *sptr,
float *cptr);
186 __device__
void sincospif(
float x,
float *sptr,
float *cptr);
187 __device__
float sinf(
float x);
188 __device__
float sinhf(
float x);
189 __device__
float sinpif(
float x);
190 __device__
float sqrtf(
float x);
191 __device__
float tanf(
float x);
192 __device__
float tanhf(
float x);
193 __device__
float tgammaf(
float x);
194 __device__
float truncf(
float x);
195 __device__
float y0f(
float x);
196 __device__
float y1f(
float x);
197 __device__
float ynf(
int n,
float x);
199 __host__ __device__
float cospif(
float x);
200 __host__ __device__
float sinpif(
float x);
201 __device__
float sqrtf(
float x);
202 __host__ __device__
float rsqrtf(
float x);
204 __device__
double acos(
double x);
205 __device__
double acosh(
double x);
206 __device__
double asin(
double x);
207 __device__
double asinh(
double x);
208 __device__
double atan(
double x);
209 __device__
double atan2(
double y,
double x);
210 __device__
double atanh(
double x);
211 __device__
double cbrt(
double x);
212 __device__
double ceil(
double x);
213 __device__
double copysign(
double x,
double y);
214 __device__
double cos(
double x);
215 __device__
double cosh(
double x);
216 __host__ __device__
double cospi(
double x);
217 __device__
double cyl_bessel_i0(
double x);
218 __device__
double cyl_bessel_i1(
double x);
219 __device__
double erf(
double x);
220 __device__
double erfc(
double x);
221 __device__
double erfcinv(
double y);
222 __device__
double erfcx(
double x);
223 __device__
double exp(
double x);
224 __device__
double exp10(
double x);
225 __device__
double exp2(
double x);
226 __device__
double expm1(
double x);
227 __device__
double fabs(
double x);
228 __device__
double fdim(
double x,
double y);
229 __device__
double floor(
double x);
230 __device__
double fma(
double x,
double y,
double z);
231 __device__
double fmax(
double x,
double y);
232 __device__
double fmin(
double x,
double y);
233 __device__
double fmod(
double x,
double y);
234 __device__
double frexp(
double x,
int *nptr);
235 __device__
double hypot(
double x,
double y);
236 __device__
double ilogb(
double x);
237 __host__ __device__
unsigned isfinite(
double x);
238 __device__
unsigned isinf(
double x);
239 __device__
unsigned isnan(
double x);
240 __device__
double j0(
double x);
241 __device__
double j1(
double x);
242 __device__
double jn(
int n,
double x);
243 __device__
double ldexp(
double x,
int exp);
244 __device__
double lgamma(
double x);
245 __device__
long long llrint(
double x);
246 __device__
long llround(
double x);
247 __device__
double log(
double x);
248 __device__
double log10(
double x);
249 __device__
double log1p(
double x);
250 __device__
double log2(
double x);
251 __device__
double logb(
double x);
252 __device__
long int lrint(
double x);
253 __device__
long int lround(
double x);
254 __device__
double modf(
double x,
double *iptr);
255 __device__
double nan(
const char* tagp);
256 __device__
double nearbyint(
double x);
257 __device__
double nextafter(
double x,
double y);
258 __device__
double norm(
int dim,
const double* t);
259 __device__
double norm3d(
double a,
double b,
double c);
260 __device__
double norm4d(
double a,
double b,
double d);
261 __device__
double normcdf(
double y);
262 __device__
double normcdfinv(
double y);
263 __device__
double pow(
double x,
double y);
264 __device__
double rcbrt(
double x);
265 __device__
double remainder(
double x,
double y);
266 __device__
double remquo(
double x,
double y,
int *quo);
267 __device__
double rhypot(
double x,
double y);
268 __device__
double rint(
double x);
269 __device__
double rnorm(
int dim,
const double* t);
270 __device__
double rnorm3d(
double a,
double b,
double c);
271 __device__
double rnorm4d(
double a,
double b,
double c,
double d);
272 __device__
double round(
double x);
273 __host__ __device__
double rsqrt(
double x);
274 __device__
double scalbln(
double x,
long int n);
275 __device__
double scalbn(
double x,
int n);
276 __host__ __device__
unsigned signbit(
double a);
277 __device__
double sin(
double a);
278 __device__
double sincos(
double x,
double *sptr,
double *cptr);
279 __device__
double sincospi(
double x,
double *sptr,
double *cptr);
280 __device__
double sinh(
double x);
281 __host__ __device__
double sinpi(
double x);
282 __device__
double sqrt(
double x);
283 __device__
double tan(
double x);
284 __device__
double tanh(
double x);
285 __device__
double tgamma(
double x);
286 __device__
double trunc(
double x);
287 __device__
double y0(
double x);
288 __device__
double y1(
double y);
289 __device__
double yn(
int n,
double x);
293 extern const int warpSize;
296 #define clock_t long long int
297 __device__
long long int clock64();
298 __device__ clock_t clock();
301 __device__
int atomicAdd(
int* address,
int val);
302 __device__
unsigned int atomicAdd(
unsigned int* address,
305 __device__
unsigned long long int atomicAdd(
unsigned long long int* address,
306 unsigned long long int val);
308 __device__
float atomicAdd(
float* address,
float val);
312 __device__
int atomicSub(
int* address,
int val);
314 __device__
unsigned int atomicSub(
unsigned int* address,
319 __device__
int atomicExch(
int* address,
int val);
321 __device__
unsigned int atomicExch(
unsigned int* address,
324 __device__
unsigned long long int atomicExch(
unsigned long long int* address,
325 unsigned long long int val);
327 __device__
float atomicExch(
float* address,
float val);
331 __device__
int atomicMin(
int* address,
int val);
332 __device__
unsigned int atomicMin(
unsigned int* address,
334 __device__
unsigned long long int atomicMin(
unsigned long long int* address,
335 unsigned long long int val);
339 __device__
int atomicMax(
int* address,
int val);
340 __device__
unsigned int atomicMax(
unsigned int* address,
342 __device__
unsigned long long int atomicMax(
unsigned long long int* address,
343 unsigned long long int val);
347 __device__
int atomicCAS(
int* address,
int compare,
int val);
348 __device__
unsigned int atomicCAS(
unsigned int* address,
349 unsigned int compare,
351 __device__
unsigned long long int atomicCAS(
unsigned long long int* address,
352 unsigned long long int compare,
353 unsigned long long int val);
357 __device__
int atomicAnd(
int* address,
int val);
358 __device__
unsigned int atomicAnd(
unsigned int* address,
360 __device__
unsigned long long int atomicAnd(
unsigned long long int* address,
361 unsigned long long int val);
365 __device__
int atomicOr(
int* address,
int val);
366 __device__
unsigned int atomicOr(
unsigned int* address,
368 __device__
unsigned long long int atomicOr(
unsigned long long int* address,
369 unsigned long long int val);
373 __device__
int atomicXor(
int* address,
int val);
374 __device__
unsigned int atomicXor(
unsigned int* address,
376 __device__
unsigned long long int atomicXor(
unsigned long long int* address,
377 unsigned long long int val);
380 __device__
unsigned int atomicInc(
unsigned int* address,
385 __device__
unsigned int atomicDec(
unsigned int* address,
390 __device__
unsigned int __popc(
unsigned int input);
391 __device__
unsigned int __popcll(
unsigned long long int input);
392 __device__
unsigned int __clz(
unsigned int input);
393 __device__
unsigned int __clzll(
unsigned long long int input);
394 __device__
unsigned int __clz(
int input);
395 __device__
unsigned int __clzll(
long long int input);
396 __device__
unsigned int __ffs(
unsigned int input);
397 __device__
unsigned int __ffsll(
unsigned long long int input);
398 __device__
unsigned int __ffs(
int input);
399 __device__
unsigned int __ffsll(
long long int input);
400 __device__
unsigned int __brev(
unsigned int input);
401 __device__
unsigned long long int __brevll(
unsigned long long int input);
405 __device__
int __all(
int input);
406 __device__
int __any(
int input);
407 __device__
unsigned long long int __ballot(
int input);
411 __device__
int __shfl(
int input,
int lane,
int width=warpSize);
412 __device__
int __shfl_up(
int input,
unsigned int lane_delta,
int width=warpSize);
413 __device__
int __shfl_down(
int input,
unsigned int lane_delta,
int width=warpSize);
414 __device__
int __shfl_xor(
int input,
int lane_mask,
int width=warpSize);
415 __device__
float __shfl(
float input,
int lane,
int width=warpSize);
416 __device__
float __shfl_up(
float input,
unsigned int lane_delta,
int width=warpSize);
417 __device__
float __shfl_down(
float input,
unsigned int lane_delta,
int width=warpSize);
418 __device__
float __shfl_xor(
float input,
int lane_mask,
int width=warpSize);
420 __device__
int __shfl(
int input,
int lane,
int width);
421 __device__
int __shfl_up(
int input,
unsigned int lane_delta,
int width);
422 __device__
int __shfl_down(
int input,
unsigned int lane_delta,
int width);
423 __device__
int __shfl_xor(
int input,
int lane_mask,
int width);
424 __device__
float __shfl(
float input,
int lane,
int width);
425 __device__
float __shfl_up(
float input,
unsigned int lane_delta,
int width);
426 __device__
float __shfl_down(
float input,
unsigned int lane_delta,
int width);
427 __device__
float __shfl_xor(
float input,
int lane_mask,
int width);
430 __host__ __device__
int min(
int arg1,
int arg2);
431 __host__ __device__
int max(
int arg1,
int arg2);
434 __device__
float __cosf(
float x);
435 __device__
float __expf(
float x);
436 __device__
float __frsqrt_rn(
float x);
437 __device__
float __fsqrt_rd(
float x);
438 __device__
float __fsqrt_rn(
float x);
439 __device__
float __fsqrt_ru(
float x);
440 __device__
float __fsqrt_rz(
float x);
441 __device__
float __log10f(
float x);
442 __device__
float __log2f(
float x);
443 __device__
float __logf(
float x);
444 __device__
float __powf(
float base,
float exponent);
445 __device__
void __sincosf(
float x,
float *s,
float *c) ;
446 __device__
float __sinf(
float x);
447 __device__
float __tanf(
float x);
448 __device__
float __dsqrt_rd(
double x);
449 __device__
float __dsqrt_rn(
double x);
450 __device__
float __dsqrt_ru(
double x);
451 __device__
float __dsqrt_rz(
double x);
457 #if __hcc_workweek__ >= 16123
459 #define hipThreadIdx_x (amp_get_local_id(0))
460 #define hipThreadIdx_y (amp_get_local_id(1))
461 #define hipThreadIdx_z (amp_get_local_id(2))
463 #define hipBlockIdx_x (hc_get_group_id(0))
464 #define hipBlockIdx_y (hc_get_group_id(1))
465 #define hipBlockIdx_z (hc_get_group_id(2))
467 #define hipBlockDim_x (amp_get_local_size(0))
468 #define hipBlockDim_y (amp_get_local_size(1))
469 #define hipBlockDim_z (amp_get_local_size(2))
471 #define hipGridDim_x (hc_get_num_groups(0))
472 #define hipGridDim_y (hc_get_num_groups(1))
473 #define hipGridDim_z (hc_get_num_groups(2))
477 #define hipThreadIdx_x (amp_get_local_id(2))
478 #define hipThreadIdx_y (amp_get_local_id(1))
479 #define hipThreadIdx_z (amp_get_local_id(0))
481 #define hipBlockIdx_x (hc_get_group_id(2))
482 #define hipBlockIdx_y (hc_get_group_id(1))
483 #define hipBlockIdx_z (hc_get_group_id(0))
485 #define hipBlockDim_x (amp_get_local_size(2))
486 #define hipBlockDim_y (amp_get_local_size(1))
487 #define hipBlockDim_z (amp_get_local_size(0))
489 #define hipGridDim_x (hc_get_num_groups(2))
490 #define hipGridDim_y (hc_get_num_groups(1))
491 #define hipGridDim_z (hc_get_num_groups(0))
493 #endif // __hcc_workweek__ check
495 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
497 #define HIP_KERNEL_NAME(...) __VA_ARGS__
500 hipStream_t ihipPreLaunchKernel(hipStream_t stream, grid_launch_parm *lp);
501 void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
504 #define KNRM "\x1B[0m"
505 #define KGRN "\x1B[32m"
507 #if not defined(DISABLE_GRID_LAUNCH)
508 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
510 grid_launch_parm lp;\
511 lp.gridDim.x = _numBlocks3D.x; \
512 lp.gridDim.y = _numBlocks3D.y; \
513 lp.gridDim.z = _numBlocks3D.z; \
514 lp.groupDim.x = _blockDim3D.x; \
515 lp.groupDim.y = _blockDim3D.y; \
516 lp.groupDim.z = _blockDim3D.z; \
517 lp.groupMemBytes = _groupMemBytes; \
518 hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp)); \
519 if (HIP_TRACE_API) {\
520 fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
521 #_kernelName, lp.gridDim.x, lp.gridDim.y, lp.gridDim.z, lp.groupDim.x, lp.groupDim.y, lp.groupDim.z, lp.groupMemBytes, (void*)(_stream));\
523 _kernelName (lp, __VA_ARGS__);\
524 ihipPostLaunchKernel(trueStream, lp);\
528 #warning(DISABLE_GRID_LAUNCH set)
530 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
532 grid_launch_parm lp;\
533 lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x; \
534 lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y; \
535 lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z; \
536 lp.groupDim.x = _blockDim3D.x; \
537 lp.groupDim.y = _blockDim3D.y; \
538 lp.groupDim.z = _blockDim3D.z; \
539 lp.groupMemBytes = _groupMemBytes;\
540 hc::completion_future cf;\
542 hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
543 if (HIP_TRACE_API) {\
544 fprintf(stderr, "==hip-api: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
545 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
547 _kernelName (lp, __VA_ARGS__);\
548 ihipPostLaunchKernel(trueStream, cf);\
553 #elif defined (__HCC_C__)
#define __host__
Definition: host_defines.h:35