41 #define CUDA_SUCCESS hipSuccess
43 #include <hip/hip_runtime_api.h>
51 #include <grid_launch.h>
52 extern int HIP_TRACE_API;
56 #define hipLaunchParm grid_launch_parm
58 #include <hip/hcc_detail/hip_texture.h>
60 #include <hip/hcc_detail/host_defines.h>
62 #if defined (__KALMAR_ACCELERATOR__) && !defined (__HCC_ACCELERATOR__)
63 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
67 #if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
72 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
73 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
74 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
75 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
76 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
79 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
80 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
83 #define __HIP_ARCH_HAS_DOUBLES__ (1)
86 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
87 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
88 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
89 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
92 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
93 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
96 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
97 #define __HIP_ARCH_HAS_3DGRID__ (1)
98 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
104 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
107 #if defined(__cplusplus)
109 #elif defined(__STDC_VERSION__)
114 __device__
float acosf(
float x);
115 __device__
float acoshf(
float x);
116 __device__
float asinf(
float x);
117 __device__
float asinhf(
float x);
118 __device__
float atan2f(
float y,
float x);
119 __device__
float atanf(
float x);
120 __device__
float atanhf(
float x);
121 __device__
float cbrtf(
float x);
122 __device__
float ceilf(
float x);
123 __device__
float copysignf(
float x,
float y);
124 __device__
float cosf(
float x);
125 __device__
float coshf(
float x);
126 __device__
float cyl_bessel_i0f(
float x);
127 __device__
float cyl_bessel_i1f(
float x);
128 __device__
float erfcf(
float x);
129 __device__
float erfcinvf(
float y);
130 __device__
float erfcxf(
float x);
131 __device__
float erff(
float x);
132 __device__
float erfinvf(
float y);
133 __device__
float exp10f(
float x);
134 __device__
float exp2f(
float x);
135 __device__
float expf(
float x);
136 __device__
float expm1f(
float x);
137 __device__
float fabsf(
float x);
138 __device__
float fdimf(
float x,
float y);
139 __device__
float fdividef(
float x,
float y);
140 __device__
float floorf(
float x);
141 __device__
float fmaf(
float x,
float y,
float z);
142 __device__
float fmaxf(
float x,
float y);
143 __device__
float fminf(
float x,
float y);
144 __device__
float fmodf(
float x,
float y);
145 __device__
float frexpf(
float x,
float y);
146 __device__
float hypotf(
float x,
float y);
147 __device__
float ilogbf(
float x);
148 __device__
unsigned isfinite(
float a);
149 __device__
unsigned isinf(
float a);
150 __device__
unsigned isnan(
float a);
151 __device__
float j0f(
float x);
152 __device__
float j1f(
float x);
153 __device__
float jnf(
int n,
float x);
154 __device__
float ldexpf(
float x,
int exp);
155 __device__
float lgammaf(
float x);
156 __device__
long long int llrintf(
float x);
157 __device__
long long int llroundf(
float x);
158 __device__
float log10f(
float x);
159 __device__
float log1pf(
float x);
160 __device__
float log2f(
float x);
161 __device__
float logbf(
float x);
162 __device__
float logf(
float x);
163 __device__
long int lrintf(
float x);
164 __device__
long int lroundf(
float x);
165 __device__
float modff(
float x,
float *iptr);
166 __device__
float nanf(
const char* tagp);
167 __device__
float nearbyintf(
float x);
168 __device__
float nextafterf(
float x,
float y);
169 __device__
float norm3df(
float a,
float b,
float c);
170 __device__
float norm4df(
float a,
float b,
float c,
float d);
171 __device__
float normcdff(
float y);
172 __device__
float normcdfinvf(
float y);
173 __device__
float normf(
int dim,
const float *a);
174 __device__
float powf(
float x,
float y);
175 __device__
float rcbtrf(
float x);
176 __device__
float remainderf(
float x,
float y);
177 __device__
float remquof(
float x,
float y,
int *quo);
178 __device__
float rhypotf(
float x,
float y);
179 __device__
float rintf(
float x);
180 __device__
float rnorm3df(
float a,
float b,
float c);
181 __device__
float rnorm4df(
float a,
float b,
float c,
float d);
182 __device__
float rnormf(
int dim,
const float* a);
183 __device__
float roundf(
float x);
184 __device__
float rsqrtf(
float x);
185 __device__
float scalblnf(
float x,
long int n);
186 __device__
float scalbnf(
float x,
int n);
187 __device__
unsigned signbit(
float a);
188 __device__
void sincosf(
float x,
float *sptr,
float *cptr);
189 __device__
void sincospif(
float x,
float *sptr,
float *cptr);
190 __device__
float sinf(
float x);
191 __device__
float sinhf(
float x);
192 __device__
float sinpif(
float x);
193 __device__
float sqrtf(
float x);
194 __device__
float tanf(
float x);
195 __device__
float tanhf(
float x);
196 __device__
float tgammaf(
float x);
197 __device__
float truncf(
float x);
198 __device__
float y0f(
float x);
199 __device__
float y1f(
float x);
200 __device__
float ynf(
int n,
float x);
202 __host__ __device__
float cospif(
float x);
203 __host__ __device__
float sinpif(
float x);
204 __device__
float sqrtf(
float x);
205 __host__ __device__
float rsqrtf(
float x);
207 __device__
double acos(
double x);
208 __device__
double acosh(
double x);
209 __device__
double asin(
double x);
210 __device__
double asinh(
double x);
211 __device__
double atan(
double x);
212 __device__
double atan2(
double y,
double x);
213 __device__
double atanh(
double x);
214 __device__
double cbrt(
double x);
215 __device__
double ceil(
double x);
216 __device__
double copysign(
double x,
double y);
217 __device__
double cos(
double x);
218 __device__
double cosh(
double x);
219 __host__ __device__
double cospi(
double x);
220 __device__
double cyl_bessel_i0(
double x);
221 __device__
double cyl_bessel_i1(
double x);
222 __device__
double erf(
double x);
223 __device__
double erfc(
double x);
224 __device__
double erfcinv(
double y);
225 __device__
double erfcx(
double x);
226 __device__
double exp(
double x);
227 __device__
double exp10(
double x);
228 __device__
double exp2(
double x);
229 __device__
double expm1(
double x);
230 __device__
double fabs(
double x);
231 __device__
double fdim(
double x,
double y);
232 __device__
double floor(
double x);
233 __device__
double fma(
double x,
double y,
double z);
234 __device__
double fmax(
double x,
double y);
235 __device__
double fmin(
double x,
double y);
236 __device__
double fmod(
double x,
double y);
237 __device__
double frexp(
double x,
int *nptr);
238 __device__
double hypot(
double x,
double y);
239 __device__
double ilogb(
double x);
240 __device__
unsigned isfinite(
double x);
241 __device__
unsigned isinf(
double x);
242 __device__
unsigned isnan(
double x);
243 __device__
double j0(
double x);
244 __device__
double j1(
double x);
245 __device__
double jn(
int n,
double x);
246 __device__
double ldexp(
double x,
int exp);
247 __device__
double lgamma(
double x);
248 __device__
long long llrint(
double x);
249 __device__
long llround(
double x);
250 __device__
double log(
double x);
251 __device__
double log10(
double x);
252 __device__
double log1p(
double x);
253 __device__
double log2(
double x);
254 __device__
double logb(
double x);
255 __device__
long int lrint(
double x);
256 __device__
long int lround(
double x);
257 __device__
double modf(
double x,
double *iptr);
258 __device__
double nan(
const char* tagp);
259 __device__
double nearbyint(
double x);
260 __device__
double nextafter(
double x,
double y);
261 __device__
double norm(
int dim,
const double* t);
262 __device__
double norm3d(
double a,
double b,
double c);
263 __device__
double norm4d(
double a,
double b,
double d);
264 __device__
double normcdf(
double y);
265 __device__
double normcdfinv(
double y);
266 __device__
double pow(
double x,
double y);
267 __device__
double rcbrt(
double x);
268 __device__
double remainder(
double x,
double y);
269 __device__
double remquo(
double x,
double y,
int *quo);
270 __device__
double rhypot(
double x,
double y);
271 __device__
double rint(
double x);
272 __device__
double rnorm(
int dim,
const double* t);
273 __device__
double rnorm3d(
double a,
double b,
double c);
274 __device__
double rnorm4d(
double a,
double b,
double c,
double d);
275 __device__
double round(
double x);
276 __host__ __device__
double rsqrt(
double x);
277 __device__
double scalbln(
double x,
long int n);
278 __device__
double scalbn(
double x,
int n);
279 __device__
unsigned signbit(
double a);
280 __device__
double sin(
double a);
281 __device__
double sincos(
double x,
double *sptr,
double *cptr);
282 __device__
double sincospi(
double x,
double *sptr,
double *cptr);
283 __device__
double sinh(
double x);
284 __host__ __device__
double sinpi(
double x);
285 __device__
double sqrt(
double x);
286 __device__
double tan(
double x);
287 __device__
double tanh(
double x);
288 __device__
double tgamma(
double x);
289 __device__
double trunc(
double x);
290 __device__
double y0(
double x);
291 __device__
double y1(
double y);
292 __device__
double yn(
int n,
double x);
297 extern const int warpSize;
300 #define clock_t long long int
301 __device__
long long int clock64();
302 __device__ clock_t clock();
305 __device__
int atomicAdd(
int* address,
int val);
306 __device__
unsigned int atomicAdd(
unsigned int* address,
309 __device__
unsigned long long int atomicAdd(
unsigned long long int* address,
310 unsigned long long int val);
312 __device__
float atomicAdd(
float* address,
float val);
316 __device__
int atomicSub(
int* address,
int val);
318 __device__
unsigned int atomicSub(
unsigned int* address,
323 __device__
int atomicExch(
int* address,
int val);
325 __device__
unsigned int atomicExch(
unsigned int* address,
328 __device__
unsigned long long int atomicExch(
unsigned long long int* address,
329 unsigned long long int val);
331 __device__
float atomicExch(
float* address,
float val);
335 __device__
int atomicMin(
int* address,
int val);
336 __device__
unsigned int atomicMin(
unsigned int* address,
338 __device__
unsigned long long int atomicMin(
unsigned long long int* address,
339 unsigned long long int val);
343 __device__
int atomicMax(
int* address,
int val);
344 __device__
unsigned int atomicMax(
unsigned int* address,
346 __device__
unsigned long long int atomicMax(
unsigned long long int* address,
347 unsigned long long int val);
351 __device__
int atomicCAS(
int* address,
int compare,
int val);
352 __device__
unsigned int atomicCAS(
unsigned int* address,
353 unsigned int compare,
355 __device__
unsigned long long int atomicCAS(
unsigned long long int* address,
356 unsigned long long int compare,
357 unsigned long long int val);
361 __device__
int atomicAnd(
int* address,
int val);
362 __device__
unsigned int atomicAnd(
unsigned int* address,
364 __device__
unsigned long long int atomicAnd(
unsigned long long int* address,
365 unsigned long long int val);
369 __device__
int atomicOr(
int* address,
int val);
370 __device__
unsigned int atomicOr(
unsigned int* address,
372 __device__
unsigned long long int atomicOr(
unsigned long long int* address,
373 unsigned long long int val);
377 __device__
int atomicXor(
int* address,
int val);
378 __device__
unsigned int atomicXor(
unsigned int* address,
380 __device__
unsigned long long int atomicXor(
unsigned long long int* address,
381 unsigned long long int val);
385 __device__
unsigned int __popc(
unsigned int input);
386 __device__
unsigned int __popcll(
unsigned long long int input);
387 __device__
unsigned int __clz(
unsigned int input);
388 __device__
unsigned int __clzll(
unsigned long long int input);
389 __device__
unsigned int __clz(
int input);
390 __device__
unsigned int __clzll(
long long int input);
391 __device__
unsigned int __ffs(
unsigned int input);
392 __device__
unsigned int __ffsll(
unsigned long long int input);
393 __device__
unsigned int __ffs(
int input);
394 __device__
unsigned int __ffsll(
long long int input);
395 __device__
unsigned int __brev(
unsigned int input);
396 __device__
unsigned long long int __brevll(
unsigned long long int input);
400 __device__
int __all(
int input);
401 __device__
int __any(
int input);
402 __device__
unsigned long long int __ballot(
int input);
407 __device__
int __shfl(
int input,
int lane,
int width=warpSize);
408 __device__
int __shfl_up(
int input,
unsigned int lane_delta,
int width=warpSize);
409 __device__
int __shfl_down(
int input,
unsigned int lane_delta,
int width=warpSize);
410 __device__
int __shfl_xor(
int input,
int lane_mask,
int width=warpSize);
411 __device__
float __shfl(
float input,
int lane,
int width=warpSize);
412 __device__
float __shfl_up(
float input,
unsigned int lane_delta,
int width=warpSize);
413 __device__
float __shfl_down(
float input,
unsigned int lane_delta,
int width=warpSize);
414 __device__
float __shfl_xor(
float input,
int lane_mask,
int width=warpSize);
416 __device__
int __shfl(
int input,
int lane,
int width);
417 __device__
int __shfl_up(
int input,
unsigned int lane_delta,
int width);
418 __device__
int __shfl_down(
int input,
unsigned int lane_delta,
int width);
419 __device__
int __shfl_xor(
int input,
int lane_mask,
int width);
420 __device__
float __shfl(
float input,
int lane,
int width);
421 __device__
float __shfl_up(
float input,
unsigned int lane_delta,
int width);
422 __device__
float __shfl_down(
float input,
unsigned int lane_delta,
int width);
423 __device__
float __shfl_xor(
float input,
int lane_mask,
int width);
426 __host__ __device__
int min(
int arg1,
int arg2);
427 __host__ __device__
int max(
int arg1,
int arg2);
430 __device__
float __cosf(
float x);
431 __device__
float __expf(
float x);
432 __device__
float __frsqrt_rn(
float x);
433 __device__
float __fsqrt_rd(
float x);
434 __device__
float __fsqrt_rn(
float x);
435 __device__
float __fsqrt_ru(
float x);
436 __device__
float __fsqrt_rz(
float x);
437 __device__
float __log10f(
float x);
438 __device__
float __log2f(
float x);
439 __device__
float __logf(
float x);
440 __device__
float __powf(
float base,
float exponent);
441 __device__
void __sincosf(
float x,
float *s,
float *c) ;
442 __device__
float __sinf(
float x);
443 __device__
float __tanf(
float x);
444 __device__
float __dsqrt_rd(
double x);
445 __device__
float __dsqrt_rn(
double x);
446 __device__
float __dsqrt_ru(
double x);
447 __device__
float __dsqrt_rz(
double x);
453 #if __hcc_workweek__ >= 16123
455 #define hipThreadIdx_x (amp_get_local_id(0))
456 #define hipThreadIdx_y (amp_get_local_id(1))
457 #define hipThreadIdx_z (amp_get_local_id(2))
459 #define hipBlockIdx_x (hc_get_group_id(0))
460 #define hipBlockIdx_y (hc_get_group_id(1))
461 #define hipBlockIdx_z (hc_get_group_id(2))
463 #define hipBlockDim_x (amp_get_local_size(0))
464 #define hipBlockDim_y (amp_get_local_size(1))
465 #define hipBlockDim_z (amp_get_local_size(2))
467 #define hipGridDim_x (hc_get_num_groups(0))
468 #define hipGridDim_y (hc_get_num_groups(1))
469 #define hipGridDim_z (hc_get_num_groups(2))
473 #define hipThreadIdx_x (amp_get_local_id(2))
474 #define hipThreadIdx_y (amp_get_local_id(1))
475 #define hipThreadIdx_z (amp_get_local_id(0))
477 #define hipBlockIdx_x (hc_get_group_id(2))
478 #define hipBlockIdx_y (hc_get_group_id(1))
479 #define hipBlockIdx_z (hc_get_group_id(0))
481 #define hipBlockDim_x (amp_get_local_size(2))
482 #define hipBlockDim_y (amp_get_local_size(1))
483 #define hipBlockDim_z (amp_get_local_size(0))
485 #define hipGridDim_x (hc_get_num_groups(2))
486 #define hipGridDim_y (hc_get_num_groups(1))
487 #define hipGridDim_z (hc_get_num_groups(0))
491 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
493 #define HIP_KERNEL_NAME(...) __VA_ARGS__
496 hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av);
497 void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &cf);
500 #define KNRM "\x1B[0m"
501 #define KGRN "\x1B[32m"
503 #if not defined(DISABLE_GRID_LAUNCH)
504 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
506 grid_launch_parm lp;\
507 lp.gridDim.x = _numBlocks3D.x; \
508 lp.gridDim.y = _numBlocks3D.y; \
509 lp.gridDim.z = _numBlocks3D.z; \
510 lp.groupDim.x = _blockDim3D.x; \
511 lp.groupDim.y = _blockDim3D.y; \
512 lp.groupDim.z = _blockDim3D.z; \
513 lp.groupMemBytes = _groupMemBytes;\
514 hc::completion_future cf;\
516 hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
517 if (HIP_TRACE_API) {\
518 fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n" KNRM, \
519 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
521 _kernelName (lp, __VA_ARGS__);\
522 ihipPostLaunchKernel(trueStream, cf);\
526 #warning(DISABLE_GRID_LAUNCH set)
528 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
530 grid_launch_parm lp;\
531 lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x; \
532 lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y; \
533 lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z; \
534 lp.groupDim.x = _blockDim3D.x; \
535 lp.groupDim.y = _blockDim3D.y; \
536 lp.groupDim.z = _blockDim3D.z; \
537 lp.groupMemBytes = _groupMemBytes;\
538 hc::completion_future cf;\
540 hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
541 if (HIP_TRACE_API) {\
542 fprintf(stderr, "==hip-api: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
543 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
545 _kernelName (lp, __VA_ARGS__);\
546 ihipPostLaunchKernel(trueStream, cf);\
551 #elif defined (__HCC_C__)
#define __host__
Definition: host_defines.h:35