39 #define CUDA_SUCCESS hipSuccess
47 #include <grid_launch.h>
51 #define hipLaunchParm grid_launch_parm
57 #if defined (__KALMAR_ACCELERATOR__) && not defined (__HCC_ACCELERATOR__)
58 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
62 #if defined(__HCC_ACCELERATOR__) and (__HCC_ACCELERATOR__ != 0)
67 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
68 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
69 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
70 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
71 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
74 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
75 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
78 #define __HIP_ARCH_HAS_DOUBLES__ (1)
81 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
82 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
83 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
84 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
87 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
88 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
91 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
92 #define __HIP_ARCH_HAS_3DGRID__ (1)
93 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
102 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
105 #if defined(__cplusplus)
107 #elif defined(__STDC_VERSION__)
114 const int warpSize = 64;
117 #define clock_t long long int
118 __device__
inline long long int clock64() {
return (
long long int)hc::__clock_u64(); };
119 __device__
inline clock_t clock() {
return (clock_t)hc::__clock_u64(); };
122 __device__
inline int atomicAdd(
int* address,
int val)
124 return hc::atomic_fetch_add(address,val);
126 __device__
inline unsigned int atomicAdd(
unsigned int* address,
129 return hc::atomic_fetch_add(address,val);
131 __device__
inline unsigned long long int atomicAdd(
unsigned long long int* address,
132 unsigned long long int val)
134 return (
long long int)hc::atomic_fetch_add((uint64_t*)address,(uint64_t)val);
136 __device__
inline float atomicAdd(
float* address,
float val)
138 return hc::atomic_fetch_add(address,val);
142 __device__
inline int atomicSub(
int* address,
int val)
144 return hc::atomic_fetch_sub(address,val);
146 __device__
inline unsigned int atomicSub(
unsigned int* address,
149 return hc::atomic_fetch_sub(address,val);
153 __device__
inline int atomicExch(
int* address,
int val)
155 return hc::atomic_exchange(address,val);
157 __device__
inline unsigned int atomicExch(
unsigned int* address,
160 return hc::atomic_exchange(address,val);
162 __device__
inline unsigned long long int atomicExch(
unsigned long long int* address,
163 unsigned long long int val)
165 return (
long long int)hc::atomic_exchange((uint64_t*)address,(uint64_t)val);
167 __device__
inline float atomicExch(
float* address,
float val)
169 return hc::atomic_exchange(address,val);
173 __device__
inline int atomicMin(
int* address,
int val)
175 return hc::atomic_fetch_min(address,val);
177 __device__
inline unsigned int atomicMin(
unsigned int* address,
180 return hc::atomic_fetch_min(address,val);
182 __device__
inline unsigned long long int atomicMin(
unsigned long long int* address,
183 unsigned long long int val)
185 return (
long long int)hc::atomic_fetch_min((uint64_t*)address,(uint64_t)val);
189 __device__
inline int atomicMax(
int* address,
int val)
191 return hc::atomic_fetch_max(address,val);
193 __device__
inline unsigned int atomicMax(
unsigned int* address,
196 return hc::atomic_fetch_max(address,val);
198 __device__
inline unsigned long long int atomicMax(
unsigned long long int* address,
199 unsigned long long int val)
201 return (
long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val);
205 __device__
inline int atomicCAS(
int* address,
int compare,
int val)
207 hc::atomic_compare_exchange(address,&compare,val);
210 __device__
inline unsigned int atomicCAS(
unsigned int* address,
211 unsigned int compare,
214 hc::atomic_compare_exchange(address,&compare,val);
217 __device__
inline unsigned long long int atomicCAS(
unsigned long long int* address,
218 unsigned long long int compare,
219 unsigned long long int val)
221 hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
226 __device__
inline int atomicAnd(
int* address,
int val)
228 return hc::atomic_fetch_and(address,val);
230 __device__
inline unsigned int atomicAnd(
unsigned int* address,
233 return hc::atomic_fetch_and(address,val);
235 __device__
inline unsigned long long int atomicAnd(
unsigned long long int* address,
236 unsigned long long int val)
238 return (
long long int)hc::atomic_fetch_and((uint64_t*)address,(uint64_t)val);
242 __device__
inline int atomicOr(
int* address,
int val)
244 return hc::atomic_fetch_or(address,val);
246 __device__
inline unsigned int atomicOr(
unsigned int* address,
249 return hc::atomic_fetch_or(address,val);
251 __device__
inline unsigned long long int atomicOr(
unsigned long long int* address,
252 unsigned long long int val)
254 return (
long long int)hc::atomic_fetch_or((uint64_t*)address,(uint64_t)val);
258 __device__
inline int atomicXor(
int* address,
int val)
260 return hc::atomic_fetch_xor(address,val);
262 __device__
inline unsigned int atomicXor(
unsigned int* address,
265 return hc::atomic_fetch_xor(address,val);
267 __device__
inline unsigned long long int atomicXor(
unsigned long long int* address,
268 unsigned long long int val)
270 return (
long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
275 __device__
inline unsigned int __popc(
unsigned int input)
277 return hc::__popcount_u32_b32( input);
280 __device__
inline unsigned int __popcll(
unsigned long long int input)
282 return hc::__popcount_u32_b64(input);
285 __device__
inline unsigned int __clz(
unsigned int input)
287 return hc::__firstbit_u32_u32( input);
290 __device__
inline unsigned int __clzll(
unsigned long long int input)
292 return hc::__firstbit_u32_u64( input);
295 __device__
inline unsigned int __clz(
int input)
297 return hc::__firstbit_u32_s32( input);
300 __device__
inline unsigned int __clzll(
long long int input)
302 return hc::__firstbit_u32_s64( input);
305 __device__
inline unsigned int __ffs(
unsigned int input)
307 return hc::__lastbit_u32_u32( input)+1;
310 __device__
inline unsigned int __ffsll(
unsigned long long int input)
312 return hc::__lastbit_u32_u64( input)+1;
315 __device__
inline unsigned int __ffs(
int input)
317 return hc::__lastbit_u32_s32( input)+1;
320 __device__
inline unsigned int __ffsll(
long long int input)
322 return hc::__lastbit_u32_s64( input)+1;
325 __device__
inline unsigned int __brev(
unsigned int input)
327 return hc::__bitrev_b32( input);
330 __device__
inline unsigned long long int __brevll(
unsigned long long int input)
332 return hc::__bitrev_b64( input);
336 __device__
inline int __all(
int input)
338 return hc::__all( input);
342 __device__
inline int __any(
int input)
344 if( hc::__any( input)!=0)
return 1;
348 __device__
inline unsigned long long int __ballot(
int input)
350 return hc::__ballot( input);
354 __device__
inline int __shfl(
int input,
int lane,
int width=warpSize)
356 return hc::__shfl(input,lane,width);
359 __device__
inline int __shfl_up(
int input,
unsigned int lane_delta,
int width=warpSize)
361 return hc::__shfl_up(input,lane_delta,width);
364 __device__
inline int __shfl_down(
int input,
unsigned int lane_delta,
int width=warpSize)
366 return hc::__shfl_down(input,lane_delta,width);
369 __device__
inline int __shfl_xor(
int input,
int lane_mask,
int width=warpSize)
371 return hc::__shfl_xor(input,lane_mask,width);
374 __device__
inline float __shfl(
float input,
int lane,
int width=warpSize)
376 return hc::__shfl(input,lane,width);
379 __device__
inline float __shfl_up(
float input,
unsigned int lane_delta,
int width=warpSize)
381 return hc::__shfl_up(input,lane_delta,width);
384 __device__
inline float __shfl_down(
float input,
unsigned int lane_delta,
int width=warpSize)
386 return hc::__shfl_down(input,lane_delta,width);
389 __device__
inline float __shfl_xor(
float input,
int lane_mask,
int width=warpSize)
391 return hc::__shfl_xor(input,lane_mask,width);
395 #include <hc_math.hpp>
397 #ifdef __HCC_ACCELERATOR__
398 using namespace hc::precise_math;
402 inline int min(
int arg1,
int arg2) __attribute((hc,cpu)) { \
403 return (
int)(hc::precise_math::fmin((
float)arg1, (
float)arg2));}
404 inline int max(
int arg1,
int arg2) __attribute((hc,cpu)) { \
405 return (
int)(hc::precise_math::fmax((
float)arg1, (
float)arg2));}
409 __device__
inline float __cosf(
float x) {
return hc::fast_math::cosf(x); };
410 __device__
inline float __expf(
float x) {
return hc::fast_math::expf(x); };
411 __device__
inline float __frsqrt_rn(
float x) {
return hc::fast_math::rsqrt(x); };
412 __device__
inline float __fsqrt_rd(
float x) {
return hc::fast_math::sqrt(x); };
413 __device__
inline float __fsqrt_rn(
float x) {
return hc::fast_math::sqrt(x); };
414 __device__
inline float __fsqrt_ru(
float x) {
return hc::fast_math::sqrt(x); };
415 __device__
inline float __fsqrt_rz(
float x) {
return hc::fast_math::sqrt(x); };
416 __device__
inline float __log10f(
float x) {
return hc::fast_math::log10f(x); };
417 __device__
inline float __log2f(
float x) {
return hc::fast_math::log2f(x); };
418 __device__
inline float __logf(
float x) {
return hc::fast_math::logf(x); };
419 __device__
inline float __powf(
float base,
float exponent) {
return hc::fast_math::powf(base, exponent); };
420 __device__
inline void __sincosf(
float x,
float *s,
float *c) {
return hc::fast_math::sincosf(x, s, c); };
421 __device__
inline float __sinf(
float x) {
return hc::fast_math::sinf(x); };
422 __device__
inline float __tanf(
float x) {
return hc::fast_math::tanf(x); };
423 __device__
inline float __dsqrt_rd(
double x) {
return hc::fast_math::sqrt(x); };
424 __device__
inline float __dsqrt_rn(
double x) {
return hc::fast_math::sqrt(x); };
425 __device__
inline float __dsqrt_ru(
double x) {
return hc::fast_math::sqrt(x); };
426 __device__
inline float __dsqrt_rz(
double x) {
return hc::fast_math::sqrt(x); };
431 #define hipThreadIdx_x (amp_get_local_id(2))
432 #define hipThreadIdx_y (amp_get_local_id(1))
433 #define hipThreadIdx_z (amp_get_local_id(0))
435 #define hipBlockIdx_x (hc_get_group_id(2))
436 #define hipBlockIdx_y (hc_get_group_id(1))
437 #define hipBlockIdx_z (hc_get_group_id(0))
439 #define hipBlockDim_x (amp_get_local_size(2))
440 #define hipBlockDim_y (amp_get_local_size(1))
441 #define hipBlockDim_z (amp_get_local_size(0))
443 #define hipGridDim_x (hc_get_num_groups(2))
444 #define hipGridDim_y (hc_get_num_groups(1))
445 #define hipGridDim_z (hc_get_num_groups(0))
450 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
454 #define KALMAR_PFE_BEGIN() \
455 hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
456 auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
457 __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
459 hc::completion_future cf = hc::parallel_for_each (\
462 [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]]
466 #define KALMAR_PFE_END \
468 if (HIP_LAUNCH_BLOCKING) {\
469 if (HIP_TRACE_API) {\
470 fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
473 if (HIP_TRACE_API) {\
474 fprintf(stderr, "hiptrace1: ...completed.\n");\
481 #define HIP_KERNEL_NAME(...) __VA_ARGS__
485 hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av);
486 void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &cf);
489 #define KNRM "\x1B[0m"
490 #define KGRN "\x1B[32m"
492 #if not defined(DISABLE_GRID_LAUNCH)
493 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
495 grid_launch_parm lp;\
496 lp.gridDim.x = _numBlocks3D.x; \
497 lp.gridDim.y = _numBlocks3D.y; \
498 lp.gridDim.z = _numBlocks3D.z; \
499 lp.groupDim.x = _blockDim3D.x; \
500 lp.groupDim.y = _blockDim3D.y; \
501 lp.groupDim.z = _blockDim3D.z; \
502 lp.groupMemBytes = _groupMemBytes;\
503 hc::completion_future cf;\
505 hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
506 if (HIP_TRACE_API) {\
507 fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n" KNRM, \
508 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
510 _kernelName (lp, __VA_ARGS__);\
511 ihipPostLaunchKernel(trueStream, cf);\
515 #warning(DISABLE_GRID_LAUNCH set)
517 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
519 grid_launch_parm lp;\
520 lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x; \
521 lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y; \
522 lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z; \
523 lp.groupDim.x = _blockDim3D.x; \
524 lp.groupDim.y = _blockDim3D.y; \
525 lp.groupDim.z = _blockDim3D.z; \
526 lp.groupMemBytes = _groupMemBytes;\
527 hc::completion_future cf;\
529 hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
530 if (HIP_TRACE_API) {\
531 fprintf(stderr, "==hip-api: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
532 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
534 _kernelName (lp, __VA_ARGS__);\
535 ihipPostLaunchKernel(trueStream, cf);\
540 #elif defined (__HCC_C__)
547 #if not defined(DISABLE_GRID_LAUNCH)
558 #define KERNELBEGIN \
559 hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
560 auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
561 __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
563 hc::completion_future cf = \
564 hc::parallel_for_each (\
567 [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]] \
573 if (HIP_LAUNCH_BLOCKING) {\
574 if (HIP_TRACE_API) {\
575 fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
578 if (HIP_TRACE_API) {\
579 fprintf(stderr, "hiptrace1: ...completed.\n");\
int HIP_TRACE_API
Trace HIP APIs.
Definition: hip_hcc.cpp:73
HIP C++ Texture API for hcc compiler.
int HIP_PRINT_ENV
Print all HIP-related environment variables.
Definition: hip_hcc.cpp:72
Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language ...
int HIP_LAUNCH_BLOCKING
Make all HIP APIs host-synchronous.
Definition: hip_hcc.cpp:70