39 #define CUDA_SUCCESS hipSuccess
47 #include <grid_launch.h>
51 #define hipLaunchParm grid_launch_parm
57 #if defined (__KALMAR_ACCELERATOR__) && not defined (__HCC_ACCELERATOR__)
58 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
62 #if defined(__HCC_ACCELERATOR__) and (__HCC_ACCELERATOR__ != 0)
67 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
68 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
69 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
70 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
71 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
74 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
75 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
78 #define __HIP_ARCH_HAS_DOUBLES__ (1)
81 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
82 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
83 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
84 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
87 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
88 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
91 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
92 #define __HIP_ARCH_HAS_3DGRID__ (1)
93 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
102 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
105 #if defined(__cplusplus)
107 #elif defined(__STDC_VERSION__)
111 #define clock_t long long int
112 __device__
inline long long int clock64() {
return (
long long int)hc::__clock_u64(); };
113 __device__
inline clock_t clock() {
return (clock_t)hc::__clock_u64(); };
116 __device__
inline int atomicAdd(
int* address,
int val)
118 return hc::atomic_fetch_add(address,val);
120 __device__
inline unsigned int atomicAdd(
unsigned int* address,
123 return hc::atomic_fetch_add(address,val);
125 __device__
inline unsigned long long int atomicAdd(
unsigned long long int* address,
126 unsigned long long int val)
128 return (
long long int)hc::atomic_fetch_add((uint64_t*)address,(uint64_t)val);
130 __device__
inline float atomicAdd(
float* address,
float val)
132 return hc::atomic_fetch_add(address,val);
136 __device__
inline int atomicSub(
int* address,
int val)
138 return hc::atomic_fetch_sub(address,val);
140 __device__
inline unsigned int atomicSub(
unsigned int* address,
143 return hc::atomic_fetch_sub(address,val);
147 __device__
inline int atomicExch(
int* address,
int val)
149 return hc::atomic_exchange(address,val);
151 __device__
inline unsigned int atomicExch(
unsigned int* address,
154 return hc::atomic_exchange(address,val);
156 __device__
inline unsigned long long int atomicExch(
unsigned long long int* address,
157 unsigned long long int val)
159 return (
long long int)hc::atomic_exchange((uint64_t*)address,(uint64_t)val);
161 __device__
inline float atomicExch(
float* address,
float val)
163 return hc::atomic_exchange(address,val);
167 __device__
inline int atomicMin(
int* address,
int val)
169 return hc::atomic_fetch_min(address,val);
171 __device__
inline unsigned int atomicMin(
unsigned int* address,
174 return hc::atomic_fetch_min(address,val);
176 __device__
inline unsigned long long int atomicMin(
unsigned long long int* address,
177 unsigned long long int val)
179 return (
long long int)hc::atomic_fetch_min((uint64_t*)address,(uint64_t)val);
183 __device__
inline int atomicMax(
int* address,
int val)
185 return hc::atomic_fetch_max(address,val);
187 __device__
inline unsigned int atomicMax(
unsigned int* address,
190 return hc::atomic_fetch_max(address,val);
192 __device__
inline unsigned long long int atomicMax(
unsigned long long int* address,
193 unsigned long long int val)
195 return (
long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val);
199 __device__
inline int atomicCAS(
int* address,
int compare,
int val)
201 hc::atomic_compare_exchange(address,&compare,val);
204 __device__
inline unsigned int atomicCAS(
unsigned int* address,
205 unsigned int compare,
208 hc::atomic_compare_exchange(address,&compare,val);
211 __device__
inline unsigned long long int atomicCAS(
unsigned long long int* address,
212 unsigned long long int compare,
213 unsigned long long int val)
215 hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
220 __device__
inline int atomicAnd(
int* address,
int val)
222 return hc::atomic_fetch_and(address,val);
224 __device__
inline unsigned int atomicAnd(
unsigned int* address,
227 return hc::atomic_fetch_and(address,val);
229 __device__
inline unsigned long long int atomicAnd(
unsigned long long int* address,
230 unsigned long long int val)
232 return (
long long int)hc::atomic_fetch_and((uint64_t*)address,(uint64_t)val);
236 __device__
inline int atomicOr(
int* address,
int val)
238 return hc::atomic_fetch_or(address,val);
240 __device__
inline unsigned int atomicOr(
unsigned int* address,
243 return hc::atomic_fetch_or(address,val);
245 __device__
inline unsigned long long int atomicOr(
unsigned long long int* address,
246 unsigned long long int val)
248 return (
long long int)hc::atomic_fetch_or((uint64_t*)address,(uint64_t)val);
252 __device__
inline int atomicXor(
int* address,
int val)
254 return hc::atomic_fetch_xor(address,val);
256 __device__
inline unsigned int atomicXor(
unsigned int* address,
259 return hc::atomic_fetch_xor(address,val);
261 __device__
inline unsigned long long int atomicXor(
unsigned long long int* address,
262 unsigned long long int val)
264 return (
long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
269 __device__
inline unsigned int __popc(
unsigned int input)
271 return hc::__popcount_u32_b32( input);
274 __device__
inline unsigned int __popcll(
unsigned long long int input)
276 return hc::__popcount_u32_b64(input);
279 __device__
inline unsigned int __clz(
unsigned int input)
281 return hc::__firstbit_u32_u32( input);
284 __device__
inline unsigned int __clzll(
unsigned long long int input)
286 return hc::__firstbit_u32_u64( input);
289 __device__
inline unsigned int __clz(
int input)
291 return hc::__firstbit_u32_s32( input);
294 __device__
inline unsigned int __clzll(
long long int input)
296 return hc::__firstbit_u32_s64( input);
299 __device__
inline unsigned int __ffs(
unsigned int input)
301 return hc::__lastbit_u32_u32( input)+1;
304 __device__
inline unsigned int __ffsll(
unsigned long long int input)
306 return hc::__lastbit_u32_u64( input)+1;
309 __device__
inline unsigned int __ffs(
int input)
311 return hc::__lastbit_u32_s32( input)+1;
314 __device__
inline unsigned int __ffsll(
long long int input)
316 return hc::__lastbit_u32_s64( input)+1;
319 __device__
inline unsigned int __brev(
unsigned int input)
321 return hc::__bitrev_b32( input);
324 __device__
inline unsigned long long int __brevll(
unsigned long long int input)
326 return hc::__bitrev_b64( input);
330 __device__
inline int __all(
int input)
332 return hc::__all( input);
335 __device__
inline int __any(
int input)
337 if( hc::__any( input)!=0)
return 1;
341 __device__
inline unsigned long long int __ballot(
int input)
343 return hc::__ballot( input);
347 __device__
inline int __shfl(
int input,
int lane,
int width)
349 return hc::__shfl(input,lane,width);
352 __device__
inline int __shfl_up(
int input,
unsigned int lane_delta,
int width)
354 return hc::__shfl_up(input,lane_delta,width);
357 __device__
inline int __shfl_down(
int input,
unsigned int lane_delta,
int width)
359 return hc::__shfl_down(input,lane_delta,width);
362 __device__
inline int __shfl_xor(
int input,
int lane_mask,
int width)
364 return hc::__shfl_xor(input,lane_mask,width);
367 __device__
inline float __shfl(
float input,
int lane,
int width)
369 return hc::__shfl(input,lane,width);
372 __device__
inline float __shfl_up(
float input,
unsigned int lane_delta,
int width)
374 return hc::__shfl_up(input,lane_delta,width);
377 __device__
inline float __shfl_down(
float input,
unsigned int lane_delta,
int width)
379 return hc::__shfl_down(input,lane_delta,width);
382 __device__
inline float __shfl_xor(
float input,
int lane_mask,
int width)
384 return hc::__shfl_xor(input,lane_mask,width);
388 #include <hc_math.hpp>
390 #ifdef __HCC_ACCELERATOR__
391 using namespace hc::precise_math;
395 inline int min(
int arg1,
int arg2) __attribute((hc,cpu)) { \
396 return (
int)(hc::precise_math::fmin((
float)arg1, (
float)arg2));}
397 inline int max(
int arg1,
int arg2) __attribute((hc,cpu)) { \
398 return (
int)(hc::precise_math::fmax((
float)arg1, (
float)arg2));}
402 __device__
inline float __cosf(
float x) {
return hc::fast_math::cosf(x); };
403 __device__
inline float __expf(
float x) {
return hc::fast_math::expf(x); };
404 __device__
inline float __frsqrt_rn(
float x) {
return hc::fast_math::rsqrt(x); };
405 __device__
inline float __fsqrt_rd(
float x) {
return hc::fast_math::sqrt(x); };
406 __device__
inline float __fsqrt_rn(
float x) {
return hc::fast_math::sqrt(x); };
407 __device__
inline float __fsqrt_ru(
float x) {
return hc::fast_math::sqrt(x); };
408 __device__
inline float __fsqrt_rz(
float x) {
return hc::fast_math::sqrt(x); };
409 __device__
inline float __log10f(
float x) {
return hc::fast_math::log10f(x); };
410 __device__
inline float __log2f(
float x) {
return hc::fast_math::log2f(x); };
411 __device__
inline float __logf(
float x) {
return hc::fast_math::logf(x); };
412 __device__
inline float __powf(
float base,
float exponent) {
return hc::fast_math::powf(base, exponent); };
413 __device__
inline void __sincosf(
float x,
float *s,
float *c) {
return hc::fast_math::sincosf(x, s, c); };
414 __device__
inline float __sinf(
float x) {
return hc::fast_math::sinf(x); };
415 __device__
inline float __tanf(
float x) {
return hc::fast_math::tanf(x); };
416 __device__
inline float __dsqrt_rd(
double x) {
return hc::fast_math::sqrt(x); };
417 __device__
inline float __dsqrt_rn(
double x) {
return hc::fast_math::sqrt(x); };
418 __device__
inline float __dsqrt_ru(
double x) {
return hc::fast_math::sqrt(x); };
419 __device__
inline float __dsqrt_rz(
double x) {
return hc::fast_math::sqrt(x); };
424 #define hipThreadIdx_x (amp_get_local_id(2))
425 #define hipThreadIdx_y (amp_get_local_id(1))
426 #define hipThreadIdx_z (amp_get_local_id(0))
428 #define hipBlockIdx_x (hc_get_group_id(2))
429 #define hipBlockIdx_y (hc_get_group_id(1))
430 #define hipBlockIdx_z (hc_get_group_id(0))
432 #define hipBlockDim_x (amp_get_local_size(2))
433 #define hipBlockDim_y (amp_get_local_size(1))
434 #define hipBlockDim_z (amp_get_local_size(0))
436 #define hipGridDim_x (hc_get_num_groups(2))
437 #define hipGridDim_y (hc_get_num_groups(1))
438 #define hipGridDim_z (hc_get_num_groups(0))
441 extern int warpSize ;
444 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
448 #define KALMAR_PFE_BEGIN() \
449 hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
450 auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
451 __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
453 hc::completion_future cf = hc::parallel_for_each (\
456 [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]]
460 #define KALMAR_PFE_END \
462 if (HIP_LAUNCH_BLOCKING) {\
463 if (HIP_TRACE_API) {\
464 fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
467 if (HIP_TRACE_API) {\
468 fprintf(stderr, "hiptrace1: ...completed.\n");\
475 #define HIP_KERNEL_NAME(...) __VA_ARGS__
479 hc::accelerator_view *ihipLaunchKernel(
hipStream_t stream);
481 #if not defined(DISABLE_GRID_LAUNCH)
482 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
484 grid_launch_parm lp;\
485 lp.gridDim.x = _numBlocks3D.x; \
486 lp.gridDim.y = _numBlocks3D.y; \
487 lp.gridDim.z = _numBlocks3D.z; \
488 lp.groupDim.x = _blockDim3D.x; \
489 lp.groupDim.y = _blockDim3D.y; \
490 lp.groupDim.z = _blockDim3D.z; \
491 lp.groupMemBytes = _groupMemBytes;\
492 hc::completion_future cf;\
494 lp.av = (ihipLaunchKernel(_stream)); \
495 if (HIP_TRACE_API) {\
496 fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
497 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
499 _kernelName (lp, __VA_ARGS__);\
503 #warning(DISABLE_GRID_LAUNCH set)
505 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
507 grid_launch_parm lp;\
508 lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x; \
509 lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y; \
510 lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z; \
511 lp.groupDim.x = _blockDim3D.x; \
512 lp.groupDim.y = _blockDim3D.y; \
513 lp.groupDim.z = _blockDim3D.z; \
514 lp.groupMemBytes = _groupMemBytes;\
515 hc::completion_future cf;\
517 lp.av = (ihipLaunchKernel(_stream)); \
518 if (HIP_TRACE_API) {\
519 fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
520 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
522 _kernelName (lp, __VA_ARGS__);\
527 #elif defined (__HCC_C__)
534 #if not defined(DISABLE_GRID_LAUNCH)
545 #define KERNELBEGIN \
546 hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
547 auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
548 __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
550 hc::completion_future cf = \
551 hc::parallel_for_each (\
554 [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]] \
560 if (HIP_LAUNCH_BLOCKING) {\
561 if (HIP_TRACE_API) {\
562 fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
565 if (HIP_TRACE_API) {\
566 fprintf(stderr, "hiptrace1: ...completed.\n");\
int HIP_TRACE_API
Trace HIP APIs.
Definition: hip_hcc.cpp:57
Definition: hip_hcc.cpp:82
HIP C++ Texture API for hcc compiler.
int HIP_PRINT_ENV
Print all HIP-related environment variables.
Definition: hip_hcc.cpp:56
Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language ...
int HIP_LAUNCH_BLOCKING
Make all HIP APIs host-synchronous.
Definition: hip_hcc.cpp:58