39 #define CUDA_SUCCESS hipSuccess
41 #include <hip_runtime_api.h>
47 #include <grid_launch.h>
51 #define hipLaunchParm grid_launch_parm
53 #include <hcc_detail/hip_texture.h>
54 #include <hcc_detail/host_defines.h>
57 #if defined (__KALMAR_ACCELERATOR__) && not defined (__HCC_ACCELERATOR__)
58 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
62 #if defined(__HCC_ACCELERATOR__) and (__HCC_ACCELERATOR__ != 0)
66 #define __HIP_DEVICE_COMPILE__ 1
70 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
71 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
72 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
73 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
74 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
77 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
78 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
81 #define __HIP_ARCH_HAS_DOUBLES__ (1)
84 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
85 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
86 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
87 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
90 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
91 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
94 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
95 #define __HIP_ARCH_HAS_3DGRID__ (1)
96 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
100 #define __HIP_DEVICE_COMPILE__ 0
109 #define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
112 #if defined(__cplusplus)
114 #elif defined(__STDC_VERSION__)
118 #define clock_t long long int
119 __device__
inline long long int clock64() {
return (
long long int)hc::__clock_u64(); };
120 __device__
inline clock_t clock() {
return (clock_t)hc::__clock_u64(); };
123 __device__
inline int atomicAdd(
int* address,
int val)
125 return hc::atomic_fetch_add(address,val);
127 __device__
inline unsigned int atomicAdd(
unsigned int* address,
130 return hc::atomic_fetch_add(address,val);
132 __device__
inline unsigned long long int atomicAdd(
unsigned long long int* address,
133 unsigned long long int val)
135 return (
long long int)hc::atomic_fetch_add((uint64_t*)address,(uint64_t)val);
137 __device__
inline float atomicAdd(
float* address,
float val)
139 return hc::atomic_fetch_add(address,val);
143 __device__
inline int atomicSub(
int* address,
int val)
145 return hc::atomic_fetch_sub(address,val);
147 __device__
inline unsigned int atomicSub(
unsigned int* address,
150 return hc::atomic_fetch_sub(address,val);
154 __device__
inline int atomicExch(
int* address,
int val)
156 return hc::atomic_exchange(address,val);
158 __device__
inline unsigned int atomicExch(
unsigned int* address,
161 return hc::atomic_exchange(address,val);
163 __device__
inline unsigned long long int atomicExch(
unsigned long long int* address,
164 unsigned long long int val)
166 return (
long long int)hc::atomic_exchange((uint64_t*)address,(uint64_t)val);
168 __device__
inline float atomicExch(
float* address,
float val)
170 return hc::atomic_exchange(address,val);
174 __device__
inline int atomicMin(
int* address,
int val)
176 return hc::atomic_fetch_min(address,val);
178 __device__
inline unsigned int atomicMin(
unsigned int* address,
181 return hc::atomic_fetch_min(address,val);
183 __device__
inline unsigned long long int atomicMin(
unsigned long long int* address,
184 unsigned long long int val)
186 return (
long long int)hc::atomic_fetch_min((uint64_t*)address,(uint64_t)val);
190 __device__
inline int atomicMax(
int* address,
int val)
192 return hc::atomic_fetch_max(address,val);
194 __device__
inline unsigned int atomicMax(
unsigned int* address,
197 return hc::atomic_fetch_max(address,val);
199 __device__
inline unsigned long long int atomicMax(
unsigned long long int* address,
200 unsigned long long int val)
202 return (
long long int)hc::atomic_fetch_max((uint64_t*)address,(uint64_t)val);
206 __device__
inline unsigned int atomicInc(
unsigned int* address)
208 return hc::atomic_fetch_inc(address);
212 __device__
inline unsigned int atomicDec(
unsigned int* address)
214 return hc::atomic_fetch_dec(address);
218 __device__
inline int atomicCAS(
int* address,
int compare,
int val)
220 hc::atomic_compare_exchange(address,&compare,val);
223 __device__
inline unsigned int atomicCAS(
unsigned int* address,
224 unsigned int compare,
227 hc::atomic_compare_exchange(address,&compare,val);
230 __device__
inline unsigned long long int atomicCAS(
unsigned long long int* address,
231 unsigned long long int compare,
232 unsigned long long int val)
234 hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
239 __device__
inline int atomicAnd(
int* address,
int val)
241 return hc::atomic_fetch_and(address,val);
243 __device__
inline unsigned int atomicAnd(
unsigned int* address,
246 return hc::atomic_fetch_and(address,val);
248 __device__
inline unsigned long long int atomicAnd(
unsigned long long int* address,
249 unsigned long long int val)
251 return (
long long int)hc::atomic_fetch_and((uint64_t*)address,(uint64_t)val);
255 __device__
inline int atomicOr(
int* address,
int val)
257 return hc::atomic_fetch_or(address,val);
259 __device__
inline unsigned int atomicOr(
unsigned int* address,
262 return hc::atomic_fetch_or(address,val);
264 __device__
inline unsigned long long int atomicOr(
unsigned long long int* address,
265 unsigned long long int val)
267 return (
long long int)hc::atomic_fetch_or((uint64_t*)address,(uint64_t)val);
271 __device__
inline int atomicXor(
int* address,
int val)
273 return hc::atomic_fetch_xor(address,val);
275 __device__
inline unsigned int atomicXor(
unsigned int* address,
278 return hc::atomic_fetch_xor(address,val);
280 __device__
inline unsigned long long int atomicXor(
unsigned long long int* address,
281 unsigned long long int val)
283 return (
long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
289 __device__
inline unsigned int __popc(
unsigned int input)
291 return hc::__popcount_u32_b32( input);
294 __device__
inline unsigned int __popcll(
unsigned long long int input)
296 return hc::__popcount_u32_b64(input);
299 __device__
inline unsigned int __clz(
unsigned int input)
301 return hc::__firstbit_u32_u32( input);
304 __device__
inline unsigned int __clzll(
unsigned long long int input)
306 return hc::__firstbit_u32_u64( input);
309 __device__
inline unsigned int __clz(
int input)
311 return hc::__firstbit_u32_s32( input);
314 __device__
inline unsigned int __clzll(
long long int input)
316 return hc::__firstbit_u32_s64( input);
319 __device__
inline unsigned int __ffs(
unsigned int input)
321 return hc::__lastbit_u32_u32( input)+1;
324 __device__
inline unsigned int __ffsll(
unsigned long long int input)
326 return hc::__lastbit_u32_u64( input)+1;
329 __device__
inline unsigned int __brev(
unsigned int input)
331 return hc::__bitrev_b32( input);
334 __device__
inline unsigned long long int __brevll(
unsigned long long int input)
336 return hc::__bitrev_b64( input);
341 __device__
inline int __all(
int input)
343 return hc::__all( input);
347 __device__
inline int __any(
int input)
349 return hc::__any( input);
353 __device__
inline unsigned long long int __ballot(
int input)
355 return hc::__ballot( input);
362 #ifdef __HCC_ACCELERATOR__
363 #include <hc_math.hpp>
365 using namespace hc::precise_math;
368 inline int min(
int arg1,
int arg2) __attribute((hc,cpu)) { \
369 return (
int)(hc::precise_math::fmin((
float)arg1, (
float)arg2));}
370 inline int max(
int arg1,
int arg2) __attribute((hc,cpu)) { \
371 return (
int)(hc::precise_math::fmax((
float)arg1, (
float)arg2));}
375 __device__
inline float __log2f(
float x) {
return hc::fast_math::log2(x); };
376 __device__
inline float __powf(
float base,
float exponent) {
return hc::fast_math::powf(base, exponent); };
385 #define hipThreadIdx_x (amp_get_local_id(2))
386 #define hipThreadIdx_y (amp_get_local_id(1))
387 #define hipThreadIdx_z (amp_get_local_id(0))
389 #define hipBlockIdx_x (hc_get_group_id(2))
390 #define hipBlockIdx_y (hc_get_group_id(1))
391 #define hipBlockIdx_z (hc_get_group_id(0))
393 #define hipBlockDim_x (amp_get_local_size(2))
394 #define hipBlockDim_y (amp_get_local_size(1))
395 #define hipBlockDim_z (amp_get_local_size(0))
397 #define hipGridDim_x (hc_get_num_groups(2))
398 #define hipGridDim_y (hc_get_num_groups(1))
399 #define hipGridDim_z (hc_get_num_groups(0))
402 extern int warpSize ;
405 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
409 #define KALMAR_PFE_BEGIN() \
410 hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
411 auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
412 __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
414 hc::completion_future cf = hc::parallel_for_each (\
417 [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]]
421 #define KALMAR_PFE_END \
423 if (HIP_LAUNCH_BLOCKING) {\
424 if (HIP_TRACE_API) {\
425 fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
428 if (HIP_TRACE_API) {\
429 fprintf(stderr, "hiptrace1: ...completed.\n");\
436 #define HIP_KERNEL_NAME(...) __VA_ARGS__
440 hc::accelerator_view *ihipLaunchKernel(
hipStream_t stream);
442 #if not defined(DISABLE_GRID_LAUNCH)
443 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
445 grid_launch_parm lp;\
446 lp.gridDim.x = _numBlocks3D.x; \
447 lp.gridDim.y = _numBlocks3D.y; \
448 lp.gridDim.z = _numBlocks3D.z; \
449 lp.groupDim.x = _blockDim3D.x; \
450 lp.groupDim.y = _blockDim3D.y; \
451 lp.groupDim.z = _blockDim3D.z; \
452 lp.groupMemBytes = _groupMemBytes;\
453 hc::completion_future cf;\
455 lp.av = (ihipLaunchKernel(_stream)); \
456 if (HIP_TRACE_API) {\
457 fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
458 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
460 _kernelName (lp, __VA_ARGS__);\
464 #warning(DISABLE_GRID_LAUNCH set)
466 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
468 grid_launch_parm lp;\
469 lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x; \
470 lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y; \
471 lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z; \
472 lp.groupDim.x = _blockDim3D.x; \
473 lp.groupDim.y = _blockDim3D.y; \
474 lp.groupDim.z = _blockDim3D.z; \
475 lp.groupMemBytes = _groupMemBytes;\
476 hc::completion_future cf;\
478 lp.av = (ihipLaunchKernel(_stream)); \
479 if (HIP_TRACE_API) {\
480 fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
481 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
483 _kernelName (lp, __VA_ARGS__);\
488 #elif defined (__HCC_C__)
495 #if not defined(DISABLE_GRID_LAUNCH)
506 #define KERNELBEGIN \
507 hc::extent<3> ext(lp.gridDim.x, lp.gridDim.y, lp.gridDim.z);\
508 auto __hipExtTile = ext.tile(lp.groupDim.x, lp.groupDim.y, lp.groupDim.z);\
509 __hipExtTile.set_dynamic_group_segment_size(lp.groupMemBytes);\
511 hc::completion_future cf = \
512 hc::parallel_for_each (\
515 [=] (hc::tiled_index<3> __hipIdx) mutable [[hc]] \
521 if (HIP_LAUNCH_BLOCKING) {\
522 if (HIP_TRACE_API) {\
523 fprintf(stderr, "hiptrace1: HIP_LAUNCH_BLOCKING ...\n");\
526 if (HIP_TRACE_API) {\
527 fprintf(stderr, "hiptrace1: ...completed.\n");\
int HIP_TRACE_API
Trace HIP APIs.
Definition: hip_hcc.cpp:57
Definition: hip_hcc.cpp:82
int HIP_PRINT_ENV
Print all HIP-related environment variables.
Definition: hip_hcc.cpp:56
int HIP_LAUNCH_BLOCKING
Make all HIP APIs host-synchronous.
Definition: hip_hcc.cpp:58