diff --git a/projects/clr/.clang-format b/projects/clr/.clang-format index 5572a72cdd..1569aac12f 100644 --- a/projects/clr/.clang-format +++ b/projects/clr/.clang-format @@ -1,10 +1,10 @@ Language: Cpp BasedOnStyle: Google AlignEscapedNewlinesLeft: false -AlignOperands: false +AlignOperands: Align ColumnLimit: 100 -AlwaysBreakTemplateDeclarations: false +BreakTemplateDeclarations: No DerivePointerAlignment: false IndentFunctionDeclarationAfterType: false MaxEmptyLinesToKeep: 2 -SortIncludes: false +SortIncludes: Never diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h index 940ef6099f..e771c63924 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h @@ -1915,13 +1915,12 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16* address, static_assert(sizeof(unsigned short int) == sizeof(__hip_bfloat16_raw)); unsigned short int* address_as_short = reinterpret_cast(address); // Align to 4 bytes - unsigned int* aligned_addr = - __builtin_bit_cast(unsigned int*, - __builtin_bit_cast(unsigned long long int, address_as_short) & - (unsigned long long int)(~0x3)); + unsigned int* aligned_addr = __builtin_bit_cast( + unsigned int*, __builtin_bit_cast(unsigned long long int, address_as_short) & + (unsigned long long int)(~0x3)); bool is_lower = __builtin_bit_cast(unsigned long long int, aligned_addr) == - __builtin_bit_cast(unsigned long long int, address); + __builtin_bit_cast(unsigned long long int, address); __hip_bfloat162 fval; if (is_lower) diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index dac24a5d67..e0b40d64dd 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -375,8 +375,7 @@ class coalesced_group : public thread_group { friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size); friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred); - template - friend __CG_QUALIFIER__ coalesced_group + template friend __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile& tgrp, bool pred); __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const { @@ -393,8 +392,8 @@ class coalesced_group : public thread_group { unsigned int masklength = min(static_cast(num_threads()) - base_offset, tile_size); lane_mask full_mask = (static_cast(warpSize) == 32) - ? static_cast((1u << 32) - 1) - : static_cast(-1ull); + ? static_cast((1u << 32) - 1) + : static_cast(-1ull); lane_mask member_mask = full_mask >> (warpSize - masklength); member_mask <<= (__lane_id() & ~(tile_size - 1)); @@ -485,9 +484,9 @@ class coalesced_group : public thread_group { srcRank = srcRank % static_cast(num_threads()); int lane = (num_threads() == warpSize) ? srcRank - : (static_cast(warpSize) == 64) - ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) - : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); + : (static_cast(warpSize) == 64) + ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) + : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); return __shfl(var, lane, warpSize); } @@ -835,8 +834,7 @@ template class thread_block_tile_base : public tile_base::numThreads; - template - friend __CG_QUALIFIER__ coalesced_group + template friend __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile& tgrp, bool pred); #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) @@ -910,10 +908,10 @@ template class parent_group_info { * \note This type is implemented on Linux, under development * on Microsoft Windows. */ -template -class thread_block_tile_type : public thread_block_tile_base, - public tiled_group, - public parent_group_info { +template class thread_block_tile_type + : public thread_block_tile_base, + public tiled_group, + public parent_group_info { _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; typedef thread_block_tile_base tbtBase; @@ -931,9 +929,8 @@ class thread_block_tile_type : public thread_block_tile_base, }; // Partial template specialization -template -class thread_block_tile_type : public thread_block_tile_base, - public tiled_group { +template class thread_block_tile_type + : public thread_block_tile_base, public tiled_group { _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize; typedef thread_block_tile_base tbtBase; @@ -1013,11 +1010,10 @@ __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, namespace impl { template class thread_block_tile_internal; -template -class thread_block_tile_internal : public thread_block_tile_type { +template class thread_block_tile_internal + : public thread_block_tile_type { protected: - template - __CG_QUALIFIER__ thread_block_tile_internal( + template __CG_QUALIFIER__ thread_block_tile_internal( const thread_block_tile_internal& g) : thread_block_tile_type(g.meta_group_rank(), g.meta_group_size()) {} @@ -1034,8 +1030,8 @@ class thread_block_tile_internal : public thread_block_tile_type -class thread_block_tile : public impl::thread_block_tile_internal { +template class thread_block_tile + : public impl::thread_block_tile_internal { protected: __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g) : impl::thread_block_tile_internal(g) {} @@ -1171,8 +1167,8 @@ class thread_block_tile : public impl::thread_block_tile_internal -class thread_block_tile : public impl::thread_block_tile_internal { +template class thread_block_tile + : public impl::thread_block_tile_internal { template friend class thread_block_tile; protected: @@ -1187,8 +1183,8 @@ template class thread_block_tile; namespace impl { template struct tiled_partition_internal; -template -struct tiled_partition_internal : public thread_block_tile { +template struct tiled_partition_internal + : public thread_block_tile { __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g) : thread_block_tile(g) {} }; diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h index 5d3d65b96b..cdb75affba 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h @@ -82,8 +82,8 @@ namespace __hip_internal { template <> struct is_floating_point<_Float16> : __hip_internal::true_type {}; } // namespace __hip_internal -template -using Enable_if_t = typename __hip_internal::enable_if::type; +template using Enable_if_t = + typename __hip_internal::enable_if::type; // BEGIN STRUCT __HALF struct __half { @@ -649,7 +649,7 @@ inline __HOST_DEVICE__ bool __hgt(__half x, __half y) { } inline __HOST_DEVICE__ bool __hequ(__half x, __half y) { return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) && - !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data); + !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data); } inline __HOST_DEVICE__ bool __hneu(__half x, __half y) { return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data); @@ -693,7 +693,7 @@ inline __HOST_DEVICE__ __half2 __hgt2(__half2 x, __half2 y) { } inline __HOST_DEVICE__ __half2 __hequ2(__half2 x, __half2 y) { auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) && - !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data); + !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data); return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hneu2(__half2 x, __half2 y) { @@ -911,13 +911,12 @@ inline __device__ __half unsafeAtomicAdd(__half* address, __half value) { static_assert(sizeof(unsigned short int) == sizeof(__half_raw)); unsigned short int* address_as_short = reinterpret_cast(address); // Align to 4 bytes - unsigned int* aligned_addr = - __builtin_bit_cast(unsigned int*, - __builtin_bit_cast(unsigned long long int, address_as_short) & - (unsigned long long int)(~0x3)); + unsigned int* aligned_addr = __builtin_bit_cast( + unsigned int*, __builtin_bit_cast(unsigned long long int, address_as_short) & + (unsigned long long int)(~0x3)); bool is_lower = __builtin_bit_cast(unsigned long long int, aligned_addr) == - __builtin_bit_cast(unsigned long long int, address); + __builtin_bit_cast(unsigned long long int, address); __half2 fval; if (is_lower) fval = __halves2half2(value, __float2half(0.0f)); diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp8.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp8.h index a2a968d435..d5796be1b4 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp8.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp8.h @@ -327,8 +327,8 @@ where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 this case, the fp16 mantissa should be shift left by 1 */ act_exponent = exponent - bias + 1; exponent_diff = f8_denormal_act_exponent - - act_exponent; // actual exponent is exponent-bias+1 as it is denormal - } else { // fp32/fp16 is normal with implicit 1 + act_exponent; // actual exponent is exponent-bias+1 as it is denormal + } else { // fp32/fp16 is normal with implicit 1 act_exponent = exponent - bias; if (act_exponent <= f8_denormal_act_exponent) { /* This is the case where fp32/fp16 is normal but it is in f8 denormal range. @@ -345,7 +345,7 @@ So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ } bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) == - (1ull << (mfmt - wm + exponent_diff - 1)); + (1ull << (mfmt - wm + exponent_diff - 1)); /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we shift right as shift right could rip off some residual part and make something not midpoint look like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than midpoint, but @@ -400,9 +400,9 @@ after shift right by 4 bits, it would look like midpoint. // The conversion function is from rocblas // https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220 // This has been modified to handle double types as well -template -__FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x, int wm, int we, - bool clip = false) { +template __FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x, + int wm, int we, + bool clip = false) { #if defined(__clang__) and defined(__HIP__) constexpr bool is_half = __hip_internal::is_same::value; constexpr bool is_float = __hip_internal::is_same::value; @@ -576,14 +576,15 @@ static __device__ __hip_fp8_storage_t cast_to_f8_from_f32(float v, bool saturate if (stochastic_rounding) { ival = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3) - ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0) - : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos + ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0) + : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos val.i32val = ival; i8data = val.i8val[0]; // little endian } else { // RNE CVT - ival = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3) - ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false) - : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0 + ival = + (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3) + ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false) + : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0 val.i32val = ival; i8data = val.i8val[0]; } @@ -628,8 +629,8 @@ cast_to_f8x2_from_f32x2(float2 v, bool saturate, __hip_fp8_interpretation_t inte } f2val.i32val[0] = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3) - ? __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, 0, false) - : __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, 0, false); + ? __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, 0, false) + : __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, 0, false); return static_cast<__hip_fp8x2_storage_t>(f2val.i16val[0]); } @@ -643,8 +644,8 @@ static __device__ float cast_to_f32_from_f8(__hip_fp8_storage_t v, val.i8val[0] = v; float fval = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3) - ? __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0) - : __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0); + ? __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0) + : __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0); return fval; } @@ -657,8 +658,8 @@ static __device__ float2 cast_to_f32x2_from_f8x2(__hip_fp8x2_storage_t v, val.i16val[0] = v; auto f2 = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3) - ? __builtin_amdgcn_cvt_pk_f32_fp8(val.i32val, false) - : __builtin_amdgcn_cvt_pk_f32_bf8(val.i32val, false); + ? __builtin_amdgcn_cvt_pk_f32_fp8(val.i32val, false) + : __builtin_amdgcn_cvt_pk_f32_bf8(val.i32val, false); return float2{f2[0], f2[1]}; } #endif // HIP_FP8_CVT_FAST_PATH @@ -672,9 +673,9 @@ __FP8_HOST_DEVICE_STATIC__ bool hip_fp8_fnuz_is_nan(__hip_fp8_storage_t a) { __FP8_HOST_DEVICE_STATIC__ bool hip_fp8_ocp_is_nan(__hip_fp8_storage_t a, const __hip_fp8_interpretation_t type) { - return (type == __HIP_E4M3) ? ((a & 0x7f) == 0x7f) - : (type == __HIP_E5M2) ? ((a & 0x7f) > 0x7c) - : false; + return (type == __HIP_E4M3) ? ((a & 0x7f) == 0x7f) + : (type == __HIP_E5M2) ? ((a & 0x7f) > 0x7c) + : false; } __FP8_HOST_DEVICE_STATIC__ bool hip_fp8_ocp_is_inf(__hip_fp8_storage_t a, diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_fp.hpp b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_fp.hpp index 8a8d31b5bb..73aac457b0 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_fp.hpp +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_fp.hpp @@ -334,13 +334,13 @@ __OCP_FP_HOST_DEVICE_STATIC__ float __amd_cvt_fp8_to_float_scale( const __amd_scale_t scale) { #if HIP_ENABLE_GFX950_OCP_BUILTINS return interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_f32_fp8(val, __amd_scale_to_float(scale), 0) - : __builtin_amdgcn_cvt_scalef32_f32_bf8(val, __amd_scale_to_float(scale), 0); + ? __builtin_amdgcn_cvt_scalef32_f32_fp8(val, __amd_scale_to_float(scale), 0) + : __builtin_amdgcn_cvt_scalef32_f32_bf8(val, __amd_scale_to_float(scale), 0); #else using namespace fcbx; return interpret == __AMD_OCP_E4M3 - ? to_float(static_cast(val), scale) - : to_float(static_cast(val), scale); + ? to_float(static_cast(val), scale) + : to_float(static_cast(val), scale); #endif } @@ -378,8 +378,8 @@ __amd_cvt_float_to_fp8_sr_scale(const float val, const __amd_fp8_interpretation_ } u{0}; using namespace fcbx; u.ui32t = interpret == __AMD_OCP_E4M3 - ? from_float_sr(val, seed, scale) - : from_float_sr(val, seed, scale); + ? from_float_sr(val, seed, scale) + : from_float_sr(val, seed, scale); return u.fp8[0]; #endif } @@ -548,8 +548,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_floatx2_storage_t __amd_cvt_fp8x2_to_floatx2 const __amd_scale_t scale) { #if HIP_ENABLE_GFX950_OCP_BUILTINS return interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(val, __amd_scale_to_float(scale), false) - : __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(val, __amd_scale_to_float(scale), false); + ? __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(val, __amd_scale_to_float(scale), false) + : __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(val, __amd_scale_to_float(scale), false); #else using namespace fcbx; __amd_floatx2_storage_t ret; @@ -582,10 +582,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_floatx2_to_fp8x2_s __amd_fp8x2_storage_t fp8x2[2]; } u{0}; u.shortx2 = interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(u.shortx2, val[0], val[1], - __amd_scale_to_float(scale), false) - : __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(u.shortx2, val[0], val[1], - __amd_scale_to_float(scale), false); + ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(u.shortx2, val[0], val[1], + __amd_scale_to_float(scale), false) + : __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(u.shortx2, val[0], val[1], + __amd_scale_to_float(scale), false); return u.fp8x2[0]; #else using namespace fcbx; @@ -679,8 +679,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t __amd_cvt_fp8x2_to_fp16x2_s } u; u.fp8x2[0] = val; return interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(scale), false) - : __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(scale), false); + ? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(scale), false) + : __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(scale), false); #else using namespace fcbx; __amd_fp16x2_storage_t ret; @@ -787,8 +787,9 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_bf16x2_storage_t __amd_cvt_fp8x2_to_bf16x2_s } u; u.fp8x2[0] = in; return interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false) - : __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false); + ? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false) + : __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), + false); #else using namespace fcbx; __amd_bf16x2_storage_t ret; @@ -891,8 +892,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x32_storage_t __amd_cvt_fp6x32_to_fp16x3 #if HIP_ENABLE_GFX950_OCP_BUILTINS // gfx950 expects scale to be in float return interpret == __AMD_OCP_E2M3 - ? __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, __amd_scale_to_float(scale)) - : __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, __amd_scale_to_float(scale)); + ? __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, __amd_scale_to_float(scale)) + : __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, __amd_scale_to_float(scale)); #else using namespace fcbx; if (interpret == __AMD_OCP_E2M3) { @@ -918,8 +919,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_bf16x32_storage_t __amd_cvt_fp6x32_to_bf16x3 const __amd_scale_t scale) { #if HIP_ENABLE_GFX950_OCP_BUILTINS return interpret == __AMD_OCP_E2M3 - ? __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, __amd_scale_to_float(scale)) - : __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, __amd_scale_to_float(scale)); + ? __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, __amd_scale_to_float(scale)) + : __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, __amd_scale_to_float(scale)); #else using namespace fcbx; if (interpret == __AMD_OCP_E2M3) { @@ -937,15 +938,15 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_floatx32_storage_t __amd_cvt_fp6x32_to_float const __amd_scale_t scale) { #if HIP_ENABLE_GFX950_OCP_BUILTINS return interpret == __AMD_OCP_E2M3 - ? __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(val, __amd_scale_to_float(scale)) - : __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(val, __amd_scale_to_float(scale)); + ? __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(val, __amd_scale_to_float(scale)) + : __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(val, __amd_scale_to_float(scale)); #else using namespace fcbx; return interpret == __AMD_OCP_E2M3 - ? fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float, Encoding::E2M3, - Encoding::IEEE754>(val, scale) - : fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float, Encoding::E3M2, - Encoding::IEEE754>(val, scale); + ? fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float, + Encoding::E2M3, Encoding::IEEE754>(val, scale) + : fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float, + Encoding::E3M2, Encoding::IEEE754>(val, scale); #endif } @@ -1200,9 +1201,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_fp16x2_to_fp8x2_sc __amd_shortx2_storage_t shortx2; __amd_fp8x2_storage_t fp8x2[2]; } u{0}; - u.shortx2 = interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(u.shortx2, in, __amd_scale_to_float(scale), false) - : __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(u.shortx2, in, __amd_scale_to_float(scale), false); + u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16( + u.shortx2, in, __amd_scale_to_float(scale), false) + : __builtin_amdgcn_cvt_scalef32_pk_bf8_f16( + u.shortx2, in, __amd_scale_to_float(scale), false); return u.fp8x2[0]; #else static_assert(sizeof(__amd_fp8x2_storage_t[2]) == sizeof(uint32_t)); @@ -1241,10 +1243,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_bf16x2_to_fp8x2_sc __amd_shortx2_storage_t shortx2; __amd_fp8x2_storage_t fp8x2[2]; } u{0}; - u.shortx2 = interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(u.shortx2, in, __amd_scale_to_float(scale), false) - : __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(u.shortx2, in, __amd_scale_to_float(scale), - false); + u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16( + u.shortx2, in, __amd_scale_to_float(scale), false) + : __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16( + u.shortx2, in, __amd_scale_to_float(scale), false); return u.fp8x2[0]; #else using namespace fcbx; @@ -1429,9 +1431,10 @@ __amd_cvt_fp8_to_fp16_scale(const __amd_fp8_storage_t val, const __amd_fp8_interpretation_t interpret, const __amd_scale_t scale) { #if HIP_ENABLE_GFX950_OCP_BUILTINS __amd_fp16x2_storage_t ret; - ret = interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_f16_fp8(ret, val, __amd_scale_to_float(scale), 0, false) - : __builtin_amdgcn_cvt_scalef32_f16_bf8(ret, val, __amd_scale_to_float(scale), 0, false); + ret = + interpret == __AMD_OCP_E4M3 + ? __builtin_amdgcn_cvt_scalef32_f16_fp8(ret, val, __amd_scale_to_float(scale), 0, false) + : __builtin_amdgcn_cvt_scalef32_f16_bf8(ret, val, __amd_scale_to_float(scale), 0, false); return ret[0]; #else using namespace fcbx; @@ -1463,9 +1466,10 @@ __amd_cvt_fp8_to_bf16_scale(const __amd_fp8_storage_t val, unsigned int ui32; } u{0}; u.fp8[0] = val; - auto ret = interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false) - : __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false); + auto ret = + interpret == __AMD_OCP_E4M3 + ? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false) + : __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false); return ret[0]; #else using namespace fcbx; @@ -1491,8 +1495,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx16_floatx16 const __amd_fp6_interpretation_t interpret, const __amd_scale_t scale) { #if HIP_ENABLE_GFX950_OCP_BUILTINS return interpret == __AMD_OCP_E2M3 - ? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale)) - : __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale)); + ? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale)) + : __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale)); #else __amd_floatx32_storage_t tmp; for (size_t i = 0; i < 16; i++) { @@ -1503,10 +1507,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx16_floatx16 } using namespace fcbx; return interpret == __AMD_OCP_E2M3 - ? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, - Encoding::IEEE754, Encoding::E2M3>(tmp, scale) - : fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, - Encoding::IEEE754, Encoding::E3M2>(tmp, scale); + ? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, + Encoding::IEEE754, Encoding::E2M3>(tmp, scale) + : fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, + Encoding::IEEE754, Encoding::E3M2>(tmp, scale); #endif } @@ -1529,15 +1533,15 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx32_to_fp6x3 in2 = {val[16], val[17], val[18], val[19], val[20], val[21], val[22], val[23], val[24], val[25], val[26], val[27], val[28], val[29], val[30], val[31]}; return interpret == __AMD_OCP_E2M3 - ? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale)) - : __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale)); + ? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale)) + : __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale)); #else using namespace fcbx; return interpret == __AMD_OCP_E2M3 - ? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, - Encoding::IEEE754, Encoding::E2M3>(val, scale) - : fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, - Encoding::IEEE754, Encoding::E3M2>(val, scale); + ? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, + Encoding::IEEE754, Encoding::E2M3>(val, scale) + : fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, + Encoding::IEEE754, Encoding::E3M2>(val, scale); #endif } @@ -1555,16 +1559,17 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx32_to_fp6x3 const unsigned int round, const __amd_scale_t scale) { #if __has_builtin(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32) and \ __has_builtin(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32) - return interpret == __AMD_OCP_E2M3 - ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(val, round, __amd_scale_to_float(scale)) - : __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(val, round, __amd_scale_to_float(scale)); + return interpret == __AMD_OCP_E2M3 ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32( + val, round, __amd_scale_to_float(scale)) + : __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32( + val, round, __amd_scale_to_float(scale)); #else using namespace fcbx; return interpret == __AMD_OCP_E2M3 - ? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, - Encoding::IEEE754, Encoding::E2M3, true>(val, scale, round) - : fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, - Encoding::IEEE754, Encoding::E3M2, true>(val, scale, round); + ? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, + Encoding::IEEE754, Encoding::E2M3, true>(val, scale, round) + : fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float, + Encoding::IEEE754, Encoding::E3M2, true>(val, scale, round); #endif } @@ -1638,16 +1643,17 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_fp16x32_to_fp6x32 const unsigned int round, const __amd_scale_t scale) { #if HIP_ENABLE_GFX950_OCP_BUILTINS return interpret == __AMD_OCP_E2M3 - ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(in, round, __amd_scale_to_float(scale)) - : __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(in, round, __amd_scale_to_float(scale)); + ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(in, round, __amd_scale_to_float(scale)) + : __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(in, round, + __amd_scale_to_float(scale)); #else return interpret == __AMD_OCP_E2M3 - ? fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t, - __amd_fp16_storage_t, fcbx::Encoding::E5M10, fcbx::Encoding::E2M3, - true>(in, scale, round) - : fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t, - __amd_fp16_storage_t, fcbx::Encoding::E5M10, fcbx::Encoding::E3M2, - true>(in, scale, round); + ? fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t, + __amd_fp16_storage_t, fcbx::Encoding::E5M10, + fcbx::Encoding::E2M3, true>(in, scale, round) + : fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t, + __amd_fp16_storage_t, fcbx::Encoding::E5M10, + fcbx::Encoding::E3M2, true>(in, scale, round); #endif } @@ -1655,17 +1661,18 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_bf16x32_to_fp6x32 const __amd_bf16x32_storage_t in, const __amd_fp6_interpretation_t interpret, const unsigned int round, const __amd_scale_t scale) { #if HIP_ENABLE_GFX950_OCP_BUILTINS - return interpret == __AMD_OCP_E2M3 - ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(in, round, __amd_scale_to_float(scale)) - : __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(in, round, __amd_scale_to_float(scale)); + return interpret == __AMD_OCP_E2M3 ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16( + in, round, __amd_scale_to_float(scale)) + : __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16( + in, round, __amd_scale_to_float(scale)); #else return interpret == __AMD_OCP_E2M3 - ? fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t, - __amd_bf16_storage_t, fcbx::Encoding::E8M7, fcbx::Encoding::E2M3, - true>(in, scale, round) - : fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t, - __amd_bf16_storage_t, fcbx::Encoding::E8M7, fcbx::Encoding::E3M2, - true>(in, scale, round); + ? fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t, + __amd_bf16_storage_t, fcbx::Encoding::E8M7, + fcbx::Encoding::E2M3, true>(in, scale, round) + : fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t, + __amd_bf16_storage_t, fcbx::Encoding::E8M7, + fcbx::Encoding::E3M2, true>(in, scale, round); #endif } @@ -2542,8 +2549,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t __amd_cvt_fp8x2_to_fp16x2( } u; u.fp8x2[0] = val; return interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(0), false) - : __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(0), false); + ? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(0), false) + : __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(0), false); #else using namespace fcbx; __amd_fp16x2_storage_t ret; @@ -2573,9 +2580,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_fp16x2_to_fp8x2( __amd_shortx2_storage_t shortx2; __amd_fp8x2_storage_t fp8x2[2]; } u{0}; - u.shortx2 = interpret == __AMD_OCP_E4M3 - ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(u.shortx2, val, __amd_scale_to_float(0), false) - : __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(u.shortx2, val, __amd_scale_to_float(0), false); + u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16( + u.shortx2, val, __amd_scale_to_float(0), false) + : __builtin_amdgcn_cvt_scalef32_pk_bf8_f16( + u.shortx2, val, __amd_scale_to_float(0), false); return u.fp8x2[0]; #else using namespace fcbx; @@ -2783,8 +2791,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8_storage_t __amd_cvt_fp16_to_fp8_sr( #else using namespace fcbx; return interpret == __AMD_OCP_E4M3 - ? from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val, sr, 0) - : from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val, sr, 0); + ? from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val, sr, 0) + : from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val, sr, 0); #endif } diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_fp_cxx.hpp b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_fp_cxx.hpp index a589b021eb..91b6695736 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_fp_cxx.hpp +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_fp_cxx.hpp @@ -719,8 +719,8 @@ struct __hipext_ocp_fp6x32_e2m3 { } #endif - __OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in, - const __amd_scale_t scale) + __OCP_FP_HOST_DEVICE__ + __hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in, const __amd_scale_t scale) #if HIP_ENABLE_GFX950_OCP_BUILTINS : __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, __amd_scale_to_float(scale))){} #else @@ -742,8 +742,8 @@ struct __hipext_ocp_fp6x32_e2m3 { } #endif - __OCP_FP_HOST_DEVICE__ - __hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in, const __amd_scale_t scale) + __OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in, + const __amd_scale_t scale) #if HIP_ENABLE_GFX950_OCP_BUILTINS : __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, __amd_scale_to_float(scale))){} #else @@ -832,8 +832,8 @@ struct __hipext_ocp_fp6x32_e3m2 { } #endif - __OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in, - const __amd_scale_t scale) + __OCP_FP_HOST_DEVICE__ + __hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in, const __amd_scale_t scale) #if HIP_ENABLE_GFX950_OCP_BUILTINS : __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, __amd_scale_to_float(scale))){} #else @@ -855,8 +855,8 @@ struct __hipext_ocp_fp6x32_e3m2 { } #endif - __OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in, - const __amd_scale_t scale) + __OCP_FP_HOST_DEVICE__ + __hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in, const __amd_scale_t scale) #if HIP_ENABLE_GFX950_OCP_BUILTINS : __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, __amd_scale_to_float(scale))){} #else diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_host.hpp b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_host.hpp index 9d68f6275b..f0ac1a4c9d 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_host.hpp +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_ocp_host.hpp @@ -793,11 +793,11 @@ __OCP_FP_HOST_DEVICE_STATIC__ OutType fp6_cvt_packedx32(InType in, int8_t scale uint32_t seed = 0) { // This is tightly coupled with the definitions of the amd_ocp_types constexpr bool in_float = std::is_same::value || - std::is_same::value || - std::is_same::value; + std::is_same::value || + std::is_same::value; constexpr bool out_float = std::is_same::value || - std::is_same::value || - std::is_same::value; + std::is_same::value || + std::is_same::value; using other_type = std::conditional::type; struct fp6x32_packed { diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h index 75a1f978c0..a9ad953123 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h @@ -314,9 +314,8 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) { * @return Original value contained in \p addr. */ __device__ inline float safeAtomicAdd(float* addr, float value) { -#if defined(__gfx908__) || \ - ((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && \ - !__has_builtin(__hip_atomic_fetch_add)) +#if defined(__gfx908__) || ((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && \ + !__has_builtin(__hip_atomic_fetch_add)) // On gfx908, we can generate unsafe FP32 atomic add that does not follow all // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead. // On gfx90a, gfx942 and gfx950 if we do not have the __hip_atomic_fetch_add builtin, we diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h index 2bd8f51629..7d0f327b8f 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h @@ -59,9 +59,9 @@ template struct HIP_vector_base; template struct HIP_vector_type; namespace hip_impl { -template -__attribute__((always_inline)) __HOST_DEVICE__ typename HIP_vector_base::Native_vec_* -get_native_pointer(HIP_vector_base& base_vec) { +template __attribute__((always_inline)) __HOST_DEVICE__ + typename HIP_vector_base::Native_vec_* + get_native_pointer(HIP_vector_base& base_vec) { static_assert(sizeof(base_vec) == sizeof(typename HIP_vector_base::Native_vec_)); static_assert(__hip_internal::alignment_of>::value == __hip_internal::alignment_of::Native_vec_>::value); @@ -78,9 +78,9 @@ get_native_pointer(const HIP_vector_base& base_vec) { }; } // Namespace hip_impl. -template -__attribute__((always_inline)) __HOST_DEVICE__ typename HIP_vector_base::Native_vec_& -get_native_vector(HIP_vector_base& base_vec) { +template __attribute__((always_inline)) __HOST_DEVICE__ + typename HIP_vector_base::Native_vec_& + get_native_vector(HIP_vector_base& base_vec) { return *hip_impl::get_native_pointer(base_vec); }; @@ -308,9 +308,8 @@ template struct HIP_vector_type : public HIP_vec __HOST_DEVICE__ HIP_vector_type() = default; - template ::value>::type* = - nullptr> + template ::value>::type* = nullptr> __HOST_DEVICE__ explicit constexpr HIP_vector_type(U x_) noexcept : HIP_vector_base{static_cast(x_)} {} template < // TODO: constrain based on type as well. @@ -368,9 +367,8 @@ template struct HIP_vector_type : public HIP_vec #endif return *this; } - template < - typename U, - typename __hip_internal::enable_if<__hip_internal::is_convertible{}>::type* = nullptr> + template {}>::type* = nullptr> __HOST_DEVICE__ HIP_vector_type& operator+=(U x) noexcept { return *this += make_vector_type(x); } @@ -383,9 +381,8 @@ template struct HIP_vector_type : public HIP_vec #endif return *this; } - template < - typename U, - typename __hip_internal::enable_if<__hip_internal::is_convertible{}>::type* = nullptr> + template {}>::type* = nullptr> __HOST_DEVICE__ HIP_vector_type& operator-=(U x) noexcept { return *this -= make_vector_type(x); } @@ -404,9 +401,8 @@ template struct HIP_vector_type : public HIP_vec return HIP_vector_type{x} *= y; } - template < - typename U, - typename __hip_internal::enable_if<__hip_internal::is_convertible{}>::type* = nullptr> + template {}>::type* = nullptr> __HOST_DEVICE__ HIP_vector_type& operator*=(U x) noexcept { return *this *= make_vector_type(x); } @@ -424,9 +420,8 @@ template struct HIP_vector_type : public HIP_vec #endif return *this; } - template < - typename U, - typename __hip_internal::enable_if<__hip_internal::is_convertible{}>::type* = nullptr> + template {}>::type* = nullptr> __HOST_DEVICE__ HIP_vector_type& operator/=(U x) noexcept { return *this /= make_vector_type(x); } @@ -576,8 +571,7 @@ __HOST_DEVICE__ inline constexpr HIP_vector_type operator/( return make_vector_type(x) /= y; } -template -__HOST_DEVICE__ inline +template __HOST_DEVICE__ inline #if __cplusplus >= 201402L && !defined(__HIPCC_RTC__) constexpr #endif diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_surface_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_surface_functions.h index ad0fc9ce1c..ac17b1bf79 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_surface_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_surface_functions.h @@ -109,9 +109,8 @@ static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, * \param x [in] The coordinate where the value will be read out. * \param boundaryMode [in] The boundary mode is currently ignored. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x, int boundaryMode = hipBoundaryModeZero) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT; @@ -128,9 +127,8 @@ static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t su * \param surfObj [in] The surface descriptor. * \param x [in] The coordinate where the data will be written. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); @@ -147,9 +145,8 @@ static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t su * \param x [in] The x coordinate where the value will be read out. * \param y [in] The y coordinate where the value will be read out. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -168,9 +165,8 @@ static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t su * \param x [in] The x coordinate where the data will be written. * \param y [in] The y coordinate where the data will be written. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -190,9 +186,8 @@ static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t su * \param y [in] The y coordinate where the value will be read out. * \param z [in] The z coordinate where the value will be read out. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -212,9 +207,8 @@ static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t su * \param y [in] The y coordinate where the data will be written. * \param z [in] The z coordinate where the data will be written. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -233,9 +227,8 @@ static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t su * \param x [in] The coordinate where the value will be read out. * \param layer [in] The layer index where the value will be read out. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -253,9 +246,8 @@ static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObje * \param x [in] The x coordinate where the data will be written. * \param layer [in] The layer index where the data will be written. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -274,9 +266,8 @@ static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObje * \param y [in] The y coordinate where the value will be read out. * \param layer [in] The layer index where the value will be read out. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -296,9 +287,8 @@ static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObje * \param y [in] The y coordinate where the data will be written. * \param layer [in] The layer index where the data will be written. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -318,9 +308,8 @@ static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObje * \param y [in] The y coordinate where the value will be read out. * \param face [in] The face index where the value will be read out. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -340,9 +329,8 @@ static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject * \param y [in] The y coordinate where the data will be written. * \param face [in] The face index where the data will be written. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -363,9 +351,8 @@ static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject * \param face [in] The face index where the value will be read out. * \param layer [in] The layer index where the data will be written. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face, int layer) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT @@ -386,9 +373,8 @@ static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfac * \param face [in] The face index where the data will be written. * \param layer [in] The layer index where the data will be written. */ -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face, int layer) { __HIP_SURFACE_OBJECT_PARAMETERS_INIT diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 3f79d0c7ed..aaad1827e2 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -443,7 +443,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf return backwardPermute(firstLane << 2, result); else { auto tmp = (static_cast(backwardPermute(firstLane << 2, result[1])) << 32) | - static_cast(backwardPermute(firstLane << 2, result[0])); + static_cast(backwardPermute(firstLane << 2, result[0])); return *reinterpret_cast(&tmp); } } diff --git a/projects/clr/hipamd/include/hip/amd_detail/functional_grid_launch.hpp b/projects/clr/hipamd/include/hip/amd_detail/functional_grid_launch.hpp index 65b85e0b8e..bd6e027e14 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/functional_grid_launch.hpp +++ b/projects/clr/hipamd/include/hip/amd_detail/functional_grid_launch.hpp @@ -130,12 +130,9 @@ inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSiz blockSizeLimit); } -template -inline hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize, - T kernel, - size_t dynSharedMemPerBlk = 0, - int blockSizeLimit = 0, - unsigned int flags = 0) { +template inline hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags( + int* gridSize, int* blockSize, T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, + unsigned int flags = 0) { using namespace hip_impl; hip_impl::hip_init(); diff --git a/projects/clr/hipamd/include/hip/amd_detail/helpers.hpp b/projects/clr/hipamd/include/hip/amd_detail/helpers.hpp index 1011cc1c01..2a8af8e153 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/helpers.hpp +++ b/projects/clr/hipamd/include/hip/amd_detail/helpers.hpp @@ -51,11 +51,11 @@ namespace std { // TODO: these should be removed as soon as possible. #if (__cplusplus < 201406L) #if (__cplusplus < 201402L) template using enable_if_t = typename enable_if::type; -template -using conditional_t = typename conditional::type; +template using conditional_t = + typename conditional::type; template using decay_t = typename decay::type; -template -using result_of_t = typename result_of::type; +template using result_of_t = + typename result_of::type; template using remove_reference_t = typename remove_reference::type; #endif #endif @@ -67,8 +67,8 @@ template using void_t_ = void; #if HIP_HAS_INVOCABLE template struct is_callable_impl; -template -struct is_callable_impl : std::is_invocable {}; +template struct is_callable_impl + : std::is_invocable {}; #elif HIP_HAS_RESULT_OF_SFINAE template struct is_callable_impl : std::false_type {}; @@ -76,11 +76,10 @@ template struct is_callable_impl::type> > : std::true_type {}; #else -template -auto simple_invoke(T Base::* pmd, Derived&& ref) -> decltype(static_cast(ref).*pmd); +template auto simple_invoke(T Base::* pmd, Derived&& ref) + -> decltype(static_cast(ref).*pmd); -template -auto simple_invoke(PMD&& pmd, Pointer&& ptr) +template auto simple_invoke(PMD&& pmd, Pointer&& ptr) -> decltype((*static_cast(ptr)).*static_cast(pmd)); template @@ -100,8 +99,8 @@ template auto simple_invoke(T Base::* pmf, const std::reference_wrapper& ref, Args&&... args) -> decltype((ref.get().*pmf)(static_cast(args)...)); -template -auto simple_invoke(F&& f, Ts&&... xs) -> decltype(f(static_cast(xs)...)); +template auto simple_invoke(F&& f, Ts&&... xs) + -> decltype(f(static_cast(xs)...)); template struct is_callable_impl : std::false_type {}; diff --git a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h index 63227bb86b..eb695e3a70 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -56,19 +56,19 @@ using lane_mask = unsigned long long int; namespace cooperative_groups { /* Global scope */ -template -using is_power_of_2 = __hip_internal::integral_constant; +template using is_power_of_2 = + __hip_internal::integral_constant; -template -using is_valid_wavefront = __hip_internal::integral_constant; +template using is_valid_wavefront = + __hip_internal::integral_constant; -template -using is_valid_tile_size = __hip_internal::integral_constant< - bool, is_power_of_2::value && is_valid_wavefront::value>; +template using is_valid_tile_size = + __hip_internal::integral_constant::value && + is_valid_wavefront::value>; -template -using is_valid_type = __hip_internal::integral_constant< - bool, __hip_internal::is_integral::value || __hip_internal::is_floating_point::value>; +template using is_valid_type = + __hip_internal::integral_constant::value || + __hip_internal::is_floating_point::value>; namespace internal { diff --git a/projects/clr/hipamd/include/hip/amd_detail/hip_prof_str.h b/projects/clr/hipamd/include/hip/amd_detail/hip_prof_str.h index c4214e4503..d9d85e7bd4 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/hip_prof_str.h +++ b/projects/clr/hipamd/include/hip/amd_detail/hip_prof_str.h @@ -8101,9 +8101,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { break; // hipDeviceGetPCIBusId[('char*', 'pciBusId'), ('int', 'len'), ('int', 'device')] case HIP_API_ID_hipDeviceGetPCIBusId: - data->args.hipDeviceGetPCIBusId.pciBusId = (data->args.hipDeviceGetPCIBusId.pciBusId) - ? strdup(data->args.hipDeviceGetPCIBusId.pciBusId) - : NULL; + data->args.hipDeviceGetPCIBusId.pciBusId = + (data->args.hipDeviceGetPCIBusId.pciBusId) + ? strdup(data->args.hipDeviceGetPCIBusId.pciBusId) + : NULL; break; // hipDeviceGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')] case HIP_API_ID_hipDeviceGetSharedMemConfig: @@ -8991,9 +8992,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipGraphInstantiate.pErrorNode) data->args.hipGraphInstantiate.pErrorNode__val = *(data->args.hipGraphInstantiate.pErrorNode); - data->args.hipGraphInstantiate.pLogBuffer = (data->args.hipGraphInstantiate.pLogBuffer) - ? strdup(data->args.hipGraphInstantiate.pLogBuffer) - : NULL; + data->args.hipGraphInstantiate.pLogBuffer = + (data->args.hipGraphInstantiate.pLogBuffer) + ? strdup(data->args.hipGraphInstantiate.pLogBuffer) + : NULL; break; // hipGraphInstantiateWithFlags[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), // ('unsigned long long', 'flags')] @@ -15959,9 +15961,8 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize); oss << ", dynSharedMemPerBlk="; roctracer::hip_support::detail::operator<<( - oss, - data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - .dynSharedMemPerBlk); + oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + .dynSharedMemPerBlk); oss << ", flags="; roctracer::hip_support::detail::operator<<( oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags); diff --git a/projects/clr/hipamd/include/hip/amd_detail/host_defines.h b/projects/clr/hipamd/include/hip/amd_detail/host_defines.h index c03907188c..8081966cf7 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/host_defines.h +++ b/projects/clr/hipamd/include/hip/amd_detail/host_defines.h @@ -114,11 +114,11 @@ template struct is_same : public false_type {}; template struct is_same<__T, __T> : public true_type {}; template ::value> struct is_signed : public false_type {}; -template -struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {}; +template struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> { +}; -template -auto test_returnable(int) -> decltype(void(static_cast(nullptr)), true_type{}); +template auto test_returnable(int) + -> decltype(void(static_cast(nullptr)), true_type{}); template auto test_returnable(...) -> false_type; template struct type_identity { @@ -139,8 +139,7 @@ template struct add_rvalue_reference : decltype(try_add_rvalue_referen template typename add_rvalue_reference::type declval() noexcept; -template -auto test_implicitly_convertible(int) +template auto test_implicitly_convertible(int) -> decltype(void(declval()(declval())), true_type{}); template auto test_implicitly_convertible(...) -> false_type; @@ -160,12 +159,10 @@ template struct remove_cv { template struct is_void : public is_same::type> {}; -template -struct is_convertible - : public integral_constant(0))::value && - decltype(test_implicitly_convertible(0))::value) || - (is_void::value && is_void::value)> {}; +template struct is_convertible + : public integral_constant(0))::value && + decltype(test_implicitly_convertible(0))::value) || + (is_void::value && is_void::value)> {}; template struct char_traits; template > class basic_istream; @@ -173,8 +170,8 @@ template > class basic_o typedef basic_istream istream; typedef basic_ostream ostream; -template -struct is_standard_layout : public integral_constant {}; +template struct is_standard_layout + : public integral_constant {}; template struct is_trivial : public integral_constant {}; @@ -195,15 +192,15 @@ template struct integer_sequence { template using index_sequence = integer_sequence; -template -struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {}; +template struct make_index_sequence_impl + : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {}; template struct make_index_sequence_impl<0, Ints...> { using type = index_sequence; }; -template -using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type; +template using make_index_sequence = + typename make_index_sequence_impl<_hip_N>::type; template constexpr index_sequence make_index_sequence_value(index_sequence) { diff --git a/projects/clr/hipamd/include/hip/amd_detail/macro_based_grid_launch.hpp b/projects/clr/hipamd/include/hip/amd_detail/macro_based_grid_launch.hpp index 346f247378..f721f89553 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/macro_based_grid_launch.hpp +++ b/projects/clr/hipamd/include/hip/amd_detail/macro_based_grid_launch.hpp @@ -61,9 +61,9 @@ template RAII_guard make_RAII_guard(const C& ctor return RAII_guard{ctor, std::move(dtor)}; } -template -using is_new_grid_launch_t = typename std::conditional{}, New_grid_launch_tag, - Old_grid_launch_tag>::type; +template using is_new_grid_launch_t = + typename std::conditional{}, New_grid_launch_tag, + Old_grid_launch_tag>::type; } // namespace // TODO: - dispatch rank should be derived from the domain dimensions passed diff --git a/projects/clr/hipamd/include/hip/amd_detail/texture_fetch_functions.h b/projects/clr/hipamd/include/hip/amd_detail/texture_fetch_functions.h index ddbb76c22b..dd1580c1ed 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/texture_fetch_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/texture_fetch_functions.h @@ -37,8 +37,8 @@ THE SOFTWARE. (void)s; template struct __hip_is_tex_surf_scalar_channel_type { - static constexpr bool value = __hip_internal::is_same::value || - __hip_internal::is_same::value || + static constexpr bool value = + __hip_internal::is_same::value || __hip_internal::is_same::value || __hip_internal::is_same::value || __hip_internal::is_same::value || __hip_internal::is_same::value || __hip_internal::is_same::value || __hip_internal::is_same::value; @@ -51,12 +51,12 @@ template struct __hip_is_tex_surf_channel_type { template struct __hip_is_tex_surf_channel_type> { static constexpr bool value = __hip_is_tex_surf_scalar_channel_type::value && - ((rank == 1) || (rank == 2) || (rank == 4)); + ((rank == 1) || (rank == 2) || (rank == 4)); }; template struct __hip_is_tex_normalized_channel_type { - static constexpr bool value = __hip_internal::is_same::value || - __hip_internal::is_same::value || + static constexpr bool value = + __hip_internal::is_same::value || __hip_internal::is_same::value || __hip_internal::is_same::value || __hip_internal::is_same::value; }; @@ -73,8 +73,7 @@ template struc /* * Map from device function return U to scalar texture type T */ -template -__forceinline__ __device__ +template __forceinline__ __device__ typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type::value, const T>::type __hipMapFrom(const U& u) { @@ -96,8 +95,7 @@ __forceinline__ __device__ /* * Map from device function return U to vector texture type T */ -template -__forceinline__ __device__ typename __hip_internal::enable_if< +template __forceinline__ __device__ typename __hip_internal::enable_if< __hip_is_tex_surf_scalar_channel_type::value, const T>::type __hipMapFrom(const U& u) { if constexpr (sizeof(typename T::value_type) < sizeof(float)) { @@ -118,8 +116,7 @@ __hipMapFrom(const U& u) { /* * Map from scalar texture type T to device function input U */ -template -__forceinline__ __device__ +template __forceinline__ __device__ typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type::value, const U>::type __hipMapTo(const T& t) { @@ -143,8 +140,7 @@ __forceinline__ __device__ /* * Map from vector texture type T to device function input U */ -template -__forceinline__ __device__ typename __hip_internal::enable_if< +template __forceinline__ __device__ typename __hip_internal::enable_if< __hip_is_tex_surf_scalar_channel_type::value, const U>::type __hipMapTo(const T& t) { if constexpr (sizeof(typename T::value_type) < sizeof(float)) { @@ -164,18 +160,16 @@ __hipMapTo(const T& t) { } } -template -using __hip_tex_ret_t = typename __hip_tex_ret::type; +template using __hip_tex_ret_t = + typename __hip_tex_ret::type; -template -struct __hip_tex_ret< +template struct __hip_tex_ret< T, hipReadModeElementType, typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value, bool>::type> { using type = T; }; -template -struct __hip_tex_ret< +template struct __hip_tex_ret< HIP_vector_type, hipReadModeElementType, typename __hip_internal::enable_if< __hip_is_tex_surf_channel_type>::value, bool>::type> { @@ -189,8 +183,7 @@ struct __hip_tex_ret -struct __hip_tex_ret< +template struct __hip_tex_ret< HIP_vector_type, hipReadModeNormalizedFloat, typename __hip_internal::enable_if< __hip_is_tex_normalized_channel_type>::value, bool>::type> { @@ -421,18 +414,16 @@ struct __hip_tex2dgather_ret { static_assert(__hip_internal::is_same::value, "Invalid channel type!"); }; -template -using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret::type; +template using __hip_tex2dgather_ret_t = + typename __hip_tex2dgather_ret::type; -template -struct __hip_tex2dgather_ret< +template struct __hip_tex2dgather_ret< T, hipReadModeElementType, typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value, bool>::type> { using type = HIP_vector_type; }; -template -struct __hip_tex2dgather_ret< +template struct __hip_tex2dgather_ret< HIP_vector_type, hipReadModeElementType, typename __hip_internal::enable_if< __hip_is_tex_surf_channel_type>::value, bool>::type> { diff --git a/projects/clr/hipamd/include/hip/amd_detail/texture_indirect_functions.h b/projects/clr/hipamd/include/hip/amd_detail/texture_indirect_functions.h index 76453a47cf..f48b3bcf8c 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/texture_indirect_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/texture_indirect_functions.h @@ -37,41 +37,36 @@ THE SOFTWARE. unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; \ (void)s; -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x) { TEXTURE_OBJECT_PARAMETERS_INIT auto tmp = __ockl_image_load_1Db(i, x); return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex1Dfetch(T* ptr, hipTextureObject_t textureObject, int x) { *ptr = tex1Dfetch(textureObject, x); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x) { TEXTURE_OBJECT_PARAMETERS_INIT auto tmp = __ockl_image_sample_1D(i, s, x); return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex1D(T* ptr, hipTextureObject_t textureObject, float x) { *ptr = tex1D(textureObject, x); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y) { TEXTURE_OBJECT_PARAMETERS_INIT float2 coords{x, y}; @@ -79,17 +74,15 @@ static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, floa return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex2D(T* ptr, hipTextureObject_t textureObject, float x, float y) { *ptr = tex2D(textureObject, x, y); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -98,17 +91,15 @@ static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, floa return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex3D(T* ptr, hipTextureObject_t textureObject, float x, float y, float z) { *ptr = tex3D(textureObject, x, y, z); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -117,17 +108,15 @@ static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObjec return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex1DLayered(T* ptr, hipTextureObject_t textureObject, float x, int layer) { *ptr = tex1DLayered(textureObject, x, layer); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -136,17 +125,15 @@ static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObjec return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex2DLayered(T* ptr, hipTextureObject_t textureObject, float x, float y, int layer) { *ptr = tex1DLayered(textureObject, x, y, layer); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y, float z) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -155,17 +142,15 @@ static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void texCubemap(T* ptr, hipTextureObject_t textureObject, float x, float y, float z) { *ptr = texCubemap(textureObject, x, y, z); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -174,17 +159,15 @@ static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t texture return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void texCubemapLayered(T* ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer) { *ptr = texCubemapLayered(textureObject, x, y, z, layer); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -214,17 +197,15 @@ static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject return {}; } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex2Dgather(T* ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0) { *ptr = texCubemapLayered(textureObject, x, y, comp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -232,17 +213,15 @@ static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, f return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex1DLod(T* ptr, hipTextureObject_t textureObject, float x, float level) { *ptr = tex1DLod(textureObject, x, level); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -251,17 +230,15 @@ static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, f return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex2DLod(T* ptr, hipTextureObject_t textureObject, float x, float y, float level) { *ptr = tex2DLod(textureObject, x, y, level); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -270,17 +247,15 @@ static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, f return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex3DLod(T* ptr, hipTextureObject_t textureObject, float x, float y, float z, float level) { *ptr = tex3DLod(textureObject, x, y, z, level); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level) { TEXTURE_OBJECT_PARAMETERS_INIT; @@ -290,17 +265,15 @@ static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureOb return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex1DLayeredLod(T* ptr, hipTextureObject_t textureObject, float x, int layer, float level) { *ptr = tex1DLayeredLod(textureObject, x, layer, level); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level) { TEXTURE_OBJECT_PARAMETERS_INIT; @@ -310,17 +283,15 @@ static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureOb return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex2DLayeredLod(T* ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level) { *ptr = tex2DLayeredLod(textureObject, x, y, layer, level); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -329,17 +300,15 @@ static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObje return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void texCubemapLod(T* ptr, hipTextureObject_t textureObject, float x, float y, float z, float level) { *ptr = texCubemapLod(textureObject, x, y, z, level); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy) { TEXTURE_OBJECT_PARAMETERS_INIT; @@ -355,18 +324,16 @@ static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObj return {}; } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void texCubemapGrad(T* ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy) { *ptr = texCubemapGrad(textureObject, x, y, z, dPdx, dPdy); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -375,9 +342,8 @@ static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t text return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, @@ -385,9 +351,8 @@ static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr, *ptr = texCubemapLayeredLod(textureObject, x, y, z, layer, level); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -395,17 +360,15 @@ static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex1DGrad(T* ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy) { *ptr = tex1DGrad(textureObject, x, dPdx, dPdy); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -415,17 +378,15 @@ static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex2DGrad(T* ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy) { *ptr = tex2DGrad(textureObject, x, y, dPdx, dPdy); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy) { TEXTURE_OBJECT_PARAMETERS_INIT; @@ -438,17 +399,15 @@ static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex3DGrad(T* ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy) { *ptr = tex3DGrad(textureObject, x, y, z, dPdx, dPdy); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -457,18 +416,16 @@ static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureO return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex1DLayeredGrad(T* ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy) { *ptr = tex1DLayeredGrad(textureObject, x, layer, dPdx, dPdy); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy) { TEXTURE_OBJECT_PARAMETERS_INIT @@ -478,18 +435,16 @@ static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureO return __hipMapFrom(tmp); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void tex2DLayeredGrad(T* ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy) { *ptr = tex2DLayeredGrad(textureObject, x, y, layer, dPdx, dPdy); } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) { @@ -507,9 +462,8 @@ static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t tex return {}; } -template < - typename T, - typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type::value>::type* = nullptr> +template ::value>::type* = nullptr> static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T* ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, diff --git a/projects/clr/hipamd/src/hip_comgr_helper.cpp b/projects/clr/hipamd/src/hip_comgr_helper.cpp index 8642e12598..8f700eb85c 100644 --- a/projects/clr/hipamd/src/hip_comgr_helper.cpp +++ b/projects/clr/hipamd/src/hip_comgr_helper.cpp @@ -156,8 +156,8 @@ bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id, static inline unsigned int getGenericVersion(const void* image) { const Elf64_Ehdr* ehdr = reinterpret_cast(image); return ehdr->e_ident[EI_ABIVERSION] == ELFABIVERSION_AMDGPU_HSA_V6 - ? ((ehdr->e_flags & EF_AMDGPU_GENERIC_VERSION) >> EF_AMDGPU_GENERIC_VERSION_OFFSET) - : 0; + ? ((ehdr->e_flags & EF_AMDGPU_GENERIC_VERSION) >> EF_AMDGPU_GENERIC_VERSION_OFFSET) + : 0; } static inline bool isGenericTarget(const void* image) { @@ -178,10 +178,9 @@ bool UnbundleBitCode(const std::vector& bundled_llvm_bitcode, const std::s const void* data = reinterpret_cast(bundled_llvm_bitcode_s.c_str()); const auto obheader = reinterpret_cast(data); const auto* desc = &obheader->desc[0]; - for (uint64_t idx = 0; idx < obheader->numOfCodeObjects; ++idx, - desc = reinterpret_cast( - reinterpret_cast(&desc->bundleEntryId[0]) + - desc->bundleEntryIdSize)) { + for (uint64_t idx = 0; idx < obheader->numOfCodeObjects; + ++idx, desc = reinterpret_cast( + reinterpret_cast(&desc->bundleEntryId[0]) + desc->bundleEntryIdSize)) { const void* image = reinterpret_cast(reinterpret_cast(obheader) + desc->offset); const size_t image_size = desc->size; @@ -736,9 +735,8 @@ bool demangleName(const std::string& mangledName, std::string& demangledName) { demangledName.resize(demangled_size); - if (AMD_COMGR_STATUS_SUCCESS != - amd::Comgr::get_data(demangled_data, &demangled_size, - const_cast(demangledName.data()))) { + if (AMD_COMGR_STATUS_SUCCESS != amd::Comgr::get_data(demangled_data, &demangled_size, + const_cast(demangledName.data()))) { amd::Comgr::release_data(mangled_data); amd::Comgr::release_data(demangled_data); return false; diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index 317df7e7a8..f4f8c10416 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -135,7 +135,7 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) { command->awaitCompletion(); ms = static_cast(static_cast(command->event().profilingInfo().end_) - time(false)) / - 1000000.f; + 1000000.f; command->release(); } else { // Note: with direct dispatch eStop.ready() relies on HW event, but CPU status can be delayed. @@ -210,7 +210,8 @@ hipError_t Event::streamWait(hip::Stream* stream, uint flags) { hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t ext_flags, bool batch_flush) { if (command == nullptr) { - int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) & + int32_t releaseFlags = + ((ext_flags == 0) ? flags_ : ext_flags) & (hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence); if (releaseFlags & hipEventDisableSystemFence) { releaseFlags = amd::Device::kCacheStateIgnore; @@ -269,8 +270,8 @@ bool isValid(hipEvent_t event) { // ================================================================================================ hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) { unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming | - hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess | - hipEventDisableSystemFence; + hipEventReleaseToDevice | hipEventReleaseToSystem | + hipEventInterprocess | hipEventDisableSystemFence; const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence); @@ -284,7 +285,7 @@ hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) { } return bitcount; }(flags & releaseFlags) > 1) || - ((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming)); + ((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming)); if (!illegalFlags) { hip::Event* e = nullptr; if (flags & hipEventInterprocess) { diff --git a/projects/clr/hipamd/src/hip_fatbin.cpp b/projects/clr/hipamd/src/hip_fatbin.cpp index c8b86acfcf..f24cbb40e5 100644 --- a/projects/clr/hipamd/src/hip_fatbin.cpp +++ b/projects/clr/hipamd/src/hip_fatbin.cpp @@ -37,10 +37,9 @@ template class ComgrUniqueHandle { // constructor which takes ownership of a correctly initialzed handle ComgrUniqueHandle(comgr_T& handle) : comgr_obj_(handle) { handle = {0}; }; - template || - std::is_same_v, - bool> = true> + template || + std::is_same_v, + bool> = true> [[nodiscard]] amd_comgr_status_t Create() { if constexpr (std::is_same_v) { return amd::Comgr::create_data_set(&comgr_obj_); @@ -736,9 +735,9 @@ hipError_t FatBinaryInfo::BuildProgram(const int device_id) { // If Program was already built skip this step and return success if (dev_programs_[device_id]->IsProgramBuilt(*g_devices[device_id]->devices()[0]) == false) { - if (CL_SUCCESS != - dev_programs_[device_id]->build(g_devices[device_id]->devices(), nullptr, nullptr, nullptr, - kOptionChangeable, kNewDevProg)) { + if (CL_SUCCESS != dev_programs_[device_id]->build(g_devices[device_id]->devices(), nullptr, + nullptr, nullptr, kOptionChangeable, + kNewDevProg)) { return hipErrorNoBinaryForGpu; } if (!dev_programs_[device_id]->load()) { diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 23e6164b2c..4fbd7d32b7 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -581,8 +581,8 @@ bool Graph::RunOneNode(Node node, bool wait) { for (auto edge : node->GetEdges()) { // Don't wait in the nodes, executed on the same streams and if it has just one dependency bool wait = ((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) || (edge->GetDependencies().size() > 1)) - ? true - : false; + ? true + : false; // Execute the edge node if (!RunOneNode(edge, wait)) { return false; diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 5c6a2692c6..b66b95c9f9 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -366,9 +366,8 @@ class GraphNode : public hipGraphNodeDOTAttribute { virtual void EnqueueCommands(hip::Stream* stream) { // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker. // Node can be enabled/disabled only for kernel, memcpy and memset nodes. - if (!isEnabled_ && - (type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy || - type_ == hipGraphNodeTypeMemset)) { + if (!isEnabled_ && (type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy || + type_ == hipGraphNodeTypeMemset)) { amd::Command::EventWaitList waitList; if (!commands_.empty()) { waitList = commands_[0]->eventWaitList(); @@ -1677,7 +1676,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode { label = buffer; } else { label = std::to_string(GetID()) + "\n" + label_ + "\n(" + memcpyDirection + "," + - std::to_string(count_) + ")"; + std::to_string(count_) + ")"; } return label; } @@ -1948,7 +1947,7 @@ class GraphMemsetNode : public GraphNode { sizeBytes = memsetParams_.width * memsetParams_.height * depth_ * memsetParams_.elementSize; } label = std::to_string(GetID()) + "\n" + label_ + "\n(" + - std::to_string(memsetParams_.value) + "," + std::to_string(sizeBytes) + ")"; + std::to_string(memsetParams_.value) + "," + std::to_string(sizeBytes) + ")"; } return label; } diff --git a/projects/clr/hipamd/src/hip_hmm.cpp b/projects/clr/hipamd/src/hip_hmm.cpp index b4e1170dc7..5801df1032 100755 --- a/projects/clr/hipamd/src/hip_hmm.cpp +++ b/projects/clr/hipamd/src/hip_hmm.cpp @@ -227,8 +227,8 @@ hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr, size_t len // This type of memory may only be specified if the device associated with the // stream reports a non-zero value for the device attribute hipDevAttrPageableMemoryAccess. hip::Stream* hip_stream = (stream == nullptr || stream == hipStreamLegacy) - ? hip::getCurrentDevice()->NullStream() - : hip::getStream(stream); + ? hip::getCurrentDevice()->NullStream() + : hip::getStream(stream); size_t offset = 0; amd::Memory* memObj = getMemoryObject(dev_ptr, offset); if (memObj == nullptr) { @@ -328,13 +328,13 @@ hipError_t ihipMemPrefetchAsync(const void* dev_ptr, size_t count, hipMemLocatio // Pick the specified stream or Null one from the provided target device if (cpuAccess == true) { hip_stream = (stream == nullptr || stream == hipStreamLegacy) - ? hip::getCurrentDevice()->NullStream() - : hip::getStream(stream); + ? hip::getCurrentDevice()->NullStream() + : hip::getStream(stream); } else { dev = g_devices[targetDevice]->devices()[0]; hip_stream = (stream == nullptr || stream == hipStreamLegacy) - ? g_devices[targetDevice]->NullStream() - : hip::getStream(stream); + ? g_devices[targetDevice]->NullStream() + : hip::getStream(stream); } if (hip_stream == nullptr) { diff --git a/projects/clr/hipamd/src/hip_internal.hpp b/projects/clr/hipamd/src/hip_internal.hpp index eb734e7080..c476d9a798 100644 --- a/projects/clr/hipamd/src/hip_internal.hpp +++ b/projects/clr/hipamd/src/hip_internal.hpp @@ -327,9 +327,9 @@ class Stream : public amd::HostQueue { unsigned long long captureID_; static inline CommandQueue::Priority convertToQueuePriority(Priority p) { - return p == Priority::High ? amd::CommandQueue::Priority::High - : p == Priority::Low ? amd::CommandQueue::Priority::Low - : amd::CommandQueue::Priority::Normal; + return p == Priority::High ? amd::CommandQueue::Priority::High + : p == Priority::Low ? amd::CommandQueue::Priority::Low + : amd::CommandQueue::Priority::Normal; } public: diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index e02b44fac7..772ee9934d 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -67,8 +67,8 @@ hipMemoryType getMemoryType(const amd::Memory* memory) { } return ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memory->getMemFlags()) - ? hipMemoryTypeHost - : hipMemoryTypeDevice; + ? hipMemoryTypeHost + : hipMemoryTypeDevice; } // ================================================================================================ @@ -336,8 +336,8 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { hip::getCurrentDevice()->SetActiveStatus(); size_t max_device_size = IS_LINUX - ? dev_info.maxMemAllocSize_ - : (dev_info.maxMemAllocSize_ + dev_info.maxPhysicalMemAllocSize_); + ? dev_info.maxMemAllocSize_ + : (dev_info.maxMemAllocSize_ + dev_info.maxPhysicalMemAllocSize_); if ((useHostDevice && dev_info.maxPhysicalMemAllocSize_ < sizeBytes) || (!useHostDevice && max_device_size < sizeBytes)) { @@ -401,9 +401,8 @@ hipError_t ihipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { } if (flags == 0 || - flags & - (hipHostMallocCoherent | hipHostMallocMapped | hipHostMallocNumaUser | - hipHostMallocUncached) || + flags & (hipHostMallocCoherent | hipHostMallocMapped | hipHostMallocNumaUser | + hipHostMallocUncached) || (!(flags & hipHostMallocNonCoherent) && HIP_HOST_COHERENT)) { ihipFlags |= CL_MEM_SVM_ATOMICS; } @@ -1143,7 +1142,7 @@ hipError_t ihipArrayCreate(hipArray_t* array, const HIP_ARRAY3D_DESCRIPTOR* pAll return hipErrorInvalidValue; } unsigned int flags = hipArrayDefault | hipArrayLayered | hipArraySurfaceLoadStore | - hipArrayTextureGather; // hipArrayCubemap isn't supported + hipArrayTextureGather; // hipArrayCubemap isn't supported if (pAllocateArray->Flags & (~flags)) { return hipErrorInvalidValue; } @@ -1282,9 +1281,8 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) { hipError_t ihipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) { if (hostPtr == nullptr || sizeBytes == 0 || - flags & - ~(hipHostRegisterPortable | hipHostRegisterMapped | hipExtHostRegisterCoarseGrained | - hipExtHostRegisterUncached)) { + flags & ~(hipHostRegisterPortable | hipHostRegisterMapped | hipExtHostRegisterCoarseGrained | + hipExtHostRegisterUncached)) { return hipErrorInvalidValue; } else { unsigned int memFlags = CL_MEM_USE_HOST_PTR | CL_MEM_SVM_ATOMICS; @@ -1377,9 +1375,8 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) { if (ptr == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - if (flags & - ~(hipHostAllocPortable | hipHostAllocMapped | hipHostAllocWriteCombined | - hipHostAllocUncached)) { + if (flags & ~(hipHostAllocPortable | hipHostAllocMapped | hipHostAllocWriteCombined | + hipHostAllocUncached)) { HIP_RETURN(hipErrorInvalidValue); } @@ -1868,9 +1865,9 @@ hipError_t ihipMemcpyHtoH(void* dstHost, const void* srcHost, amd::Coord3D copyR for (size_t slice = 0; slice < copyRegion[2]; slice++) { for (size_t row = 0; row < copyRegion[1]; row++) { const void* srcRow = static_cast(srcHost) + srcRect.start_ + - row * srcRect.rowPitch_ + slice * srcRect.slicePitch_; + row * srcRect.rowPitch_ + slice * srcRect.slicePitch_; void* dstRow = static_cast(dstHost) + dstRect.start_ + row * dstRect.rowPitch_ + - slice * dstRect.slicePitch_; + slice * dstRect.slicePitch_; std::memcpy(dstRow, srcRow, copyRegion[0]); } } @@ -2331,9 +2328,8 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool // Transfers from device memory to pageable host memory and transfers from any // host memory to any host memory are synchronous with respect to the host. // Device to Device copies do not need to host side synchronization. - if (dstMemoryType == hipMemoryTypeHost || - ((pCopy->srcMemoryType == hipMemoryTypeHost) && - (pCopy->dstMemoryType == hipMemoryTypeHost))) { + if (dstMemoryType == hipMemoryTypeHost || ((pCopy->srcMemoryType == hipMemoryTypeHost) && + (pCopy->dstMemoryType == hipMemoryTypeHost))) { isAsync = false; } else if ((pCopy->srcMemoryType == hipMemoryTypeDevice) && (pCopy->dstMemoryType == hipMemoryTypeDevice)) { @@ -4111,7 +4107,7 @@ hipError_t ihipMipmapArrayCreate(hipMipmappedArray_t* mipmapped_array_pptr, return hipErrorInvalidValue; } unsigned int flags = hipArrayDefault | hipArrayLayered | hipArraySurfaceLoadStore | - hipArrayTextureGather; // hipArrayCubemap isn't supported + hipArrayTextureGather; // hipArrayCubemap isn't supported if (mipmapped_array_desc_ptr->Flags & (~flags)) { return hipErrorInvalidValue; } diff --git a/projects/clr/hipamd/src/hip_mempool.cpp b/projects/clr/hipamd/src/hip_mempool.cpp index 08ede5ed00..6fcb3c869c 100644 --- a/projects/clr/hipamd/src/hip_mempool.cpp +++ b/projects/clr/hipamd/src/hip_mempool.cpp @@ -380,8 +380,8 @@ hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_ auto mpool = reinterpret_cast(mem_pool); auto hip_stream = (stream == nullptr || stream == hipStreamLegacy) - ? hip::getCurrentDevice()->NullStream() - : reinterpret_cast(stream); + ? hip::getCurrentDevice()->NullStream() + : reinterpret_cast(stream); *dev_ptr = mpool->AllocateMemory(size, hip_stream); if (*dev_ptr == nullptr) { HIP_RETURN(hipErrorOutOfMemory); diff --git a/projects/clr/hipamd/src/hip_mempool_impl.cpp b/projects/clr/hipamd/src/hip_mempool_impl.cpp index 8361ab6861..8508253f99 100644 --- a/projects/clr/hipamd/src/hip_mempool_impl.cpp +++ b/projects/clr/hipamd/src/hip_mempool_impl.cpp @@ -422,9 +422,9 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) { break; case hipMemPoolAttrReservedMemCurrent: // All allocated memory by the pool in OS - *reinterpret_cast(value) = (state_.use_vm_heap_) - ? MappedSize() - : (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize()); + *reinterpret_cast(value) = + (state_.use_vm_heap_) ? MappedSize() + : (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize()); break; case hipMemPoolAttrReservedMemHigh: // High watermark of all allocated memory in OS, since the last reset diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index 9b61b8d1e8..df51566447 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -165,7 +165,7 @@ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunc case HIP_FUNC_ATTRIBUTE_PTX_VERSION: case HIP_FUNC_ATTRIBUTE_BINARY_VERSION: *value = hip::getCurrentDevice()->devices()[0]->isa().versionMajor() * 10 + - hip::getCurrentDevice()->devices()[0]->isa().versionMinor(); + hip::getCurrentDevice()->devices()[0]->isa().versionMinor(); break; case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA: *value = 0; @@ -224,9 +224,8 @@ hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int valu (device::Kernel*)(kernel->getDeviceKernel(*(hip::getCurrentDevice()->devices()[0]))); if (attr == hipFuncAttributeMaxDynamicSharedMemorySize) { - if ((value < 0) || - (value > (d_kernel->workGroupInfo()->availableLDSSize_ - - d_kernel->workGroupInfo()->localMemSize_))) { + if ((value < 0) || (value > (d_kernel->workGroupInfo()->availableLDSSize_ - + d_kernel->workGroupInfo()->localMemSize_))) { HIP_RETURN(hipErrorInvalidValue); } d_kernel->workGroupInfo()->maxDynamicSharedSizeBytes_ = value; diff --git a/projects/clr/hipamd/src/hip_texture.cpp b/projects/clr/hipamd/src/hip_texture.cpp index d594446646..ac6a4bbc94 100644 --- a/projects/clr/hipamd/src/hip_texture.cpp +++ b/projects/clr/hipamd/src/hip_texture.cpp @@ -79,9 +79,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso // pResViewDesc can only be specified if the type of resource is a HIP array or a HIP mipmapped // array. - if ((pResViewDesc != nullptr) && - ((pResDesc->resType != hipResourceTypeArray) && - (pResDesc->resType != hipResourceTypeMipmappedArray))) { + if ((pResViewDesc != nullptr) && ((pResDesc->resType != hipResourceTypeArray) && + (pResDesc->resType != hipResourceTypeMipmappedArray))) { return hipErrorUnknown; } @@ -176,9 +175,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso // hipAddressModeWrap and hipAddressModeMirror won't be supported // and will be switched to hipAddressModeClamp. for (int i = 0; i < 3; i++) { - if ((pTexDesc->normalizedCoords == 0) && - ((pTexDesc->addressMode[i] == hipAddressModeWrap) || - (pTexDesc->addressMode[i] == hipAddressModeMirror))) { + if ((pTexDesc->normalizedCoords == 0) && ((pTexDesc->addressMode[i] == hipAddressModeWrap) || + (pTexDesc->addressMode[i] == hipAddressModeMirror))) { addressMode[i] = hip::getCLAddressingMode(hipAddressModeClamp); } // hipTextureDesc::addressMode is ignored if hipResourceDesc::resType is hipResourceTypeLinear @@ -237,12 +235,14 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso if ((pResViewDesc != nullptr) || (readMode == hipReadModeNormalizedFloat) || (pTexDesc->sRGB == 1)) { // TODO ROCclr currently right now can only change the format of the image. - const cl_channel_order channelOrder = (pResViewDesc != nullptr) - ? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB) - : hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB); - const cl_channel_type channelType = (pResViewDesc != nullptr) - ? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode) - : hip::getCLChannelType(pResDesc->res.array.array->Format, readMode); + const cl_channel_order channelOrder = + (pResViewDesc != nullptr) + ? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB) + : hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB); + const cl_channel_type channelType = + (pResViewDesc != nullptr) + ? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode) + : hip::getCLChannelType(pResDesc->res.array.array->Format, readMode); const amd::Image::Format imageFormat(cl_image_format{channelOrder, channelType}); if (!imageFormat.isValid()) { return hipErrorInvalidValue; @@ -277,12 +277,14 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso if ((pResViewDesc != nullptr) || (readMode == hipReadModeNormalizedFloat) || (pTexDesc->sRGB == 1)) { // TODO ROCclr currently right now can only change the format of the image. - const cl_channel_order channelOrder = (pResViewDesc != nullptr) - ? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB) - : hip::getCLChannelOrder(pResDesc->res.mipmap.mipmap->num_channels, pTexDesc->sRGB); - const cl_channel_type channelType = (pResViewDesc != nullptr) - ? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode) - : hip::getCLChannelType(pResDesc->res.mipmap.mipmap->format, readMode); + const cl_channel_order channelOrder = + (pResViewDesc != nullptr) + ? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB) + : hip::getCLChannelOrder(pResDesc->res.mipmap.mipmap->num_channels, pTexDesc->sRGB); + const cl_channel_type channelType = + (pResViewDesc != nullptr) + ? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode) + : hip::getCLChannelType(pResDesc->res.mipmap.mipmap->format, readMode); const amd::Image::Format imageFormat(cl_image_format{channelOrder, channelType}); if (!imageFormat.isValid()) { return hipErrorInvalidValue; @@ -335,7 +337,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode); const amd::Image::Format imageFormat({channelOrder, channelType}); const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType); - const size_t imageSizeInBytes = pResDesc->res.pitch2D.width * imageFormat.getElementSize() + + const size_t imageSizeInBytes = + pResDesc->res.pitch2D.width * imageFormat.getElementSize() + pResDesc->res.pitch2D.pitchInBytes * (pResDesc->res.pitch2D.height - 1); amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes); diff --git a/projects/clr/opencl/amdocl/cl_common.hpp b/projects/clr/opencl/amdocl/cl_common.hpp index 43fbdba08e..6d817d7bed 100644 --- a/projects/clr/opencl/amdocl/cl_common.hpp +++ b/projects/clr/opencl/amdocl/cl_common.hpp @@ -36,9 +36,9 @@ int checkContextProperties(const cl_context_properties* properties, bool* offlin namespace amd { -template -static inline cl_int clGetInfo(T& field, size_t param_value_size, void* param_value, - size_t* param_value_size_ret) { +template static inline cl_int clGetInfo(T& field, size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) { const void* valuePtr; size_t valueSize; diff --git a/projects/clr/opencl/amdocl/cl_event.cpp b/projects/clr/opencl/amdocl/cl_event.cpp index 4dfc9c0b94..1dd1823d04 100644 --- a/projects/clr/opencl/amdocl/cl_event.cpp +++ b/projects/clr/opencl/amdocl/cl_event.cpp @@ -164,9 +164,10 @@ RUNTIME_ENTRY(cl_int, clGetEventInfo, } case CL_EVENT_COMMAND_QUEUE: { amd::Command& command = as_amd(event)->command(); - cl_command_queue queue = command.queue() == NULL - ? NULL - : const_cast(as_cl(command.queue()->asCommandQueue())); + cl_command_queue queue = + command.queue() == NULL + ? NULL + : const_cast(as_cl(command.queue()->asCommandQueue())); return amd::clGetInfo(queue, param_value_size, param_value, param_value_size_ret); } case CL_EVENT_COMMAND_TYPE: { diff --git a/projects/clr/opencl/amdocl/cl_gl.cpp b/projects/clr/opencl/amdocl/cl_gl.cpp index b6b6535267..e2830f32cf 100644 --- a/projects/clr/opencl/amdocl/cl_gl.cpp +++ b/projects/clr/opencl/amdocl/cl_gl.cpp @@ -885,9 +885,8 @@ RUNTIME_ENTRY(cl_int, clGetGLContextInfoKHR, for (cl_uint i = 0; i < num_gpu_devices; ++i) { cl_device_id device = gpu_devices[i]; - if (is_valid(device) && - as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_, - VALIDATE_ONLY)) { + if (is_valid(device) && as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, + info.hCtx_, VALIDATE_ONLY)) { return amd::clGetInfo(device, param_value_size, param_value, param_value_size_ret); } } @@ -912,9 +911,8 @@ RUNTIME_ENTRY(cl_int, clGetGLContextInfoKHR, for (cl_uint i = 0; i < total_devices; ++i) { cl_device_id device = devices[i]; - if (is_valid(device) && - as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_, - VALIDATE_ONLY)) { + if (is_valid(device) && as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, + info.hCtx_, VALIDATE_ONLY)) { compatible_devices.push_back(as_amd(device)); } } diff --git a/projects/clr/opencl/amdocl/cl_memobj.cpp b/projects/clr/opencl/amdocl/cl_memobj.cpp index e37f543e7a..13b42d4df3 100644 --- a/projects/clr/opencl/amdocl/cl_memobj.cpp +++ b/projects/clr/opencl/amdocl/cl_memobj.cpp @@ -70,12 +70,10 @@ static bool validateFlags(cl_mem_flags flags, bool chkReadWrite = false) { temp |= (flags & CL_MEM_KERNEL_READ_AND_WRITE); } - if (temp && - !(CL_MEM_READ_WRITE == temp || CL_MEM_WRITE_ONLY == temp || - (chkReadWrite && - (CL_MEM_KERNEL_READ_AND_WRITE == temp || - (CL_MEM_KERNEL_READ_AND_WRITE | CL_MEM_READ_WRITE) == temp)) || - CL_MEM_READ_ONLY == temp)) { + if (temp && !(CL_MEM_READ_WRITE == temp || CL_MEM_WRITE_ONLY == temp || + (chkReadWrite && (CL_MEM_KERNEL_READ_AND_WRITE == temp || + (CL_MEM_KERNEL_READ_AND_WRITE | CL_MEM_READ_WRITE) == temp)) || + CL_MEM_READ_ONLY == temp)) { return false; } @@ -89,9 +87,8 @@ static bool validateFlags(cl_mem_flags flags, bool chkReadWrite = false) { } if ((flags & CL_MEM_EXTERNAL_PHYSICAL_AMD) && - (flags & - (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE | - CL_MEM_READ_ONLY))) { + (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | + CL_MEM_READ_WRITE | CL_MEM_READ_ONLY))) { return false; } @@ -414,9 +411,8 @@ RUNTIME_ENTRY_RET(cl_mem, clCreateBuffer, // check extensions flag consistency if ((flags & CL_MEM_USE_PERSISTENT_MEM_AMD) && - (flags & - (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD | - CL_MEM_BUS_ADDRESSABLE_AMD))) { + (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD | + CL_MEM_BUS_ADDRESSABLE_AMD))) { *not_null(errcode_ret) = CL_INVALID_VALUE; LogWarning("conflicting flags CL_MEM_USE_PERSISTENT_MEM_AMD and host memory specific flags"); return (cl_mem)0; @@ -901,9 +897,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBuffer, return CL_INVALID_VALUE; } - if (srcBuffer == dstBuffer && - ((src_offset <= dst_offset && dst_offset < src_offset + cb) || - (dst_offset <= src_offset && src_offset < dst_offset + cb))) { + if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) || + (dst_offset <= src_offset && src_offset < dst_offset + cb))) { return CL_MEM_COPY_OVERLAP; } diff --git a/projects/clr/opencl/amdocl/cl_p2p_amd.cpp b/projects/clr/opencl/amdocl/cl_p2p_amd.cpp index b59e8a3c6f..c87b03e9c8 100644 --- a/projects/clr/opencl/amdocl/cl_p2p_amd.cpp +++ b/projects/clr/opencl/amdocl/cl_p2p_amd.cpp @@ -60,9 +60,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBufferP2PAMD, return CL_INVALID_VALUE; } - if (srcBuffer == dstBuffer && - ((src_offset <= dst_offset && dst_offset < src_offset + cb) || - (dst_offset <= src_offset && src_offset < dst_offset + cb))) { + if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) || + (dst_offset <= src_offset && src_offset < dst_offset + cb))) { return CL_MEM_COPY_OVERLAP; } diff --git a/projects/clr/opencl/amdocl/cl_program.cpp b/projects/clr/opencl/amdocl/cl_program.cpp index 31f1baaef9..e2a21dce44 100644 --- a/projects/clr/opencl/amdocl/cl_program.cpp +++ b/projects/clr/opencl/amdocl/cl_program.cpp @@ -1833,7 +1833,7 @@ RUNTIME_ENTRY(cl_int, clGetKernelWorkGroupInfo, // Return the amount of used local memory const size_t align = amdDevice.info().minDataTypeAlignSize_; cl_ulong memSize = as_amd(kernel)->parameters().localMemSize(align) + - amd::alignUp(devKernel->workGroupInfo()->localMemSize_, align); + amd::alignUp(devKernel->workGroupInfo()->localMemSize_, align); return amd::clGetInfo(memSize, param_value_size, param_value, param_value_size_ret); } case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { diff --git a/projects/clr/opencl/khronos/headers/opencl1.2/CL/cl.hpp b/projects/clr/opencl/khronos/headers/opencl1.2/CL/cl.hpp index 6ee05a0ac0..1514d4687d 100644 --- a/projects/clr/opencl/khronos/headers/opencl1.2/CL/cl.hpp +++ b/projects/clr/opencl/khronos/headers/opencl1.2/CL/cl.hpp @@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) * does not work, because when using a derived type (e.g. Context) the generic * template will provide a better match. */ -template -inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, - typename T::cl_type = 0) { +template inline cl_int getInfoHelper(Func f, cl_uint name, + VECTOR_CLASS* param, int, + typename T::cl_type = 0) { ::size_t required; cl_int err = f(name, 0, NULL, &required); if (err != CL_SUCCESS) { @@ -2743,12 +2743,10 @@ template cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer); template cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator); -template -cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer); -template -cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator, - IteratorType endIterator); +template cl_int copy(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer); +template cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, + IteratorType startIterator, IteratorType endIterator); /*! \brief Class interface for Buffer Memory Objects. @@ -2804,9 +2802,9 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly, - bool useHostPtr = false, cl_int* err = NULL) { + template Buffer(IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, + cl_int* err = NULL) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -2850,17 +2848,17 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); /*! * \brief Construct a Buffer from a host container via iterators using a specified queue. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); //! \brief Default constructor - initializes to NULL. Buffer() : Memory() {} @@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper { const VECTOR_CLASS* mem_locs = NULL, const VECTOR_CLASS* events = NULL, Event* event = NULL) const { cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0) - ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) - : NULL; + ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; if (mems != NULL) { for (unsigned int i = 0; i < mem_objects->size(); i++) { @@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_; __attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; #endif // !_WIN32 -template -Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr, cl_int* err) { +template Buffer::Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr, cl_int* err) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato * Host to Device. * Uses specified queue. */ -template -inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer) { +template inline cl_int copy(const CommandQueue& queue, + IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer) { typedef typename std::iterator_traits::value_type DataType; cl_int error; diff --git a/projects/clr/opencl/khronos/headers/opencl1.2/CL/cl_platform.h b/projects/clr/opencl/khronos/headers/opencl1.2/CL/cl_platform.h index a827c0a500..a822c1074d 100644 --- a/projects/clr/opencl/khronos/headers/opencl1.2/CL/cl_platform.h +++ b/projects/clr/opencl/khronos/headers/opencl1.2/CL/cl_platform.h @@ -59,7 +59,7 @@ extern "C" { #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \ CL_EXTENSION_WEAK_LINK \ - AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER @@ -68,7 +68,7 @@ extern "C" { #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \ CL_EXTENSION_WEAK_LINK \ - AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 #else #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER diff --git a/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl.hpp b/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl.hpp index 6ee05a0ac0..1514d4687d 100644 --- a/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl.hpp +++ b/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl.hpp @@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) * does not work, because when using a derived type (e.g. Context) the generic * template will provide a better match. */ -template -inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, - typename T::cl_type = 0) { +template inline cl_int getInfoHelper(Func f, cl_uint name, + VECTOR_CLASS* param, int, + typename T::cl_type = 0) { ::size_t required; cl_int err = f(name, 0, NULL, &required); if (err != CL_SUCCESS) { @@ -2743,12 +2743,10 @@ template cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer); template cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator); -template -cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer); -template -cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator, - IteratorType endIterator); +template cl_int copy(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer); +template cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, + IteratorType startIterator, IteratorType endIterator); /*! \brief Class interface for Buffer Memory Objects. @@ -2804,9 +2802,9 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly, - bool useHostPtr = false, cl_int* err = NULL) { + template Buffer(IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, + cl_int* err = NULL) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -2850,17 +2848,17 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); /*! * \brief Construct a Buffer from a host container via iterators using a specified queue. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); //! \brief Default constructor - initializes to NULL. Buffer() : Memory() {} @@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper { const VECTOR_CLASS* mem_locs = NULL, const VECTOR_CLASS* events = NULL, Event* event = NULL) const { cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0) - ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) - : NULL; + ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; if (mems != NULL) { for (unsigned int i = 0; i < mem_objects->size(); i++) { @@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_; __attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; #endif // !_WIN32 -template -Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr, cl_int* err) { +template Buffer::Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr, cl_int* err) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato * Host to Device. * Uses specified queue. */ -template -inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer) { +template inline cl_int copy(const CommandQueue& queue, + IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer) { typedef typename std::iterator_traits::value_type DataType; cl_int error; diff --git a/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl2.hpp b/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl2.hpp index 5c40a4bb19..bb1bb6ece4 100644 --- a/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl2.hpp +++ b/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl2.hpp @@ -1765,9 +1765,8 @@ template inline bool operator!=(const Wrapper& lhs, const Wrappe using BuildLogType = - vector::param_type>>; + vector::param_type>>; #if defined(CL_HPP_ENABLE_EXCEPTIONS) /** * Exception class for build errors to carry build info @@ -2961,12 +2960,10 @@ template cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer); template cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator); -template -cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer); -template -cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator, - IteratorType endIterator); +template cl_int copy(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer); +template cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, + IteratorType startIterator, IteratorType endIterator); #if CL_HPP_TARGET_OPENCL_VERSION >= 200 @@ -3053,8 +3050,8 @@ template class SVMAllocator { SVMAllocator(const SVMAllocator& other) : context_(other.context_) {} - template - SVMAllocator(const SVMAllocator& other) : context_(other.context_) {} + template SVMAllocator(const SVMAllocator& other) + : context_(other.context_) {} ~SVMAllocator() {} @@ -3272,9 +3269,9 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly, - bool useHostPtr = false, cl_int* err = NULL) { + template Buffer(IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, + cl_int* err = NULL) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -3318,17 +3315,17 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); /*! * \brief Construct a Buffer from a host container via iterators using a specified queue. * If useHostPtr is specified iterators must be random access. */ - template - Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); //! \brief Default constructor - initializes to NULL. Buffer() : Memory() {} @@ -4828,8 +4825,7 @@ template struct KernelArgumentHandler; // Enable for objects that are not subclasses of memory // Pointers, constants etc -template -struct KernelArgumentHandler< +template struct KernelArgumentHandler< T, typename std::enable_if::value>::type> { static size_type size(const T&) { return sizeof(T); } static const T* ptr(const T& value) { return &value; } @@ -4992,9 +4988,8 @@ class Kernel : public detail::Wrapper { __GET_KERNEL_ARG_INFO_ERR); } - template - size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range, - cl_int* err = NULL) const { + template size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range, + cl_int* err = NULL) const { size_type param; cl_int result = getSubGroupInfo(dev, name, range, ¶m); if (err != NULL) { @@ -5591,9 +5586,8 @@ inline Program linkProgram(vector inputPrograms, const char* options = #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 // Template specialization for CL_PROGRAM_BINARIES -template <> -inline cl_int cl::Program::getInfo(cl_program_info name, - vector>* param) const { +template <> inline cl_int cl::Program::getInfo(cl_program_info name, + vector>* param) const { if (name != CL_PROGRAM_BINARIES) { return CL_INVALID_VALUE; } @@ -6367,9 +6361,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. * This variant takes a raw SVM pointer. */ - template - cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size, - const vector* events = NULL, Event* event = NULL) const { + template cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, + size_type size, const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMMap( @@ -6468,9 +6462,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. * This variant takes a cl::pointer instance. */ - template - cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* events = NULL, - Event* event = NULL) const { + template cl_int enqueueUnmapSVM(cl::pointer& ptr, + const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMUnmap( @@ -6488,9 +6482,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. * This variant takes a cl::vector instance. */ - template - cl_int enqueueUnmapSVM(cl::vector& container, const vector* events = NULL, - Event* event = NULL) const { + template cl_int enqueueUnmapSVM(cl::vector& container, + const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMUnmap( @@ -6827,8 +6821,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl::Context context = cl::Context::getDefault(); cl::Device device = cl::Device::getDefault(); - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -6847,8 +6842,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl_int* err = NULL) { cl_int error; - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -6866,8 +6862,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl_int* err = NULL) { cl_int error; - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE, queueSize, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -7021,9 +7018,9 @@ template <> struct KernelArgumentHandler { #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 -template -Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr, cl_int* err) { +template Buffer::Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr, cl_int* err) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -7163,9 +7160,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla * update a region of a coarse-grained SVM buffer. * This variant takes a raw SVM pointer. */ -template -inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size, - const vector* events, Event* event) { +template inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, + size_type size, const vector* events, + Event* event) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7180,10 +7177,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t * update a region of a coarse-grained SVM buffer. * This variant takes a cl::pointer instance. */ -template -inline cl_int enqueueMapSVM(cl::pointer ptr, cl_bool blocking, cl_map_flags flags, - size_type size, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueMapSVM(cl::pointer ptr, cl_bool blocking, + cl_map_flags flags, size_type size, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7257,9 +7254,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector* events = NULL, Event* * SVM buffer back to the OpenCL runtime. * This variant takes a cl::pointer instance. */ -template -inline cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueUnmapSVM(cl::pointer& ptr, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7275,9 +7272,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* event * SVM buffer back to the OpenCL runtime. * This variant takes a cl::vector instance. */ -template -inline cl_int enqueueUnmapSVM(cl::vector& container, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueUnmapSVM(cl::vector& container, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7336,9 +7333,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato * Host to Device. * Uses specified queue. */ -template -inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer) { +template inline cl_int copy(const CommandQueue& queue, + IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer) { typedef typename std::iterator_traits::value_type DataType; cl_int error; diff --git a/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl_platform.h b/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl_platform.h index 3851ac3168..e33af206fb 100644 --- a/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl_platform.h +++ b/projects/clr/opencl/khronos/headers/opencl2.0/CL/cl_platform.h @@ -67,7 +67,7 @@ extern "C" { #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \ CL_EXTENSION_WEAK_LINK \ - AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER @@ -76,7 +76,7 @@ extern "C" { #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \ CL_EXTENSION_WEAK_LINK \ - AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 #else #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER diff --git a/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl.hpp b/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl.hpp index 6ee05a0ac0..1514d4687d 100644 --- a/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl.hpp +++ b/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl.hpp @@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) * does not work, because when using a derived type (e.g. Context) the generic * template will provide a better match. */ -template -inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, - typename T::cl_type = 0) { +template inline cl_int getInfoHelper(Func f, cl_uint name, + VECTOR_CLASS* param, int, + typename T::cl_type = 0) { ::size_t required; cl_int err = f(name, 0, NULL, &required); if (err != CL_SUCCESS) { @@ -2743,12 +2743,10 @@ template cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer); template cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator); -template -cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer); -template -cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator, - IteratorType endIterator); +template cl_int copy(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer); +template cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, + IteratorType startIterator, IteratorType endIterator); /*! \brief Class interface for Buffer Memory Objects. @@ -2804,9 +2802,9 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly, - bool useHostPtr = false, cl_int* err = NULL) { + template Buffer(IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, + cl_int* err = NULL) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -2850,17 +2848,17 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); /*! * \brief Construct a Buffer from a host container via iterators using a specified queue. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); //! \brief Default constructor - initializes to NULL. Buffer() : Memory() {} @@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper { const VECTOR_CLASS* mem_locs = NULL, const VECTOR_CLASS* events = NULL, Event* event = NULL) const { cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0) - ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) - : NULL; + ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; if (mems != NULL) { for (unsigned int i = 0; i < mem_objects->size(); i++) { @@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_; __attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; #endif // !_WIN32 -template -Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr, cl_int* err) { +template Buffer::Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr, cl_int* err) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato * Host to Device. * Uses specified queue. */ -template -inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer) { +template inline cl_int copy(const CommandQueue& queue, + IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer) { typedef typename std::iterator_traits::value_type DataType; cl_int error; diff --git a/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl2.hpp b/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl2.hpp index 5c40a4bb19..bb1bb6ece4 100644 --- a/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl2.hpp +++ b/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl2.hpp @@ -1765,9 +1765,8 @@ template inline bool operator!=(const Wrapper& lhs, const Wrappe using BuildLogType = - vector::param_type>>; + vector::param_type>>; #if defined(CL_HPP_ENABLE_EXCEPTIONS) /** * Exception class for build errors to carry build info @@ -2961,12 +2960,10 @@ template cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer); template cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator); -template -cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer); -template -cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator, - IteratorType endIterator); +template cl_int copy(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer); +template cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, + IteratorType startIterator, IteratorType endIterator); #if CL_HPP_TARGET_OPENCL_VERSION >= 200 @@ -3053,8 +3050,8 @@ template class SVMAllocator { SVMAllocator(const SVMAllocator& other) : context_(other.context_) {} - template - SVMAllocator(const SVMAllocator& other) : context_(other.context_) {} + template SVMAllocator(const SVMAllocator& other) + : context_(other.context_) {} ~SVMAllocator() {} @@ -3272,9 +3269,9 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly, - bool useHostPtr = false, cl_int* err = NULL) { + template Buffer(IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, + cl_int* err = NULL) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -3318,17 +3315,17 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); /*! * \brief Construct a Buffer from a host container via iterators using a specified queue. * If useHostPtr is specified iterators must be random access. */ - template - Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); //! \brief Default constructor - initializes to NULL. Buffer() : Memory() {} @@ -4828,8 +4825,7 @@ template struct KernelArgumentHandler; // Enable for objects that are not subclasses of memory // Pointers, constants etc -template -struct KernelArgumentHandler< +template struct KernelArgumentHandler< T, typename std::enable_if::value>::type> { static size_type size(const T&) { return sizeof(T); } static const T* ptr(const T& value) { return &value; } @@ -4992,9 +4988,8 @@ class Kernel : public detail::Wrapper { __GET_KERNEL_ARG_INFO_ERR); } - template - size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range, - cl_int* err = NULL) const { + template size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range, + cl_int* err = NULL) const { size_type param; cl_int result = getSubGroupInfo(dev, name, range, ¶m); if (err != NULL) { @@ -5591,9 +5586,8 @@ inline Program linkProgram(vector inputPrograms, const char* options = #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 // Template specialization for CL_PROGRAM_BINARIES -template <> -inline cl_int cl::Program::getInfo(cl_program_info name, - vector>* param) const { +template <> inline cl_int cl::Program::getInfo(cl_program_info name, + vector>* param) const { if (name != CL_PROGRAM_BINARIES) { return CL_INVALID_VALUE; } @@ -6367,9 +6361,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. * This variant takes a raw SVM pointer. */ - template - cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size, - const vector* events = NULL, Event* event = NULL) const { + template cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, + size_type size, const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMMap( @@ -6468,9 +6462,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. * This variant takes a cl::pointer instance. */ - template - cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* events = NULL, - Event* event = NULL) const { + template cl_int enqueueUnmapSVM(cl::pointer& ptr, + const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMUnmap( @@ -6488,9 +6482,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. * This variant takes a cl::vector instance. */ - template - cl_int enqueueUnmapSVM(cl::vector& container, const vector* events = NULL, - Event* event = NULL) const { + template cl_int enqueueUnmapSVM(cl::vector& container, + const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMUnmap( @@ -6827,8 +6821,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl::Context context = cl::Context::getDefault(); cl::Device device = cl::Device::getDefault(); - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -6847,8 +6842,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl_int* err = NULL) { cl_int error; - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -6866,8 +6862,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl_int* err = NULL) { cl_int error; - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE, queueSize, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -7021,9 +7018,9 @@ template <> struct KernelArgumentHandler { #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 -template -Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr, cl_int* err) { +template Buffer::Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr, cl_int* err) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -7163,9 +7160,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla * update a region of a coarse-grained SVM buffer. * This variant takes a raw SVM pointer. */ -template -inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size, - const vector* events, Event* event) { +template inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, + size_type size, const vector* events, + Event* event) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7180,10 +7177,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t * update a region of a coarse-grained SVM buffer. * This variant takes a cl::pointer instance. */ -template -inline cl_int enqueueMapSVM(cl::pointer ptr, cl_bool blocking, cl_map_flags flags, - size_type size, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueMapSVM(cl::pointer ptr, cl_bool blocking, + cl_map_flags flags, size_type size, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7257,9 +7254,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector* events = NULL, Event* * SVM buffer back to the OpenCL runtime. * This variant takes a cl::pointer instance. */ -template -inline cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueUnmapSVM(cl::pointer& ptr, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7275,9 +7272,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* event * SVM buffer back to the OpenCL runtime. * This variant takes a cl::vector instance. */ -template -inline cl_int enqueueUnmapSVM(cl::vector& container, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueUnmapSVM(cl::vector& container, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7336,9 +7333,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato * Host to Device. * Uses specified queue. */ -template -inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer) { +template inline cl_int copy(const CommandQueue& queue, + IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer) { typedef typename std::iterator_traits::value_type DataType; cl_int error; diff --git a/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl_platform.h b/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl_platform.h index 4b422312ee..ffa55eab46 100644 --- a/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl_platform.h +++ b/projects/clr/opencl/khronos/headers/opencl2.1/CL/cl_platform.h @@ -67,7 +67,7 @@ extern "C" { #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \ CL_EXTENSION_WEAK_LINK \ - AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER @@ -76,7 +76,7 @@ extern "C" { #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \ CL_EXTENSION_WEAK_LINK \ - AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 #else #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER diff --git a/projects/clr/opencl/khronos/headers/opencl2.2/CL/cl.hpp b/projects/clr/opencl/khronos/headers/opencl2.2/CL/cl.hpp index fdaf6d62fa..f6f8d12ce0 100644 --- a/projects/clr/opencl/khronos/headers/opencl2.2/CL/cl.hpp +++ b/projects/clr/opencl/khronos/headers/opencl2.2/CL/cl.hpp @@ -1009,9 +1009,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) * does not work, because when using a derived type (e.g. Context) the generic * template will provide a better match. */ -template -inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, - typename T::cl_type = 0) { +template inline cl_int getInfoHelper(Func f, cl_uint name, + VECTOR_CLASS* param, int, + typename T::cl_type = 0) { ::size_t required; cl_int err = f(name, 0, NULL, &required); if (err != CL_SUCCESS) { @@ -2736,12 +2736,10 @@ template cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer); template cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator); -template -cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer); -template -cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator, - IteratorType endIterator); +template cl_int copy(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer); +template cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, + IteratorType startIterator, IteratorType endIterator); /*! \brief Class interface for Buffer Memory Objects. @@ -2797,9 +2795,9 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly, - bool useHostPtr = false, cl_int* err = NULL) { + template Buffer(IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, + cl_int* err = NULL) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -2843,17 +2841,17 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); /*! * \brief Construct a Buffer from a host container via iterators using a specified queue. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); //! \brief Default constructor - initializes to NULL. Buffer() : Memory() {} @@ -5314,8 +5312,8 @@ class CommandQueue : public detail::Wrapper { const VECTOR_CLASS* mem_locs = NULL, const VECTOR_CLASS* events = NULL, Event* event = NULL) const { cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0) - ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) - : NULL; + ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; if (mems != NULL) { for (unsigned int i = 0; i < mem_objects->size(); i++) { @@ -5505,9 +5503,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_; __attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; #endif // !_WIN32 -template -Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr, cl_int* err) { +template Buffer::Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr, cl_int* err) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -5709,9 +5707,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato * Host to Device. * Uses specified queue. */ -template -inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer) { +template inline cl_int copy(const CommandQueue& queue, + IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer) { typedef typename std::iterator_traits::value_type DataType; cl_int error; diff --git a/projects/clr/opencl/khronos/headers/opencl2.2/CL/cl2.hpp b/projects/clr/opencl/khronos/headers/opencl2.2/CL/cl2.hpp index d1a64bac66..47b86da68c 100644 --- a/projects/clr/opencl/khronos/headers/opencl2.2/CL/cl2.hpp +++ b/projects/clr/opencl/khronos/headers/opencl2.2/CL/cl2.hpp @@ -1753,9 +1753,8 @@ template inline bool operator!=(const Wrapper& lhs, const Wrappe using BuildLogType = - vector::param_type>>; + vector::param_type>>; #if defined(CL_HPP_ENABLE_EXCEPTIONS) /** * Exception class for build errors to carry build info @@ -2951,12 +2950,10 @@ template cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer); template cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator); -template -cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer); -template -cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator, - IteratorType endIterator); +template cl_int copy(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer); +template cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, + IteratorType startIterator, IteratorType endIterator); #if CL_HPP_TARGET_OPENCL_VERSION >= 200 @@ -3043,8 +3040,8 @@ template class SVMAllocator { SVMAllocator(const SVMAllocator& other) : context_(other.context_) {} - template - SVMAllocator(const SVMAllocator& other) : context_(other.context_) {} + template SVMAllocator(const SVMAllocator& other) + : context_(other.context_) {} ~SVMAllocator() {} @@ -3262,9 +3259,9 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly, - bool useHostPtr = false, cl_int* err = NULL) { + template Buffer(IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, + cl_int* err = NULL) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -3308,17 +3305,17 @@ class Buffer : public Memory { * IteratorType must be random access. * If useHostPtr is specified iterators must represent contiguous data. */ - template - Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); /*! * \brief Construct a Buffer from a host container via iterators using a specified queue. * If useHostPtr is specified iterators must be random access. */ - template - Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + template Buffer(const CommandQueue& queue, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr = false, cl_int* err = NULL); //! \brief Default constructor - initializes to NULL. Buffer() : Memory() {} @@ -4818,8 +4815,7 @@ template struct KernelArgumentHandler; // Enable for objects that are not subclasses of memory // Pointers, constants etc -template -struct KernelArgumentHandler< +template struct KernelArgumentHandler< T, typename std::enable_if::value>::type> { static size_type size(const T&) { return sizeof(T); } static const T* ptr(const T& value) { return &value; } @@ -4982,9 +4978,8 @@ class Kernel : public detail::Wrapper { __GET_KERNEL_ARG_INFO_ERR); } - template - size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range, - cl_int* err = NULL) const { + template size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range, + cl_int* err = NULL) const { size_type param; cl_int result = getSubGroupInfo(dev, name, range, ¶m); if (err != NULL) { @@ -5581,9 +5576,8 @@ inline Program linkProgram(vector inputPrograms, const char* options = #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 // Template specialization for CL_PROGRAM_BINARIES -template <> -inline cl_int cl::Program::getInfo(cl_program_info name, - vector>* param) const { +template <> inline cl_int cl::Program::getInfo(cl_program_info name, + vector>* param) const { if (name != CL_PROGRAM_BINARIES) { return CL_INVALID_VALUE; } @@ -6357,9 +6351,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. * This variant takes a raw SVM pointer. */ - template - cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size, - const vector* events = NULL, Event* event = NULL) const { + template cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, + size_type size, const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMMap( @@ -6458,9 +6452,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. * This variant takes a cl::pointer instance. */ - template - cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* events = NULL, - Event* event = NULL) const { + template cl_int enqueueUnmapSVM(cl::pointer& ptr, + const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMUnmap( @@ -6478,9 +6472,9 @@ class CommandQueue : public detail::Wrapper { * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. * This variant takes a cl::vector instance. */ - template - cl_int enqueueUnmapSVM(cl::vector& container, const vector* events = NULL, - Event* event = NULL) const { + template cl_int enqueueUnmapSVM(cl::vector& container, + const vector* events = NULL, + Event* event = NULL) const { cl_event tmp; cl_int err = detail::errHandler( ::clEnqueueSVMUnmap( @@ -6817,8 +6811,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl::Context context = cl::Context::getDefault(); cl::Device device = cl::Device::getDefault(); - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -6837,8 +6832,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl_int* err = NULL) { cl_int error; - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -6856,8 +6852,9 @@ class DeviceCommandQueue : public detail::Wrapper { cl_int* err = NULL) { cl_int error; - cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | + static_cast(properties); cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE, queueSize, 0}; object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error); @@ -7011,9 +7008,9 @@ template <> struct KernelArgumentHandler { #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 -template -Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr, cl_int* err) { +template Buffer::Buffer(const Context& context, IteratorType startIterator, + IteratorType endIterator, bool readOnly, + bool useHostPtr, cl_int* err) { typedef typename std::iterator_traits::value_type DataType; cl_int error; @@ -7153,9 +7150,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla * update a region of a coarse-grained SVM buffer. * This variant takes a raw SVM pointer. */ -template -inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size, - const vector* events, Event* event) { +template inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, + size_type size, const vector* events, + Event* event) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7170,10 +7167,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t * update a region of a coarse-grained SVM buffer. * This variant takes a cl::pointer instance. */ -template -inline cl_int enqueueMapSVM(cl::pointer ptr, cl_bool blocking, cl_map_flags flags, - size_type size, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueMapSVM(cl::pointer ptr, cl_bool blocking, + cl_map_flags flags, size_type size, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7247,9 +7244,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector* events = NULL, Event* * SVM buffer back to the OpenCL runtime. * This variant takes a cl::pointer instance. */ -template -inline cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueUnmapSVM(cl::pointer& ptr, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7265,9 +7262,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer& ptr, const vector* event * SVM buffer back to the OpenCL runtime. * This variant takes a cl::vector instance. */ -template -inline cl_int enqueueUnmapSVM(cl::vector& container, const vector* events = NULL, - Event* event = NULL) { +template inline cl_int enqueueUnmapSVM(cl::vector& container, + const vector* events = NULL, + Event* event = NULL) { cl_int error; CommandQueue queue = CommandQueue::getDefault(&error); if (error != CL_SUCCESS) { @@ -7326,9 +7323,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato * Host to Device. * Uses specified queue. */ -template -inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator, - cl::Buffer& buffer) { +template inline cl_int copy(const CommandQueue& queue, + IteratorType startIterator, + IteratorType endIterator, cl::Buffer& buffer) { typedef typename std::iterator_traits::value_type DataType; cl_int error; diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp index 7fc639a172..3f741dfa61 100644 --- a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp +++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp @@ -126,7 +126,7 @@ void OCLDX11YUY2::run(void) { BYTE* pLine = (BYTE*)LockedRectD11.pData + y * LockedRectD11.RowPitch; BYTE* pLineUV = (BYTE*)LockedRectD11.pData + y * LockedRectD11.RowPitch + - OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch; + OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch; for (int x = 0; x < OCLDX11YUY2::WIDTH; x++) { *pLine++ = 0x7F; // Y diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp index bf3b5951db..d418d1c312 100644 --- a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp @@ -265,7 +265,7 @@ void OCLPerfGenericBandwidth::run(void) { // We have one extra write per LDS location to initialize LDS double perf = ((double)global * (numReads_ * sizeof(cl_float) + dataSizeBytes_ / 64) * NUM_ITER * (double)(1e-09)) / - sec; + sec; _perfInfo = (float)perf; SNPRINTF(buf, sizeof(buf), " %6s %9s %8d threads, %3d reads (GB/s) ", buf2, buf3, global, diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp index fdbd38899e..1cede60aed 100644 --- a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp @@ -401,8 +401,8 @@ void OCLPerfKernelThroughput::open(unsigned int test, char* units, double& conve input2BufferSize_ = static_cast(matrixDim2_ * matrixDim1_ * sizeof(float)); output1BufferSize_ = static_cast(matrixDim1_ * matrixDim1_ * sizeof(float)); _reqDataSize = (1.0 * matrixDim1_ * matrixDim2_ * sizeof(float)) + - (1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) + - (1.0 * matrixDim1_ * matrixDim1_ * sizeof(float)); + (1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) + + (1.0 * matrixDim1_ * matrixDim1_ * sizeof(float)); break; case 1: // Flops/Byte flopsPerByte_ = (int)workSize[workSizeIdx_]; // for kernelType == 0 @@ -695,13 +695,13 @@ void OCLPerfKernelThroughput::run(void) { // printf("FlopCount = 2*%i*%i*%i=%f\n", // matrixDim1_,matrixDim1_,matrixDim2_,flopCount); bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f / - avgKernelTime_; // GB/s + avgKernelTime_; // GB/s gflops_ = (float)(1000000.f * flopCount / avgKernelTime_ / 1000000000.0); break; case 1: // Madds flopCount = _reqDataSize * flopsPerByte_; bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f / - avgKernelTime_; // GB/s + avgKernelTime_; // GB/s gflops_ = bandwidth_ * flopsPerByte_; break; } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp index 0c1e8e7b6b..77d0609214 100644 --- a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp @@ -341,7 +341,7 @@ void OCLPerfLDSReadSpeed::run(void) { // We have one extra write per LDS location to initialize LDS double perf = ((double)global * (numReads_ * sizeof(cl_float) + ldsSizeBytes_ / 64) * NUM_ITER * (double)(1e-09)) / - sec; + sec; _perfInfo = (float)perf; SNPRINTF(buf, sizeof(buf), " %s %8d threads, %3d reads (GB/s) ", buf2, global, numReads_); diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp index d2a4b5f21c..24254f7510 100644 --- a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp @@ -749,10 +749,9 @@ void OCLPerfMandelbrot::run(void) { // printf(" totalIter = %lld\n", totalIters); if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) { CHECK_RESULT((totalIters != expectedIters[_openTest]) && - (totalIters != - expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX - ? _openTest + FMA_EXPECTEDVALUES_INDEX - : _openTest)]), + (totalIters != expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX + ? _openTest + FMA_EXPECTEDVALUES_INDEX + : _openTest)]), "Incorrect iteration count detected!"); } else { CHECK_RESULT(totalIters != expectedItersNV[_openTest], "Incorrect iteration count detected!"); @@ -869,11 +868,9 @@ void OCLPerfAsyncMandelbrot::run(void) { // printf(" totalIter = %lld\n", totalIters); if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) { CHECK_RESULT((totalIters != 2 * expectedIters[_openTest]) && - (totalIters != - 2 * - expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX - ? _openTest + FMA_EXPECTEDVALUES_INDEX - : _openTest)]), + (totalIters != 2 * expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX + ? _openTest + FMA_EXPECTEDVALUES_INDEX + : _openTest)]), "Incorrect iteration count detected!"); } else { CHECK_RESULT(totalIters != 2 * expectedItersNV[_openTest], diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp index 9be17c6524..039d88d85f 100644 --- a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp @@ -40,7 +40,7 @@ const static char* strKernel = KERNEL_CODE( /* The purpose of this is to introduce an additional zero at stage - pass * bit*/ const uint leftID = (thread & (pairDistance - 1)) | - ((thread & ~(pairDistance - 1)) << 1); /* Is the same as below */ + ((thread & ~(pairDistance - 1)) << 1); /* Is the same as below */ const uint direction = ((thread >> stage) & 1) == 1 ? 0 : 1; diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp index 45083c0d40..bf74fcaf27 100644 --- a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp @@ -183,8 +183,8 @@ void OCLMultiQueue::open(unsigned int test, char* units, double& conversion, sizeof(maxComputeUnits), &maxComputeUnits, NULL); computePower *= 32 * maxComputeUnits; NumElements = (NumElements < static_cast(computePower)) - ? static_cast(computePower) - : NumElements; + ? static_cast(computePower) + : NumElements; program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, &error_); CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, NULL, NULL); diff --git a/projects/clr/opencl/tools/clinfo/clinfo.cpp b/projects/clr/opencl/tools/clinfo/clinfo.cpp index c6433ec010..b95d407dc7 100644 --- a/projects/clr/opencl/tools/clinfo/clinfo.cpp +++ b/projects/clr/opencl/tools/clinfo/clinfo.cpp @@ -140,8 +140,8 @@ int main(int argc, char** argv) { bool isAMDPlatform = (strcmp(platform.getInfo().c_str(), "AMD Accelerated Parallel Processing") == 0) - ? true - : false; + ? true + : false; if (isAMDPlatform) { std::string boardName; device.getInfo(CL_DEVICE_BOARD_NAME_AMD, &boardName); diff --git a/projects/clr/rocclr/compiler/lib/utils/options.cpp b/projects/clr/rocclr/compiler/lib/utils/options.cpp index 68c28a520a..adb8d486a6 100644 --- a/projects/clr/rocclr/compiler/lib/utils/options.cpp +++ b/projects/clr/rocclr/compiler/lib/utils/options.cpp @@ -188,7 +188,7 @@ bool setAliasOptionVariable(int OptDescTableIx, Options& Opts, int64_t IValue, c if (OptDescTableIx == OID_SaveTemps) { // Dump .cl, .i(.ii), .amdil, .isa, .s, dll, calimage flags = DUMP_CL | DUMP_I | DUMP_S | DUMP_O | DUMP_DLL | DUMP_CGIL | DUMP_DEBUGIL | DUMP_IL | - DUMP_ISA; + DUMP_ISA; } else if (OptDescTableIx == OID_SaveTempsAll) { flags = DUMP_ALL; } else { // OID_Output @@ -531,7 +531,8 @@ int getOptionDesc(std::string& options, size_t StartPos, bool IsShortForm, Optio } char next_c = options.at(pos); - bool optionalHasValue = (OPTION_value(od) == OVA_OPTIONAL) && + bool optionalHasValue = + (OPTION_value(od) == OVA_OPTIONAL) && (((OPTION_info(od) & OA_SEPARATOR_EQUAL) && (next_c == '=')) || ((OPTION_info(od) & OA_SEPARATOR_NONE) && !OPTION_valueSeparator(next_c))); bool hasValue = (OPTION_value(od) == OVA_REQUIRED) || optionalHasValue; diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp index f3730cdd04..bcd5c5d458 100644 --- a/projects/clr/rocclr/device/device.cpp +++ b/projects/clr/rocclr/device/device.cpp @@ -339,9 +339,9 @@ const Isa* Isa::findIsa(uint32_t versionMajor, uint32_t versionMinor, uint32_t v auto supportedIsas_ = supportedIsas(); auto isaIter = std::find_if(supportedIsas_.first, supportedIsas_.second, [&](const Isa& isa) { return versionMajor == isa.versionMajor_ && versionMinor == isa.versionMinor_ && - versionStepping == isa.versionStepping_ && - (isa.sramecc_ == amd::Isa::Feature::Unsupported || isa.sramecc_ == sramecc) && - (isa.xnack_ == amd::Isa::Feature::Unsupported || isa.xnack_ == xnack); + versionStepping == isa.versionStepping_ && + (isa.sramecc_ == amd::Isa::Feature::Unsupported || isa.sramecc_ == sramecc) && + (isa.xnack_ == amd::Isa::Feature::Unsupported || isa.xnack_ == xnack); }); return isaIter == supportedIsas_.second ? nullptr : isaIter; } @@ -1132,7 +1132,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, char* handle, size_t* me // Calculate the memory offset from the original base ptr *mem_offset = reinterpret_cast
(dev_ptr) - reinterpret_cast
(orig_dev_ptr) + - amd_mem_obj->getOffset(); + amd_mem_obj->getOffset(); *mem_size = amd_mem_obj->getSize(); diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index ab0aa5de03..e0337659b0 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1763,8 +1763,8 @@ class Device : public RuntimeObject { return (info().svmCapabilities_ & (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) != 0 - ? true - : false; + ? true + : false; } //! check svm FGS support capability. diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp index a45ed6805c..a1fc797e90 100644 --- a/projects/clr/rocclr/device/devkernel.cpp +++ b/projects/clr/rocclr/device/devkernel.cpp @@ -769,8 +769,8 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is return amd::KernelParameterDescriptor::QueueObject; case ARG_TYPE_VALUE: return (argInfo->arg.value.data == DATATYPE_struct) - ? amd::KernelParameterDescriptor::ReferenceObject - : amd::KernelParameterDescriptor::ValueObject; + ? amd::KernelParameterDescriptor::ReferenceObject + : amd::KernelParameterDescriptor::ValueObject; case ARG_TYPE_IMAGE: return amd::KernelParameterDescriptor::ImageObject; case ARG_TYPE_SAMPLER: diff --git a/projects/clr/rocclr/device/devprogram.cpp b/projects/clr/rocclr/device/devprogram.cpp index 9812006e09..a33b09b9c6 100644 --- a/projects/clr/rocclr/device/devprogram.cpp +++ b/projects/clr/rocclr/device/devprogram.cpp @@ -511,8 +511,8 @@ bool Program::compileAndLinkExecutable(const amd_comgr_data_set_t inputs, if (status == AMD_COMGR_STATUS_SUCCESS) { hasRelocatableData = true; amd_comgr_action_kind_t kind = (continueCompileFrom == FILE_TYPE_ASM_TEXT) - ? AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE - : AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE; + ? AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE + : AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE; status = amd::Comgr::do_action(kind, action, inputs, relocatableData); extractBuildLog(relocatableData); } @@ -1259,9 +1259,9 @@ bool Program::linkImplHSAIL(amd::option::Options* options) { bool finalize = true; internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; // If !binaryElf_ then program must have been created using clCreateProgramWithBinary - aclType continueCompileFrom = (!binaryElf_) - ? static_cast(getNextCompilationStageFromBinary(options)) - : ACL_TYPE_LLVMIR_BINARY; + aclType continueCompileFrom = + (!binaryElf_) ? static_cast(getNextCompilationStageFromBinary(options)) + : ACL_TYPE_LLVMIR_BINARY; switch (continueCompileFrom) { case ACL_TYPE_SPIRV_BINARY: @@ -2857,9 +2857,8 @@ bool Program::getDemangledName(const std::string& mangledName, std::string& dema demangledName.resize(demangled_size); - if (AMD_COMGR_STATUS_SUCCESS != - amd::Comgr::get_data(demangled_data, &demangled_size, - const_cast(demangledName.data()))) { + if (AMD_COMGR_STATUS_SUCCESS != amd::Comgr::get_data(demangled_data, &demangled_size, + const_cast(demangledName.data()))) { amd::Comgr::release_data(mangled_data); amd::Comgr::release_data(demangled_data); return false; diff --git a/projects/clr/rocclr/device/pal/palblit.cpp b/projects/clr/rocclr/device/pal/palblit.cpp index 0166d357af..5fdabd36ee 100644 --- a/projects/clr/rocclr/device/pal/palblit.cpp +++ b/projects/clr/rocclr/device/pal/palblit.cpp @@ -2166,18 +2166,18 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, for (auto& packed_obj : packed_vector) { constexpr uint32_t kFillType = FillBufferAligned; uint32_t kpattern_size = (packed_obj.pattern_expanded_) - ? HostBlitManager::FillBufferInfo::kExtendedSize - : patternSize; + ? HostBlitManager::FillBufferInfo::kExtendedSize + : patternSize; size_t kfill_size = packed_obj.fill_size_ / kpattern_size; uint64_t koffset = overall_offset; overall_offset += packed_obj.fill_size_; size_t globalWorkOffset[3] = {0, 0, 0}; - uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) - : (kpattern_size & 0x7) == 0 ? sizeof(uint64_t) - : (kpattern_size & 0x3) == 0 ? sizeof(uint32_t) - : (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) - : sizeof(uint8_t); + uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) + : (kpattern_size & 0x7) == 0 ? sizeof(uint64_t) + : (kpattern_size & 0x3) == 0 ? sizeof(uint32_t) + : (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) + : sizeof(uint8_t); // Program kernels arguments for the fill operation Memory* mem = &gpuMem(memory); @@ -2302,9 +2302,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, constexpr size_t kFillImageThreshold = 256 * 256; // Use host fill if memory has direct access and image is small - if (setup_.disableFillImage_ || - (gpuMem(memory).isHostMemDirectAccess() && - (size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) { + if (setup_.disableFillImage_ || (gpuMem(memory).isHostMemDirectAccess() && + (size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) { gpu().releaseGpuMemoryFence(); result = HostBlitManager::fillImage(memory, pattern, origin, size, entire); diff --git a/projects/clr/rocclr/device/pal/palcounters.cpp b/projects/clr/rocclr/device/pal/palcounters.cpp index 77c3791926..77473b1b6d 100644 --- a/projects/clr/rocclr/device/pal/palcounters.cpp +++ b/projects/clr/rocclr/device/pal/palcounters.cpp @@ -194,7 +194,7 @@ bool PalCounterReference::finalize() { assert(layout.sampleCount == numExpCounters_); size_t size = sizeof(Pal::GlobalCounterLayout) + - (sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1)); + (sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1)); layout_ = reinterpret_cast(new char[size]); if (layout_ != nullptr) { layout_->sampleCount = layout.sampleCount; @@ -728,7 +728,7 @@ bool PerfCounter::create() { } counter_start = info_.counterIndex_; counter_step = dev().properties().gfxipProperties.shaderCore.numShaderArrays * - dev().properties().gfxipProperties.shaderCore.numShaderEngines; + dev().properties().gfxipProperties.shaderCore.numShaderEngines; break; case PCIndexSelect::ComputeUnit: diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index 713ebc32bf..327b765a42 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -111,8 +111,8 @@ static std::tuple findIsa(uint32_t gfxipMajor, uin auto palDeviceIter = std::find_if(std::begin(supportedPalDevices), std::end(supportedPalDevices), [&](const PalDevice& palDevice) { return palDevice.gfxipMajor_ == gfxipMajor && - palDevice.gfxipMinor_ == gfxipMinor && - palDevice.gfxipStepping_ == (gfxipStepping & 0xF); + palDevice.gfxipMinor_ == gfxipMinor && + palDevice.gfxipStepping_ == (gfxipStepping & 0xF); }); if (palDeviceIter == std::end(supportedPalDevices)) { return std::make_tuple(nullptr, nullptr); @@ -131,8 +131,8 @@ static std::tuple findPal(uint3 auto palDeviceIter = std::find_if(std::begin(supportedPalDevices), std::end(supportedPalDevices), [&](const PalDevice& palDevice) { return palDevice.gfxipMajor_ == gfxipMajor && - palDevice.gfxipMinor_ == gfxipMinor && - palDevice.gfxipStepping_ == (gfxipStepping & 0xF); + palDevice.gfxipMinor_ == gfxipMinor && + palDevice.gfxipStepping_ == (gfxipStepping & 0xF); }); if (palDeviceIter == std::end(supportedPalDevices)) { return std::make_tuple(Pal::GfxIpLevel::None, Pal::AsicRevision::Unknown, nullptr); @@ -351,8 +351,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.maxWorkItemDimensions_ = 3; info_.maxComputeUnits_ = settings().enableWgpMode_ - ? palProp.gfxipProperties.shaderCore.numAvailableCus / 2 - : palProp.gfxipProperties.shaderCore.numAvailableCus; + ? palProp.gfxipProperties.shaderCore.numAvailableCus / 2 + : palProp.gfxipProperties.shaderCore.numAvailableCus; info_.maxPhysicalComputeUnits_ = info_.maxComputeUnits_; info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines; @@ -371,11 +371,11 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support info_.maxEngineClockFrequency_ = (palProp.gfxipProperties.performance.maxGpuClock != 0) - ? palProp.gfxipProperties.performance.maxGpuClock - : 555; + ? palProp.gfxipProperties.performance.maxGpuClock + : 555; info_.maxMemoryClockFrequency_ = (palProp.gpuMemoryProperties.performance.maxMemClock != 0) - ? palProp.gpuMemoryProperties.performance.maxMemClock - : 555; + ? palProp.gpuMemoryProperties.performance.maxMemClock + : 555; info_.wallClockFrequency_ = palProp.timestampFrequency / 1000; // in KHz info_.vramBusBitWidth_ = palProp.gpuMemoryProperties.performance.vramBusBitWidth; info_.l2CacheSize_ = palProp.gfxipProperties.shaderCore.tccSizeInBytes; @@ -417,8 +417,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, uint uswcPercentAvailable = ((static_cast(heaps[Pal::GpuHeapGartUswc].logicalSize) / Mi) > 1536 && IS_WINDOWS) - ? 75 - : 50; + ? 75 + : 50; if (settings().apuSystem_) { info_.globalMemSize_ += (static_cast(heaps[Pal::GpuHeapGartUswc].logicalSize) * uswcPercentAvailable) / @@ -622,8 +622,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.deviceTopology_.pcie.function = palProp.pciProperties.functionNumber; info_.simdPerCU_ = settings().enableWgpMode_ - ? (2 * palProp.gfxipProperties.shaderCore.numSimdsPerCu) - : palProp.gfxipProperties.shaderCore.numSimdsPerCu; + ? (2 * palProp.gfxipProperties.shaderCore.numSimdsPerCu) + : palProp.gfxipProperties.shaderCore.numSimdsPerCu; info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray; info_.simdWidth_ = isa().simdWidth(); info_.simdInstructionWidth_ = 1; @@ -656,7 +656,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.pcieDeviceId_ = palProp.deviceId; info_.pcieRevisionId_ = palProp.revisionId; info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * info_.simdPerCU_ * - palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd; + palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd; info_.cooperativeGroups_ = settings().enableCoopGroups_; info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_; @@ -906,8 +906,8 @@ bool Device::create(Pal::IDevice* device) { // Save the IP level for the offline detection ipLevel_ = properties().gfxLevel; asicRevision_ = flagIsDefault(PAL_FORCE_ASIC_REVISION) - ? properties().revision - : static_cast(PAL_FORCE_ASIC_REVISION); + ? properties().revision + : static_cast(PAL_FORCE_ASIC_REVISION); // XNACK flag should be set for PageMigration or IOMMUv2 support. bool isXNACKEnabled = @@ -1284,10 +1284,9 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { if (queue != nullptr) { profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE); if (queue->asHostQueue() != nullptr) { - bool interopQueue = (0 != - (queue->context().info().flags_ & - (amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr | - amd::Context::D3D11DeviceKhr))); + bool interopQueue = (0 != (queue->context().info().flags_ & + (amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr | + amd::Context::D3D11DeviceKhr))); rtCUs = queue->rtCUs(); } else if (queue->asDeviceQueue() != nullptr) { deviceQueueSize = queue->asDeviceQueue()->size(); @@ -1439,9 +1438,9 @@ bool Device::init() { // Count up all the devices in the system. platform_->EnumerateDevices(&gNumDevices, &gDeviceList[0]); - const char* requestedDeviceList = amd::IS_HIP - ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES) - : GPU_DEVICE_ORDINAL; + const char* requestedDeviceList = + amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES) + : GPU_DEVICE_ORDINAL; if (requestedDeviceList[0] != '\0') { useDeviceList = true; @@ -1611,8 +1610,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const { Resource::MemoryType type = (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) - ? Resource::Remote - : Resource::Local; + ? Resource::Remote + : Resource::Local; // Check if runtime can force a tiny buffer into USWC memory if ((size <= (GPU_MAX_REMOTE_MEM_SIZE * Ki)) && (type == Resource::Local) && @@ -1633,8 +1632,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const { // Internal means VirtualDevice!=nullptr bool internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && (owner.getVirtualDevice() != nullptr)) - ? true - : false; + ? true + : false; // Create a memory object gpuMemory = new pal::Buffer(*this, owner, owner.getSize()); @@ -1918,9 +1917,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { (memory->memoryType() != Resource::ExternalPhysical) && ((owner.getHostMem() != nullptr) || ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) { - bool ok = memory->pinSystemMemory( - owner.getHostMem(), - (owner.getHostMemRef()->size()) ? owner.getHostMemRef()->size() : owner.getSize()); + bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size()) + ? owner.getHostMemRef()->size() + : owner.getSize()); //! \note: Ignore the pinning result for now } @@ -2067,7 +2066,8 @@ bool Device::globalFreeMemory(size_t* freeMemory) const { // Allocated system memory without cached allocations. Cache size contains all allocations, so // don't count persistent and local Pal::gpusize system_memory = allocedMem[Pal::GpuHeapGartCacheable] + - allocedMem[Pal::GpuHeapGartUswc] + cache_group_local - resourceCache().cacheSize(); + allocedMem[Pal::GpuHeapGartUswc] + cache_group_local - + resourceCache().cacheSize(); #if IS_WINDOWS // Second, query OS for overall memory usage on the system @@ -2091,7 +2091,7 @@ bool Device::globalFreeMemory(size_t* freeMemory) const { if (mem_budget_info.usage[Pal::GpuHeapGroupNonLocal] > (resourceCache().cacheSize() - cache_group_local)) { system_total_alloced = mem_budget_info.usage[Pal::GpuHeapGroupNonLocal] + cache_group_local - - resourceCache().cacheSize(); + resourceCache().cacheSize(); } // System usage exceeds per process usage for system memory if (system_total_alloced > system_memory) { @@ -2102,9 +2102,10 @@ bool Device::globalFreeMemory(size_t* freeMemory) const { // Third, finalize reported free memory // Fill free memory info - freeMemory[TotalFreeMemory] = (total_alloced > info().globalMemSize_) - ? 0 - : static_cast((info().globalMemSize_ - total_alloced) / Ki); + freeMemory[TotalFreeMemory] = + (total_alloced > info().globalMemSize_) + ? 0 + : static_cast((info().globalMemSize_ - total_alloced) / Ki); freeMemory[TotalFreeMemory] -= (freeMemory[TotalFreeMemory] > HIP_HIDDEN_FREE_MEM * Ki) ? HIP_HIDDEN_FREE_MEM * Ki : 0; @@ -2842,8 +2843,8 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI (Pal::Result::Success == (iDev()->SetClockMode(setClockMode, reinterpret_cast(pSetClockModeOutput)))) - ? true - : false; + ? true + : false; return result; } diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp index 137e9c8e38..7cca51e244 100644 --- a/projects/clr/rocclr/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/device/pal/paldevice.hpp @@ -490,10 +490,10 @@ class Device : public NullDevice { //! Returns the number of available compute rings uint numExclusiveComputeEngines() const { return exclusiveComputeEnginesId_.size() + - ((exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) == - exclusiveComputeEnginesId().end()) - ? 1 - : 0); + ((exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) == + exclusiveComputeEnginesId().end()) + ? 1 + : 0); } //! Returns the map of available exclusive compute rings with the engine index diff --git a/projects/clr/rocclr/device/pal/paldeviced3d9.cpp b/projects/clr/rocclr/device/pal/paldeviced3d9.cpp index 265d8fa6c2..4f3279b16e 100644 --- a/projects/clr/rocclr/device/pal/paldeviced3d9.cpp +++ b/projects/clr/rocclr/device/pal/paldeviced3d9.cpp @@ -59,7 +59,7 @@ bool Device::associateD3D9Device(void* d3d9Device) { // match the adapter bool canInteroperate = (properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) && - (properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart); + (properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart); return canInteroperate; } diff --git a/projects/clr/rocclr/device/pal/paldevicegl.cpp b/projects/clr/rocclr/device/pal/paldevicegl.cpp index 4f4e7847a1..cc3177a2e6 100644 --- a/projects/clr/rocclr/device/pal/paldevicegl.cpp +++ b/projects/clr/rocclr/device/pal/paldevicegl.cpp @@ -782,8 +782,8 @@ bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) { // match the adapter canInteroperate = (properties().osProperties.luidHighPart == glAdapterLuid.HighPart) && - (properties().osProperties.luidLowPart == glAdapterLuid.LowPart) && - ((1 << properties().gpuIndex) == glChainBitMask); + (properties().osProperties.luidLowPart == glAdapterLuid.LowPart) && + ((1 << properties().gpuIndex) == glChainBitMask); } #else GLuint glDeviceId = 0; @@ -797,9 +797,9 @@ bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const if (pfnMesaGLInteropGLXQueryDeviceInfo(disp, ctx, &info) == 0) { // match the adapter canInteroperate = (properties().pciProperties.busNumber == info.pci_bus) && - (properties().pciProperties.deviceNumber == info.pci_device) && - (properties().pciProperties.functionNumber == info.pci_function) && - (static_cast(1 << properties().gpuIndex) == glChainMask); + (properties().pciProperties.deviceNumber == info.pci_device) && + (properties().pciProperties.functionNumber == info.pci_function) && + (static_cast(1 << properties().gpuIndex) == glChainMask); } } #endif diff --git a/projects/clr/rocclr/device/pal/palgpuopen.cpp b/projects/clr/rocclr/device/pal/palgpuopen.cpp index 040ff0d5df..de16c0ab0a 100644 --- a/projects/clr/rocclr/device/pal/palgpuopen.cpp +++ b/projects/clr/rocclr/device/pal/palgpuopen.cpp @@ -620,8 +620,8 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) { if (result == Pal::Result::Success) { GpuUtil::SampleTraceApiInfo sample_trace_api_info = {}; sample_trace_api_info.instructionTraceMode = (inst_tracing_enabled_) - ? GpuUtil::InstructionTraceMode::FullFrame - : GpuUtil::InstructionTraceMode::Disabled; + ? GpuUtil::InstructionTraceMode::FullFrame + : GpuUtil::InstructionTraceMode::Disabled; trace_.gpa_session_->SetSampleTraceApiInfo(sample_trace_api_info, trace_.gpa_sample_id_); } diff --git a/projects/clr/rocclr/device/pal/palkernel.cpp b/projects/clr/rocclr/device/pal/palkernel.cpp index 85e72f9540..ad351358cb 100644 --- a/projects/clr/rocclr/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/device/pal/palkernel.cpp @@ -167,7 +167,7 @@ bool HSAILKernel::init() { // Find total workgroup size if (workGroupInfo_.compileSize_[0] != 0) { workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * - workGroupInfo_.compileSize_[2]; + workGroupInfo_.compileSize_[2]; } else { workGroupInfo_.size_ = device().info().preferredWorkGroupSize_; } diff --git a/projects/clr/rocclr/device/pal/palmemory.cpp b/projects/clr/rocclr/device/pal/palmemory.cpp index 5a167dc826..aae169e70c 100644 --- a/projects/clr/rocclr/device/pal/palmemory.cpp +++ b/projects/clr/rocclr/device/pal/palmemory.cpp @@ -367,8 +367,8 @@ bool Memory::createInterop() { vkRes.nt_handle_ = ((ext_memory->Type() != amd::ExternalMemory::HandleType::OpaqueFd) && (ext_memory->Type() != amd::ExternalMemory::HandleType::OpaqueWin32Kmt) && (ext_memory->Type() != amd::ExternalMemory::HandleType::D3D11ResourceKmt)) - ? true - : false; + ? true + : false; } else if (glObject != nullptr) { diff --git a/projects/clr/rocclr/device/pal/palprintf.cpp b/projects/clr/rocclr/device/pal/palprintf.cpp index cd85e64d14..0e4ec849db 100644 --- a/projects/clr/rocclr/device/pal/palprintf.cpp +++ b/projects/clr/rocclr/device/pal/palprintf.cpp @@ -289,8 +289,8 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t case 4: if (printFloat) { const float fArg = size == 2 - ? amd::half2float(*(reinterpret_cast(argument))) - : *(reinterpret_cast(argument)); + ? amd::half2float(*(reinterpret_cast(argument))) + : *(reinterpret_cast(argument)); static const char* fSpecifiers = "eEfgGa"; std::string fmtF = fmt; size_t posS = fmtF.find_first_of("%"); @@ -327,13 +327,12 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t hhFmt.erase(hhFmt.find_first_of("h"), 2); amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); } else if (hlModifier) { - amd::Os::printf(hlFmt.data(), - size == 2 ? *(reinterpret_cast(argument)) - : *(reinterpret_cast(argument))); + amd::Os::printf(hlFmt.data(), size == 2 + ? *(reinterpret_cast(argument)) + : *(reinterpret_cast(argument))); } else { - amd::Os::printf(fmt.data(), - size == 2 ? *(reinterpret_cast(argument)) - : *(reinterpret_cast(argument))); + amd::Os::printf(fmt.data(), size == 2 ? *(reinterpret_cast(argument)) + : *(reinterpret_cast(argument))); } } break; diff --git a/projects/clr/rocclr/device/pal/palresource.cpp b/projects/clr/rocclr/device/pal/palresource.cpp index 74c7344249..f1a2bba7b7 100644 --- a/projects/clr/rocclr/device/pal/palresource.cpp +++ b/projects/clr/rocclr/device/pal/palresource.cpp @@ -305,7 +305,7 @@ Resource::Resource(const Device& gpuDev, size_t size) desc_.state_ = 0; desc_.type_ = Empty; desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) / - Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); + Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); desc_.height_ = 1; desc_.depth_ = 1; desc_.mipLevels_ = 1; @@ -859,9 +859,8 @@ bool Resource::CreateInterop(CreateParams* params) { size_t imageSize; size_t gpuMemSize; - if (Pal::Result::Success != - dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize, - &imgCreateInfo)) { + if (Pal::Result::Success != dev().iDev()->GetExternalSharedImageSizes( + imgOpenInfo, &imageSize, &gpuMemSize, &imgCreateInfo)) { return false; } @@ -1327,8 +1326,8 @@ bool Resource::create(MemoryType memType, CreateParams* params, bool forceLinear createInfo.size = desc().width_ * elementSize_; createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); createInfo.alignment = (params && params->alignment_ != 0) - ? params->alignment_ - : (desc().scratch_ ? 64 * Ki : MaxGpuAlignment); + ? params->alignment_ + : (desc().scratch_ ? 64 * Ki : MaxGpuAlignment); createInfo.vaRange = Pal::VaRange::Default; createInfo.priority = Pal::GpuMemPriority::Normal; @@ -1388,7 +1387,7 @@ void Resource::free() { } const bool wait = (memoryType() != ImageView) && (memoryType() != ImageBuffer) && - (memoryType() != ImageExternalBuffer) && (memoryType() != View); + (memoryType() != ImageExternalBuffer) && (memoryType() != View); // OCL has to wait, even if resource is placed in the cache, since reallocation can occur // and resource can be reused on another async queue without a wait on a busy operation @@ -1519,8 +1518,8 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, } bool cp_dma = dev().settings().disableSdma_ || - (!enableCopyRect && desc().buffer_ && dstResource.desc().buffer_ && - (size[0] < dev().settings().cpDmaCopySizeMax_)); + (!enableCopyRect && desc().buffer_ && dstResource.desc().buffer_ && + (size[0] < dev().settings().cpDmaCopySizeMax_)); if (cp_dma) { // Make sure compute is done before CP DMA start gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::KernelToCopy); @@ -1563,9 +1562,9 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, } copyRegion.gpuMemoryOffset = gpuMemoryOffset; copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; - copyRegion.gpuMemoryDepthPitch = (srcOrigin[2]) - ? srcOrigin[2] - : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; + copyRegion.gpuMemoryDepthPitch = + (srcOrigin[2]) ? srcOrigin[2] + : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_, imgLayout, 1, ©Region); } else if (!desc().buffer_ && dstResource.desc().buffer_) { Pal::MemoryImageCopyRegion copyRegion = {}; @@ -1588,9 +1587,9 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, } copyRegion.gpuMemoryOffset = gpuMemoryOffset; copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; - copyRegion.gpuMemoryDepthPitch = (dstOrigin[2]) - ? dstOrigin[2] - : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; + copyRegion.gpuMemoryDepthPitch = + (dstOrigin[2]) ? dstOrigin[2] + : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, ©Region); } else { if (enableCopyRect) { diff --git a/projects/clr/rocclr/device/pal/palresource.hpp b/projects/clr/rocclr/device/pal/palresource.hpp index 202e6311e4..84099a77e3 100644 --- a/projects/clr/rocclr/device/pal/palresource.hpp +++ b/projects/clr/rocclr/device/pal/palresource.hpp @@ -424,7 +424,7 @@ class Resource : public amd::HeapObject { memRef_ = viewOwner_->memRef_; memRef_->retain(); desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) / - Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); + Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); setBusy(*memRef()->gpu_, GpuEvent::InvalidID); } } diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp index fecbe704c7..6e8593d343 100644 --- a/projects/clr/rocclr/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/device/pal/palsettings.cpp @@ -341,9 +341,8 @@ bool Settings::create(const Pal::DeviceProperties& palProp, #endif } - if (apuSystem_ && - ((heaps[Pal::GpuHeapLocal].logicalSize + heaps[Pal::GpuHeapInvisible].logicalSize) < - (150 * Mi))) { + if (apuSystem_ && ((heaps[Pal::GpuHeapLocal].logicalSize + + heaps[Pal::GpuHeapInvisible].logicalSize) < (150 * Mi))) { remoteAlloc_ = true; } diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index c420b42819..610c926601 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -896,7 +896,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // \todo forces PAL to reuse CBs, but requires postamble createInfo.flags.autoMemoryReuse = false; createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc; - createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = VirtualGPU::Queue::MaxCommands * + createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = + VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0) + ((dev().captureMgr() != nullptr) ? 512 : 0)); createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = dev().settings().maxCmdBuffers_ * createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize; @@ -925,8 +926,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, uint idx = index() % dev().numComputeEngines(); uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs - ? 0 - : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2); + ? 0 + : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2); uint max_cmd_buffers = dev().settings().maxCmdBuffers_; if (dev().numComputeEngines()) { @@ -937,8 +938,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, } const auto& info = dev().QueuePool().find(queues_[MainEngine]->iQueue_); hwRing_ = (info != dev().QueuePool().end()) - ? info->second->index_ - : (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES; + ? info->second->index_ + : (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES; // Check if device has SDMA engines if (dev().numDMAEngines() != 0 && !dev().settings().disableSdma_) { @@ -2158,7 +2159,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) { amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(vcmd.dst()); assert(dstMemory && "No svm Buffer to fill with!"); size_t offset = reinterpret_cast(vcmd.dst()) - - reinterpret_cast(dstMemory->getSvmPtr()); + reinterpret_cast(dstMemory->getSvmPtr()); pal::Memory* memory = dev().getGpuMemory(dstMemory); @@ -2828,15 +2829,13 @@ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) { if (cmd.semaphoreCmd() == amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE) { flushDMA(MainEngine); - if (Pal::Result::Success != - queues_[MainEngine]->iQueue_->SignalQueueSemaphore(const_cast(sem), - cmd.fence())) { + if (Pal::Result::Success != queues_[MainEngine]->iQueue_->SignalQueueSemaphore( + const_cast(sem), cmd.fence())) { LogError("Failed to signal external semaphore"); } } else { - if (Pal::Result::Success != - queues_[MainEngine]->iQueue_->WaitQueueSemaphore(const_cast(sem), - cmd.fence())) { + if (Pal::Result::Success != queues_[MainEngine]->iQueue_->WaitQueueSemaphore( + const_cast(sem), cmd.fence())) { LogError("Failed to wait on external semaphore"); } } @@ -3657,9 +3656,8 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p //! Note: SVM with subbuffers has an issue with tracking. //! Conformance can send read only subbuffer, but update the region //! in the kernel. - if ((mem != nullptr) && - ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) || - ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) { + if ((mem != nullptr) && ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) || + ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) { mem->signalWrite(&dev()); } if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 70a79233c9..9005eede4b 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -1709,8 +1709,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, } else { size_t totalSize = size[0]; // Do a staging copy - bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ || - (totalSize <= dev().settings().sdmaCopyThreshold_) || + bool useShaderCopyPath = + setup_.disableHwlCopyBuffer_ || (totalSize <= dev().settings().sdmaCopyThreshold_) || (copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT); if (!useShaderCopyPath) { @@ -1843,8 +1843,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo } else { size_t totalSize = size[0]; // Do a staging copy - bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ || - (totalSize <= dev().settings().sdmaCopyThreshold_) || + bool useShaderCopyPath = + setup_.disableHwlCopyBuffer_ || (totalSize <= dev().settings().sdmaCopyThreshold_) || (copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT); if (!useShaderCopyPath) { @@ -2014,18 +2014,18 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern for (auto& packed_obj : packed_vector) { constexpr uint32_t kFillType = FillBufferAligned; uint32_t kpattern_size = (packed_obj.pattern_expanded_) - ? HostBlitManager::FillBufferInfo::kExtendedSize - : patternSize; + ? HostBlitManager::FillBufferInfo::kExtendedSize + : patternSize; size_t kfill_size = packed_obj.fill_size_ / kpattern_size; size_t koffset = overall_offset; overall_offset += packed_obj.fill_size_; size_t globalWorkOffset[3] = {0, 0, 0}; - uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) - : (kpattern_size & 0x7) == 0 ? sizeof(uint64_t) - : (kpattern_size & 0x3) == 0 ? sizeof(uint32_t) - : (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) - : sizeof(uint8_t); + uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) + : (kpattern_size & 0x7) == 0 ? sizeof(uint64_t) + : (kpattern_size & 0x3) == 0 ? sizeof(uint32_t) + : (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) + : sizeof(uint8_t); // Program kernels arguments for the fill operation cl_mem mem = as_cl(memory.owner()); setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem, koffset); @@ -2096,10 +2096,10 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern size_t globalWorkSize[3] = {amd::alignUp(fillSizeX, 16), amd::alignUp(fillSizeY, 16), 1}; size_t localWorkSize[3] = {16, 16, 1}; - uint32_t alignment = (patternSize & 0x7) == 0 ? sizeof(uint64_t) - : (patternSize & 0x3) == 0 ? sizeof(uint32_t) - : (patternSize & 0x1) == 0 ? sizeof(uint16_t) - : sizeof(uint8_t); + uint32_t alignment = (patternSize & 0x7) == 0 ? sizeof(uint64_t) + : (patternSize & 0x3) == 0 ? sizeof(uint32_t) + : (patternSize & 0x1) == 0 ? sizeof(uint16_t) + : sizeof(uint8_t); cl_mem mem = as_cl(memory.owner()); if (alignment == sizeof(uint64_t)) { @@ -2250,8 +2250,8 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds bool ipcShared = srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared(); - bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ || - (sizeIn[0] <= dev().settings().sdmaCopyThreshold_) || + bool useShaderCopyPath = + setup_.disableHwlCopyBuffer_ || (sizeIn[0] <= dev().settings().sdmaCopyThreshold_) || (!(p2p || ipcShared) && (!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess() && !(copyMetadata.copyEnginePreference_ == @@ -2307,9 +2307,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, constexpr size_t kFillImageThreshold = 256 * 256; // Use host fill if memory has direct access and image is small - if (setup_.disableFillImage_ || - (gpuMem(memory).isHostMemDirectAccess() && - (size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) { + if (setup_.disableFillImage_ || (gpuMem(memory).isHostMemDirectAccess() && + (size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); result = HostBlitManager::fillImage(memory, pattern, origin, size, entire); @@ -2691,8 +2690,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, hsa_queue_t* schedulerQueue, amd::NDRangeContainer ndrange(1, globalWorkOffset, globalWorkSize, localWorkSize); - device::Kernel* devKernel = const_cast( - kernels_[Scheduler]->getDeviceKernel(dev())); + device::Kernel* devKernel = + const_cast(kernels_[Scheduler]->getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 2d66455eb6..0c36b9b209 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -376,8 +376,8 @@ hsa_ven_amd_loader_1_00_pfn_t Device::amd_loader_ext_table = {nullptr}; hsa_status_t Device::loaderQueryHostAddress(const void* device, const void** host) { return amd_loader_ext_table.hsa_ven_amd_loader_query_host_address - ? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host) - : HSA_STATUS_ERROR; + ? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host) + : HSA_STATUS_ERROR; } // ================================================================================================ @@ -413,9 +413,9 @@ bool Device::init() { return false; } - std::string ordinals = amd::IS_HIP - ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES) - : GPU_DEVICE_ORDINAL; + std::string ordinals = + amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES) + : GPU_DEVICE_ORDINAL; if (ordinals[0] != '\0') { size_t pos = 0; std::vector valid_agents; @@ -573,9 +573,9 @@ bool Device::create() { return false; } - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, - &pciDeviceId_)) { + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_, + (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, + &pciDeviceId_)) { LogPrintfError("Unable to get PCI ID of HSA device %s", agent_name); return false; } @@ -584,35 +584,34 @@ bool Device::create() { uint count; hsa_isa_t first_isa; } agent_isas = {0, {0}}; - if (HSA_STATUS_SUCCESS != - hsa_agent_iterate_isas( - bkendDevice_, - [](hsa_isa_t isa, void* data) { - agent_isas_t* agent_isas = static_cast(data); - if (agent_isas->count++ == 0) { - agent_isas->first_isa = isa; - } - return HSA_STATUS_SUCCESS; - }, - &agent_isas)) { + if (HSA_STATUS_SUCCESS != hsa_agent_iterate_isas( + bkendDevice_, + [](hsa_isa_t isa, void* data) { + agent_isas_t* agent_isas = static_cast(data); + if (agent_isas->count++ == 0) { + agent_isas->first_isa = isa; + } + return HSA_STATUS_SUCCESS; + }, + &agent_isas)) { LogPrintfError("Unable to iterate supported ISAs for HSA device %s (PCI ID %x)", agent_name, pciDeviceId_); return false; } uint32_t isa_name_length = 0; - if (HSA_STATUS_SUCCESS != - hsa_isa_get_info_alt(agent_isas.first_isa, (hsa_isa_info_t)HSA_ISA_INFO_NAME_LENGTH, - &isa_name_length)) { + if (HSA_STATUS_SUCCESS != hsa_isa_get_info_alt(agent_isas.first_isa, + (hsa_isa_info_t)HSA_ISA_INFO_NAME_LENGTH, + &isa_name_length)) { LogPrintfError("Unable to get ISA name length for HSA device %s (PCI ID %x)", agent_name, pciDeviceId_); return false; } std::vector isa_name(isa_name_length + 1, '\0'); - if (HSA_STATUS_SUCCESS != - hsa_isa_get_info_alt(agent_isas.first_isa, (hsa_isa_info_t)HSA_ISA_INFO_NAME, - isa_name.data())) { + if (HSA_STATUS_SUCCESS != hsa_isa_get_info_alt(agent_isas.first_isa, + (hsa_isa_info_t)HSA_ISA_INFO_NAME, + isa_name.data())) { LogPrintfError("Unable to get ISA name for HSA device %s (PCI ID %x)", agent_name, pciDeviceId_); return false; @@ -663,10 +662,9 @@ bool Device::create() { assert(!settings_); roc::Settings* hsaSettings = new roc::Settings(); settings_ = hsaSettings; - if (!hsaSettings || - !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa, - isa->xnack() == amd::Isa::Feature::Enabled, coop_groups, isXgmi_, - hasValidHDPFlush)) { + if (!hsaSettings || !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa, + isa->xnack() == amd::Isa::Feature::Enabled, coop_groups, + isXgmi_, hasValidHDPFlush)) { LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name, pciDeviceId_); return false; @@ -969,11 +967,11 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) void Sampler::fillSampleDescriptor(hsa_ext_sampler_descriptor_v2_t& samplerDescriptor, const amd::Sampler& sampler) const { samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST - ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST - : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR; + ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST + : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR; samplerDescriptor.coordinate_mode = sampler.normalizedCoords() - ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED - : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; + ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED + : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; for (int i = 0; i < 3; i++) { switch (sampler.addressingMode(i)) { case CL_ADDRESS_CLAMP_TO_EDGE: @@ -1036,9 +1034,9 @@ bool Device::populateOCLDeviceConstants() { ::strncpy(info_.name_, isa().targetId(), sizeof(info_.name_) - 1); char device_name[64] = {0}; - if (HSA_STATUS_SUCCESS == - hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, - device_name)) { + if (HSA_STATUS_SUCCESS == hsa_agent_get_info(bkendDevice_, + (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, + device_name)) { ::strncpy(info_.boardName_, device_name, sizeof(info_.boardName_) - 1); } @@ -1075,9 +1073,9 @@ bool Device::populateOCLDeviceConstants() { info_.maxPhysicalComputeUnits_ = settings().enableWgpMode_ ? info_.maxPhysicalComputeUnits_ / 2 : info_.maxPhysicalComputeUnits_; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, - &info_.globalMemCacheLineSize_)) { + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_, + (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, + &info_.globalMemCacheLineSize_)) { return false; } info_.globalMemCacheLineSize_ = @@ -1152,9 +1150,8 @@ bool Device::populateOCLDeviceConstants() { checkAtomicSupport(); assert(cpu_agent_info_->fine_grain_pool.handle != 0); - if (HSA_STATUS_SUCCESS != - hsa_amd_agent_iterate_memory_pools(bkendDevice_, Device::iterateGpuMemoryPoolCallback, - this)) { + if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools( + bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) { return false; } @@ -1188,9 +1185,9 @@ bool Device::populateOCLDeviceConstants() { } size_t group_segment_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info(group_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, - &group_segment_size)) { + if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(group_segment_, + HSA_AMD_MEMORY_POOL_INFO_SIZE, + &group_segment_size)) { return false; } assert(group_segment_size > 0); @@ -1229,16 +1226,16 @@ bool Device::populateOCLDeviceConstants() { if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) { size_t global_segment_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, - &global_segment_size)) { + if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_, + HSA_AMD_MEMORY_POOL_INFO_SIZE, + &global_segment_size)) { return false; } assert(global_segment_size > 0); info_.globalMemSize_ = (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * static_cast(global_segment_size)) / - 100u; + 100u; // For APU with vram size <= 512MiB, use a smaller single alloc percentage if (info_.globalMemSize_ <= 536870912) { @@ -1266,7 +1263,7 @@ bool Device::populateOCLDeviceConstants() { info_.globalMemSize_ = std::max(info_.globalMemSize_, uint64_t(1 * Gi)); info_.globalMemSize_ = (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * static_cast(info_.globalMemSize_)) / - 100u; + 100u; info_.maxMemAllocSize_ = uint64_t(info_.globalMemSize_ * std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); @@ -1325,8 +1322,8 @@ bool Device::populateOCLDeviceConstants() { info_.hostUnifiedMemory_ = 1; info_.iommuv2_ = true; } - info_.memBaseAddrAlign_ = 8 * - (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(int64_t[16]) * 2 : MEMOBJ_BASE_ADDR_ALIGN); + info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(int64_t[16]) * 2 + : MEMOBJ_BASE_ADDR_ALIGN); info_.minDataTypeAlignSize_ = sizeof(int64_t[16]); info_.maxConstantArgs_ = 8; @@ -1629,14 +1626,14 @@ bool Device::populateOCLDeviceConstants() { if (getIsaMeta(std::move(isa().isaName()), isaMeta)) { std::string addressableNumVGPRs, totalNumVGPRs, vGPRAllocGranule; info_.availableVGPRs_ = getValueFromIsaMeta(isaMeta, "AddressableNumVGPRs", addressableNumVGPRs) - ? atoi(addressableNumVGPRs.c_str()) - : 0; + ? atoi(addressableNumVGPRs.c_str()) + : 0; info_.vgprsPerSimd_ = getValueFromIsaMeta(isaMeta, "TotalNumVGPRs", totalNumVGPRs) - ? atoi(totalNumVGPRs.c_str()) - : 0; + ? atoi(totalNumVGPRs.c_str()) + : 0; info_.vgprAllocGranularity_ = getValueFromIsaMeta(isaMeta, "VGPRAllocGranule", vGPRAllocGranule) - ? atoi(vGPRAllocGranule.c_str()) - : 0; + ? atoi(vGPRAllocGranule.c_str()) + : 0; info_.availableRegistersPerCU_ = info_.vgprsPerSimd_ * info_.simdPerCU_ * info_.wavefrontWidth_; ClPrint(amd::LOG_INFO, amd::LOG_INIT, @@ -1647,8 +1644,8 @@ bool Device::populateOCLDeviceConstants() { std::string sgprValue; info_.availableSGPRs_ = (getValueFromIsaMeta(isaMeta, "AddressableNumSGPRs", sgprValue)) - ? (atoi(sgprValue.c_str())) - : 0; + ? (atoi(sgprValue.c_str())) + : 0; if (!releaseIsaMeta(isaMeta)) { LogInfo("Can not release the isa meta node"); } @@ -1663,9 +1660,8 @@ bool Device::populateOCLDeviceConstants() { } // This capability should be available with xnack enabled - if (HSA_STATUS_SUCCESS != - hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, - &info_.hmmCpuMemoryAccessible_)) { + if (HSA_STATUS_SUCCESS != hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, + &info_.hmmCpuMemoryAccessible_)) { LogError("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT query failed."); } @@ -1805,9 +1801,9 @@ bool Device::bindExternalDevice(uint flags, void* const gfxDevice[], void* gfxCo } return info_.deviceTopology_.pcie.bus == info.pci_bus && - info_.deviceTopology_.pcie.device == info.pci_device && - info_.deviceTopology_.pcie.function == info.pci_function && - info_.vendorId_ == info.vendor_id && pciDeviceId_ == info.device_id; + info_.deviceTopology_.pcie.device == info.pci_device && + info_.deviceTopology_.pcie.function == info.pci_function && + info_.vendorId_ == info.vendor_id && pciDeviceId_ == info.device_id; #endif } @@ -2224,10 +2220,10 @@ void Device::releaseMemory(void* ptr, size_t size) const { void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain, bool contiguous) const { - const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) - ? gpu_ext_fine_grained_segment_ - : (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_ - : gpuvm_segment_; + const hsa_amd_memory_pool_t& pool = + (pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) ? gpu_ext_fine_grained_segment_ + : (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_ + : gpuvm_segment_; if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) { DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle, @@ -2474,9 +2470,8 @@ bool Device::SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryA amd::Memory* svm_mem = amd::MemObjMap::FindMemObj(dev_ptr); if ((nullptr == svm_mem) || ((svm_mem->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) == 0) || // Validate the range of provided memory - ((svm_mem->getSize() - - (reinterpret_cast(dev_ptr) - - reinterpret_cast
(svm_mem->getSvmPtr()))) < count)) { + ((svm_mem->getSize() - (reinterpret_cast(dev_ptr) - + reinterpret_cast
(svm_mem->getSvmPtr()))) < count)) { LogPrintfError("SetSvmAttributes received unknown memory for update: %p!", dev_ptr); return false; } @@ -2565,9 +2560,8 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes, amd::Memory* svm_mem = amd::MemObjMap::FindMemObj(dev_ptr); if ((nullptr == svm_mem) || ((svm_mem->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) == 0) || // Validate the range of provided memory - ((svm_mem->getSize() - - (reinterpret_cast(dev_ptr) - - reinterpret_cast
(svm_mem->getSvmPtr()))) < count)) { + ((svm_mem->getSize() - (reinterpret_cast(dev_ptr) - + reinterpret_cast
(svm_mem->getSvmPtr()))) < count)) { LogPrintfError("GetSvmAttributes received unknown memory %p for state!", dev_ptr); return false; } @@ -3493,9 +3487,8 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer } if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) { - if ((size != 0) && - ((reinterpret_cast(dev_ptr) - - reinterpret_cast(ptr_info->agentBaseAddress)) > size)) { + if ((size != 0) && ((reinterpret_cast(dev_ptr) - + reinterpret_cast(ptr_info->agentBaseAddress)) > size)) { return false; } return true; diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp index d66228b114..f85a1680d9 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp @@ -835,9 +835,8 @@ bool Buffer::create(bool alloc_local) { } else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) { // TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal, // replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready. - if (HSA_STATUS_SUCCESS != - hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr, HSA_AMD_SIGNAL_AMD_GPU_ONLY, - &signal_)) { + if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr, + HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal_)) { ClPrint(amd::LOG_ERROR, amd::LOG_MEM, "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed"); return false; @@ -1316,8 +1315,8 @@ bool Image::create(bool alloc_local) { // support alignment larger than HSA memory region allocation granularity. // In this case, the user manages the alignment. const size_t alloc_size = (deviceImageInfo_.alignment <= dev().alloc_granularity()) - ? deviceImageInfo_.size - : deviceImageInfo_.size + deviceImageInfo_.alignment; + ? deviceImageInfo_.size + : deviceImageInfo_.size + deviceImageInfo_.alignment; if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size); @@ -1357,8 +1356,8 @@ bool Image::createView(const Memory& parent) { deviceMemory_ = parent.getDeviceMemory(); originalDeviceMemory_ = (parent.owner()->asBuffer() != nullptr) - ? deviceMemory_ - : static_cast(parent).originalDeviceMemory_; + ? deviceMemory_ + : static_cast(parent).originalDeviceMemory_; // Detect image view from buffer to distinguish linear paths from tiled. amd::Memory* ancestor = parent.owner(); @@ -1411,10 +1410,10 @@ bool Image::createView(const Memory& parent) { break; } hsa_ext_image_t hsaImage; - if (HSA_STATUS_SUCCESS == - hsa_ext_image_create_with_layout( - dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, permission_, - HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, tryPitch, 0, &hsaImage)) { + if (HSA_STATUS_SUCCESS == hsa_ext_image_create_with_layout( + dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, + permission_, HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, tryPitch, 0, + &hsaImage)) { // The image pitch from app is not expectation of the GPU LogWarning("[OCL] will use copy image"); workaround = true; diff --git a/projects/clr/rocclr/device/rocm/rocmemory.hpp b/projects/clr/rocclr/device/rocm/rocmemory.hpp index 7e60b4217e..dfeb56de9e 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.hpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.hpp @@ -153,10 +153,10 @@ class Memory : public device::Memory { // Get MemorySegment type in terms of host memory allocation flags Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) { - return (memFlags & CL_MEM_SVM_ATOMICS) == 0 - ? Device::MemorySegment::kNoAtomics - : ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ? Device::MemorySegment::kUncachedAtomics - : Device::MemorySegment::kAtomics); + return (memFlags & CL_MEM_SVM_ATOMICS) == 0 ? Device::MemorySegment::kNoAtomics + : ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 + ? Device::MemorySegment::kUncachedAtomics + : Device::MemorySegment::kAtomics); } private: diff --git a/projects/clr/rocclr/device/rocm/rocprintf.cpp b/projects/clr/rocclr/device/rocm/rocprintf.cpp index d92521f6d7..4ed427f3a0 100644 --- a/projects/clr/rocclr/device/rocm/rocprintf.cpp +++ b/projects/clr/rocclr/device/rocm/rocprintf.cpp @@ -177,8 +177,8 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t case 4: if (printFloat) { const float fArg = size == 2 - ? amd::half2float(*(reinterpret_cast(argument))) - : *(reinterpret_cast(argument)); + ? amd::half2float(*(reinterpret_cast(argument))) + : *(reinterpret_cast(argument)); static const char* fSpecifiers = "eEfgGa"; std::string fmtF = fmt; size_t posS = fmtF.find_first_of("%"); @@ -216,13 +216,12 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t hhFmt.erase(hhFmt.find_first_of("h"), 2); amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); } else if (hlModifier) { - amd::Os::printf(hlFmt.data(), - size == 2 ? *(reinterpret_cast(argument)) - : *(reinterpret_cast(argument))); + amd::Os::printf(hlFmt.data(), size == 2 + ? *(reinterpret_cast(argument)) + : *(reinterpret_cast(argument))); } else { - amd::Os::printf(fmt.data(), - size == 2 ? *(reinterpret_cast(argument)) - : *(reinterpret_cast(argument))); + amd::Os::printf(fmt.data(), size == 2 ? *(reinterpret_cast(argument)) + : *(reinterpret_cast(argument))); } } break; diff --git a/projects/clr/rocclr/device/rocm/rocsched.hpp b/projects/clr/rocclr/device/rocm/rocsched.hpp index a4cc296e83..819a53a4d9 100644 --- a/projects/clr/rocclr/device/rocm/rocsched.hpp +++ b/projects/clr/rocclr/device/rocm/rocsched.hpp @@ -57,8 +57,8 @@ struct AmdAqlWrap { // ItÂ’s incremented on the // start and decremented on the finish. The parent kernel can be // considered as done when the value is 0 and the state is DONE - uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) - uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) + uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) + uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects uint32_t reserved[5]; //!< For the future usage diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index 52184864dc..db977a61ae 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -240,7 +240,7 @@ void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidH const uint32_t gfxStepping = isa.versionStepping(); const bool isGfx94x = gfxipMajor == 9 && gfxipMinor >= 4 && - (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2); + (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2); const bool isGfx90a = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10); const bool isPreGfx908 = (gfxipMajor < 9) || ((gfxipMajor == 9) && (gfxipMinor == 0) && (gfxStepping < 8)); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 9d1248c7b1..1266643e7f 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -879,11 +879,11 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para } else { ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i, desc.typeName_.c_str(), desc.name_.c_str(), - (desc.size_ == 1) ? *reinterpret_cast(srcArgPtr) - : (desc.size_ == 2) ? *reinterpret_cast(srcArgPtr) - : (desc.size_ == 4) ? *reinterpret_cast(srcArgPtr) - : (desc.size_ == 8) ? *reinterpret_cast(srcArgPtr) - : 0LL, + (desc.size_ == 1) ? *reinterpret_cast(srcArgPtr) + : (desc.size_ == 2) ? *reinterpret_cast(srcArgPtr) + : (desc.size_ == 4) ? *reinterpret_cast(srcArgPtr) + : (desc.size_ == 8) ? *reinterpret_cast(srcArgPtr) + : 0LL, desc.size_); } } @@ -1362,10 +1362,10 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), cache_state, barrier_value_packet_.signal, barrier_value_packet_.value, barrier_value_packet_.mask, - barrier_value_packet_.cond == 0 ? "EQ" - : barrier_value_packet_.cond == 1 ? "NE" - : barrier_value_packet_.cond == 2 ? "LT" - : "GTE", + barrier_value_packet_.cond == 0 ? "EQ" + : barrier_value_packet_.cond == 1 ? "NE" + : barrier_value_packet_.cond == 2 ? "LT" + : "GTE", barrier_value_packet_.completion_signal, read, index); // Clear dependent signals for the next packet barrier_value_packet_.signal = hsa_signal_t{}; @@ -1432,21 +1432,23 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, cooperative_ = cooperative; if (device.settings().fenceScopeAgent_) { - dispatchPacketHeaderNoSync_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + dispatchPacketHeaderNoSync_ = + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); dispatchPacketHeader_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); } else { - dispatchPacketHeaderNoSync_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + dispatchPacketHeaderNoSync_ = + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); dispatchPacketHeader_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); } aqlHeader_ = dispatchPacketHeader_; @@ -2091,8 +2093,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { // Find the requested agent for the transfer hsa_agent_t agent = (cmd.cpu_access() || (dev().settings().hmmFlags_ & Settings::Hmm::EnableSystemMemory)) - ? dev().getCpuAgent(cmd.numa_id()) - : (static_cast(cmd.device()))->getBackendDevice(); + ? dev().getCpuAgent(cmd.numa_id()) + : (static_cast(cmd.device()))->getBackendDevice(); // Initiate a prefetch command hsa_status_t status = @@ -3000,7 +3002,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { size_t fillSize = patternSize * cmd.times(); size_t offset = reinterpret_cast(cmd.dst()) - - reinterpret_cast(dstMemory->getSvmPtr()); + reinterpret_cast(dstMemory->getSvmPtr()); Memory* memory = dev().getRocMemory(dstMemory); @@ -3567,9 +3569,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const if (aql_packet != nullptr) { *aql_packet = dispatchPacket; aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 00352abac6..3c0ddb2b6c 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -475,9 +475,9 @@ class VirtualGPU : public device::VirtualDevice { const uint8_t* aqlPacket = nullptr, bool attach_signal = false); bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true, bool attach_signal = false); - template - bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, - bool attach_signal = false); + template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, + uint16_t rest, bool blocking, + bool attach_signal = false); bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); diff --git a/projects/clr/rocclr/elf/elfio/elfio.hpp b/projects/clr/rocclr/elf/elfio/elfio.hpp index 0d97ef711f..746ae70535 100644 --- a/projects/clr/rocclr/elf/elfio/elfio.hpp +++ b/projects/clr/rocclr/elf/elfio/elfio.hpp @@ -384,10 +384,10 @@ class elfio { bool is_sect_in_seg(Elf64_Off sect_begin, Elf_Xword sect_size, Elf64_Off seg_begin, Elf64_Off seg_end) { return seg_begin <= sect_begin && sect_begin + sect_size <= seg_end && - sect_begin < - seg_end; // this is important criteria when sect_size == 0 - // Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11) - // sect_begin=12, sect_size=0 -> shall return false! + sect_begin < + seg_end; // this is important criteria when sect_size == 0 + // Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11) + // sect_begin=12, sect_size=0 -> shall return false! } //------------------------------------------------------------------------------ @@ -447,7 +447,7 @@ class elfio { section* sec = sections_.at(i); std::streampos headerPosition = (std::streamoff)header->get_sections_offset() + - header->get_section_entry_size() * sec->get_index(); + header->get_section_entry_size() * sec->get_index(); sec->save(stream, headerPosition, sec->get_offset()); } diff --git a/projects/clr/rocclr/elf/elfio/elfio_note.hpp b/projects/clr/rocclr/elf/elfio/elfio_note.hpp index f30723139d..7caacd6973 100644 --- a/projects/clr/rocclr/elf/elfio/elfio_note.hpp +++ b/projects/clr/rocclr/elf/elfio/elfio_note.hpp @@ -130,7 +130,7 @@ template class note_section_accessor_template { Elf_Word descsz = convertor(*(const Elf_Word*)(data + current + sizeof(namesz))); current += 3 * sizeof(Elf_Word) + ((namesz + align - 1) / align) * align + - ((descsz + align - 1) / align) * align; + ((descsz + align - 1) / align) * align; } } diff --git a/projects/clr/rocclr/elf/elfio/elfio_relocation.hpp b/projects/clr/rocclr/elf/elfio/elfio_relocation.hpp index 4def576840..e867930099 100644 --- a/projects/clr/rocclr/elf/elfio/elfio_relocation.hpp +++ b/projects/clr/rocclr/elf/elfio/elfio_relocation.hpp @@ -104,8 +104,8 @@ template class relocation_section_accessor_template { unsigned char other; symbol_section_accessor symbols(elf_file, elf_file.sections[get_symbol_table_index()]); - ret = ret && - symbols.get_symbol(symbol, symbolName, symbolValue, size, bind, symbolType, section, other); + ret = ret && symbols.get_symbol(symbol, symbolName, symbolValue, size, bind, symbolType, + section, other); if (ret) { // Was it successful? switch (type) { @@ -207,9 +207,9 @@ template class relocation_section_accessor_template { Elf_Half get_symbol_table_index() const { return (Elf_Half)relocation_section->get_link(); } //------------------------------------------------------------------------------ - template - void generic_get_entry_rel(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol, Elf_Word& type, - Elf_Sxword& addend) const { + template void generic_get_entry_rel(Elf_Xword index, Elf64_Addr& offset, + Elf_Word& symbol, Elf_Word& type, + Elf_Sxword& addend) const { const endianess_convertor& convertor = elf_file.get_convertor(); const T* pEntry = reinterpret_cast(relocation_section->get_data() + @@ -222,9 +222,9 @@ template class relocation_section_accessor_template { } //------------------------------------------------------------------------------ - template - void generic_get_entry_rela(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol, Elf_Word& type, - Elf_Sxword& addend) const { + template void generic_get_entry_rela(Elf_Xword index, Elf64_Addr& offset, + Elf_Word& symbol, Elf_Word& type, + Elf_Sxword& addend) const { const endianess_convertor& convertor = elf_file.get_convertor(); const T* pEntry = reinterpret_cast(relocation_section->get_data() + diff --git a/projects/clr/rocclr/elf/elfio/elfio_symbols.hpp b/projects/clr/rocclr/elf/elfio/elfio_symbols.hpp index ff5fa51ba9..a937bbd5b6 100644 --- a/projects/clr/rocclr/elf/elfio/elfio_symbols.hpp +++ b/projects/clr/rocclr/elf/elfio/elfio_symbols.hpp @@ -255,10 +255,10 @@ template class symbol_section_accessor_template { } //------------------------------------------------------------------------------ - template - bool generic_get_symbol(Elf_Xword index, std::string& name, Elf64_Addr& value, Elf_Xword& size, - unsigned char& bind, unsigned char& type, Elf_Half& section_index, - unsigned char& other) const { + template bool generic_get_symbol(Elf_Xword index, std::string& name, Elf64_Addr& value, + Elf_Xword& size, unsigned char& bind, + unsigned char& type, Elf_Half& section_index, + unsigned char& other) const { bool ret = false; if (0 != symbol_section->get_data() && index < get_symbols_num()) { @@ -287,9 +287,9 @@ template class symbol_section_accessor_template { } //------------------------------------------------------------------------------ - template - Elf_Word generic_add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size, unsigned char info, - unsigned char other, Elf_Half shndx) { + template Elf_Word generic_add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size, + unsigned char info, unsigned char other, + Elf_Half shndx) { const endianess_convertor& convertor = elf_file.get_convertor(); T entry; diff --git a/projects/clr/rocclr/elf/elfio/elfio_utils.hpp b/projects/clr/rocclr/elf/elfio/elfio_utils.hpp index de6d3f2637..438dc157f9 100644 --- a/projects/clr/rocclr/elf/elfio/elfio_utils.hpp +++ b/projects/clr/rocclr/elf/elfio/elfio_utils.hpp @@ -66,9 +66,9 @@ class endianess_convertor { return value; } value = ((value & 0x00000000000000FFull) << 56) | ((value & 0x000000000000FF00ull) << 40) | - ((value & 0x0000000000FF0000ull) << 24) | ((value & 0x00000000FF000000ull) << 8) | - ((value & 0x000000FF00000000ull) >> 8) | ((value & 0x0000FF0000000000ull) >> 24) | - ((value & 0x00FF000000000000ull) >> 40) | ((value & 0xFF00000000000000ull) >> 56); + ((value & 0x0000000000FF0000ull) << 24) | ((value & 0x00000000FF000000ull) << 8) | + ((value & 0x000000FF00000000ull) >> 8) | ((value & 0x0000FF0000000000ull) >> 24) | + ((value & 0x00FF000000000000ull) >> 40) | ((value & 0xFF00000000000000ull) >> 56); return value; } @@ -87,7 +87,7 @@ class endianess_convertor { return value; } value = ((value & 0x000000FF) << 24) | ((value & 0x0000FF00) << 8) | - ((value & 0x00FF0000) >> 8) | ((value & 0xFF000000) >> 24); + ((value & 0x00FF0000) >> 8) | ((value & 0xFF000000) >> 24); return value; } diff --git a/projects/clr/rocclr/os/os.hpp b/projects/clr/rocclr/os/os.hpp index 73dd6d3e4d..a65d454fdf 100644 --- a/projects/clr/rocclr/os/os.hpp +++ b/projects/clr/rocclr/os/os.hpp @@ -471,7 +471,7 @@ inline void Os::ThreadAffinityMask::clear(uint cpu) { inline bool Os::ThreadAffinityMask::isSet(uint cpu) const { return (KAFFINITY)0 != - (mask_[cpu / (8 * sizeof(KAFFINITY))] & ((KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY))))); + (mask_[cpu / (8 * sizeof(KAFFINITY))] & ((KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY))))); } inline bool Os::ThreadAffinityMask::isEmpty() const { diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index c0b4f69894..c52b3e1e19 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -301,10 +301,9 @@ const Event::EventWaitList Event::nullWaitList(0); // ================================================================================================ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, uint32_t commandWaitBits, const Event* waitingEvent) - : Event(queue, - amd::activity_prof::IsEnabled(amd::activity_prof::OperationId(type)) || - queue.properties().test(CL_QUEUE_PROFILING_ENABLE) || - Agent::shouldPostEventEvents()), + : Event(queue, amd::activity_prof::IsEnabled(amd::activity_prof::OperationId(type)) || + queue.properties().test(CL_QUEUE_PROFILING_ENABLE) || + Agent::shouldPostEventEvents()), queue_(&queue), next_(nullptr), type_(type), @@ -604,24 +603,24 @@ bool CopyMemoryCommand::isEntireMemory() const { Coord3D imageSize(size()[0] * size()[1] * size()[2] * source().asImage()->getImageFormat().getElementSize()); result = source().isEntirelyCovered(srcOrigin(), size()) && - destination().isEntirelyCovered(dstOrigin(), imageSize); + destination().isEntirelyCovered(dstOrigin(), imageSize); } break; case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { Coord3D imageSize(size()[0] * size()[1] * size()[2] * destination().asImage()->getImageFormat().getElementSize()); result = source().isEntirelyCovered(srcOrigin(), imageSize) && - destination().isEntirelyCovered(dstOrigin(), size()); + destination().isEntirelyCovered(dstOrigin(), size()); } break; case CL_COMMAND_COPY_BUFFER_RECT: { Coord3D rectSize(size()[0] * size()[1] * size()[2]); Coord3D srcOffs(srcRect().start_); Coord3D dstOffs(dstRect().start_); result = source().isEntirelyCovered(srcOffs, rectSize) && - destination().isEntirelyCovered(dstOffs, rectSize); + destination().isEntirelyCovered(dstOffs, rectSize); } break; default: result = source().isEntirelyCovered(srcOrigin(), size()) && - destination().isEntirelyCovered(dstOrigin(), size()); + destination().isEntirelyCovered(dstOrigin(), size()); break; } return result; diff --git a/projects/clr/rocclr/platform/context.cpp b/projects/clr/rocclr/platform/context.cpp index 6e34376a9f..943cc7bfc4 100644 --- a/projects/clr/rocclr/platform/context.cpp +++ b/projects/clr/rocclr/platform/context.cpp @@ -260,9 +260,8 @@ int Context::create(const intptr_t* properties) { } // Check if OCL context can be associated with any external device - if (info_.flags_ & - (D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | D3D9DeviceKhr | D3D9DeviceEXKhr | - D3D9DeviceVAKhr)) { + if (info_.flags_ & (D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | D3D9DeviceKhr | + D3D9DeviceEXKhr | D3D9DeviceVAKhr)) { // Loop through all devices for (const auto& it : devices_) { if (!it->bindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY)) { diff --git a/projects/clr/rocclr/platform/kernel.cpp b/projects/clr/rocclr/platform/kernel.cpp index 13b8d82d4c..fc2832dad8 100644 --- a/projects/clr/rocclr/platform/kernel.cpp +++ b/projects/clr/rocclr/platform/kernel.cpp @@ -75,10 +75,10 @@ size_t KernelParameters::localMemSize(size_t minDataTypeAlignment) const { if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { if (desc.size_ == 8) { memSize = alignUp(memSize, minDataTypeAlignment) + - *reinterpret_cast(values_ + desc.offset_); + *reinterpret_cast(values_ + desc.offset_); } else { memSize = alignUp(memSize, minDataTypeAlignment) + - *reinterpret_cast(values_ + desc.offset_); + *reinterpret_cast(values_ + desc.offset_); } } } @@ -300,10 +300,10 @@ address KernelParameters::capture(device::VirtualDevice& vDev, uint64_t lclMemSi } else if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { if (desc.size_ == 8) { lclMemSize = alignUp(lclMemSize, device.info().minDataTypeAlignSize_) + - *reinterpret_cast(values_ + desc.offset_); + *reinterpret_cast(values_ + desc.offset_); } else { lclMemSize = alignUp(lclMemSize, device.info().minDataTypeAlignSize_) + - *reinterpret_cast(values_ + desc.offset_); + *reinterpret_cast(values_ + desc.offset_); } } } diff --git a/projects/clr/rocclr/platform/kernel.hpp b/projects/clr/rocclr/platform/kernel.hpp index 6ea68f926f..bdfc6cc909 100644 --- a/projects/clr/rocclr/platform/kernel.hpp +++ b/projects/clr/rocclr/platform/kernel.hpp @@ -158,10 +158,11 @@ class KernelParameters : protected HeapObject { execNewVcop_(0), execPfpaVcop_(0), deviceKernelArgs_(false) { - totalSize_ = signature.paramsSize() + + totalSize_ = + signature.paramsSize() + (signature.numMemories() + signature.numSamplers() + signature.numQueues()) * sizeof(void*); values_ = reinterpret_cast
(this) + - alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT); + alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT); memoryObjOffset_ = signature_.paramsSize(); memoryObjects_ = reinterpret_cast(values_ + memoryObjOffset_); samplerObjOffset_ = memoryObjOffset_ + signature_.numMemories() * sizeof(amd::Memory*); @@ -186,7 +187,7 @@ class KernelParameters : protected HeapObject { execPfpaVcop_(rhs.execPfpaVcop_), deviceKernelArgs_(false) { values_ = reinterpret_cast
(this) + - alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT); + alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT); memoryObjOffset_ = signature_.paramsSize(); memoryObjects_ = reinterpret_cast(values_ + memoryObjOffset_); samplerObjOffset_ = memoryObjOffset_ + signature_.numMemories() * sizeof(amd::Memory*); @@ -223,7 +224,8 @@ class KernelParameters : protected HeapObject { //! Allocate memory for this instance as well as the required storage for // the values_, defined_, and rawPointer_ arrays. void* operator new(size_t size, const KernelSignature& signature) { - size_t requiredSize = alignUp(size, PARAMETERS_MIN_ALIGNMENT) + signature.paramsSize() + + size_t requiredSize = + alignUp(size, PARAMETERS_MIN_ALIGNMENT) + signature.paramsSize() + (signature.numMemories() + signature.numSamplers() + signature.numQueues()) * sizeof(void*); return AlignedMemory::allocate(requiredSize, PARAMETERS_MIN_ALIGNMENT); } diff --git a/projects/clr/rocclr/platform/memory.cpp b/projects/clr/rocclr/platform/memory.cpp index ff9f32252c..3dc21895b8 100644 --- a/projects/clr/rocclr/platform/memory.cpp +++ b/projects/clr/rocclr/platform/memory.cpp @@ -57,9 +57,9 @@ bool HostMemoryReference::allocateMemory(size_t size, const Context& context) { size_t memoryAlignment = (CPU_MEMORY_ALIGNMENT_SIZE <= 0) ? 256 : CPU_MEMORY_ALIGNMENT_SIZE; size_ = amd::alignUp(size, memoryAlignment); //! \note memory size must be aligned for CAL pinning - hostMem_ = CPU_MEMORY_GUARD_PAGES - ? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN, CPU_MEMORY_GUARD_PAGE_SIZE * Ki) - : context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN); + hostMem_ = CPU_MEMORY_GUARD_PAGES ? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN, + CPU_MEMORY_GUARD_PAGE_SIZE * Ki) + : context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN); alloced_ = (hostMem_ != NULL); return alloced_; } @@ -146,7 +146,7 @@ Memory::Memory(Memory& parent, Flags flags, size_t origin, size_t size, Type typ if ((flags_ & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) { flags_ |= parent_->getMemFlags() & - (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS); + (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS); } } @@ -590,8 +590,8 @@ bool Buffer::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) con bool Buffer::validateRegion(const Coord3D& origin, const Coord3D& region) const { return ((region[0] > 0) && (origin[0] < getSize()) && ((origin[0] + region[0]) <= getSize())) - ? true - : false; + ? true + : false; } void Pipe::initDeviceMemory() { @@ -614,7 +614,7 @@ Image::Image(const Format& format, Image& parent, uint baseMipLevel, cl_mem_flag baseMipLevel_(baseMipLevel) { if (baseMipLevel > 0) { impl_.region_.c[0] = GETMIPDIM(parent.getWidth(), baseMipLevel) * - parent.getImageFormat().getElementSize() / format.getElementSize(); + parent.getImageFormat().getElementSize() / format.getElementSize(); impl_.region_.c[1] = GETMIPDIM(parent.getHeight(), baseMipLevel); impl_.region_.c[2] = GETMIPDIM(parent.getDepth(), baseMipLevel); @@ -1030,9 +1030,9 @@ const cl_image_format Image::supportedFormats[] = { {CL_DEPTH, CL_FLOAT}, }; -const uint32_t NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of - // the table supportedFormats above and before sRGB - // and depth. +const uint32_t NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of + // the table supportedFormats above and before sRGB + // and depth. const uint32_t NUM_CHANNEL_ORDER_OF_sRGB = 1; // The number of channel orders of sRGB at the end of // the table supportedFormats above and before depth. const uint32_t NUM_CHANNEL_ORDER_OF_DEPTH = @@ -1246,8 +1246,8 @@ Image* Image::createView(const Context& context, const Format& format, device::V bool Image::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const { return (origin[0] == 0 && origin[1] == 0 && origin[2] == 0 && region[0] == getWidth() && region[1] == getHeight() && region[2] == getDepth()) - ? true - : false; + ? true + : false; } bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const { @@ -1255,15 +1255,15 @@ bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const { (region[0] != 0) && (origin[1] < getHeight()) && (region[1] != 0) && (origin[2] < getDepth()) && (region[2] != 0) && ((origin[0] + region[0]) <= getWidth()) && ((origin[1] + region[1]) <= getHeight()) && ((origin[2] + region[2]) <= getDepth())) - ? true - : false; + ? true + : false; } bool Image::isRowSliceValid(size_t rowPitch, size_t slice, size_t width, size_t height) const { size_t tmpHeight = (getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? 1 : height; bool valid = (rowPitch == 0) || - ((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize())); + ((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize())); return ((slice == 0) || ((slice != 0) && (slice >= rowPitch * tmpHeight))) ? valid : false; } diff --git a/projects/clr/rocclr/platform/memory.hpp b/projects/clr/rocclr/platform/memory.hpp index 1c1516036e..d77e9df51c 100644 --- a/projects/clr/rocclr/platform/memory.hpp +++ b/projects/clr/rocclr/platform/memory.hpp @@ -530,7 +530,7 @@ class Image : public Memory { //! Compare 2 image formats. bool operator==(const Format& rhs) const { return image_channel_order == rhs.image_channel_order && - image_channel_data_type == rhs.image_channel_data_type; + image_channel_data_type == rhs.image_channel_data_type; } bool operator!=(const Format& rhs) const { return !(*this == rhs); } diff --git a/projects/clr/rocclr/platform/program.cpp b/projects/clr/rocclr/platform/program.cpp index 2cf4e366f3..8f53f813ad 100644 --- a/projects/clr/rocclr/platform/program.cpp +++ b/projects/clr/rocclr/platform/program.cpp @@ -170,8 +170,8 @@ int32_t Program::addDeviceProgram(Device& device, const void* image, size_t leng } } options->oVariables->Legacy = !device.settings().useLightning_ - ? isAMDILTarget(*amd::aclutGetTargetInfo(binary)) - : isHSAILTarget(*amd::aclutGetTargetInfo(binary)); + ? isAMDILTarget(*amd::aclutGetTargetInfo(binary)) + : isHSAILTarget(*amd::aclutGetTargetInfo(binary)); amd::Hsail::BinaryFini(binary); } #endif // defined(WITH_COMPILER_LIB) @@ -522,9 +522,8 @@ int32_t Program::build(const std::vector& devices, const char* options, for (const auto& it : devices) { option::Options parsedOptions; constexpr bool LinkOptsOnly = false; - if ((language_ != HIP) && - !ParseAllOptions(cppstr, parsedOptions, optionChangable, LinkOptsOnly, - it->settings().useLightning_)) { + if ((language_ != HIP) && !ParseAllOptions(cppstr, parsedOptions, optionChangable, LinkOptsOnly, + it->settings().useLightning_)) { programLog_ = parsedOptions.optionsLog(); LogError("Parsing compile options failed."); return CL_INVALID_COMPILER_OPTIONS; diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 7148e56e73..45248fbbbb 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -21,537 +21,258 @@ #ifndef FLAGS_HPP_ #define FLAGS_HPP_ - -#define RUNTIME_FLAGS(debug, release, release_on_stg) \ - \ - release(int, AMD_LOG_LEVEL, 0, "The default log level") release( \ - uint, AMD_LOG_MASK, 0X7FFFFFFF, \ - "The mask to enable specific kinds of logs") release(cstring, AMD_LOG_LEVEL_FILE, "", \ - "Set output file for AMD_LOG_LEVEL, " \ - "Default is stderr") release(size_t, \ - AMD_LOG_LEVEL_SIZE, \ - 2048, \ - "The max " \ - "size of " \ - "AMD_LOG " \ - "generate" \ - "d in MB " \ - "if " \ - "printed " \ - "to a " \ - "file") \ - debug(uint, DEBUG_GPU_FLAGS, 0, "The debug options for GPU device") release( \ - size_t, CQ_THREAD_STACK_SIZE, 256 * Ki, /* @todo: that much! */ \ - "The default command queue thread stack size") release(int, GPU_MAX_WORKGROUP_SIZE, 0, \ - "Maximum number of workitems in " \ - "a workgroup for GPU, 0 -use " \ - "default") \ - debug(bool, CPU_MEMORY_GUARD_PAGES, false, "Use guard pages for CPU memory") debug( \ - size_t, CPU_MEMORY_GUARD_PAGE_SIZE, \ - 64, "Size in KB of CPU memory guard page") debug(size_t, CPU_MEMORY_ALIGNMENT_SIZE, \ - 256, \ - "Size in bytes for the default " \ - "alignment for guarded memory on " \ - "CPU") debug(size_t, \ - PARAMETERS_MIN_ALIGNMENT, \ - NATIVE_ALIGNMENT_SIZE, \ - "Minimum alignment " \ - "required for the " \ - "abstract parameters " \ - "stack") debug(size_t, \ - MEMOBJ_BASE_ADDR_ALIGN, \ - 4 * Ki, \ - "Align" \ - "ment " \ - "of " \ - "the " \ - "base " \ - "addre" \ - "ss " \ - "of " \ - "any " \ - "alloc" \ - "ate " \ - "memor" \ - "y " \ - "objec" \ - "t") \ - release( \ - uint, ROC_HMM_FLAGS, \ - 0, "ROCm HMM configuration flags") release(cstring, GPU_DEVICE_ORDINAL, "", \ - "Select the device ordinal (comma " \ - "seperated list of available " \ - "devices)") release(bool, \ - REMOTE_ALLOC, \ - false, \ - "Use remote " \ - "memory for the " \ - "global heap " \ - "allocation") \ - release(uint, GPU_CP_DMA_COPY_SIZE, 1, \ - "Set maximum size of CP DMA copy in KiB") release(uint, \ - GPU_MAX_HEAP_SIZE, \ - 100, \ - "Set maximum size of " \ - "the GPU heap to % " \ - "of board memory") \ - release( \ - uint, GPU_STAGING_BUFFER_SIZE, 4, \ - "Size of the GPU staging buffer in MiB") release(bool, \ - GPU_DUMP_BLIT_KERNELS, \ - false, \ - "Dump the kernels for " \ - "blit manager") \ - release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \ - "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \ - release(bool, GPU_FLUSH_ON_EXECUTION, false, \ - "Submit commands to HW on every operation. 0 - Disable, 1 " \ - "- Enable") release(bool, CL_KHR_FP64, true, \ - "Enable/Disable support for double " \ - "precision") release(cstring, \ - AMD_OCL_BUILD_OPTIONS, \ - 0, \ - "Set " \ - "clBuildProgram() " \ - "and " \ - "clCompileProgram(" \ - ")'s options " \ - "(override)") \ - release(cstring, AMD_OCL_BUILD_OPTIONS_APPEND, 0, \ - "Append clBuildProgram() and clCompileProgram()'s " \ - "options") release(cstring, AMD_OCL_LINK_OPTIONS, 0, \ - "Set clLinkProgram()'s options " \ - "(override)") \ - release( \ - cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \ - "Append clLinkProgram()'s options") debug(cstring, \ - AMD_OCL_SUBST_OBJFILE, \ - 0, \ - "Specify " \ - "binary " \ - "substitution" \ - " config " \ - "file for " \ - "OpenCL") \ - release( \ - size_t, GPU_PINNED_XFER_SIZE, 32, \ - "The pinned buffer size for pinning in read/write " \ - "transfers in MiB") release(size_t, \ - GPU_PINNED_MIN_XFER_SIZE, \ - 128, \ - "The minimal buffer " \ - "size for pinned " \ - "read/write transfers " \ - "in MiB") release(size_t, \ - GPU_RESOURCE_CACHE_SIZE, \ - 64, \ - "The " \ - "reso" \ - "urce" \ - " cac" \ - "he " \ - "size" \ - " in " \ - "MB") \ - release( \ - size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \ - "The maximum size accepted for suballocations " \ - "in KB") release(size_t, GPU_NUM_MEM_DEPENDENCY, \ - 256, \ - "Number of memory objects for " \ - "dependency tracking") \ - release( \ - size_t, GPU_XFER_BUFFER_SIZE, 0, \ - "Transfer buffer size for image copy " \ - "optimization in KB") release(bool, \ - GPU_IMAGE_DMA, \ - true, \ - "Enable DRM " \ - "DMA for " \ - "image " \ - "transfers") \ - release( \ - uint, GPU_SINGLE_ALLOC_PERCENT, 100, \ - "Maximum size of a single allocation " \ - "as percentage of total") release(uint, \ - GPU_NUM_COMPUTE_RINGS, \ - 2, \ - "GPU " \ - "numb" \ - "er " \ - "of " \ - "comp" \ - "ute " \ - "ring" \ - "s. " \ - "0 - " \ - "disa" \ - "bled" \ - ", 1 " \ - ", " \ - "2,.." \ - " - " \ - "the " \ - "numb" \ - "er " \ - "of " \ - "comp" \ - "ute " \ - "ring" \ - "s") \ - release( \ - bool, AMD_OCL_WAIT_COMMAND, false, \ - "1 = Enable a wait for every " \ - "submitted command") release(uint, \ - GPU_PRINT_CHILD_KERNEL, \ - 0, \ - "Print" \ - "s " \ - "the " \ - "speci" \ - "fied " \ - "numbe" \ - "r of " \ - "the " \ - "child" \ - " kern" \ - "els") \ - release(bool, GPU_USE_DEVICE_QUEUE, \ - false, \ - "Use a dedicated device " \ - "queue for the actual " \ - "submissions") release(bool, \ - AMD_THREAD_TRACE_ENABLE, \ - true, \ - "Ena" \ - "ble" \ - " th" \ - "rea" \ - "d " \ - "tra" \ - "ce " \ - "ext" \ - "ens" \ - "io" \ - "n") \ - release( \ - uint, OPENCL_VERSION, 200, \ - "Force GPU opencl version") release(bool, \ - HSA_LOCAL_MEMORY_ENABLE, \ - true, \ - "Enable HSA device local memory usage") \ - release( \ - uint, \ - HSA_KERNARG_POOL_SIZE, \ - 1024 * 1024, \ - "Kernarg pool size") release(bool, \ - GPU_MIPMAP, \ - true, \ - "Enables GPU mipmap extension") \ - release( \ - uint, \ - GPU_ENABLE_PAL, \ - 2, \ - "Enables PAL " \ - "backend. 0 - ROC, " \ - "1 - PAL, 2 - ROC " \ - "or PAL") release(bool, DISABLE_DEFERRED_ALLOC, \ - false, \ - "Disables deferred memory allocation on device") \ - release( \ - int, \ - AMD_GPU_FORCE_SINGLE_FP_DENORM, \ - -1, \ - "Force denorm " \ - "for single " \ - "precision: -1 " \ - "- don't " \ - "force, 0 - " \ - "disable, 1 - " \ - "enable") \ - release( \ - uint, \ - OCL_SET_SVM_SIZE, \ - 4 * 16384, \ - "set SVM " \ - "space " \ - "size for " \ - "discrete " \ - "GPU") release(uint, \ - GPU_WAVES_PER_SIMD, \ - 0, \ - "Force the number of waves per SIMD (1-10)") \ - release( \ - bool, \ - OCL_STUB_PROGRAMS, \ - false, \ - "1 = " \ - "Enable" \ - "s OCL " \ - "progra" \ - "ms " \ - "stubin" \ - "g") \ - release( \ - bool, \ - GPU_ANALYZE_HANG, \ - false, \ - "1 " \ - "= " \ - "En" \ - "ab" \ - "le" \ - "s " \ - "GP" \ - "U " \ - "ha" \ - "ng" \ - " a" \ - "na" \ - "ly" \ - "si" \ - "s") \ - release( \ - uint, \ - GPU_MAX_REMOTE_MEM_SIZE, \ - 2, \ - "Maximum size (in Ki) that allows device memory substitution with system") \ - release(bool, \ - GPU_ADD_HBCC_SIZE, \ - false, \ - "Add HBCC size to the reported device memory") release(bool, \ - PAL_DISABLE_SDMA, \ - false, \ - "1 = Disable SDMA for PAL") release(uint, \ - PAL_RGP_DISP_COUNT, \ - 10000, \ - "The number of dispatches for RGP capture with SQTT") release(uint, \ - PAL_MALL_POLICY, \ - 0, \ - "Controls the behaviour of allocations with respect to the MALL" \ - "0 = MALL policy is decided by KMD" \ - "1 = Allocations are never put through the MALL" \ - "2 = Allocations will always be put through the MALL") release(bool, \ - GPU_ENABLE_WAVE32_MODE, \ - true, \ - "Enables Wave32 compilation in HW if available") release(bool, \ - GPU_ENABLE_LC, \ - true, \ - "Enables LC path") release(bool, GPU_ENABLE_HW_P2P, \ - false, \ - "Enables HW P2P path") release(bool, \ - GPU_ENABLE_COOP_GROUPS, \ - true, \ - "Enables cooperative group launch") release(uint, \ - GPU_MAX_COMMAND_BUFFERS, \ - 8, \ - "The maximum number of command buffers allocated per queue") release(uint, \ - GPU_MAX_HW_QUEUES, \ - 4, \ - "The maximum number of HW queues allocated per device") release(bool, GPU_IMAGE_BUFFER_WAR, true, \ - "Enables image buffer workaround") release(cstring, \ - HIP_VISIBLE_DEVICES, \ - "", \ - "Only devices whose index is present in the sequence are visible to HIP") release(cstring, \ - CUDA_VISIBLE_DEVICES, \ - "", \ - "Only devices whose index is present in the sequence are visible to CUDA") \ - release(bool, \ - GPU_ENABLE_WGP_MODE, \ - true, \ - "Enables WGP Mode in HW if available") \ - release( \ - bool, \ - GPU_DUMP_CODE_OBJECT, \ - false, \ - "Enable dump code object") release(uint, \ - GPU_MAX_USWC_ALLOC_SIZE, 2048, \ - "Set a limit in Mb on the maximum USWC allocation size" \ - "-1 = No limit") \ - release( \ - uint, \ - AMD_SERIALIZE_KERNEL, \ - 0, \ - "Serialize kernel enqueue, 0x1 = Wait for completion before enqueue" \ - "0x2 = Wait for completion after enqueue 0x3 = both") release(uint, \ - AMD_SERIALIZE_COPY, \ - 0, \ - "Serialize copies, 0x1 = Wait for completion before enqueue" \ - "0x2 = Wait for completion after enqueue 0x3 = both") release(uint, \ - HIP_LAUNCH_BLOCKING, \ - 0, \ - "Serialize kernel enqueue 0x1 = Wait for completion after enqueue," \ - "same as AMD_SERIALIZE_KERNEL=2") release(bool, \ - PAL_ALWAYS_RESIDENT, \ - false, \ - "Force memory resources to become resident at allocation time") release(uint, \ - HIP_HOST_COHERENT, \ - 0, \ - "Coherent memory in hipHostMalloc, 0x1 = memory is coherent with host" \ - "0x0 = memory is not coherent between host and GPU") release(uint, AMD_OPT_FLUSH, 1, \ - "Kernel flush option , 0x0 = Use system-scope fence operations." \ - "0x1 = Use device-scope fence operations when possible.") \ - release( \ - bool, \ - AMD_DIRECT_DISPATCH, \ - false, \ - "Enable direct kernel dispatch.") release(uint, \ - HIP_HIDDEN_FREE_MEM, \ - 0, \ - "Reserve free mem reporting in Mb" \ - "0 = Disable") release(size_t, \ - GPU_FORCE_BLIT_COPY_SIZE, \ - 16, \ - "Use Blit until this size(in KB) for copies") release(uint, \ - ROC_ACTIVE_WAIT_TIMEOUT, \ - 0, \ - "Forces active wait of GPU interrup for the timeout(us)") release(bool, \ - ROC_ENABLE_LARGE_BAR, \ - true, \ - "Enable Large Bar if supported by the device") release(bool, \ - ROC_CPU_WAIT_FOR_SIGNAL, \ - true, \ - "Enable CPU wait for dependent HSA signals.") release(bool, \ - ROC_SYSTEM_SCOPE_SIGNAL, \ - true, \ - "Enable system scope for signals (uses interrupts).") release(bool, \ - GPU_FORCE_QUEUE_PROFILING, \ - false, \ - "Force command queue profiling by default") \ - release( \ - bool, \ - HIP_MEM_POOL_SUPPORT, \ - true, \ - "Enables memory pool support in HIP") release(bool, \ - HIP_MEM_POOL_USE_VM, \ - true, \ - "Enables memory pool support in HIP") release(bool, \ - DEBUG_HIP_MEM_POOL_VMHEAP, \ - true, \ - "Enables virtual memory for memory pools") release(bool, \ - PAL_HIP_IPC_FLAG, true, \ - "Enable interprocess flag for device allocation in PAL HIP") \ - release( \ - uint, \ - PAL_FORCE_ASIC_REVISION, \ - 0, \ - "Force a specific asic revision for all devices") \ - release( \ - bool, \ - PAL_EMBED_KERNEL_MD, \ - false, \ - "Enables writing kernel metadata into command buffers.") release(cstring, \ - ROC_GLOBAL_CU_MASK, \ - "", \ - "Sets a global CU mask (entered as hex value) for all queues," \ - "Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)") release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \ - "Size in KBytes of prepinned memory") release(bool, \ - AMD_CPU_AFFINITY, \ - false, \ - "Reset CPU affinity of any runtime threads") release(bool, \ - ROC_USE_FGS_KERNARG, \ - true, \ - "Use fine grain kernel args segment for supported asics") release(uint, \ - ROC_P2P_SDMA_SIZE, \ - 1024, \ - "The minimum size in KB for P2P transfer with SDMA") release(uint, \ - ROC_AQL_QUEUE_SIZE, \ - 16384, \ - "AQL queue size in AQL packets") \ - release( \ - uint, \ - ROC_SIGNAL_POOL_SIZE, \ - 64, \ - "Initial size of HSA signal pool") \ - release(uint, \ - DEBUG_CLR_LIMIT_BLIT_WG, \ - 16, \ - "Limit the number of workgroups in blit operations") release(bool, \ - DEBUG_CLR_BLIT_KERNARG_OPT, \ - false, \ - "Enable blit kernel arguments optimization") release(bool, \ - ROC_SKIP_KERNEL_ARG_COPY, \ - false, \ - "If true, then runtime can skip kernel arg copy") release(bool, \ - GPU_STREAMOPS_CP_WAIT, \ - false, \ - "Force the stream wait memory operation to wait on CP.") release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, \ - false, \ - "Set this to true to force runtime unbundler in hiprtc.") release(size_t, \ - HIP_INITIAL_DM_SIZE, \ - 8 * Mi, \ - "Set initial heap size for device malloc.") \ - release( \ - bool, \ - HIP_FORCE_DEV_KERNARG, \ - true, \ - "Force device mem for kernel args.") release(bool, \ - DEBUG_CLR_GRAPH_PACKET_CAPTURE, \ - true, \ - "Enable/Disable graph packet capturing") release(bool, \ - GPU_DEBUG_ENABLE, false, \ - "Enables collection of extra info for debugger at some perf cost") \ - release( \ - cstring, \ - HIPRTC_COMPILE_OPTIONS_APPEND, \ - "", \ - "Set compile options needed for hiprtc compilation") \ - release( \ - cstring, \ - HIPRTC_LINK_OPTIONS_APPEND, \ - "", \ - "Set link options needed for hiprtc compilation") \ - release( \ - bool, \ - HIP_VMEM_MANAGE_SUPPORT, \ - true, \ - "Virtual Memory Management Support") \ - release( \ - bool, \ - DEBUG_HIP_GRAPH_DOT_PRINT, \ - false, \ - "Enable/Disable graph debug dot print dump") release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \ - "Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \ - release( \ - uint, \ - DEBUG_HIP_FORCE_GRAPH_QUEUES, \ - 4, \ - "Forces the number of streams for the graph parallel execution") \ - release( \ - uint, \ - DEBUG_HIP_BLOCK_SYNC, \ - 50, \ - "Blocks synchronization on CPU until the callback processing is done") \ - release(uint, \ - DEBUG_CLR_MAX_BATCH_SIZE, \ - 1000, \ - "Forces the callback to clean-up CPU submission queue") release(bool, DEBUG_CLR_SYSMEM_POOL, false, \ - "Use sysmem pool implementation in runtime for amd commands") \ - release(bool, \ - DEBUG_HIP_KERNARG_COPY_OPT, \ - true, \ - "Enable/Disable multiple kern arg copies") release(bool, \ - DEBUG_CLR_KERNARG_HDP_FLUSH_WA, \ - false, \ - "Toggle kernel arg copy workaround") release(bool, \ - DEBUG_HIP_DYNAMIC_QUEUES, \ - false, \ - "Forces dynamic queue management") \ - release( \ - uint, \ - HIP_SKIP_ABORT_ON_GPU_ERROR, \ - true, \ - "Set this to true, to avoid host side abort for GPU errors") \ - release( \ - bool, \ - HIP_FORCE_SPIRV_CODEOBJECT, \ - false, \ - "Force use of SPIRV instead of device specific code object.") \ - release( \ - uint, \ - DEBUG_CLR_BATCH_CPU_SYNC_SIZE, \ - 8, \ - "Forces the minimum batch size for CPU sync") +// clang-format off +#define RUNTIME_FLAGS(debug,release,release_on_stg) \ + \ +release(int, AMD_LOG_LEVEL, 0, \ + "The default log level") \ +release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \ + "The mask to enable specific kinds of logs") \ +release(cstring, AMD_LOG_LEVEL_FILE, "", \ + "Set output file for AMD_LOG_LEVEL, Default is stderr") \ +release(size_t, AMD_LOG_LEVEL_SIZE, 2048, \ + "The max size of AMD_LOG generated in MB if printed to a file") \ +debug(uint, DEBUG_GPU_FLAGS, 0, \ + "The debug options for GPU device") \ +release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \ + "The default command queue thread stack size") \ +release(int, GPU_MAX_WORKGROUP_SIZE, 0, \ + "Maximum number of workitems in a workgroup for GPU, 0 -use default") \ +debug(bool, CPU_MEMORY_GUARD_PAGES, false, \ + "Use guard pages for CPU memory") \ +debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \ + "Size in KB of CPU memory guard page") \ +debug(size_t, CPU_MEMORY_ALIGNMENT_SIZE, 256, \ + "Size in bytes for the default alignment for guarded memory on CPU") \ +debug(size_t, PARAMETERS_MIN_ALIGNMENT, NATIVE_ALIGNMENT_SIZE, \ + "Minimum alignment required for the abstract parameters stack") \ +debug(size_t, MEMOBJ_BASE_ADDR_ALIGN, 4*Ki, \ + "Alignment of the base address of any allocate memory object") \ +release(uint, ROC_HMM_FLAGS, 0, \ + "ROCm HMM configuration flags") \ +release(cstring, GPU_DEVICE_ORDINAL, "", \ + "Select the device ordinal (comma seperated list of available devices)") \ +release(bool, REMOTE_ALLOC, false, \ + "Use remote memory for the global heap allocation") \ +release(uint, GPU_CP_DMA_COPY_SIZE, 1, \ + "Set maximum size of CP DMA copy in KiB") \ +release(uint, GPU_MAX_HEAP_SIZE, 100, \ + "Set maximum size of the GPU heap to % of board memory") \ +release(uint, GPU_STAGING_BUFFER_SIZE, 4, \ + "Size of the GPU staging buffer in MiB") \ +release(bool, GPU_DUMP_BLIT_KERNELS, false, \ + "Dump the kernels for blit manager") \ +release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \ + "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \ +release(bool, GPU_FLUSH_ON_EXECUTION, false, \ + "Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \ +release(bool, CL_KHR_FP64, true, \ + "Enable/Disable support for double precision") \ +release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \ + "Set clBuildProgram() and clCompileProgram()'s options (override)") \ +release(cstring, AMD_OCL_BUILD_OPTIONS_APPEND, 0, \ + "Append clBuildProgram() and clCompileProgram()'s options") \ +release(cstring, AMD_OCL_LINK_OPTIONS, 0, \ + "Set clLinkProgram()'s options (override)") \ +release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \ + "Append clLinkProgram()'s options") \ +debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \ + "Specify binary substitution config file for OpenCL") \ +release(size_t, GPU_PINNED_XFER_SIZE, 32, \ + "The pinned buffer size for pinning in read/write transfers in MiB") \ +release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \ + "The minimal buffer size for pinned read/write transfers in MiB") \ +release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \ + "The resource cache size in MB") \ +release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \ + "The maximum size accepted for suballocations in KB") \ +release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \ + "Number of memory objects for dependency tracking") \ +release(size_t, GPU_XFER_BUFFER_SIZE, 0, \ + "Transfer buffer size for image copy optimization in KB") \ +release(bool, GPU_IMAGE_DMA, true, \ + "Enable DRM DMA for image transfers") \ +release(uint, GPU_SINGLE_ALLOC_PERCENT, 100, \ + "Maximum size of a single allocation as percentage of total") \ +release(uint, GPU_NUM_COMPUTE_RINGS, 2, \ + "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \ +release(bool, AMD_OCL_WAIT_COMMAND, false, \ + "1 = Enable a wait for every submitted command") \ +release(uint, GPU_PRINT_CHILD_KERNEL, 0, \ + "Prints the specified number of the child kernels") \ +release(bool, GPU_USE_DEVICE_QUEUE, false, \ + "Use a dedicated device queue for the actual submissions") \ +release(bool, AMD_THREAD_TRACE_ENABLE, true, \ + "Enable thread trace extension") \ +release(uint, OPENCL_VERSION, 200, \ + "Force GPU opencl version") \ +release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \ + "Enable HSA device local memory usage") \ +release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \ + "Kernarg pool size") \ +release(bool, GPU_MIPMAP, true, \ + "Enables GPU mipmap extension") \ +release(uint, GPU_ENABLE_PAL, 2, \ + "Enables PAL backend. 0 - ROC, 1 - PAL, 2 - ROC or PAL") \ +release(bool, DISABLE_DEFERRED_ALLOC, false, \ + "Disables deferred memory allocation on device") \ +release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \ + "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \ +release(uint, OCL_SET_SVM_SIZE, 4*16384, \ + "set SVM space size for discrete GPU") \ +release(uint, GPU_WAVES_PER_SIMD, 0, \ + "Force the number of waves per SIMD (1-10)") \ +release(bool, OCL_STUB_PROGRAMS, false, \ + "1 = Enables OCL programs stubing") \ +release(bool, GPU_ANALYZE_HANG, false, \ + "1 = Enables GPU hang analysis") \ +release(uint, GPU_MAX_REMOTE_MEM_SIZE, 2, \ + "Maximum size (in Ki) that allows device memory substitution with system") \ +release(bool, GPU_ADD_HBCC_SIZE, false, \ + "Add HBCC size to the reported device memory") \ +release(bool, PAL_DISABLE_SDMA, false, \ + "1 = Disable SDMA for PAL") \ +release(uint, PAL_RGP_DISP_COUNT, 10000, \ + "The number of dispatches for RGP capture with SQTT") \ +release(uint, PAL_MALL_POLICY, 0, \ + "Controls the behaviour of allocations with respect to the MALL" \ + "0 = MALL policy is decided by KMD" \ + "1 = Allocations are never put through the MALL" \ + "2 = Allocations will always be put through the MALL") \ +release(bool, GPU_ENABLE_WAVE32_MODE, true, \ + "Enables Wave32 compilation in HW if available") \ +release(bool, GPU_ENABLE_LC, true, \ + "Enables LC path") \ +release(bool, GPU_ENABLE_HW_P2P, false, \ + "Enables HW P2P path") \ +release(bool, GPU_ENABLE_COOP_GROUPS, true, \ + "Enables cooperative group launch") \ +release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \ + "The maximum number of command buffers allocated per queue") \ +release(uint, GPU_MAX_HW_QUEUES, 4, \ + "The maximum number of HW queues allocated per device") \ +release(bool, GPU_IMAGE_BUFFER_WAR, true, \ + "Enables image buffer workaround") \ +release(cstring, HIP_VISIBLE_DEVICES, "", \ + "Only devices whose index is present in the sequence are visible to HIP") \ +release(cstring, CUDA_VISIBLE_DEVICES, "", \ + "Only devices whose index is present in the sequence are visible to CUDA") \ +release(bool, GPU_ENABLE_WGP_MODE, true, \ + "Enables WGP Mode in HW if available") \ +release(bool, GPU_DUMP_CODE_OBJECT, false, \ + "Enable dump code object") \ +release(uint, GPU_MAX_USWC_ALLOC_SIZE, 2048, \ + "Set a limit in Mb on the maximum USWC allocation size" \ + "-1 = No limit") \ +release(uint, AMD_SERIALIZE_KERNEL, 0, \ + "Serialize kernel enqueue, 0x1 = Wait for completion before enqueue" \ + "0x2 = Wait for completion after enqueue 0x3 = both") \ +release(uint, AMD_SERIALIZE_COPY, 0, \ + "Serialize copies, 0x1 = Wait for completion before enqueue" \ + "0x2 = Wait for completion after enqueue 0x3 = both") \ +release(uint, HIP_LAUNCH_BLOCKING, 0, \ + "Serialize kernel enqueue 0x1 = Wait for completion after enqueue," \ + "same as AMD_SERIALIZE_KERNEL=2") \ +release(bool, PAL_ALWAYS_RESIDENT, false, \ + "Force memory resources to become resident at allocation time") \ +release(uint, HIP_HOST_COHERENT, 0, \ + "Coherent memory in hipHostMalloc, 0x1 = memory is coherent with host"\ + "0x0 = memory is not coherent between host and GPU") \ +release(uint, AMD_OPT_FLUSH, 1, \ + "Kernel flush option , 0x0 = Use system-scope fence operations." \ + "0x1 = Use device-scope fence operations when possible.") \ +release(bool, AMD_DIRECT_DISPATCH, false, \ + "Enable direct kernel dispatch.") \ +release(uint, HIP_HIDDEN_FREE_MEM, 0, \ + "Reserve free mem reporting in Mb" \ + "0 = Disable") \ +release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 16, \ + "Use Blit until this size(in KB) for copies") \ +release(uint, ROC_ACTIVE_WAIT_TIMEOUT, 0, \ + "Forces active wait of GPU interrup for the timeout(us)") \ +release(bool, ROC_ENABLE_LARGE_BAR, true, \ + "Enable Large Bar if supported by the device") \ +release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \ + "Enable CPU wait for dependent HSA signals.") \ +release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \ + "Enable system scope for signals (uses interrupts).") \ +release(bool, GPU_FORCE_QUEUE_PROFILING, false, \ + "Force command queue profiling by default") \ +release(bool, HIP_MEM_POOL_SUPPORT, true, \ + "Enables memory pool support in HIP") \ +release(bool, HIP_MEM_POOL_USE_VM, true, \ + "Enables memory pool support in HIP") \ +release(bool, DEBUG_HIP_MEM_POOL_VMHEAP, true, \ + "Enables virtual memory for memory pools") \ +release(bool, PAL_HIP_IPC_FLAG, true, \ + "Enable interprocess flag for device allocation in PAL HIP") \ +release(uint, PAL_FORCE_ASIC_REVISION, 0, \ + "Force a specific asic revision for all devices") \ +release(bool, PAL_EMBED_KERNEL_MD, false, \ + "Enables writing kernel metadata into command buffers.") \ +release(cstring, ROC_GLOBAL_CU_MASK, "", \ + "Sets a global CU mask (entered as hex value) for all queues," \ + "Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)") \ +release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \ + "Size in KBytes of prepinned memory") \ +release(bool, AMD_CPU_AFFINITY, false, \ + "Reset CPU affinity of any runtime threads") \ +release(bool, ROC_USE_FGS_KERNARG, true, \ + "Use fine grain kernel args segment for supported asics") \ +release(uint, ROC_P2P_SDMA_SIZE, 1024, \ + "The minimum size in KB for P2P transfer with SDMA") \ +release(uint, ROC_AQL_QUEUE_SIZE, 16384, \ + "AQL queue size in AQL packets") \ +release(uint, ROC_SIGNAL_POOL_SIZE, 64, \ + "Initial size of HSA signal pool") \ +release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \ + "Limit the number of workgroups in blit operations") \ +release(bool, DEBUG_CLR_BLIT_KERNARG_OPT, false, \ + "Enable blit kernel arguments optimization") \ +release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \ + "If true, then runtime can skip kernel arg copy") \ +release(bool, GPU_STREAMOPS_CP_WAIT, false, \ + "Force the stream wait memory operation to wait on CP.") \ +release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false, \ + "Set this to true to force runtime unbundler in hiprtc.") \ +release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \ + "Set initial heap size for device malloc.") \ +release(bool, HIP_FORCE_DEV_KERNARG, true, \ + "Force device mem for kernel args.") \ +release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \ + "Enable/Disable graph packet capturing") \ +release(bool, GPU_DEBUG_ENABLE, false, \ + "Enables collection of extra info for debugger at some perf cost") \ +release(cstring, HIPRTC_COMPILE_OPTIONS_APPEND, "", \ + "Set compile options needed for hiprtc compilation") \ +release(cstring, HIPRTC_LINK_OPTIONS_APPEND, "", \ + "Set link options needed for hiprtc compilation") \ +release(bool, HIP_VMEM_MANAGE_SUPPORT, true, \ + "Virtual Memory Management Support") \ +release(bool, DEBUG_HIP_GRAPH_DOT_PRINT, false, \ + "Enable/Disable graph debug dot print dump") \ +release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \ + "Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \ +release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \ + "Forces the number of streams for the graph parallel execution") \ +release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \ + "Blocks synchronization on CPU until the callback processing is done")\ +release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000, \ + "Forces the callback to clean-up CPU submission queue") \ +release(bool, DEBUG_CLR_SYSMEM_POOL, false, \ + "Use sysmem pool implementation in runtime for amd commands") \ +release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true, \ + "Enable/Disable multiple kern arg copies") \ +release(bool, DEBUG_CLR_KERNARG_HDP_FLUSH_WA, false, \ + "Toggle kernel arg copy workaround") \ +release(bool, DEBUG_HIP_DYNAMIC_QUEUES, false, \ + "Forces dynamic queue management") \ +release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true, \ + "Set this to true, to avoid host side abort for GPU errors") \ +release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \ + "Force use of SPIRV instead of device specific code object.") \ +release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8, \ + "Forces the minimum batch size for CPU sync") // clang-format on namespace amd { diff --git a/projects/clr/rocclr/utils/util.hpp b/projects/clr/rocclr/utils/util.hpp index fd537d8998..b74b148a92 100644 --- a/projects/clr/rocclr/utils/util.hpp +++ b/projects/clr/rocclr/utils/util.hpp @@ -256,7 +256,7 @@ inline float half2float(const uint16_t Val) { uint32_t signBit = ((uint32_t)(Val & 0x8000)) << signBitShift; uint32_t exponent = (Val & halfExpoentMask) >> 10; uint32_t fraction = ((uint32_t)(Val & halfFractionMask)) - << 13; // Aligning half fraction to float + << 13; // Aligning half fraction to float union { uint32_t u32Arg; float fArg; @@ -283,7 +283,7 @@ inline float half2float(const uint16_t Val) { } } uint32_t floatExponent = ((exponent + floatExponentBias - halfExponentBias) & 0xff) - << floatExponentShift; + << floatExponentShift; u32Arg = signBit | floatExponent | fraction; return fArg; } diff --git a/projects/hip-tests/.clang-format b/projects/hip-tests/.clang-format index 5572a72cdd..1569aac12f 100644 --- a/projects/hip-tests/.clang-format +++ b/projects/hip-tests/.clang-format @@ -1,10 +1,10 @@ Language: Cpp BasedOnStyle: Google AlignEscapedNewlinesLeft: false -AlignOperands: false +AlignOperands: Align ColumnLimit: 100 -AlwaysBreakTemplateDeclarations: false +BreakTemplateDeclarations: No DerivePointerAlignment: false IndentFunctionDeclarationAfterType: false MaxEmptyLinesToKeep: 2 -SortIncludes: false +SortIncludes: Never diff --git a/projects/hip-tests/catch/external/Catch2/catch.hpp b/projects/hip-tests/catch/external/Catch2/catch.hpp index fdfb6613d5..ab39621b9a 100644 --- a/projects/hip-tests/catch/external/Catch2/catch.hpp +++ b/projects/hip-tests/catch/external/Catch2/catch.hpp @@ -734,13 +734,11 @@ constexpr auto operator"" _catch_sr(char const* rawChars, std::size_t size) noex f(x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1))(f, peek, __VA_ARGS__) #define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...) \ - , \ - f(userdata, x) \ - CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD))(f, userdata, peek, __VA_ARGS__) + , f(userdata, x) \ + CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD))(f, userdata, peek, __VA_ARGS__) #define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...) \ - , \ - f(userdata, x) \ - CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD))(f, userdata, peek, __VA_ARGS__) + , f(userdata, x) \ + CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD))(f, userdata, peek, __VA_ARGS__) #define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...) \ f(userdata, x) \ CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD))(f, userdata, peek, __VA_ARGS__) @@ -827,8 +825,8 @@ constexpr auto operator"" _catch_sr(char const* rawChars, std::size_t size) noex return {}; \ } \ template