Adjust clang format to the new versions, revert broken macro layout (#714)

このコミットが含まれているのは:
Danylo Lytovchenko
2025-08-22 17:23:22 +02:00
committed by GitHub
コミット 2ff2316227
189個のファイルの変更1906行の追加2418行の削除
+3 -3
ファイルの表示
@@ -1,10 +1,10 @@
Language: Cpp
BasedOnStyle: Google
AlignEscapedNewlinesLeft: false
AlignOperands: false
AlignOperands: Align
ColumnLimit: 100
AlwaysBreakTemplateDeclarations: false
BreakTemplateDeclarations: No
DerivePointerAlignment: false
IndentFunctionDeclarationAfterType: false
MaxEmptyLinesToKeep: 2
SortIncludes: false
SortIncludes: Never
+4 -5
ファイルの表示
@@ -1915,13 +1915,12 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16* address,
static_assert(sizeof(unsigned short int) == sizeof(__hip_bfloat16_raw));
unsigned short int* address_as_short = reinterpret_cast<unsigned short int*>(address);
// Align to 4 bytes
unsigned int* aligned_addr =
__builtin_bit_cast(unsigned int*,
__builtin_bit_cast(unsigned long long int, address_as_short) &
(unsigned long long int)(~0x3));
unsigned int* aligned_addr = __builtin_bit_cast(
unsigned int*, __builtin_bit_cast(unsigned long long int, address_as_short) &
(unsigned long long int)(~0x3));
bool is_lower = __builtin_bit_cast(unsigned long long int, aligned_addr) ==
__builtin_bit_cast(unsigned long long int, address);
__builtin_bit_cast(unsigned long long int, address);
__hip_bfloat162 fval;
if (is_lower)
+22 -26
ファイルの表示
@@ -375,8 +375,7 @@ class coalesced_group : public thread_group {
friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
unsigned int tile_size);
friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
template <unsigned int fsize, class fparent>
friend __CG_QUALIFIER__ coalesced_group
template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
__CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
@@ -393,8 +392,8 @@ class coalesced_group : public thread_group {
unsigned int masklength =
min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
lane_mask full_mask = (static_cast<int>(warpSize) == 32)
? static_cast<lane_mask>((1u << 32) - 1)
: static_cast<lane_mask>(-1ull);
? static_cast<lane_mask>((1u << 32) - 1)
: static_cast<lane_mask>(-1ull);
lane_mask member_mask = full_mask >> (warpSize - masklength);
member_mask <<= (__lane_id() & ~(tile_size - 1));
@@ -485,9 +484,9 @@ class coalesced_group : public thread_group {
srcRank = srcRank % static_cast<int>(num_threads());
int lane = (num_threads() == warpSize) ? srcRank
: (static_cast<int>(warpSize) == 64)
? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
: (static_cast<int>(warpSize) == 64)
? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
return __shfl(var, lane, warpSize);
}
@@ -835,8 +834,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
"Tile size is either not a power of 2 or greater than the wavefront size");
using tile_base<size>::numThreads;
template <unsigned int fsize, class fparent>
friend __CG_QUALIFIER__ coalesced_group
template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
@@ -910,10 +908,10 @@ template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
* \note This type is implemented on Linux, under development
* on Microsoft Windows.
*/
template <unsigned int tileSize, class ParentCGTy>
class thread_block_tile_type : public thread_block_tile_base<tileSize>,
public tiled_group,
public parent_group_info<tileSize, ParentCGTy> {
template <unsigned int tileSize, class ParentCGTy> class thread_block_tile_type
: public thread_block_tile_base<tileSize>,
public tiled_group,
public parent_group_info<tileSize, ParentCGTy> {
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
typedef thread_block_tile_base<numThreads> tbtBase;
@@ -931,9 +929,8 @@ class thread_block_tile_type : public thread_block_tile_base<tileSize>,
};
// Partial template specialization
template <unsigned int tileSize>
class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
public tiled_group {
template <unsigned int tileSize> class thread_block_tile_type<tileSize, void>
: public thread_block_tile_base<tileSize>, public tiled_group {
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
typedef thread_block_tile_base<numThreads> tbtBase;
@@ -1013,11 +1010,10 @@ __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
namespace impl {
template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
template <unsigned int size, class ParentCGTy>
class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
template <unsigned int size, class ParentCGTy> class thread_block_tile_internal
: public thread_block_tile_type<size, ParentCGTy> {
protected:
template <unsigned int tbtSize, class tbtParentT>
__CG_QUALIFIER__ thread_block_tile_internal(
template <unsigned int tbtSize, class tbtParentT> __CG_QUALIFIER__ thread_block_tile_internal(
const thread_block_tile_internal<tbtSize, tbtParentT>& g)
: thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}
@@ -1034,8 +1030,8 @@ class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGT
* \note This type is implemented on Linux, under development
* on Microsoft Windows.
*/
template <unsigned int size, class ParentCGTy>
class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
template <unsigned int size, class ParentCGTy> class thread_block_tile
: public impl::thread_block_tile_internal<size, ParentCGTy> {
protected:
__CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
: impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
@@ -1171,8 +1167,8 @@ class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCG
#endif
};
template <unsigned int size>
class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
template <unsigned int size> class thread_block_tile<size, void>
: public impl::thread_block_tile_internal<size, void> {
template <unsigned int, class ParentCGTy> friend class thread_block_tile;
protected:
@@ -1187,8 +1183,8 @@ template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
namespace impl {
template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
template <unsigned int size>
struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
template <unsigned int size> struct tiled_partition_internal<size, thread_block>
: public thread_block_tile<size, thread_block> {
__CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
: thread_block_tile<size, thread_block>(g) {}
};
+8 -9
ファイルの表示
@@ -82,8 +82,8 @@ namespace __hip_internal {
template <> struct is_floating_point<_Float16> : __hip_internal::true_type {};
} // namespace __hip_internal
template <bool cond, typename T = void>
using Enable_if_t = typename __hip_internal::enable_if<cond, T>::type;
template <bool cond, typename T = void> using Enable_if_t =
typename __hip_internal::enable_if<cond, T>::type;
// BEGIN STRUCT __HALF
struct __half {
@@ -649,7 +649,7 @@ inline __HOST_DEVICE__ bool __hgt(__half x, __half y) {
}
inline __HOST_DEVICE__ bool __hequ(__half x, __half y) {
return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) &&
!(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
!(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
}
inline __HOST_DEVICE__ bool __hneu(__half x, __half y) {
return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data);
@@ -693,7 +693,7 @@ inline __HOST_DEVICE__ __half2 __hgt2(__half2 x, __half2 y) {
}
inline __HOST_DEVICE__ __half2 __hequ2(__half2 x, __half2 y) {
auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) &&
!(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
!(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
return __builtin_convertvector(-r, _Float16_2);
}
inline __HOST_DEVICE__ __half2 __hneu2(__half2 x, __half2 y) {
@@ -911,13 +911,12 @@ inline __device__ __half unsafeAtomicAdd(__half* address, __half value) {
static_assert(sizeof(unsigned short int) == sizeof(__half_raw));
unsigned short int* address_as_short = reinterpret_cast<unsigned short int*>(address);
// Align to 4 bytes
unsigned int* aligned_addr =
__builtin_bit_cast(unsigned int*,
__builtin_bit_cast(unsigned long long int, address_as_short) &
(unsigned long long int)(~0x3));
unsigned int* aligned_addr = __builtin_bit_cast(
unsigned int*, __builtin_bit_cast(unsigned long long int, address_as_short) &
(unsigned long long int)(~0x3));
bool is_lower = __builtin_bit_cast(unsigned long long int, aligned_addr) ==
__builtin_bit_cast(unsigned long long int, address);
__builtin_bit_cast(unsigned long long int, address);
__half2 fval;
if (is_lower)
fval = __halves2half2(value, __float2half(0.0f));
+21 -20
ファイルの表示
@@ -327,8 +327,8 @@ where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
this case, the fp16 mantissa should be shift left by 1 */
act_exponent = exponent - bias + 1;
exponent_diff = f8_denormal_act_exponent -
act_exponent; // actual exponent is exponent-bias+1 as it is denormal
} else { // fp32/fp16 is normal with implicit 1
act_exponent; // actual exponent is exponent-bias+1 as it is denormal
} else { // fp32/fp16 is normal with implicit 1
act_exponent = exponent - bias;
if (act_exponent <= f8_denormal_act_exponent) {
/* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
@@ -345,7 +345,7 @@ So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
}
bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) ==
(1ull << (mfmt - wm + exponent_diff - 1));
(1ull << (mfmt - wm + exponent_diff - 1));
/* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we shift
right as shift right could rip off some residual part and make something not midpoint look like
midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than midpoint, but
@@ -400,9 +400,9 @@ after shift right by 4 bits, it would look like midpoint.
// The conversion function is from rocblas
// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220
// This has been modified to handle double types as well
template <typename T, bool is_fnuz>
__FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x, int wm, int we,
bool clip = false) {
template <typename T, bool is_fnuz> __FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x,
int wm, int we,
bool clip = false) {
#if defined(__clang__) and defined(__HIP__)
constexpr bool is_half = __hip_internal::is_same<T, _Float16>::value;
constexpr bool is_float = __hip_internal::is_same<T, float>::value;
@@ -576,14 +576,15 @@ static __device__ __hip_fp8_storage_t cast_to_f8_from_f32(float v, bool saturate
if (stochastic_rounding) {
ival = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0)
: __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0)
: __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
val.i32val = ival;
i8data = val.i8val[0]; // little endian
} else { // RNE CVT
ival = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false)
: __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0
ival =
(interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false)
: __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0
val.i32val = ival;
i8data = val.i8val[0];
}
@@ -628,8 +629,8 @@ cast_to_f8x2_from_f32x2(float2 v, bool saturate, __hip_fp8_interpretation_t inte
}
f2val.i32val[0] = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
? __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, 0, false)
: __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, 0, false);
? __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, 0, false)
: __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, 0, false);
return static_cast<__hip_fp8x2_storage_t>(f2val.i16val[0]);
}
@@ -643,8 +644,8 @@ static __device__ float cast_to_f32_from_f8(__hip_fp8_storage_t v,
val.i8val[0] = v;
float fval = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
? __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0)
: __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
? __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0)
: __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
return fval;
}
@@ -657,8 +658,8 @@ static __device__ float2 cast_to_f32x2_from_f8x2(__hip_fp8x2_storage_t v,
val.i16val[0] = v;
auto f2 = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
? __builtin_amdgcn_cvt_pk_f32_fp8(val.i32val, false)
: __builtin_amdgcn_cvt_pk_f32_bf8(val.i32val, false);
? __builtin_amdgcn_cvt_pk_f32_fp8(val.i32val, false)
: __builtin_amdgcn_cvt_pk_f32_bf8(val.i32val, false);
return float2{f2[0], f2[1]};
}
#endif // HIP_FP8_CVT_FAST_PATH
@@ -672,9 +673,9 @@ __FP8_HOST_DEVICE_STATIC__ bool hip_fp8_fnuz_is_nan(__hip_fp8_storage_t a) {
__FP8_HOST_DEVICE_STATIC__ bool hip_fp8_ocp_is_nan(__hip_fp8_storage_t a,
const __hip_fp8_interpretation_t type) {
return (type == __HIP_E4M3) ? ((a & 0x7f) == 0x7f)
: (type == __HIP_E5M2) ? ((a & 0x7f) > 0x7c)
: false;
return (type == __HIP_E4M3) ? ((a & 0x7f) == 0x7f)
: (type == __HIP_E5M2) ? ((a & 0x7f) > 0x7c)
: false;
}
__FP8_HOST_DEVICE_STATIC__ bool hip_fp8_ocp_is_inf(__hip_fp8_storage_t a,
+90 -82
ファイルの表示
@@ -334,13 +334,13 @@ __OCP_FP_HOST_DEVICE_STATIC__ float __amd_cvt_fp8_to_float_scale(
const __amd_scale_t scale) {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
return interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_f32_fp8(val, __amd_scale_to_float(scale), 0)
: __builtin_amdgcn_cvt_scalef32_f32_bf8(val, __amd_scale_to_float(scale), 0);
? __builtin_amdgcn_cvt_scalef32_f32_fp8(val, __amd_scale_to_float(scale), 0)
: __builtin_amdgcn_cvt_scalef32_f32_bf8(val, __amd_scale_to_float(scale), 0);
#else
using namespace fcbx;
return interpret == __AMD_OCP_E4M3
? to_float<float, Encoding::E4M3, true>(static_cast<uint32_t>(val), scale)
: to_float<float, Encoding::E5M2, true>(static_cast<uint32_t>(val), scale);
? to_float<float, Encoding::E4M3, true>(static_cast<uint32_t>(val), scale)
: to_float<float, Encoding::E5M2, true>(static_cast<uint32_t>(val), scale);
#endif
}
@@ -378,8 +378,8 @@ __amd_cvt_float_to_fp8_sr_scale(const float val, const __amd_fp8_interpretation_
} u{0};
using namespace fcbx;
u.ui32t = interpret == __AMD_OCP_E4M3
? from_float_sr<float, Encoding::E4M3, true>(val, seed, scale)
: from_float_sr<float, Encoding::E5M2, true>(val, seed, scale);
? from_float_sr<float, Encoding::E4M3, true>(val, seed, scale)
: from_float_sr<float, Encoding::E5M2, true>(val, seed, scale);
return u.fp8[0];
#endif
}
@@ -548,8 +548,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_floatx2_storage_t __amd_cvt_fp8x2_to_floatx2
const __amd_scale_t scale) {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
return interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(val, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(val, __amd_scale_to_float(scale), false);
? __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(val, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(val, __amd_scale_to_float(scale), false);
#else
using namespace fcbx;
__amd_floatx2_storage_t ret;
@@ -582,10 +582,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_floatx2_to_fp8x2_s
__amd_fp8x2_storage_t fp8x2[2];
} u{0};
u.shortx2 = interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(u.shortx2, val[0], val[1],
__amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(u.shortx2, val[0], val[1],
__amd_scale_to_float(scale), false);
? __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(u.shortx2, val[0], val[1],
__amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(u.shortx2, val[0], val[1],
__amd_scale_to_float(scale), false);
return u.fp8x2[0];
#else
using namespace fcbx;
@@ -679,8 +679,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t __amd_cvt_fp8x2_to_fp16x2_s
} u;
u.fp8x2[0] = val;
return interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(scale), false);
? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(scale), false);
#else
using namespace fcbx;
__amd_fp16x2_storage_t ret;
@@ -787,8 +787,9 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_bf16x2_storage_t __amd_cvt_fp8x2_to_bf16x2_s
} u;
u.fp8x2[0] = in;
return interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false);
? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale),
false);
#else
using namespace fcbx;
__amd_bf16x2_storage_t ret;
@@ -891,8 +892,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x32_storage_t __amd_cvt_fp6x32_to_fp16x3
#if HIP_ENABLE_GFX950_OCP_BUILTINS
// gfx950 expects scale to be in float
return interpret == __AMD_OCP_E2M3
? __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, __amd_scale_to_float(scale));
? __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, __amd_scale_to_float(scale));
#else
using namespace fcbx;
if (interpret == __AMD_OCP_E2M3) {
@@ -918,8 +919,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_bf16x32_storage_t __amd_cvt_fp6x32_to_bf16x3
const __amd_scale_t scale) {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
return interpret == __AMD_OCP_E2M3
? __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, __amd_scale_to_float(scale));
? __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, __amd_scale_to_float(scale));
#else
using namespace fcbx;
if (interpret == __AMD_OCP_E2M3) {
@@ -937,15 +938,15 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_floatx32_storage_t __amd_cvt_fp6x32_to_float
const __amd_scale_t scale) {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
return interpret == __AMD_OCP_E2M3
? __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(val, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(val, __amd_scale_to_float(scale));
? __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(val, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(val, __amd_scale_to_float(scale));
#else
using namespace fcbx;
return interpret == __AMD_OCP_E2M3
? fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float, Encoding::E2M3,
Encoding::IEEE754>(val, scale)
: fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float, Encoding::E3M2,
Encoding::IEEE754>(val, scale);
? fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float,
Encoding::E2M3, Encoding::IEEE754>(val, scale)
: fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float,
Encoding::E3M2, Encoding::IEEE754>(val, scale);
#endif
}
@@ -1200,9 +1201,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_fp16x2_to_fp8x2_sc
__amd_shortx2_storage_t shortx2;
__amd_fp8x2_storage_t fp8x2[2];
} u{0};
u.shortx2 = interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(u.shortx2, in, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(u.shortx2, in, __amd_scale_to_float(scale), false);
u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(
u.shortx2, in, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(
u.shortx2, in, __amd_scale_to_float(scale), false);
return u.fp8x2[0];
#else
static_assert(sizeof(__amd_fp8x2_storage_t[2]) == sizeof(uint32_t));
@@ -1241,10 +1243,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_bf16x2_to_fp8x2_sc
__amd_shortx2_storage_t shortx2;
__amd_fp8x2_storage_t fp8x2[2];
} u{0};
u.shortx2 = interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(u.shortx2, in, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(u.shortx2, in, __amd_scale_to_float(scale),
false);
u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(
u.shortx2, in, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(
u.shortx2, in, __amd_scale_to_float(scale), false);
return u.fp8x2[0];
#else
using namespace fcbx;
@@ -1429,9 +1431,10 @@ __amd_cvt_fp8_to_fp16_scale(const __amd_fp8_storage_t val,
const __amd_fp8_interpretation_t interpret, const __amd_scale_t scale) {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp16x2_storage_t ret;
ret = interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_f16_fp8(ret, val, __amd_scale_to_float(scale), 0, false)
: __builtin_amdgcn_cvt_scalef32_f16_bf8(ret, val, __amd_scale_to_float(scale), 0, false);
ret =
interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_f16_fp8(ret, val, __amd_scale_to_float(scale), 0, false)
: __builtin_amdgcn_cvt_scalef32_f16_bf8(ret, val, __amd_scale_to_float(scale), 0, false);
return ret[0];
#else
using namespace fcbx;
@@ -1463,9 +1466,10 @@ __amd_cvt_fp8_to_bf16_scale(const __amd_fp8_storage_t val,
unsigned int ui32;
} u{0};
u.fp8[0] = val;
auto ret = interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false);
auto ret =
interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false);
return ret[0];
#else
using namespace fcbx;
@@ -1491,8 +1495,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx16_floatx16
const __amd_fp6_interpretation_t interpret, const __amd_scale_t scale) {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
return interpret == __AMD_OCP_E2M3
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
#else
__amd_floatx32_storage_t tmp;
for (size_t i = 0; i < 16; i++) {
@@ -1503,10 +1507,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx16_floatx16
}
using namespace fcbx;
return interpret == __AMD_OCP_E2M3
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E2M3>(tmp, scale)
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E3M2>(tmp, scale);
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E2M3>(tmp, scale)
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E3M2>(tmp, scale);
#endif
}
@@ -1529,15 +1533,15 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx32_to_fp6x3
in2 = {val[16], val[17], val[18], val[19], val[20], val[21], val[22], val[23],
val[24], val[25], val[26], val[27], val[28], val[29], val[30], val[31]};
return interpret == __AMD_OCP_E2M3
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
#else
using namespace fcbx;
return interpret == __AMD_OCP_E2M3
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E2M3>(val, scale)
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E3M2>(val, scale);
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E2M3>(val, scale)
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E3M2>(val, scale);
#endif
}
@@ -1555,16 +1559,17 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx32_to_fp6x3
const unsigned int round, const __amd_scale_t scale) {
#if __has_builtin(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32) and \
__has_builtin(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32)
return interpret == __AMD_OCP_E2M3
? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(val, round, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(val, round, __amd_scale_to_float(scale));
return interpret == __AMD_OCP_E2M3 ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(
val, round, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(
val, round, __amd_scale_to_float(scale));
#else
using namespace fcbx;
return interpret == __AMD_OCP_E2M3
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E2M3, true>(val, scale, round)
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E3M2, true>(val, scale, round);
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E2M3, true>(val, scale, round)
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
Encoding::IEEE754, Encoding::E3M2, true>(val, scale, round);
#endif
}
@@ -1638,16 +1643,17 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_fp16x32_to_fp6x32
const unsigned int round, const __amd_scale_t scale) {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
return interpret == __AMD_OCP_E2M3
? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(in, round, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(in, round, __amd_scale_to_float(scale));
? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(in, round, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(in, round,
__amd_scale_to_float(scale));
#else
return interpret == __AMD_OCP_E2M3
? fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t,
__amd_fp16_storage_t, fcbx::Encoding::E5M10, fcbx::Encoding::E2M3,
true>(in, scale, round)
: fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t,
__amd_fp16_storage_t, fcbx::Encoding::E5M10, fcbx::Encoding::E3M2,
true>(in, scale, round);
? fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t,
__amd_fp16_storage_t, fcbx::Encoding::E5M10,
fcbx::Encoding::E2M3, true>(in, scale, round)
: fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t,
__amd_fp16_storage_t, fcbx::Encoding::E5M10,
fcbx::Encoding::E3M2, true>(in, scale, round);
#endif
}
@@ -1655,17 +1661,18 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_bf16x32_to_fp6x32
const __amd_bf16x32_storage_t in, const __amd_fp6_interpretation_t interpret,
const unsigned int round, const __amd_scale_t scale) {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
return interpret == __AMD_OCP_E2M3
? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(in, round, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(in, round, __amd_scale_to_float(scale));
return interpret == __AMD_OCP_E2M3 ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(
in, round, __amd_scale_to_float(scale))
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(
in, round, __amd_scale_to_float(scale));
#else
return interpret == __AMD_OCP_E2M3
? fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t,
__amd_bf16_storage_t, fcbx::Encoding::E8M7, fcbx::Encoding::E2M3,
true>(in, scale, round)
: fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t,
__amd_bf16_storage_t, fcbx::Encoding::E8M7, fcbx::Encoding::E3M2,
true>(in, scale, round);
? fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t,
__amd_bf16_storage_t, fcbx::Encoding::E8M7,
fcbx::Encoding::E2M3, true>(in, scale, round)
: fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t,
__amd_bf16_storage_t, fcbx::Encoding::E8M7,
fcbx::Encoding::E3M2, true>(in, scale, round);
#endif
}
@@ -2542,8 +2549,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t __amd_cvt_fp8x2_to_fp16x2(
} u;
u.fp8x2[0] = val;
return interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(0), false)
: __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(0), false);
? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(0), false)
: __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(0), false);
#else
using namespace fcbx;
__amd_fp16x2_storage_t ret;
@@ -2573,9 +2580,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_fp16x2_to_fp8x2(
__amd_shortx2_storage_t shortx2;
__amd_fp8x2_storage_t fp8x2[2];
} u{0};
u.shortx2 = interpret == __AMD_OCP_E4M3
? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(u.shortx2, val, __amd_scale_to_float(0), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(u.shortx2, val, __amd_scale_to_float(0), false);
u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(
u.shortx2, val, __amd_scale_to_float(0), false)
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(
u.shortx2, val, __amd_scale_to_float(0), false);
return u.fp8x2[0];
#else
using namespace fcbx;
@@ -2783,8 +2791,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8_storage_t __amd_cvt_fp16_to_fp8_sr(
#else
using namespace fcbx;
return interpret == __AMD_OCP_E4M3
? from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val, sr, 0)
: from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val, sr, 0);
? from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val, sr, 0)
: from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val, sr, 0);
#endif
}
+8 -8
ファイルの表示
@@ -719,8 +719,8 @@ struct __hipext_ocp_fp6x32_e2m3 {
}
#endif
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in,
const __amd_scale_t scale)
__OCP_FP_HOST_DEVICE__
__hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in, const __amd_scale_t scale)
#if HIP_ENABLE_GFX950_OCP_BUILTINS
: __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, __amd_scale_to_float(scale))){}
#else
@@ -742,8 +742,8 @@ struct __hipext_ocp_fp6x32_e2m3 {
}
#endif
__OCP_FP_HOST_DEVICE__
__hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in, const __amd_scale_t scale)
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in,
const __amd_scale_t scale)
#if HIP_ENABLE_GFX950_OCP_BUILTINS
: __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, __amd_scale_to_float(scale))){}
#else
@@ -832,8 +832,8 @@ struct __hipext_ocp_fp6x32_e3m2 {
}
#endif
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in,
const __amd_scale_t scale)
__OCP_FP_HOST_DEVICE__
__hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in, const __amd_scale_t scale)
#if HIP_ENABLE_GFX950_OCP_BUILTINS
: __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, __amd_scale_to_float(scale))){}
#else
@@ -855,8 +855,8 @@ struct __hipext_ocp_fp6x32_e3m2 {
}
#endif
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in,
const __amd_scale_t scale)
__OCP_FP_HOST_DEVICE__
__hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in, const __amd_scale_t scale)
#if HIP_ENABLE_GFX950_OCP_BUILTINS
: __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, __amd_scale_to_float(scale))){}
#else
+4 -4
ファイルの表示
@@ -793,11 +793,11 @@ __OCP_FP_HOST_DEVICE_STATIC__ OutType fp6_cvt_packedx32(InType in, int8_t scale
uint32_t seed = 0) {
// This is tightly coupled with the definitions of the amd_ocp_types
constexpr bool in_float = std::is_same<InType, __amd_floatx32_storage_t>::value ||
std::is_same<InType, __amd_fp16x32_storage_t>::value ||
std::is_same<InType, __amd_bf16x32_storage_t>::value;
std::is_same<InType, __amd_fp16x32_storage_t>::value ||
std::is_same<InType, __amd_bf16x32_storage_t>::value;
constexpr bool out_float = std::is_same<OutType, __amd_floatx32_storage_t>::value ||
std::is_same<OutType, __amd_fp16x32_storage_t>::value ||
std::is_same<OutType, __amd_bf16x32_storage_t>::value;
std::is_same<OutType, __amd_fp16x32_storage_t>::value ||
std::is_same<OutType, __amd_bf16x32_storage_t>::value;
using other_type = std::conditional<in_float, OutType, InType>::type;
struct fp6x32_packed {
+2 -3
ファイルの表示
@@ -314,9 +314,8 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) {
* @return Original value contained in \p addr.
*/
__device__ inline float safeAtomicAdd(float* addr, float value) {
#if defined(__gfx908__) || \
((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && \
!__has_builtin(__hip_atomic_fetch_add))
#if defined(__gfx908__) || ((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && \
!__has_builtin(__hip_atomic_fetch_add))
// On gfx908, we can generate unsafe FP32 atomic add that does not follow all
// IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
// On gfx90a, gfx942 and gfx950 if we do not have the __hip_atomic_fetch_add builtin, we
+17 -23
ファイルの表示
@@ -59,9 +59,9 @@ template <typename T, unsigned int n> struct HIP_vector_base;
template <typename T, unsigned int rank> struct HIP_vector_type;
namespace hip_impl {
template <typename T, unsigned int n>
__attribute__((always_inline)) __HOST_DEVICE__ typename HIP_vector_base<T, n>::Native_vec_*
get_native_pointer(HIP_vector_base<T, n>& base_vec) {
template <typename T, unsigned int n> __attribute__((always_inline)) __HOST_DEVICE__
typename HIP_vector_base<T, n>::Native_vec_*
get_native_pointer(HIP_vector_base<T, n>& base_vec) {
static_assert(sizeof(base_vec) == sizeof(typename HIP_vector_base<T, n>::Native_vec_));
static_assert(__hip_internal::alignment_of<HIP_vector_base<T, n>>::value ==
__hip_internal::alignment_of<typename HIP_vector_base<T, n>::Native_vec_>::value);
@@ -78,9 +78,9 @@ get_native_pointer(const HIP_vector_base<T, n>& base_vec) {
};
} // Namespace hip_impl.
template <typename T, unsigned int n>
__attribute__((always_inline)) __HOST_DEVICE__ typename HIP_vector_base<T, n>::Native_vec_&
get_native_vector(HIP_vector_base<T, n>& base_vec) {
template <typename T, unsigned int n> __attribute__((always_inline)) __HOST_DEVICE__
typename HIP_vector_base<T, n>::Native_vec_&
get_native_vector(HIP_vector_base<T, n>& base_vec) {
return *hip_impl::get_native_pointer(base_vec);
};
@@ -308,9 +308,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
__HOST_DEVICE__
HIP_vector_type() = default;
template <typename U,
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>::value>::type* =
nullptr>
template <typename U, typename __hip_internal::enable_if<
__hip_internal::is_convertible<U, T>::value>::type* = nullptr>
__HOST_DEVICE__ explicit constexpr HIP_vector_type(U x_) noexcept
: HIP_vector_base<T, rank>{static_cast<T>(x_)} {}
template < // TODO: constrain based on type as well.
@@ -368,9 +367,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
#endif
return *this;
}
template <
typename U,
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
template <typename U, typename __hip_internal::enable_if<
__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
__HOST_DEVICE__ HIP_vector_type& operator+=(U x) noexcept {
return *this += make_vector_type<T, rank>(x);
}
@@ -383,9 +381,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
#endif
return *this;
}
template <
typename U,
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
template <typename U, typename __hip_internal::enable_if<
__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
__HOST_DEVICE__ HIP_vector_type& operator-=(U x) noexcept {
return *this -= make_vector_type<T, rank>(x);
}
@@ -404,9 +401,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
return HIP_vector_type{x} *= y;
}
template <
typename U,
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
template <typename U, typename __hip_internal::enable_if<
__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
__HOST_DEVICE__ HIP_vector_type& operator*=(U x) noexcept {
return *this *= make_vector_type<T, rank>(x);
}
@@ -424,9 +420,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
#endif
return *this;
}
template <
typename U,
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
template <typename U, typename __hip_internal::enable_if<
__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
__HOST_DEVICE__ HIP_vector_type& operator/=(U x) noexcept {
return *this /= make_vector_type<T, rank>(x);
}
@@ -576,8 +571,7 @@ __HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator/(
return make_vector_type<T, n>(x) /= y;
}
template <typename T, unsigned int n>
__HOST_DEVICE__ inline
template <typename T, unsigned int n> __HOST_DEVICE__ inline
#if __cplusplus >= 201402L && !defined(__HIPCC_RTC__)
constexpr
#endif
+28 -42
ファイルの表示
@@ -109,9 +109,8 @@ static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format,
* \param x [in] The coordinate where the value will be read out.
* \param boundaryMode [in] The boundary mode is currently ignored.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
int boundaryMode = hipBoundaryModeZero) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT;
@@ -128,9 +127,8 @@ static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t su
* \param surfObj [in] The surface descriptor.
* \param x [in] The coordinate where the data will be written.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
@@ -147,9 +145,8 @@ static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t su
* \param x [in] The x coordinate where the value will be read out.
* \param y [in] The y coordinate where the value will be read out.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x,
int y) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -168,9 +165,8 @@ static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t su
* \param x [in] The x coordinate where the data will be written.
* \param y [in] The y coordinate where the data will be written.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x,
int y) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -190,9 +186,8 @@ static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t su
* \param y [in] The y coordinate where the value will be read out.
* \param z [in] The z coordinate where the value will be read out.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y,
int z) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -212,9 +207,8 @@ static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t su
* \param y [in] The y coordinate where the data will be written.
* \param z [in] The z coordinate where the data will be written.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y,
int z) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -233,9 +227,8 @@ static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t su
* \param x [in] The coordinate where the value will be read out.
* \param layer [in] The layer index where the value will be read out.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -253,9 +246,8 @@ static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObje
* \param x [in] The x coordinate where the data will be written.
* \param layer [in] The layer index where the data will be written.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -274,9 +266,8 @@ static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObje
* \param y [in] The y coordinate where the value will be read out.
* \param layer [in] The layer index where the value will be read out.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
int y, int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -296,9 +287,8 @@ static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObje
* \param y [in] The y coordinate where the data will be written.
* \param layer [in] The layer index where the data will be written.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
int y, int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -318,9 +308,8 @@ static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObje
* \param y [in] The y coordinate where the value will be read out.
* \param face [in] The face index where the value will be read out.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x,
int y, int face) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -340,9 +329,8 @@ static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject
* \param y [in] The y coordinate where the data will be written.
* \param face [in] The face index where the data will be written.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x,
int y, int face) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -363,9 +351,8 @@ static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject
* \param face [in] The face index where the value will be read out.
* \param layer [in] The layer index where the data will be written.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj,
int x, int y, int face, int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
@@ -386,9 +373,8 @@ static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfac
* \param face [in] The face index where the data will be written.
* \param layer [in] The layer index where the data will be written.
*/
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj,
int x, int y, int face, int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
+1 -1
ファイルの表示
@@ -443,7 +443,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
return backwardPermute(firstLane << 2, result);
else {
auto tmp = (static_cast<unsigned long long>(backwardPermute(firstLane << 2, result[1])) << 32) |
static_cast<unsigned int>(backwardPermute(firstLane << 2, result[0]));
static_cast<unsigned int>(backwardPermute(firstLane << 2, result[0]));
return *reinterpret_cast<T*>(&tmp);
}
}
+3 -6
ファイルの表示
@@ -130,12 +130,9 @@ inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSiz
blockSizeLimit);
}
template <class T>
inline hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
T kernel,
size_t dynSharedMemPerBlk = 0,
int blockSizeLimit = 0,
unsigned int flags = 0) {
template <class T> inline hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(
int* gridSize, int* blockSize, T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0,
unsigned int flags = 0) {
using namespace hip_impl;
hip_impl::hip_init();
+11 -12
ファイルの表示
@@ -51,11 +51,11 @@ namespace std { // TODO: these should be removed as soon as possible.
#if (__cplusplus < 201406L)
#if (__cplusplus < 201402L)
template <bool cond, typename T = void> using enable_if_t = typename enable_if<cond, T>::type;
template <bool cond, typename T, typename U>
using conditional_t = typename conditional<cond, T, U>::type;
template <bool cond, typename T, typename U> using conditional_t =
typename conditional<cond, T, U>::type;
template <typename T> using decay_t = typename decay<T>::type;
template <FunctionalProcedure F, typename... Ts>
using result_of_t = typename result_of<F(Ts...)>::type;
template <FunctionalProcedure F, typename... Ts> using result_of_t =
typename result_of<F(Ts...)>::type;
template <typename T> using remove_reference_t = typename remove_reference<T>::type;
#endif
#endif
@@ -67,8 +67,8 @@ template <typename...> using void_t_ = void;
#if HIP_HAS_INVOCABLE
template <typename, typename = void> struct is_callable_impl;
template <FunctionalProcedure F, typename... Ts>
struct is_callable_impl<F(Ts...)> : std::is_invocable<F, Ts...> {};
template <FunctionalProcedure F, typename... Ts> struct is_callable_impl<F(Ts...)>
: std::is_invocable<F, Ts...> {};
#elif HIP_HAS_RESULT_OF_SFINAE
template <typename, typename = void> struct is_callable_impl : std::false_type {};
@@ -76,11 +76,10 @@ template <FunctionalProcedure F, typename... Ts>
struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type> >
: std::true_type {};
#else
template <class Base, class T, class Derived>
auto simple_invoke(T Base::* pmd, Derived&& ref) -> decltype(static_cast<Derived&&>(ref).*pmd);
template <class Base, class T, class Derived> auto simple_invoke(T Base::* pmd, Derived&& ref)
-> decltype(static_cast<Derived&&>(ref).*pmd);
template <class PMD, class Pointer>
auto simple_invoke(PMD&& pmd, Pointer&& ptr)
template <class PMD, class Pointer> auto simple_invoke(PMD&& pmd, Pointer&& ptr)
-> decltype((*static_cast<Pointer&&>(ptr)).*static_cast<PMD&&>(pmd));
template <class Base, class T, class Derived>
@@ -100,8 +99,8 @@ template <class Base, class T, class Derived, class... Args>
auto simple_invoke(T Base::* pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
-> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));
template <class F, class... Ts>
auto simple_invoke(F&& f, Ts&&... xs) -> decltype(f(static_cast<Ts&&>(xs)...));
template <class F, class... Ts> auto simple_invoke(F&& f, Ts&&... xs)
-> decltype(f(static_cast<Ts&&>(xs)...));
template <typename, typename = void> struct is_callable_impl : std::false_type {};
+10 -10
ファイルの表示
@@ -56,19 +56,19 @@ using lane_mask = unsigned long long int;
namespace cooperative_groups {
/* Global scope */
template <unsigned int size>
using is_power_of_2 = __hip_internal::integral_constant<bool, (size & (size - 1)) == 0>;
template <unsigned int size> using is_power_of_2 =
__hip_internal::integral_constant<bool, (size & (size - 1)) == 0>;
template <unsigned int size>
using is_valid_wavefront = __hip_internal::integral_constant<bool, size <= 64>;
template <unsigned int size> using is_valid_wavefront =
__hip_internal::integral_constant<bool, size <= 64>;
template <unsigned int size>
using is_valid_tile_size = __hip_internal::integral_constant<
bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
template <unsigned int size> using is_valid_tile_size =
__hip_internal::integral_constant<bool, is_power_of_2<size>::value &&
is_valid_wavefront<size>::value>;
template <typename T>
using is_valid_type = __hip_internal::integral_constant<
bool, __hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value>;
template <typename T> using is_valid_type =
__hip_internal::integral_constant<bool, __hip_internal::is_integral<T>::value ||
__hip_internal::is_floating_point<T>::value>;
namespace internal {
+10 -9
ファイルの表示
@@ -8101,9 +8101,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
break;
// hipDeviceGetPCIBusId[('char*', 'pciBusId'), ('int', 'len'), ('int', 'device')]
case HIP_API_ID_hipDeviceGetPCIBusId:
data->args.hipDeviceGetPCIBusId.pciBusId = (data->args.hipDeviceGetPCIBusId.pciBusId)
? strdup(data->args.hipDeviceGetPCIBusId.pciBusId)
: NULL;
data->args.hipDeviceGetPCIBusId.pciBusId =
(data->args.hipDeviceGetPCIBusId.pciBusId)
? strdup(data->args.hipDeviceGetPCIBusId.pciBusId)
: NULL;
break;
// hipDeviceGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
case HIP_API_ID_hipDeviceGetSharedMemConfig:
@@ -8991,9 +8992,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
if (data->args.hipGraphInstantiate.pErrorNode)
data->args.hipGraphInstantiate.pErrorNode__val =
*(data->args.hipGraphInstantiate.pErrorNode);
data->args.hipGraphInstantiate.pLogBuffer = (data->args.hipGraphInstantiate.pLogBuffer)
? strdup(data->args.hipGraphInstantiate.pLogBuffer)
: NULL;
data->args.hipGraphInstantiate.pLogBuffer =
(data->args.hipGraphInstantiate.pLogBuffer)
? strdup(data->args.hipGraphInstantiate.pLogBuffer)
: NULL;
break;
// hipGraphInstantiateWithFlags[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'),
// ('unsigned long long', 'flags')]
@@ -15959,9 +15961,8 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize);
oss << ", dynSharedMemPerBlk=";
roctracer::hip_support::detail::operator<<(
oss,
data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
.dynSharedMemPerBlk);
oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
.dynSharedMemPerBlk);
oss << ", flags=";
roctracer::hip_support::detail::operator<<(
oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags);
+15 -18
ファイルの表示
@@ -114,11 +114,11 @@ template <typename __T, typename __U> struct is_same : public false_type {};
template <typename __T> struct is_same<__T, __T> : public true_type {};
template <typename _Tp, bool = is_arithmetic<_Tp>::value> struct is_signed : public false_type {};
template <typename _Tp>
struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
template <typename _Tp> struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {
};
template <class T>
auto test_returnable(int) -> decltype(void(static_cast<T (*)()>(nullptr)), true_type{});
template <class T> auto test_returnable(int)
-> decltype(void(static_cast<T (*)()>(nullptr)), true_type{});
template <class> auto test_returnable(...) -> false_type;
template <class T> struct type_identity {
@@ -139,8 +139,7 @@ template <class T> struct add_rvalue_reference : decltype(try_add_rvalue_referen
template <typename T> typename add_rvalue_reference<T>::type declval() noexcept;
template <class From, class To>
auto test_implicitly_convertible(int)
template <class From, class To> auto test_implicitly_convertible(int)
-> decltype(void(declval<void (&)(To)>()(declval<From>())), true_type{});
template <class, class> auto test_implicitly_convertible(...) -> false_type;
@@ -160,12 +159,10 @@ template <class T> struct remove_cv<const volatile T> {
template <class T> struct is_void : public is_same<void, typename remove_cv<T>::type> {};
template <class From, class To>
struct is_convertible
: public integral_constant<bool,
(decltype(test_returnable<To>(0))::value &&
decltype(test_implicitly_convertible<From, To>(0))::value) ||
(is_void<From>::value && is_void<To>::value)> {};
template <class From, class To> struct is_convertible
: public integral_constant<bool, (decltype(test_returnable<To>(0))::value &&
decltype(test_implicitly_convertible<From, To>(0))::value) ||
(is_void<From>::value && is_void<To>::value)> {};
template <typename _CharT> struct char_traits;
template <typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
@@ -173,8 +170,8 @@ template <typename _CharT, typename _Traits = char_traits<_CharT>> class basic_o
typedef basic_istream<char> istream;
typedef basic_ostream<char> ostream;
template <typename _Tp>
struct is_standard_layout : public integral_constant<bool, __is_standard_layout(_Tp)> {};
template <typename _Tp> struct is_standard_layout
: public integral_constant<bool, __is_standard_layout(_Tp)> {};
template <typename _Tp> struct is_trivial : public integral_constant<bool, __is_trivial(_Tp)> {};
@@ -195,15 +192,15 @@ template <typename T, T... Ints> struct integer_sequence {
template <size_t... Ints> using index_sequence = integer_sequence<size_t, Ints...>;
template <size_t _hip_N, size_t... Ints>
struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
template <size_t _hip_N, size_t... Ints> struct make_index_sequence_impl
: make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
template <size_t... Ints> struct make_index_sequence_impl<0, Ints...> {
using type = index_sequence<Ints...>;
};
template <size_t _hip_N>
using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type;
template <size_t _hip_N> using make_index_sequence =
typename make_index_sequence_impl<_hip_N>::type;
template <size_t... Ints>
constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
+3 -3
ファイルの表示
@@ -61,9 +61,9 @@ template <typename C, typename D> RAII_guard<C, D> make_RAII_guard(const C& ctor
return RAII_guard<C, D>{ctor, std::move(dtor)};
}
template <FunctionalProcedure F, typename... Ts>
using is_new_grid_launch_t = typename std::conditional<is_callable<F(Ts...)>{}, New_grid_launch_tag,
Old_grid_launch_tag>::type;
template <FunctionalProcedure F, typename... Ts> using is_new_grid_launch_t =
typename std::conditional<is_callable<F(Ts...)>{}, New_grid_launch_tag,
Old_grid_launch_tag>::type;
} // namespace
// TODO: - dispatch rank should be derived from the domain dimensions passed
+18 -27
ファイルの表示
@@ -37,8 +37,8 @@ THE SOFTWARE.
(void)s;
template <typename T> struct __hip_is_tex_surf_scalar_channel_type {
static constexpr bool value = __hip_internal::is_same<T, char>::value ||
__hip_internal::is_same<T, unsigned char>::value ||
static constexpr bool value =
__hip_internal::is_same<T, char>::value || __hip_internal::is_same<T, unsigned char>::value ||
__hip_internal::is_same<T, short>::value ||
__hip_internal::is_same<T, unsigned short>::value || __hip_internal::is_same<T, int>::value ||
__hip_internal::is_same<T, unsigned int>::value || __hip_internal::is_same<T, float>::value;
@@ -51,12 +51,12 @@ template <typename T> struct __hip_is_tex_surf_channel_type {
template <typename T, unsigned int rank>
struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>> {
static constexpr bool value = __hip_is_tex_surf_scalar_channel_type<T>::value &&
((rank == 1) || (rank == 2) || (rank == 4));
((rank == 1) || (rank == 2) || (rank == 4));
};
template <typename T> struct __hip_is_tex_normalized_channel_type {
static constexpr bool value = __hip_internal::is_same<T, char>::value ||
__hip_internal::is_same<T, unsigned char>::value ||
static constexpr bool value =
__hip_internal::is_same<T, char>::value || __hip_internal::is_same<T, unsigned char>::value ||
__hip_internal::is_same<T, short>::value || __hip_internal::is_same<T, unsigned short>::value;
};
@@ -73,8 +73,7 @@ template <typename T, hipTextureReadMode readMode, typename Enable = void> struc
/*
* Map from device function return U to scalar texture type T
*/
template <typename T, typename U>
__forceinline__ __device__
template <typename T, typename U> __forceinline__ __device__
typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
const T>::type
__hipMapFrom(const U& u) {
@@ -96,8 +95,7 @@ __forceinline__ __device__
/*
* Map from device function return U to vector texture type T
*/
template <typename T, typename U>
__forceinline__ __device__ typename __hip_internal::enable_if<
template <typename T, typename U> __forceinline__ __device__ typename __hip_internal::enable_if<
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
__hipMapFrom(const U& u) {
if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
@@ -118,8 +116,7 @@ __hipMapFrom(const U& u) {
/*
* Map from scalar texture type T to device function input U
*/
template <typename U, typename T>
__forceinline__ __device__
template <typename U, typename T> __forceinline__ __device__
typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
const U>::type
__hipMapTo(const T& t) {
@@ -143,8 +140,7 @@ __forceinline__ __device__
/*
* Map from vector texture type T to device function input U
*/
template <typename U, typename T>
__forceinline__ __device__ typename __hip_internal::enable_if<
template <typename U, typename T> __forceinline__ __device__ typename __hip_internal::enable_if<
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
__hipMapTo(const T& t) {
if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
@@ -164,18 +160,16 @@ __hipMapTo(const T& t) {
}
}
template <typename T, hipTextureReadMode readMode>
using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
template <typename T, hipTextureReadMode readMode> using __hip_tex_ret_t =
typename __hip_tex_ret<T, readMode, bool>::type;
template <typename T>
struct __hip_tex_ret<
template <typename T> struct __hip_tex_ret<
T, hipReadModeElementType,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
using type = T;
};
template <typename T, unsigned int rank>
struct __hip_tex_ret<
template <typename T, unsigned int rank> struct __hip_tex_ret<
HIP_vector_type<T, rank>, hipReadModeElementType,
typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
@@ -189,8 +183,7 @@ struct __hip_tex_ret<T, hipReadModeNormalizedFloat,
using type = float;
};
template <typename T, unsigned int rank>
struct __hip_tex_ret<
template <typename T, unsigned int rank> struct __hip_tex_ret<
HIP_vector_type<T, rank>, hipReadModeNormalizedFloat,
typename __hip_internal::enable_if<
__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
@@ -421,18 +414,16 @@ struct __hip_tex2dgather_ret {
static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
};
template <typename T, hipTextureReadMode readMode>
using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
template <typename T, hipTextureReadMode readMode> using __hip_tex2dgather_ret_t =
typename __hip_tex2dgather_ret<T, readMode, bool>::type;
template <typename T>
struct __hip_tex2dgather_ret<
template <typename T> struct __hip_tex2dgather_ret<
T, hipReadModeElementType,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
using type = HIP_vector_type<T, 4>;
};
template <typename T, unsigned int rank>
struct __hip_tex2dgather_ret<
template <typename T, unsigned int rank> struct __hip_tex2dgather_ret<
HIP_vector_type<T, rank>, hipReadModeElementType,
typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
+92 -138
ファイルの表示
@@ -37,41 +37,36 @@ THE SOFTWARE.
unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; \
(void)s;
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x) {
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_load_1Db(i, x);
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1Dfetch(T* ptr, hipTextureObject_t textureObject, int x) {
*ptr = tex1Dfetch<T>(textureObject, x);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x) {
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_1D(i, s, x);
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1D(T* ptr, hipTextureObject_t textureObject, float x) {
*ptr = tex1D<T>(textureObject, x);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y) {
TEXTURE_OBJECT_PARAMETERS_INIT
float2 coords{x, y};
@@ -79,17 +74,15 @@ static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, floa
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2D(T* ptr, hipTextureObject_t textureObject, float x,
float y) {
*ptr = tex2D<T>(textureObject, x, y);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y,
float z) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -98,17 +91,15 @@ static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, floa
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3D(T* ptr, hipTextureObject_t textureObject, float x,
float y, float z) {
*ptr = tex3D<T>(textureObject, x, y, z);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x,
int layer) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -117,17 +108,15 @@ static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObjec
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayered(T* ptr, hipTextureObject_t textureObject,
float x, int layer) {
*ptr = tex1DLayered<T>(textureObject, x, layer);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y,
int layer) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -136,17 +125,15 @@ static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObjec
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayered(T* ptr, hipTextureObject_t textureObject,
float x, float y, int layer) {
*ptr = tex1DLayered<T>(textureObject, x, y, layer);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y,
float z) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -155,17 +142,15 @@ static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject,
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemap(T* ptr, hipTextureObject_t textureObject, float x,
float y, float z) {
*ptr = texCubemap<T>(textureObject, x, y, z);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x,
float y, float z, int layer) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -174,17 +159,15 @@ static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t texture
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayered(T* ptr, hipTextureObject_t textureObject,
float x, float y, float z, int layer) {
*ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y,
int comp = 0) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -214,17 +197,15 @@ static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject
return {};
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2Dgather(T* ptr, hipTextureObject_t textureObject,
float x, float y, int comp = 0) {
*ptr = texCubemapLayered<T>(textureObject, x, y, comp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x,
float level) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -232,17 +213,15 @@ static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, f
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLod(T* ptr, hipTextureObject_t textureObject, float x,
float level) {
*ptr = tex1DLod<T>(textureObject, x, level);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y,
float level) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -251,17 +230,15 @@ static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, f
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLod(T* ptr, hipTextureObject_t textureObject, float x,
float y, float level) {
*ptr = tex2DLod<T>(textureObject, x, y, level);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y,
float z, float level) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -270,17 +247,15 @@ static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, f
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3DLod(T* ptr, hipTextureObject_t textureObject, float x,
float y, float z, float level) {
*ptr = tex3DLod<T>(textureObject, x, y, z, level);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x,
int layer, float level) {
TEXTURE_OBJECT_PARAMETERS_INIT;
@@ -290,17 +265,15 @@ static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureOb
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayeredLod(T* ptr, hipTextureObject_t textureObject,
float x, int layer, float level) {
*ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x,
float y, int layer, float level) {
TEXTURE_OBJECT_PARAMETERS_INIT;
@@ -310,17 +283,15 @@ static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureOb
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayeredLod(T* ptr, hipTextureObject_t textureObject,
float x, float y, int layer, float level) {
*ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x,
float y, float z, float level) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -329,17 +300,15 @@ static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObje
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLod(T* ptr, hipTextureObject_t textureObject,
float x, float y, float z, float level) {
*ptr = texCubemapLod<T>(textureObject, x, y, z, level);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x,
float y, float z, float4 dPdx, float4 dPdy) {
TEXTURE_OBJECT_PARAMETERS_INIT;
@@ -355,18 +324,16 @@ static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObj
return {};
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapGrad(T* ptr, hipTextureObject_t textureObject,
float x, float y, float z, float4 dPdx,
float4 dPdy) {
*ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x,
float y, float z, int layer, float level) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -375,9 +342,8 @@ static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t text
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr,
hipTextureObject_t textureObject,
float x, float y, float z, int layer,
@@ -385,9 +351,8 @@ static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr,
*ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx,
float dPdy) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -395,17 +360,15 @@ static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject,
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DGrad(T* ptr, hipTextureObject_t textureObject, float x,
float dPdx, float dPdy) {
*ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y,
float2 dPdx, float2 dPdy) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -415,17 +378,15 @@ static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject,
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DGrad(T* ptr, hipTextureObject_t textureObject, float x,
float y, float2 dPdx, float2 dPdy) {
*ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y,
float z, float4 dPdx, float4 dPdy) {
TEXTURE_OBJECT_PARAMETERS_INIT;
@@ -438,17 +399,15 @@ static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject,
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3DGrad(T* ptr, hipTextureObject_t textureObject, float x,
float y, float z, float4 dPdx, float4 dPdy) {
*ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x,
int layer, float dPdx, float dPdy) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -457,18 +416,16 @@ static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureO
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
float x, int layer, float dPdx,
float dPdy) {
*ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x,
float y, int layer, float2 dPdx, float2 dPdy) {
TEXTURE_OBJECT_PARAMETERS_INIT
@@ -478,18 +435,16 @@ static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureO
return __hipMapFrom<T>(tmp);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
float x, float y, int layer, float2 dPdx,
float2 dPdy) {
*ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x,
float y, float z, int layer, float4 dPdx,
float4 dPdy) {
@@ -507,9 +462,8 @@ static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t tex
return {};
}
template <
typename T,
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
template <typename T, typename __hip_internal::enable_if<
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T* ptr,
hipTextureObject_t textureObject,
float x, float y, float z, int layer,
+7 -9
ファイルの表示
@@ -156,8 +156,8 @@ bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
static inline unsigned int getGenericVersion(const void* image) {
const Elf64_Ehdr* ehdr = reinterpret_cast<const Elf64_Ehdr*>(image);
return ehdr->e_ident[EI_ABIVERSION] == ELFABIVERSION_AMDGPU_HSA_V6
? ((ehdr->e_flags & EF_AMDGPU_GENERIC_VERSION) >> EF_AMDGPU_GENERIC_VERSION_OFFSET)
: 0;
? ((ehdr->e_flags & EF_AMDGPU_GENERIC_VERSION) >> EF_AMDGPU_GENERIC_VERSION_OFFSET)
: 0;
}
static inline bool isGenericTarget(const void* image) {
@@ -178,10 +178,9 @@ bool UnbundleBitCode(const std::vector<char>& bundled_llvm_bitcode, const std::s
const void* data = reinterpret_cast<const void*>(bundled_llvm_bitcode_s.c_str());
const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
const auto* desc = &obheader->desc[0];
for (uint64_t idx = 0; idx < obheader->numOfCodeObjects; ++idx,
desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
desc->bundleEntryIdSize)) {
for (uint64_t idx = 0; idx < obheader->numOfCodeObjects;
++idx, desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) + desc->bundleEntryIdSize)) {
const void* image =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
const size_t image_size = desc->size;
@@ -736,9 +735,8 @@ bool demangleName(const std::string& mangledName, std::string& demangledName) {
demangledName.resize(demangled_size);
if (AMD_COMGR_STATUS_SUCCESS !=
amd::Comgr::get_data(demangled_data, &demangled_size,
const_cast<char*>(demangledName.data()))) {
if (AMD_COMGR_STATUS_SUCCESS != amd::Comgr::get_data(demangled_data, &demangled_size,
const_cast<char*>(demangledName.data()))) {
amd::Comgr::release_data(mangled_data);
amd::Comgr::release_data(demangled_data);
return false;
+6 -5
ファイルの表示
@@ -135,7 +135,7 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) {
command->awaitCompletion();
ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) -
time(false)) /
1000000.f;
1000000.f;
command->release();
} else {
// Note: with direct dispatch eStop.ready() relies on HW event, but CPU status can be delayed.
@@ -210,7 +210,8 @@ hipError_t Event::streamWait(hip::Stream* stream, uint flags) {
hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t ext_flags,
bool batch_flush) {
if (command == nullptr) {
int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) &
int32_t releaseFlags =
((ext_flags == 0) ? flags_ : ext_flags) &
(hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence);
if (releaseFlags & hipEventDisableSystemFence) {
releaseFlags = amd::Device::kCacheStateIgnore;
@@ -269,8 +270,8 @@ bool isValid(hipEvent_t event) {
// ================================================================================================
hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess |
hipEventDisableSystemFence;
hipEventReleaseToDevice | hipEventReleaseToSystem |
hipEventInterprocess | hipEventDisableSystemFence;
const unsigned releaseFlags =
(hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence);
@@ -284,7 +285,7 @@ hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
}
return bitcount;
}(flags & releaseFlags) > 1) ||
((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
if (!illegalFlags) {
hip::Event* e = nullptr;
if (flags & hipEventInterprocess) {
+6 -7
ファイルの表示
@@ -37,10 +37,9 @@ template <typename comgr_T> class ComgrUniqueHandle {
// constructor which takes ownership of a correctly initialzed handle
ComgrUniqueHandle(comgr_T& handle) : comgr_obj_(handle) { handle = {0}; };
template <typename T = comgr_T,
std::enable_if_t<std::is_same_v<T, amd_comgr_data_set_t> ||
std::is_same_v<T, amd_comgr_action_info_t>,
bool> = true>
template <typename T = comgr_T, std::enable_if_t<std::is_same_v<T, amd_comgr_data_set_t> ||
std::is_same_v<T, amd_comgr_action_info_t>,
bool> = true>
[[nodiscard]] amd_comgr_status_t Create() {
if constexpr (std::is_same_v<T, amd_comgr_data_set_t>) {
return amd::Comgr::create_data_set(&comgr_obj_);
@@ -736,9 +735,9 @@ hipError_t FatBinaryInfo::BuildProgram(const int device_id) {
// If Program was already built skip this step and return success
if (dev_programs_[device_id]->IsProgramBuilt(*g_devices[device_id]->devices()[0]) == false) {
if (CL_SUCCESS !=
dev_programs_[device_id]->build(g_devices[device_id]->devices(), nullptr, nullptr, nullptr,
kOptionChangeable, kNewDevProg)) {
if (CL_SUCCESS != dev_programs_[device_id]->build(g_devices[device_id]->devices(), nullptr,
nullptr, nullptr, kOptionChangeable,
kNewDevProg)) {
return hipErrorNoBinaryForGpu;
}
if (!dev_programs_[device_id]->load()) {
+2 -2
ファイルの表示
@@ -581,8 +581,8 @@ bool Graph::RunOneNode(Node node, bool wait) {
for (auto edge : node->GetEdges()) {
// Don't wait in the nodes, executed on the same streams and if it has just one dependency
bool wait = ((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) || (edge->GetDependencies().size() > 1))
? true
: false;
? true
: false;
// Execute the edge node
if (!RunOneNode(edge, wait)) {
return false;
+4 -5
ファイルの表示
@@ -366,9 +366,8 @@ class GraphNode : public hipGraphNodeDOTAttribute {
virtual void EnqueueCommands(hip::Stream* stream) {
// If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
// Node can be enabled/disabled only for kernel, memcpy and memset nodes.
if (!isEnabled_ &&
(type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy ||
type_ == hipGraphNodeTypeMemset)) {
if (!isEnabled_ && (type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy ||
type_ == hipGraphNodeTypeMemset)) {
amd::Command::EventWaitList waitList;
if (!commands_.empty()) {
waitList = commands_[0]->eventWaitList();
@@ -1677,7 +1676,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
label = buffer;
} else {
label = std::to_string(GetID()) + "\n" + label_ + "\n(" + memcpyDirection + "," +
std::to_string(count_) + ")";
std::to_string(count_) + ")";
}
return label;
}
@@ -1948,7 +1947,7 @@ class GraphMemsetNode : public GraphNode {
sizeBytes = memsetParams_.width * memsetParams_.height * depth_ * memsetParams_.elementSize;
}
label = std::to_string(GetID()) + "\n" + label_ + "\n(" +
std::to_string(memsetParams_.value) + "," + std::to_string(sizeBytes) + ")";
std::to_string(memsetParams_.value) + "," + std::to_string(sizeBytes) + ")";
}
return label;
}
+6 -6
ファイルの表示
@@ -227,8 +227,8 @@ hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr, size_t len
// This type of memory may only be specified if the device associated with the
// stream reports a non-zero value for the device attribute hipDevAttrPageableMemoryAccess.
hip::Stream* hip_stream = (stream == nullptr || stream == hipStreamLegacy)
? hip::getCurrentDevice()->NullStream()
: hip::getStream(stream);
? hip::getCurrentDevice()->NullStream()
: hip::getStream(stream);
size_t offset = 0;
amd::Memory* memObj = getMemoryObject(dev_ptr, offset);
if (memObj == nullptr) {
@@ -328,13 +328,13 @@ hipError_t ihipMemPrefetchAsync(const void* dev_ptr, size_t count, hipMemLocatio
// Pick the specified stream or Null one from the provided target device
if (cpuAccess == true) {
hip_stream = (stream == nullptr || stream == hipStreamLegacy)
? hip::getCurrentDevice()->NullStream()
: hip::getStream(stream);
? hip::getCurrentDevice()->NullStream()
: hip::getStream(stream);
} else {
dev = g_devices[targetDevice]->devices()[0];
hip_stream = (stream == nullptr || stream == hipStreamLegacy)
? g_devices[targetDevice]->NullStream()
: hip::getStream(stream);
? g_devices[targetDevice]->NullStream()
: hip::getStream(stream);
}
if (hip_stream == nullptr) {
+3 -3
ファイルの表示
@@ -327,9 +327,9 @@ class Stream : public amd::HostQueue {
unsigned long long captureID_;
static inline CommandQueue::Priority convertToQueuePriority(Priority p) {
return p == Priority::High ? amd::CommandQueue::Priority::High
: p == Priority::Low ? amd::CommandQueue::Priority::Low
: amd::CommandQueue::Priority::Normal;
return p == Priority::High ? amd::CommandQueue::Priority::High
: p == Priority::Low ? amd::CommandQueue::Priority::Low
: amd::CommandQueue::Priority::Normal;
}
public:
+16 -20
ファイルの表示
@@ -67,8 +67,8 @@ hipMemoryType getMemoryType(const amd::Memory* memory) {
}
return ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memory->getMemFlags())
? hipMemoryTypeHost
: hipMemoryTypeDevice;
? hipMemoryTypeHost
: hipMemoryTypeDevice;
}
// ================================================================================================
@@ -336,8 +336,8 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
hip::getCurrentDevice()->SetActiveStatus();
size_t max_device_size = IS_LINUX
? dev_info.maxMemAllocSize_
: (dev_info.maxMemAllocSize_ + dev_info.maxPhysicalMemAllocSize_);
? dev_info.maxMemAllocSize_
: (dev_info.maxMemAllocSize_ + dev_info.maxPhysicalMemAllocSize_);
if ((useHostDevice && dev_info.maxPhysicalMemAllocSize_ < sizeBytes) ||
(!useHostDevice && max_device_size < sizeBytes)) {
@@ -401,9 +401,8 @@ hipError_t ihipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
}
if (flags == 0 ||
flags &
(hipHostMallocCoherent | hipHostMallocMapped | hipHostMallocNumaUser |
hipHostMallocUncached) ||
flags & (hipHostMallocCoherent | hipHostMallocMapped | hipHostMallocNumaUser |
hipHostMallocUncached) ||
(!(flags & hipHostMallocNonCoherent) && HIP_HOST_COHERENT)) {
ihipFlags |= CL_MEM_SVM_ATOMICS;
}
@@ -1143,7 +1142,7 @@ hipError_t ihipArrayCreate(hipArray_t* array, const HIP_ARRAY3D_DESCRIPTOR* pAll
return hipErrorInvalidValue;
}
unsigned int flags = hipArrayDefault | hipArrayLayered | hipArraySurfaceLoadStore |
hipArrayTextureGather; // hipArrayCubemap isn't supported
hipArrayTextureGather; // hipArrayCubemap isn't supported
if (pAllocateArray->Flags & (~flags)) {
return hipErrorInvalidValue;
}
@@ -1282,9 +1281,8 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) {
hipError_t ihipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) {
if (hostPtr == nullptr || sizeBytes == 0 ||
flags &
~(hipHostRegisterPortable | hipHostRegisterMapped | hipExtHostRegisterCoarseGrained |
hipExtHostRegisterUncached)) {
flags & ~(hipHostRegisterPortable | hipHostRegisterMapped | hipExtHostRegisterCoarseGrained |
hipExtHostRegisterUncached)) {
return hipErrorInvalidValue;
} else {
unsigned int memFlags = CL_MEM_USE_HOST_PTR | CL_MEM_SVM_ATOMICS;
@@ -1377,9 +1375,8 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) {
if (ptr == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
if (flags &
~(hipHostAllocPortable | hipHostAllocMapped | hipHostAllocWriteCombined |
hipHostAllocUncached)) {
if (flags & ~(hipHostAllocPortable | hipHostAllocMapped | hipHostAllocWriteCombined |
hipHostAllocUncached)) {
HIP_RETURN(hipErrorInvalidValue);
}
@@ -1868,9 +1865,9 @@ hipError_t ihipMemcpyHtoH(void* dstHost, const void* srcHost, amd::Coord3D copyR
for (size_t slice = 0; slice < copyRegion[2]; slice++) {
for (size_t row = 0; row < copyRegion[1]; row++) {
const void* srcRow = static_cast<const char*>(srcHost) + srcRect.start_ +
row * srcRect.rowPitch_ + slice * srcRect.slicePitch_;
row * srcRect.rowPitch_ + slice * srcRect.slicePitch_;
void* dstRow = static_cast<char*>(dstHost) + dstRect.start_ + row * dstRect.rowPitch_ +
slice * dstRect.slicePitch_;
slice * dstRect.slicePitch_;
std::memcpy(dstRow, srcRow, copyRegion[0]);
}
}
@@ -2331,9 +2328,8 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool
// Transfers from device memory to pageable host memory and transfers from any
// host memory to any host memory are synchronous with respect to the host.
// Device to Device copies do not need to host side synchronization.
if (dstMemoryType == hipMemoryTypeHost ||
((pCopy->srcMemoryType == hipMemoryTypeHost) &&
(pCopy->dstMemoryType == hipMemoryTypeHost))) {
if (dstMemoryType == hipMemoryTypeHost || ((pCopy->srcMemoryType == hipMemoryTypeHost) &&
(pCopy->dstMemoryType == hipMemoryTypeHost))) {
isAsync = false;
} else if ((pCopy->srcMemoryType == hipMemoryTypeDevice) &&
(pCopy->dstMemoryType == hipMemoryTypeDevice)) {
@@ -4111,7 +4107,7 @@ hipError_t ihipMipmapArrayCreate(hipMipmappedArray_t* mipmapped_array_pptr,
return hipErrorInvalidValue;
}
unsigned int flags = hipArrayDefault | hipArrayLayered | hipArraySurfaceLoadStore |
hipArrayTextureGather; // hipArrayCubemap isn't supported
hipArrayTextureGather; // hipArrayCubemap isn't supported
if (mipmapped_array_desc_ptr->Flags & (~flags)) {
return hipErrorInvalidValue;
}
+2 -2
ファイルの表示
@@ -380,8 +380,8 @@ hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_
auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);
auto hip_stream = (stream == nullptr || stream == hipStreamLegacy)
? hip::getCurrentDevice()->NullStream()
: reinterpret_cast<hip::Stream*>(stream);
? hip::getCurrentDevice()->NullStream()
: reinterpret_cast<hip::Stream*>(stream);
*dev_ptr = mpool->AllocateMemory(size, hip_stream);
if (*dev_ptr == nullptr) {
HIP_RETURN(hipErrorOutOfMemory);
+3 -3
ファイルの表示
@@ -422,9 +422,9 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) {
break;
case hipMemPoolAttrReservedMemCurrent:
// All allocated memory by the pool in OS
*reinterpret_cast<uint64_t*>(value) = (state_.use_vm_heap_)
? MappedSize()
: (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
*reinterpret_cast<uint64_t*>(value) =
(state_.use_vm_heap_) ? MappedSize()
: (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
break;
case hipMemPoolAttrReservedMemHigh:
// High watermark of all allocated memory in OS, since the last reset
+3 -4
ファイルの表示
@@ -165,7 +165,7 @@ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunc
case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
case HIP_FUNC_ATTRIBUTE_BINARY_VERSION:
*value = hip::getCurrentDevice()->devices()[0]->isa().versionMajor() * 10 +
hip::getCurrentDevice()->devices()[0]->isa().versionMinor();
hip::getCurrentDevice()->devices()[0]->isa().versionMinor();
break;
case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA:
*value = 0;
@@ -224,9 +224,8 @@ hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int valu
(device::Kernel*)(kernel->getDeviceKernel(*(hip::getCurrentDevice()->devices()[0])));
if (attr == hipFuncAttributeMaxDynamicSharedMemorySize) {
if ((value < 0) ||
(value > (d_kernel->workGroupInfo()->availableLDSSize_ -
d_kernel->workGroupInfo()->localMemSize_))) {
if ((value < 0) || (value > (d_kernel->workGroupInfo()->availableLDSSize_ -
d_kernel->workGroupInfo()->localMemSize_))) {
HIP_RETURN(hipErrorInvalidValue);
}
d_kernel->workGroupInfo()->maxDynamicSharedSizeBytes_ = value;
+22 -19
ファイルの表示
@@ -79,9 +79,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
// pResViewDesc can only be specified if the type of resource is a HIP array or a HIP mipmapped
// array.
if ((pResViewDesc != nullptr) &&
((pResDesc->resType != hipResourceTypeArray) &&
(pResDesc->resType != hipResourceTypeMipmappedArray))) {
if ((pResViewDesc != nullptr) && ((pResDesc->resType != hipResourceTypeArray) &&
(pResDesc->resType != hipResourceTypeMipmappedArray))) {
return hipErrorUnknown;
}
@@ -176,9 +175,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
// hipAddressModeWrap and hipAddressModeMirror won't be supported
// and will be switched to hipAddressModeClamp.
for (int i = 0; i < 3; i++) {
if ((pTexDesc->normalizedCoords == 0) &&
((pTexDesc->addressMode[i] == hipAddressModeWrap) ||
(pTexDesc->addressMode[i] == hipAddressModeMirror))) {
if ((pTexDesc->normalizedCoords == 0) && ((pTexDesc->addressMode[i] == hipAddressModeWrap) ||
(pTexDesc->addressMode[i] == hipAddressModeMirror))) {
addressMode[i] = hip::getCLAddressingMode(hipAddressModeClamp);
}
// hipTextureDesc::addressMode is ignored if hipResourceDesc::resType is hipResourceTypeLinear
@@ -237,12 +235,14 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
if ((pResViewDesc != nullptr) || (readMode == hipReadModeNormalizedFloat) ||
(pTexDesc->sRGB == 1)) {
// TODO ROCclr currently right now can only change the format of the image.
const cl_channel_order channelOrder = (pResViewDesc != nullptr)
? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
: hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB);
const cl_channel_type channelType = (pResViewDesc != nullptr)
? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
: hip::getCLChannelType(pResDesc->res.array.array->Format, readMode);
const cl_channel_order channelOrder =
(pResViewDesc != nullptr)
? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
: hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB);
const cl_channel_type channelType =
(pResViewDesc != nullptr)
? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
: hip::getCLChannelType(pResDesc->res.array.array->Format, readMode);
const amd::Image::Format imageFormat(cl_image_format{channelOrder, channelType});
if (!imageFormat.isValid()) {
return hipErrorInvalidValue;
@@ -277,12 +277,14 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
if ((pResViewDesc != nullptr) || (readMode == hipReadModeNormalizedFloat) ||
(pTexDesc->sRGB == 1)) {
// TODO ROCclr currently right now can only change the format of the image.
const cl_channel_order channelOrder = (pResViewDesc != nullptr)
? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
: hip::getCLChannelOrder(pResDesc->res.mipmap.mipmap->num_channels, pTexDesc->sRGB);
const cl_channel_type channelType = (pResViewDesc != nullptr)
? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
: hip::getCLChannelType(pResDesc->res.mipmap.mipmap->format, readMode);
const cl_channel_order channelOrder =
(pResViewDesc != nullptr)
? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
: hip::getCLChannelOrder(pResDesc->res.mipmap.mipmap->num_channels, pTexDesc->sRGB);
const cl_channel_type channelType =
(pResViewDesc != nullptr)
? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
: hip::getCLChannelType(pResDesc->res.mipmap.mipmap->format, readMode);
const amd::Image::Format imageFormat(cl_image_format{channelOrder, channelType});
if (!imageFormat.isValid()) {
return hipErrorInvalidValue;
@@ -335,7 +337,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode);
const amd::Image::Format imageFormat({channelOrder, channelType});
const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
const size_t imageSizeInBytes = pResDesc->res.pitch2D.width * imageFormat.getElementSize() +
const size_t imageSizeInBytes =
pResDesc->res.pitch2D.width * imageFormat.getElementSize() +
pResDesc->res.pitch2D.pitchInBytes * (pResDesc->res.pitch2D.height - 1);
amd::Memory* buffer =
getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes);
+3 -3
ファイルの表示
@@ -36,9 +36,9 @@ int checkContextProperties(const cl_context_properties* properties, bool* offlin
namespace amd {
template <typename T>
static inline cl_int clGetInfo(T& field, size_t param_value_size, void* param_value,
size_t* param_value_size_ret) {
template <typename T> static inline cl_int clGetInfo(T& field, size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) {
const void* valuePtr;
size_t valueSize;
+4 -3
ファイルの表示
@@ -164,9 +164,10 @@ RUNTIME_ENTRY(cl_int, clGetEventInfo,
}
case CL_EVENT_COMMAND_QUEUE: {
amd::Command& command = as_amd(event)->command();
cl_command_queue queue = command.queue() == NULL
? NULL
: const_cast<cl_command_queue>(as_cl(command.queue()->asCommandQueue()));
cl_command_queue queue =
command.queue() == NULL
? NULL
: const_cast<cl_command_queue>(as_cl(command.queue()->asCommandQueue()));
return amd::clGetInfo(queue, param_value_size, param_value, param_value_size_ret);
}
case CL_EVENT_COMMAND_TYPE: {
+4 -6
ファイルの表示
@@ -885,9 +885,8 @@ RUNTIME_ENTRY(cl_int, clGetGLContextInfoKHR,
for (cl_uint i = 0; i < num_gpu_devices; ++i) {
cl_device_id device = gpu_devices[i];
if (is_valid(device) &&
as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_,
VALIDATE_ONLY)) {
if (is_valid(device) && as_amd(device)->bindExternalDevice(info.flags_, info.hDev_,
info.hCtx_, VALIDATE_ONLY)) {
return amd::clGetInfo(device, param_value_size, param_value, param_value_size_ret);
}
}
@@ -912,9 +911,8 @@ RUNTIME_ENTRY(cl_int, clGetGLContextInfoKHR,
for (cl_uint i = 0; i < total_devices; ++i) {
cl_device_id device = devices[i];
if (is_valid(device) &&
as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_,
VALIDATE_ONLY)) {
if (is_valid(device) && as_amd(device)->bindExternalDevice(info.flags_, info.hDev_,
info.hCtx_, VALIDATE_ONLY)) {
compatible_devices.push_back(as_amd(device));
}
}
+10 -15
ファイルの表示
@@ -70,12 +70,10 @@ static bool validateFlags(cl_mem_flags flags, bool chkReadWrite = false) {
temp |= (flags & CL_MEM_KERNEL_READ_AND_WRITE);
}
if (temp &&
!(CL_MEM_READ_WRITE == temp || CL_MEM_WRITE_ONLY == temp ||
(chkReadWrite &&
(CL_MEM_KERNEL_READ_AND_WRITE == temp ||
(CL_MEM_KERNEL_READ_AND_WRITE | CL_MEM_READ_WRITE) == temp)) ||
CL_MEM_READ_ONLY == temp)) {
if (temp && !(CL_MEM_READ_WRITE == temp || CL_MEM_WRITE_ONLY == temp ||
(chkReadWrite && (CL_MEM_KERNEL_READ_AND_WRITE == temp ||
(CL_MEM_KERNEL_READ_AND_WRITE | CL_MEM_READ_WRITE) == temp)) ||
CL_MEM_READ_ONLY == temp)) {
return false;
}
@@ -89,9 +87,8 @@ static bool validateFlags(cl_mem_flags flags, bool chkReadWrite = false) {
}
if ((flags & CL_MEM_EXTERNAL_PHYSICAL_AMD) &&
(flags &
(CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE |
CL_MEM_READ_ONLY))) {
(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
CL_MEM_READ_WRITE | CL_MEM_READ_ONLY))) {
return false;
}
@@ -414,9 +411,8 @@ RUNTIME_ENTRY_RET(cl_mem, clCreateBuffer,
// check extensions flag consistency
if ((flags & CL_MEM_USE_PERSISTENT_MEM_AMD) &&
(flags &
(CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD |
CL_MEM_BUS_ADDRESSABLE_AMD))) {
(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD |
CL_MEM_BUS_ADDRESSABLE_AMD))) {
*not_null(errcode_ret) = CL_INVALID_VALUE;
LogWarning("conflicting flags CL_MEM_USE_PERSISTENT_MEM_AMD and host memory specific flags");
return (cl_mem)0;
@@ -901,9 +897,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBuffer,
return CL_INVALID_VALUE;
}
if (srcBuffer == dstBuffer &&
((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
return CL_MEM_COPY_OVERLAP;
}
+2 -3
ファイルの表示
@@ -60,9 +60,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBufferP2PAMD,
return CL_INVALID_VALUE;
}
if (srcBuffer == dstBuffer &&
((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
return CL_MEM_COPY_OVERLAP;
}
+1 -1
ファイルの表示
@@ -1833,7 +1833,7 @@ RUNTIME_ENTRY(cl_int, clGetKernelWorkGroupInfo,
// Return the amount of used local memory
const size_t align = amdDevice.info().minDataTypeAlignSize_;
cl_ulong memSize = as_amd(kernel)->parameters().localMemSize(align) +
amd::alignUp(devKernel->workGroupInfo()->localMemSize_, align);
amd::alignUp(devKernel->workGroupInfo()->localMemSize_, align);
return amd::clGetInfo(memSize, param_value_size, param_value, param_value_size_ret);
}
case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+24 -26
ファイルの表示
@@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
* does not work, because when using a derived type (e.g. Context) the generic
* template will provide a better match.
*/
template <typename Func, typename T>
inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int,
typename T::cl_type = 0) {
template <typename Func, typename T> inline cl_int getInfoHelper(Func f, cl_uint name,
VECTOR_CLASS<T>* param, int,
typename T::cl_type = 0) {
::size_t required;
cl_int err = f(name, 0, NULL, &required);
if (err != CL_SUCCESS) {
@@ -2743,12 +2743,10 @@ template <typename IteratorType>
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
IteratorType endIterator);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
IteratorType startIterator, IteratorType endIterator);
/*! \brief Class interface for Buffer Memory Objects.
@@ -2804,9 +2802,9 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL) {
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false,
cl_int* err = NULL) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -2850,17 +2848,17 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
/*!
* \brief Construct a Buffer from a host container via iterators using a specified queue.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
//! \brief Default constructor - initializes to NULL.
Buffer() : Memory() {}
@@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
const VECTOR_CLASS<const void*>* mem_locs = NULL,
const VECTOR_CLASS<Event>* events = NULL, Event* event = NULL) const {
cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0)
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
: NULL;
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
: NULL;
if (mems != NULL) {
for (unsigned int i = 0; i < mem_objects->size(); i++) {
@@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_;
__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
#endif // !_WIN32
template <typename IteratorType>
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr, cl_int* err) {
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr, cl_int* err) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
* Host to Device.
* Uses specified queue.
*/
template <typename IteratorType>
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer) {
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
+2 -2
ファイルの表示
@@ -59,7 +59,7 @@ extern "C" {
#define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \
CL_EXTENSION_WEAK_LINK \
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
#ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
@@ -68,7 +68,7 @@ extern "C" {
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \
CL_EXTENSION_WEAK_LINK \
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
#else
#warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+24 -26
ファイルの表示
@@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
* does not work, because when using a derived type (e.g. Context) the generic
* template will provide a better match.
*/
template <typename Func, typename T>
inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int,
typename T::cl_type = 0) {
template <typename Func, typename T> inline cl_int getInfoHelper(Func f, cl_uint name,
VECTOR_CLASS<T>* param, int,
typename T::cl_type = 0) {
::size_t required;
cl_int err = f(name, 0, NULL, &required);
if (err != CL_SUCCESS) {
@@ -2743,12 +2743,10 @@ template <typename IteratorType>
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
IteratorType endIterator);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
IteratorType startIterator, IteratorType endIterator);
/*! \brief Class interface for Buffer Memory Objects.
@@ -2804,9 +2802,9 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL) {
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false,
cl_int* err = NULL) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -2850,17 +2848,17 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
/*!
* \brief Construct a Buffer from a host container via iterators using a specified queue.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
//! \brief Default constructor - initializes to NULL.
Buffer() : Memory() {}
@@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
const VECTOR_CLASS<const void*>* mem_locs = NULL,
const VECTOR_CLASS<Event>* events = NULL, Event* event = NULL) const {
cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0)
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
: NULL;
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
: NULL;
if (mems != NULL) {
for (unsigned int i = 0; i < mem_objects->size(); i++) {
@@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_;
__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
#endif // !_WIN32
template <typename IteratorType>
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr, cl_int* err) {
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr, cl_int* err) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
* Host to Device.
* Uses specified queue.
*/
template <typename IteratorType>
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer) {
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
+59 -62
ファイルの表示
@@ -1765,9 +1765,8 @@ template <typename T> inline bool operator!=(const Wrapper<T>& lhs, const Wrappe
using BuildLogType =
vector<std::pair<cl::Device,
typename detail::param_traits<detail::cl_program_build_info,
CL_PROGRAM_BUILD_LOG>::param_type>>;
vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info,
CL_PROGRAM_BUILD_LOG>::param_type>>;
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
/**
* Exception class for build errors to carry build info
@@ -2961,12 +2960,10 @@ template <typename IteratorType>
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
IteratorType endIterator);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
IteratorType startIterator, IteratorType endIterator);
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
@@ -3053,8 +3050,8 @@ template <typename T, class SVMTrait> class SVMAllocator {
SVMAllocator(const SVMAllocator& other) : context_(other.context_) {}
template <typename U>
SVMAllocator(const SVMAllocator<U, SVMTrait>& other) : context_(other.context_) {}
template <typename U> SVMAllocator(const SVMAllocator<U, SVMTrait>& other)
: context_(other.context_) {}
~SVMAllocator() {}
@@ -3272,9 +3269,9 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL) {
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false,
cl_int* err = NULL) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -3318,17 +3315,17 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
/*!
* \brief Construct a Buffer from a host container via iterators using a specified queue.
* If useHostPtr is specified iterators must be random access.
*/
template <typename IteratorType>
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
//! \brief Default constructor - initializes to NULL.
Buffer() : Memory() {}
@@ -4828,8 +4825,7 @@ template <typename T, class Enable = void> struct KernelArgumentHandler;
// Enable for objects that are not subclasses of memory
// Pointers, constants etc
template <typename T>
struct KernelArgumentHandler<
template <typename T> struct KernelArgumentHandler<
T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type> {
static size_type size(const T&) { return sizeof(T); }
static const T* ptr(const T& value) { return &value; }
@@ -4992,9 +4988,8 @@ class Kernel : public detail::Wrapper<cl_kernel> {
__GET_KERNEL_ARG_INFO_ERR);
}
template <cl_int name>
size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
cl_int* err = NULL) const {
template <cl_int name> size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
cl_int* err = NULL) const {
size_type param;
cl_int result = getSubGroupInfo(dev, name, range, &param);
if (err != NULL) {
@@ -5591,9 +5586,8 @@ inline Program linkProgram(vector<Program> inputPrograms, const char* options =
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
// Template specialization for CL_PROGRAM_BINARIES
template <>
inline cl_int cl::Program::getInfo(cl_program_info name,
vector<vector<unsigned char>>* param) const {
template <> inline cl_int cl::Program::getInfo(cl_program_info name,
vector<vector<unsigned char>>* param) const {
if (name != CL_PROGRAM_BINARIES) {
return CL_INVALID_VALUE;
}
@@ -6367,9 +6361,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
* This variant takes a raw SVM pointer.
*/
template <typename T>
cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
const vector<Event>* events = NULL, Event* event = NULL) const {
template <typename T> cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMMap(
@@ -6468,9 +6462,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
Event* event = NULL) const {
template <typename T, class D> cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMUnmap(
@@ -6488,9 +6482,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
* This variant takes a cl::vector instance.
*/
template <typename T, class Alloc>
cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
Event* event = NULL) const {
template <typename T, class Alloc> cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMUnmap(
@@ -6827,8 +6821,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl::Context context = cl::Context::getDefault();
cl::Device device = cl::Device::getDefault();
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -6847,8 +6842,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl_int* err = NULL) {
cl_int error;
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -6866,8 +6862,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl_int* err = NULL) {
cl_int error;
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE,
queueSize, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -7021,9 +7018,9 @@ template <> struct KernelArgumentHandler<cl::DeviceCommandQueue, void> {
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
template <typename IteratorType>
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr, cl_int* err) {
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr, cl_int* err) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -7163,9 +7160,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla
* update a region of a coarse-grained SVM buffer.
* This variant takes a raw SVM pointer.
*/
template <typename T>
inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
const vector<Event>* events, Event* event) {
template <typename T> inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events,
Event* event) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7180,10 +7177,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t
* update a region of a coarse-grained SVM buffer.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class D> inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking,
cl_map_flags flags, size_type size,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7257,9 +7254,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector<Event>* events = NULL, Event*
* SVM buffer back to the OpenCL runtime.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class D> inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7275,9 +7272,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* event
* SVM buffer back to the OpenCL runtime.
* This variant takes a cl::vector instance.
*/
template <typename T, class Alloc>
inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class Alloc> inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7336,9 +7333,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
* Host to Device.
* Uses specified queue.
*/
template <typename IteratorType>
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer) {
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
+2 -2
ファイルの表示
@@ -67,7 +67,7 @@ extern "C" {
#define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \
CL_EXTENSION_WEAK_LINK \
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
#ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
@@ -76,7 +76,7 @@ extern "C" {
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \
CL_EXTENSION_WEAK_LINK \
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
#else
#warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+24 -26
ファイルの表示
@@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
* does not work, because when using a derived type (e.g. Context) the generic
* template will provide a better match.
*/
template <typename Func, typename T>
inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int,
typename T::cl_type = 0) {
template <typename Func, typename T> inline cl_int getInfoHelper(Func f, cl_uint name,
VECTOR_CLASS<T>* param, int,
typename T::cl_type = 0) {
::size_t required;
cl_int err = f(name, 0, NULL, &required);
if (err != CL_SUCCESS) {
@@ -2743,12 +2743,10 @@ template <typename IteratorType>
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
IteratorType endIterator);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
IteratorType startIterator, IteratorType endIterator);
/*! \brief Class interface for Buffer Memory Objects.
@@ -2804,9 +2802,9 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL) {
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false,
cl_int* err = NULL) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -2850,17 +2848,17 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
/*!
* \brief Construct a Buffer from a host container via iterators using a specified queue.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
//! \brief Default constructor - initializes to NULL.
Buffer() : Memory() {}
@@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
const VECTOR_CLASS<const void*>* mem_locs = NULL,
const VECTOR_CLASS<Event>* events = NULL, Event* event = NULL) const {
cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0)
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
: NULL;
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
: NULL;
if (mems != NULL) {
for (unsigned int i = 0; i < mem_objects->size(); i++) {
@@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_;
__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
#endif // !_WIN32
template <typename IteratorType>
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr, cl_int* err) {
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr, cl_int* err) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
* Host to Device.
* Uses specified queue.
*/
template <typename IteratorType>
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer) {
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
+59 -62
ファイルの表示
@@ -1765,9 +1765,8 @@ template <typename T> inline bool operator!=(const Wrapper<T>& lhs, const Wrappe
using BuildLogType =
vector<std::pair<cl::Device,
typename detail::param_traits<detail::cl_program_build_info,
CL_PROGRAM_BUILD_LOG>::param_type>>;
vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info,
CL_PROGRAM_BUILD_LOG>::param_type>>;
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
/**
* Exception class for build errors to carry build info
@@ -2961,12 +2960,10 @@ template <typename IteratorType>
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
IteratorType endIterator);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
IteratorType startIterator, IteratorType endIterator);
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
@@ -3053,8 +3050,8 @@ template <typename T, class SVMTrait> class SVMAllocator {
SVMAllocator(const SVMAllocator& other) : context_(other.context_) {}
template <typename U>
SVMAllocator(const SVMAllocator<U, SVMTrait>& other) : context_(other.context_) {}
template <typename U> SVMAllocator(const SVMAllocator<U, SVMTrait>& other)
: context_(other.context_) {}
~SVMAllocator() {}
@@ -3272,9 +3269,9 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL) {
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false,
cl_int* err = NULL) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -3318,17 +3315,17 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
/*!
* \brief Construct a Buffer from a host container via iterators using a specified queue.
* If useHostPtr is specified iterators must be random access.
*/
template <typename IteratorType>
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
//! \brief Default constructor - initializes to NULL.
Buffer() : Memory() {}
@@ -4828,8 +4825,7 @@ template <typename T, class Enable = void> struct KernelArgumentHandler;
// Enable for objects that are not subclasses of memory
// Pointers, constants etc
template <typename T>
struct KernelArgumentHandler<
template <typename T> struct KernelArgumentHandler<
T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type> {
static size_type size(const T&) { return sizeof(T); }
static const T* ptr(const T& value) { return &value; }
@@ -4992,9 +4988,8 @@ class Kernel : public detail::Wrapper<cl_kernel> {
__GET_KERNEL_ARG_INFO_ERR);
}
template <cl_int name>
size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
cl_int* err = NULL) const {
template <cl_int name> size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
cl_int* err = NULL) const {
size_type param;
cl_int result = getSubGroupInfo(dev, name, range, &param);
if (err != NULL) {
@@ -5591,9 +5586,8 @@ inline Program linkProgram(vector<Program> inputPrograms, const char* options =
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
// Template specialization for CL_PROGRAM_BINARIES
template <>
inline cl_int cl::Program::getInfo(cl_program_info name,
vector<vector<unsigned char>>* param) const {
template <> inline cl_int cl::Program::getInfo(cl_program_info name,
vector<vector<unsigned char>>* param) const {
if (name != CL_PROGRAM_BINARIES) {
return CL_INVALID_VALUE;
}
@@ -6367,9 +6361,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
* This variant takes a raw SVM pointer.
*/
template <typename T>
cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
const vector<Event>* events = NULL, Event* event = NULL) const {
template <typename T> cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMMap(
@@ -6468,9 +6462,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
Event* event = NULL) const {
template <typename T, class D> cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMUnmap(
@@ -6488,9 +6482,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
* This variant takes a cl::vector instance.
*/
template <typename T, class Alloc>
cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
Event* event = NULL) const {
template <typename T, class Alloc> cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMUnmap(
@@ -6827,8 +6821,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl::Context context = cl::Context::getDefault();
cl::Device device = cl::Device::getDefault();
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -6847,8 +6842,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl_int* err = NULL) {
cl_int error;
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -6866,8 +6862,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl_int* err = NULL) {
cl_int error;
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE,
queueSize, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -7021,9 +7018,9 @@ template <> struct KernelArgumentHandler<cl::DeviceCommandQueue, void> {
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
template <typename IteratorType>
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr, cl_int* err) {
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr, cl_int* err) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -7163,9 +7160,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla
* update a region of a coarse-grained SVM buffer.
* This variant takes a raw SVM pointer.
*/
template <typename T>
inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
const vector<Event>* events, Event* event) {
template <typename T> inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events,
Event* event) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7180,10 +7177,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t
* update a region of a coarse-grained SVM buffer.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class D> inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking,
cl_map_flags flags, size_type size,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7257,9 +7254,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector<Event>* events = NULL, Event*
* SVM buffer back to the OpenCL runtime.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class D> inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7275,9 +7272,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* event
* SVM buffer back to the OpenCL runtime.
* This variant takes a cl::vector instance.
*/
template <typename T, class Alloc>
inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class Alloc> inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7336,9 +7333,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
* Host to Device.
* Uses specified queue.
*/
template <typename IteratorType>
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer) {
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
+2 -2
ファイルの表示
@@ -67,7 +67,7 @@ extern "C" {
#define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \
CL_EXTENSION_WEAK_LINK \
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
#ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
@@ -76,7 +76,7 @@ extern "C" {
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \
CL_EXTENSION_WEAK_LINK \
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
#else
#warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+24 -26
ファイルの表示
@@ -1009,9 +1009,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
* does not work, because when using a derived type (e.g. Context) the generic
* template will provide a better match.
*/
template <typename Func, typename T>
inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int,
typename T::cl_type = 0) {
template <typename Func, typename T> inline cl_int getInfoHelper(Func f, cl_uint name,
VECTOR_CLASS<T>* param, int,
typename T::cl_type = 0) {
::size_t required;
cl_int err = f(name, 0, NULL, &required);
if (err != CL_SUCCESS) {
@@ -2736,12 +2736,10 @@ template <typename IteratorType>
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
IteratorType endIterator);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
IteratorType startIterator, IteratorType endIterator);
/*! \brief Class interface for Buffer Memory Objects.
@@ -2797,9 +2795,9 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL) {
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false,
cl_int* err = NULL) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -2843,17 +2841,17 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
/*!
* \brief Construct a Buffer from a host container via iterators using a specified queue.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
//! \brief Default constructor - initializes to NULL.
Buffer() : Memory() {}
@@ -5314,8 +5312,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
const VECTOR_CLASS<const void*>* mem_locs = NULL,
const VECTOR_CLASS<Event>* events = NULL, Event* event = NULL) const {
cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0)
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
: NULL;
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
: NULL;
if (mems != NULL) {
for (unsigned int i = 0; i < mem_objects->size(); i++) {
@@ -5505,9 +5503,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_;
__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
#endif // !_WIN32
template <typename IteratorType>
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr, cl_int* err) {
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr, cl_int* err) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -5709,9 +5707,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
* Host to Device.
* Uses specified queue.
*/
template <typename IteratorType>
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer) {
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
+59 -62
ファイルの表示
@@ -1753,9 +1753,8 @@ template <typename T> inline bool operator!=(const Wrapper<T>& lhs, const Wrappe
using BuildLogType =
vector<std::pair<cl::Device,
typename detail::param_traits<detail::cl_program_build_info,
CL_PROGRAM_BUILD_LOG>::param_type>>;
vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info,
CL_PROGRAM_BUILD_LOG>::param_type>>;
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
/**
* Exception class for build errors to carry build info
@@ -2951,12 +2950,10 @@ template <typename IteratorType>
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer);
template <typename IteratorType>
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
IteratorType endIterator);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer);
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
IteratorType startIterator, IteratorType endIterator);
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
@@ -3043,8 +3040,8 @@ template <typename T, class SVMTrait> class SVMAllocator {
SVMAllocator(const SVMAllocator& other) : context_(other.context_) {}
template <typename U>
SVMAllocator(const SVMAllocator<U, SVMTrait>& other) : context_(other.context_) {}
template <typename U> SVMAllocator(const SVMAllocator<U, SVMTrait>& other)
: context_(other.context_) {}
~SVMAllocator() {}
@@ -3262,9 +3259,9 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL) {
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false,
cl_int* err = NULL) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -3308,17 +3305,17 @@ class Buffer : public Memory {
* IteratorType must be random access.
* If useHostPtr is specified iterators must represent contiguous data.
*/
template <typename IteratorType>
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
/*!
* \brief Construct a Buffer from a host container via iterators using a specified queue.
* If useHostPtr is specified iterators must be random access.
*/
template <typename IteratorType>
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr = false, cl_int* err = NULL);
//! \brief Default constructor - initializes to NULL.
Buffer() : Memory() {}
@@ -4818,8 +4815,7 @@ template <typename T, class Enable = void> struct KernelArgumentHandler;
// Enable for objects that are not subclasses of memory
// Pointers, constants etc
template <typename T>
struct KernelArgumentHandler<
template <typename T> struct KernelArgumentHandler<
T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type> {
static size_type size(const T&) { return sizeof(T); }
static const T* ptr(const T& value) { return &value; }
@@ -4982,9 +4978,8 @@ class Kernel : public detail::Wrapper<cl_kernel> {
__GET_KERNEL_ARG_INFO_ERR);
}
template <cl_int name>
size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
cl_int* err = NULL) const {
template <cl_int name> size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
cl_int* err = NULL) const {
size_type param;
cl_int result = getSubGroupInfo(dev, name, range, &param);
if (err != NULL) {
@@ -5581,9 +5576,8 @@ inline Program linkProgram(vector<Program> inputPrograms, const char* options =
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
// Template specialization for CL_PROGRAM_BINARIES
template <>
inline cl_int cl::Program::getInfo(cl_program_info name,
vector<vector<unsigned char>>* param) const {
template <> inline cl_int cl::Program::getInfo(cl_program_info name,
vector<vector<unsigned char>>* param) const {
if (name != CL_PROGRAM_BINARIES) {
return CL_INVALID_VALUE;
}
@@ -6357,9 +6351,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
* This variant takes a raw SVM pointer.
*/
template <typename T>
cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
const vector<Event>* events = NULL, Event* event = NULL) const {
template <typename T> cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMMap(
@@ -6458,9 +6452,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
Event* event = NULL) const {
template <typename T, class D> cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMUnmap(
@@ -6478,9 +6472,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
* This variant takes a cl::vector instance.
*/
template <typename T, class Alloc>
cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
Event* event = NULL) const {
template <typename T, class Alloc> cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
const vector<Event>* events = NULL,
Event* event = NULL) const {
cl_event tmp;
cl_int err = detail::errHandler(
::clEnqueueSVMUnmap(
@@ -6817,8 +6811,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl::Context context = cl::Context::getDefault();
cl::Device device = cl::Device::getDefault();
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -6837,8 +6832,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl_int* err = NULL) {
cl_int error;
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -6856,8 +6852,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
cl_int* err = NULL) {
cl_int error;
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
cl_command_queue_properties mergedProperties =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
static_cast<cl_command_queue_properties>(properties);
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE,
queueSize, 0};
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
@@ -7011,9 +7008,9 @@ template <> struct KernelArgumentHandler<cl::DeviceCommandQueue, void> {
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
template <typename IteratorType>
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
bool readOnly, bool useHostPtr, cl_int* err) {
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
IteratorType endIterator, bool readOnly,
bool useHostPtr, cl_int* err) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
@@ -7153,9 +7150,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla
* update a region of a coarse-grained SVM buffer.
* This variant takes a raw SVM pointer.
*/
template <typename T>
inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
const vector<Event>* events, Event* event) {
template <typename T> inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events,
Event* event) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7170,10 +7167,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t
* update a region of a coarse-grained SVM buffer.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking, cl_map_flags flags,
size_type size, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class D> inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking,
cl_map_flags flags, size_type size,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7247,9 +7244,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector<Event>* events = NULL, Event*
* SVM buffer back to the OpenCL runtime.
* This variant takes a cl::pointer instance.
*/
template <typename T, class D>
inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class D> inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7265,9 +7262,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* event
* SVM buffer back to the OpenCL runtime.
* This variant takes a cl::vector instance.
*/
template <typename T, class Alloc>
inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
Event* event = NULL) {
template <typename T, class Alloc> inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
const vector<Event>* events = NULL,
Event* event = NULL) {
cl_int error;
CommandQueue queue = CommandQueue::getDefault(&error);
if (error != CL_SUCCESS) {
@@ -7326,9 +7323,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
* Host to Device.
* Uses specified queue.
*/
template <typename IteratorType>
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
cl::Buffer& buffer) {
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
IteratorType startIterator,
IteratorType endIterator, cl::Buffer& buffer) {
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
cl_int error;
+1 -1
ファイルの表示
@@ -126,7 +126,7 @@ void OCLDX11YUY2::run(void) {
BYTE* pLine = (BYTE*)LockedRectD11.pData + y * LockedRectD11.RowPitch;
BYTE* pLineUV = (BYTE*)LockedRectD11.pData + y * LockedRectD11.RowPitch +
OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch;
OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch;
for (int x = 0; x < OCLDX11YUY2::WIDTH; x++) {
*pLine++ = 0x7F; // Y
+1 -1
ファイルの表示
@@ -265,7 +265,7 @@ void OCLPerfGenericBandwidth::run(void) {
// We have one extra write per LDS location to initialize LDS
double perf = ((double)global * (numReads_ * sizeof(cl_float) + dataSizeBytes_ / 64) * NUM_ITER *
(double)(1e-09)) /
sec;
sec;
_perfInfo = (float)perf;
SNPRINTF(buf, sizeof(buf), " %6s %9s %8d threads, %3d reads (GB/s) ", buf2, buf3, global,
+4 -4
ファイルの表示
@@ -401,8 +401,8 @@ void OCLPerfKernelThroughput::open(unsigned int test, char* units, double& conve
input2BufferSize_ = static_cast<size_t>(matrixDim2_ * matrixDim1_ * sizeof(float));
output1BufferSize_ = static_cast<size_t>(matrixDim1_ * matrixDim1_ * sizeof(float));
_reqDataSize = (1.0 * matrixDim1_ * matrixDim2_ * sizeof(float)) +
(1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) +
(1.0 * matrixDim1_ * matrixDim1_ * sizeof(float));
(1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) +
(1.0 * matrixDim1_ * matrixDim1_ * sizeof(float));
break;
case 1: // Flops/Byte
flopsPerByte_ = (int)workSize[workSizeIdx_]; // for kernelType == 0
@@ -695,13 +695,13 @@ void OCLPerfKernelThroughput::run(void) {
// printf("FlopCount = 2*%i*%i*%i=%f\n",
// matrixDim1_,matrixDim1_,matrixDim2_,flopCount);
bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f /
avgKernelTime_; // GB/s
avgKernelTime_; // GB/s
gflops_ = (float)(1000000.f * flopCount / avgKernelTime_ / 1000000000.0);
break;
case 1: // Madds
flopCount = _reqDataSize * flopsPerByte_;
bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f /
avgKernelTime_; // GB/s
avgKernelTime_; // GB/s
gflops_ = bandwidth_ * flopsPerByte_;
break;
}
+1 -1
ファイルの表示
@@ -341,7 +341,7 @@ void OCLPerfLDSReadSpeed::run(void) {
// We have one extra write per LDS location to initialize LDS
double perf = ((double)global * (numReads_ * sizeof(cl_float) + ldsSizeBytes_ / 64) * NUM_ITER *
(double)(1e-09)) /
sec;
sec;
_perfInfo = (float)perf;
SNPRINTF(buf, sizeof(buf), " %s %8d threads, %3d reads (GB/s) ", buf2, global, numReads_);
+6 -9
ファイルの表示
@@ -749,10 +749,9 @@ void OCLPerfMandelbrot::run(void) {
// printf(" totalIter = %lld\n", totalIters);
if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) {
CHECK_RESULT((totalIters != expectedIters[_openTest]) &&
(totalIters !=
expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
? _openTest + FMA_EXPECTEDVALUES_INDEX
: _openTest)]),
(totalIters != expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
? _openTest + FMA_EXPECTEDVALUES_INDEX
: _openTest)]),
"Incorrect iteration count detected!");
} else {
CHECK_RESULT(totalIters != expectedItersNV[_openTest], "Incorrect iteration count detected!");
@@ -869,11 +868,9 @@ void OCLPerfAsyncMandelbrot::run(void) {
// printf(" totalIter = %lld\n", totalIters);
if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) {
CHECK_RESULT((totalIters != 2 * expectedIters[_openTest]) &&
(totalIters !=
2 *
expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
? _openTest + FMA_EXPECTEDVALUES_INDEX
: _openTest)]),
(totalIters != 2 * expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
? _openTest + FMA_EXPECTEDVALUES_INDEX
: _openTest)]),
"Incorrect iteration count detected!");
} else {
CHECK_RESULT(totalIters != 2 * expectedItersNV[_openTest],
+1 -1
ファイルの表示
@@ -40,7 +40,7 @@ const static char* strKernel = KERNEL_CODE(
/* The purpose of this is to introduce an additional zero at stage - pass
* bit*/
const uint leftID = (thread & (pairDistance - 1)) |
((thread & ~(pairDistance - 1)) << 1); /* Is the same as below */
((thread & ~(pairDistance - 1)) << 1); /* Is the same as below */
const uint direction = ((thread >> stage) & 1) == 1 ? 0 : 1;
+2 -2
ファイルの表示
@@ -183,8 +183,8 @@ void OCLMultiQueue::open(unsigned int test, char* units, double& conversion,
sizeof(maxComputeUnits), &maxComputeUnits, NULL);
computePower *= 32 * maxComputeUnits;
NumElements = (NumElements < static_cast<size_t>(computePower))
? static_cast<size_t>(computePower)
: NumElements;
? static_cast<size_t>(computePower)
: NumElements;
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, NULL, NULL);
+2 -2
ファイルの表示
@@ -140,8 +140,8 @@ int main(int argc, char** argv) {
bool isAMDPlatform = (strcmp(platform.getInfo<CL_PLATFORM_NAME>().c_str(),
"AMD Accelerated Parallel Processing") == 0)
? true
: false;
? true
: false;
if (isAMDPlatform) {
std::string boardName;
device.getInfo(CL_DEVICE_BOARD_NAME_AMD, &boardName);
+3 -2
ファイルの表示
@@ -188,7 +188,7 @@ bool setAliasOptionVariable(int OptDescTableIx, Options& Opts, int64_t IValue, c
if (OptDescTableIx == OID_SaveTemps) {
// Dump .cl, .i(.ii), .amdil, .isa, .s, dll, calimage
flags = DUMP_CL | DUMP_I | DUMP_S | DUMP_O | DUMP_DLL | DUMP_CGIL | DUMP_DEBUGIL | DUMP_IL |
DUMP_ISA;
DUMP_ISA;
} else if (OptDescTableIx == OID_SaveTempsAll) {
flags = DUMP_ALL;
} else { // OID_Output
@@ -531,7 +531,8 @@ int getOptionDesc(std::string& options, size_t StartPos, bool IsShortForm, Optio
}
char next_c = options.at(pos);
bool optionalHasValue = (OPTION_value(od) == OVA_OPTIONAL) &&
bool optionalHasValue =
(OPTION_value(od) == OVA_OPTIONAL) &&
(((OPTION_info(od) & OA_SEPARATOR_EQUAL) && (next_c == '=')) ||
((OPTION_info(od) & OA_SEPARATOR_NONE) && !OPTION_valueSeparator(next_c)));
bool hasValue = (OPTION_value(od) == OVA_REQUIRED) || optionalHasValue;
+4 -4
ファイルの表示
@@ -339,9 +339,9 @@ const Isa* Isa::findIsa(uint32_t versionMajor, uint32_t versionMinor, uint32_t v
auto supportedIsas_ = supportedIsas();
auto isaIter = std::find_if(supportedIsas_.first, supportedIsas_.second, [&](const Isa& isa) {
return versionMajor == isa.versionMajor_ && versionMinor == isa.versionMinor_ &&
versionStepping == isa.versionStepping_ &&
(isa.sramecc_ == amd::Isa::Feature::Unsupported || isa.sramecc_ == sramecc) &&
(isa.xnack_ == amd::Isa::Feature::Unsupported || isa.xnack_ == xnack);
versionStepping == isa.versionStepping_ &&
(isa.sramecc_ == amd::Isa::Feature::Unsupported || isa.sramecc_ == sramecc) &&
(isa.xnack_ == amd::Isa::Feature::Unsupported || isa.xnack_ == xnack);
});
return isaIter == supportedIsas_.second ? nullptr : isaIter;
}
@@ -1132,7 +1132,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, char* handle, size_t* me
// Calculate the memory offset from the original base ptr
*mem_offset = reinterpret_cast<address>(dev_ptr) - reinterpret_cast<address>(orig_dev_ptr) +
amd_mem_obj->getOffset();
amd_mem_obj->getOffset();
*mem_size = amd_mem_obj->getSize();
+2 -2
ファイルの表示
@@ -1763,8 +1763,8 @@ class Device : public RuntimeObject {
return (info().svmCapabilities_ &
(CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER |
CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) != 0
? true
: false;
? true
: false;
}
//! check svm FGS support capability.
+2 -2
ファイルの表示
@@ -769,8 +769,8 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is
return amd::KernelParameterDescriptor::QueueObject;
case ARG_TYPE_VALUE:
return (argInfo->arg.value.data == DATATYPE_struct)
? amd::KernelParameterDescriptor::ReferenceObject
: amd::KernelParameterDescriptor::ValueObject;
? amd::KernelParameterDescriptor::ReferenceObject
: amd::KernelParameterDescriptor::ValueObject;
case ARG_TYPE_IMAGE:
return amd::KernelParameterDescriptor::ImageObject;
case ARG_TYPE_SAMPLER:
+7 -8
ファイルの表示
@@ -511,8 +511,8 @@ bool Program::compileAndLinkExecutable(const amd_comgr_data_set_t inputs,
if (status == AMD_COMGR_STATUS_SUCCESS) {
hasRelocatableData = true;
amd_comgr_action_kind_t kind = (continueCompileFrom == FILE_TYPE_ASM_TEXT)
? AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE
: AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE;
? AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE
: AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE;
status = amd::Comgr::do_action(kind, action, inputs, relocatableData);
extractBuildLog(relocatableData);
}
@@ -1259,9 +1259,9 @@ bool Program::linkImplHSAIL(amd::option::Options* options) {
bool finalize = true;
internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
aclType continueCompileFrom = (!binaryElf_)
? static_cast<aclType>(getNextCompilationStageFromBinary(options))
: ACL_TYPE_LLVMIR_BINARY;
aclType continueCompileFrom =
(!binaryElf_) ? static_cast<aclType>(getNextCompilationStageFromBinary(options))
: ACL_TYPE_LLVMIR_BINARY;
switch (continueCompileFrom) {
case ACL_TYPE_SPIRV_BINARY:
@@ -2857,9 +2857,8 @@ bool Program::getDemangledName(const std::string& mangledName, std::string& dema
demangledName.resize(demangled_size);
if (AMD_COMGR_STATUS_SUCCESS !=
amd::Comgr::get_data(demangled_data, &demangled_size,
const_cast<char*>(demangledName.data()))) {
if (AMD_COMGR_STATUS_SUCCESS != amd::Comgr::get_data(demangled_data, &demangled_size,
const_cast<char*>(demangledName.data()))) {
amd::Comgr::release_data(mangled_data);
amd::Comgr::release_data(demangled_data);
return false;
+9 -10
ファイルの表示
@@ -2166,18 +2166,18 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
for (auto& packed_obj : packed_vector) {
constexpr uint32_t kFillType = FillBufferAligned;
uint32_t kpattern_size = (packed_obj.pattern_expanded_)
? HostBlitManager::FillBufferInfo::kExtendedSize
: patternSize;
? HostBlitManager::FillBufferInfo::kExtendedSize
: patternSize;
size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
uint64_t koffset = overall_offset;
overall_offset += packed_obj.fill_size_;
size_t globalWorkOffset[3] = {0, 0, 0};
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t)
: (kpattern_size & 0x7) == 0 ? sizeof(uint64_t)
: (kpattern_size & 0x3) == 0 ? sizeof(uint32_t)
: (kpattern_size & 0x1) == 0 ? sizeof(uint16_t)
: sizeof(uint8_t);
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t)
: (kpattern_size & 0x7) == 0 ? sizeof(uint64_t)
: (kpattern_size & 0x3) == 0 ? sizeof(uint32_t)
: (kpattern_size & 0x1) == 0 ? sizeof(uint16_t)
: sizeof(uint8_t);
// Program kernels arguments for the fill operation
Memory* mem = &gpuMem(memory);
@@ -2302,9 +2302,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
constexpr size_t kFillImageThreshold = 256 * 256;
// Use host fill if memory has direct access and image is small
if (setup_.disableFillImage_ ||
(gpuMem(memory).isHostMemDirectAccess() &&
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
if (setup_.disableFillImage_ || (gpuMem(memory).isHostMemDirectAccess() &&
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
+2 -2
ファイルの表示
@@ -194,7 +194,7 @@ bool PalCounterReference::finalize() {
assert(layout.sampleCount == numExpCounters_);
size_t size = sizeof(Pal::GlobalCounterLayout) +
(sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1));
(sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1));
layout_ = reinterpret_cast<Pal::GlobalCounterLayout*>(new char[size]);
if (layout_ != nullptr) {
layout_->sampleCount = layout.sampleCount;
@@ -728,7 +728,7 @@ bool PerfCounter::create() {
}
counter_start = info_.counterIndex_;
counter_step = dev().properties().gfxipProperties.shaderCore.numShaderArrays *
dev().properties().gfxipProperties.shaderCore.numShaderEngines;
dev().properties().gfxipProperties.shaderCore.numShaderEngines;
break;
case PCIndexSelect::ComputeUnit:
+39 -38
ファイルの表示
@@ -111,8 +111,8 @@ static std::tuple<const amd::Isa*, const char*> findIsa(uint32_t gfxipMajor, uin
auto palDeviceIter = std::find_if(std::begin(supportedPalDevices), std::end(supportedPalDevices),
[&](const PalDevice& palDevice) {
return palDevice.gfxipMajor_ == gfxipMajor &&
palDevice.gfxipMinor_ == gfxipMinor &&
palDevice.gfxipStepping_ == (gfxipStepping & 0xF);
palDevice.gfxipMinor_ == gfxipMinor &&
palDevice.gfxipStepping_ == (gfxipStepping & 0xF);
});
if (palDeviceIter == std::end(supportedPalDevices)) {
return std::make_tuple(nullptr, nullptr);
@@ -131,8 +131,8 @@ static std::tuple<Pal::GfxIpLevel, Pal::AsicRevision, const char*> findPal(uint3
auto palDeviceIter = std::find_if(std::begin(supportedPalDevices), std::end(supportedPalDevices),
[&](const PalDevice& palDevice) {
return palDevice.gfxipMajor_ == gfxipMajor &&
palDevice.gfxipMinor_ == gfxipMinor &&
palDevice.gfxipStepping_ == (gfxipStepping & 0xF);
palDevice.gfxipMinor_ == gfxipMinor &&
palDevice.gfxipStepping_ == (gfxipStepping & 0xF);
});
if (palDeviceIter == std::end(supportedPalDevices)) {
return std::make_tuple(Pal::GfxIpLevel::None, Pal::AsicRevision::Unknown, nullptr);
@@ -351,8 +351,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.maxWorkItemDimensions_ = 3;
info_.maxComputeUnits_ = settings().enableWgpMode_
? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
: palProp.gfxipProperties.shaderCore.numAvailableCus;
? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
: palProp.gfxipProperties.shaderCore.numAvailableCus;
info_.maxPhysicalComputeUnits_ = info_.maxComputeUnits_;
info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;
@@ -371,11 +371,11 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support
info_.maxEngineClockFrequency_ = (palProp.gfxipProperties.performance.maxGpuClock != 0)
? palProp.gfxipProperties.performance.maxGpuClock
: 555;
? palProp.gfxipProperties.performance.maxGpuClock
: 555;
info_.maxMemoryClockFrequency_ = (palProp.gpuMemoryProperties.performance.maxMemClock != 0)
? palProp.gpuMemoryProperties.performance.maxMemClock
: 555;
? palProp.gpuMemoryProperties.performance.maxMemClock
: 555;
info_.wallClockFrequency_ = palProp.timestampFrequency / 1000; // in KHz
info_.vramBusBitWidth_ = palProp.gpuMemoryProperties.performance.vramBusBitWidth;
info_.l2CacheSize_ = palProp.gfxipProperties.shaderCore.tccSizeInBytes;
@@ -417,8 +417,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
uint uswcPercentAvailable =
((static_cast<uint64_t>(heaps[Pal::GpuHeapGartUswc].logicalSize) / Mi) > 1536 && IS_WINDOWS)
? 75
: 50;
? 75
: 50;
if (settings().apuSystem_) {
info_.globalMemSize_ +=
(static_cast<uint64_t>(heaps[Pal::GpuHeapGartUswc].logicalSize) * uswcPercentAvailable) /
@@ -622,8 +622,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.deviceTopology_.pcie.function = palProp.pciProperties.functionNumber;
info_.simdPerCU_ = settings().enableWgpMode_
? (2 * palProp.gfxipProperties.shaderCore.numSimdsPerCu)
: palProp.gfxipProperties.shaderCore.numSimdsPerCu;
? (2 * palProp.gfxipProperties.shaderCore.numSimdsPerCu)
: palProp.gfxipProperties.shaderCore.numSimdsPerCu;
info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
info_.simdWidth_ = isa().simdWidth();
info_.simdInstructionWidth_ = 1;
@@ -656,7 +656,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.pcieDeviceId_ = palProp.deviceId;
info_.pcieRevisionId_ = palProp.revisionId;
info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * info_.simdPerCU_ *
palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
info_.cooperativeGroups_ = settings().enableCoopGroups_;
info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;
@@ -906,8 +906,8 @@ bool Device::create(Pal::IDevice* device) {
// Save the IP level for the offline detection
ipLevel_ = properties().gfxLevel;
asicRevision_ = flagIsDefault(PAL_FORCE_ASIC_REVISION)
? properties().revision
: static_cast<Pal::AsicRevision>(PAL_FORCE_ASIC_REVISION);
? properties().revision
: static_cast<Pal::AsicRevision>(PAL_FORCE_ASIC_REVISION);
// XNACK flag should be set for PageMigration or IOMMUv2 support.
bool isXNACKEnabled =
@@ -1284,10 +1284,9 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
if (queue != nullptr) {
profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
if (queue->asHostQueue() != nullptr) {
bool interopQueue = (0 !=
(queue->context().info().flags_ &
(amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr |
amd::Context::D3D11DeviceKhr)));
bool interopQueue = (0 != (queue->context().info().flags_ &
(amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr |
amd::Context::D3D11DeviceKhr)));
rtCUs = queue->rtCUs();
} else if (queue->asDeviceQueue() != nullptr) {
deviceQueueSize = queue->asDeviceQueue()->size();
@@ -1439,9 +1438,9 @@ bool Device::init() {
// Count up all the devices in the system.
platform_->EnumerateDevices(&gNumDevices, &gDeviceList[0]);
const char* requestedDeviceList = amd::IS_HIP
? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
: GPU_DEVICE_ORDINAL;
const char* requestedDeviceList =
amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
: GPU_DEVICE_ORDINAL;
if (requestedDeviceList[0] != '\0') {
useDeviceList = true;
@@ -1611,8 +1610,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
Resource::MemoryType type =
(owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER))
? Resource::Remote
: Resource::Local;
? Resource::Remote
: Resource::Local;
// Check if runtime can force a tiny buffer into USWC memory
if ((size <= (GPU_MAX_REMOTE_MEM_SIZE * Ki)) && (type == Resource::Local) &&
@@ -1633,8 +1632,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
// Internal means VirtualDevice!=nullptr
bool internalAlloc =
((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && (owner.getVirtualDevice() != nullptr))
? true
: false;
? true
: false;
// Create a memory object
gpuMemory = new pal::Buffer(*this, owner, owner.getSize());
@@ -1918,9 +1917,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
(memory->memoryType() != Resource::ExternalPhysical) &&
((owner.getHostMem() != nullptr) ||
((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) {
bool ok = memory->pinSystemMemory(
owner.getHostMem(),
(owner.getHostMemRef()->size()) ? owner.getHostMemRef()->size() : owner.getSize());
bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size())
? owner.getHostMemRef()->size()
: owner.getSize());
//! \note: Ignore the pinning result for now
}
@@ -2067,7 +2066,8 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
// Allocated system memory without cached allocations. Cache size contains all allocations, so
// don't count persistent and local
Pal::gpusize system_memory = allocedMem[Pal::GpuHeapGartCacheable] +
allocedMem[Pal::GpuHeapGartUswc] + cache_group_local - resourceCache().cacheSize();
allocedMem[Pal::GpuHeapGartUswc] + cache_group_local -
resourceCache().cacheSize();
#if IS_WINDOWS
// Second, query OS for overall memory usage on the system
@@ -2091,7 +2091,7 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
if (mem_budget_info.usage[Pal::GpuHeapGroupNonLocal] >
(resourceCache().cacheSize() - cache_group_local)) {
system_total_alloced = mem_budget_info.usage[Pal::GpuHeapGroupNonLocal] + cache_group_local -
resourceCache().cacheSize();
resourceCache().cacheSize();
}
// System usage exceeds per process usage for system memory
if (system_total_alloced > system_memory) {
@@ -2102,9 +2102,10 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
// Third, finalize reported free memory
// Fill free memory info
freeMemory[TotalFreeMemory] = (total_alloced > info().globalMemSize_)
? 0
: static_cast<size_t>((info().globalMemSize_ - total_alloced) / Ki);
freeMemory[TotalFreeMemory] =
(total_alloced > info().globalMemSize_)
? 0
: static_cast<size_t>((info().globalMemSize_ - total_alloced) / Ki);
freeMemory[TotalFreeMemory] -=
(freeMemory[TotalFreeMemory] > HIP_HIDDEN_FREE_MEM * Ki) ? HIP_HIDDEN_FREE_MEM * Ki : 0;
@@ -2842,8 +2843,8 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
(Pal::Result::Success ==
(iDev()->SetClockMode(setClockMode,
reinterpret_cast<Pal::SetClockModeOutput*>(pSetClockModeOutput))))
? true
: false;
? true
: false;
return result;
}
+4 -4
ファイルの表示
@@ -490,10 +490,10 @@ class Device : public NullDevice {
//! Returns the number of available compute rings
uint numExclusiveComputeEngines() const {
return exclusiveComputeEnginesId_.size() +
((exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) ==
exclusiveComputeEnginesId().end())
? 1
: 0);
((exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) ==
exclusiveComputeEnginesId().end())
? 1
: 0);
}
//! Returns the map of available exclusive compute rings with the engine index
+1 -1
ファイルの表示
@@ -59,7 +59,7 @@ bool Device::associateD3D9Device(void* d3d9Device) {
// match the adapter
bool canInteroperate = (properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) &&
(properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart);
(properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart);
return canInteroperate;
}
+5 -5
ファイルの表示
@@ -782,8 +782,8 @@ bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) {
// match the adapter
canInteroperate = (properties().osProperties.luidHighPart == glAdapterLuid.HighPart) &&
(properties().osProperties.luidLowPart == glAdapterLuid.LowPart) &&
((1 << properties().gpuIndex) == glChainBitMask);
(properties().osProperties.luidLowPart == glAdapterLuid.LowPart) &&
((1 << properties().gpuIndex) == glChainBitMask);
}
#else
GLuint glDeviceId = 0;
@@ -797,9 +797,9 @@ bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
if (pfnMesaGLInteropGLXQueryDeviceInfo(disp, ctx, &info) == 0) {
// match the adapter
canInteroperate = (properties().pciProperties.busNumber == info.pci_bus) &&
(properties().pciProperties.deviceNumber == info.pci_device) &&
(properties().pciProperties.functionNumber == info.pci_function) &&
(static_cast<GLuint>(1 << properties().gpuIndex) == glChainMask);
(properties().pciProperties.deviceNumber == info.pci_device) &&
(properties().pciProperties.functionNumber == info.pci_function) &&
(static_cast<GLuint>(1 << properties().gpuIndex) == glChainMask);
}
}
#endif
+2 -2
ファイルの表示
@@ -620,8 +620,8 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) {
if (result == Pal::Result::Success) {
GpuUtil::SampleTraceApiInfo sample_trace_api_info = {};
sample_trace_api_info.instructionTraceMode = (inst_tracing_enabled_)
? GpuUtil::InstructionTraceMode::FullFrame
: GpuUtil::InstructionTraceMode::Disabled;
? GpuUtil::InstructionTraceMode::FullFrame
: GpuUtil::InstructionTraceMode::Disabled;
trace_.gpa_session_->SetSampleTraceApiInfo(sample_trace_api_info, trace_.gpa_sample_id_);
}
+1 -1
ファイルの表示
@@ -167,7 +167,7 @@ bool HSAILKernel::init() {
// Find total workgroup size
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
workGroupInfo_.compileSize_[2];
workGroupInfo_.compileSize_[2];
} else {
workGroupInfo_.size_ = device().info().preferredWorkGroupSize_;
}
+2 -2
ファイルの表示
@@ -367,8 +367,8 @@ bool Memory::createInterop() {
vkRes.nt_handle_ = ((ext_memory->Type() != amd::ExternalMemory::HandleType::OpaqueFd) &&
(ext_memory->Type() != amd::ExternalMemory::HandleType::OpaqueWin32Kmt) &&
(ext_memory->Type() != amd::ExternalMemory::HandleType::D3D11ResourceKmt))
? true
: false;
? true
: false;
}
else if (glObject != nullptr) {
+7 -8
ファイルの表示
@@ -289,8 +289,8 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
case 4:
if (printFloat) {
const float fArg = size == 2
? amd::half2float(*(reinterpret_cast<const uint16_t*>(argument)))
: *(reinterpret_cast<const float*>(argument));
? amd::half2float(*(reinterpret_cast<const uint16_t*>(argument)))
: *(reinterpret_cast<const float*>(argument));
static const char* fSpecifiers = "eEfgGa";
std::string fmtF = fmt;
size_t posS = fmtF.find_first_of("%");
@@ -327,13 +327,12 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
hhFmt.erase(hhFmt.find_first_of("h"), 2);
amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
} else if (hlModifier) {
amd::Os::printf(hlFmt.data(),
size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
: *(reinterpret_cast<const uint32_t*>(argument)));
amd::Os::printf(hlFmt.data(), size == 2
? *(reinterpret_cast<const uint16_t*>(argument))
: *(reinterpret_cast<const uint32_t*>(argument)));
} else {
amd::Os::printf(fmt.data(),
size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
: *(reinterpret_cast<const uint32_t*>(argument)));
amd::Os::printf(fmt.data(), size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
: *(reinterpret_cast<const uint32_t*>(argument)));
}
}
break;
+14 -15
ファイルの表示
@@ -305,7 +305,7 @@ Resource::Resource(const Device& gpuDev, size_t size)
desc_.state_ = 0;
desc_.type_ = Empty;
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
desc_.height_ = 1;
desc_.depth_ = 1;
desc_.mipLevels_ = 1;
@@ -859,9 +859,8 @@ bool Resource::CreateInterop(CreateParams* params) {
size_t imageSize;
size_t gpuMemSize;
if (Pal::Result::Success !=
dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
&imgCreateInfo)) {
if (Pal::Result::Success != dev().iDev()->GetExternalSharedImageSizes(
imgOpenInfo, &imageSize, &gpuMemSize, &imgCreateInfo)) {
return false;
}
@@ -1327,8 +1326,8 @@ bool Resource::create(MemoryType memType, CreateParams* params, bool forceLinear
createInfo.size = desc().width_ * elementSize_;
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
createInfo.alignment = (params && params->alignment_ != 0)
? params->alignment_
: (desc().scratch_ ? 64 * Ki : MaxGpuAlignment);
? params->alignment_
: (desc().scratch_ ? 64 * Ki : MaxGpuAlignment);
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
@@ -1388,7 +1387,7 @@ void Resource::free() {
}
const bool wait = (memoryType() != ImageView) && (memoryType() != ImageBuffer) &&
(memoryType() != ImageExternalBuffer) && (memoryType() != View);
(memoryType() != ImageExternalBuffer) && (memoryType() != View);
// OCL has to wait, even if resource is placed in the cache, since reallocation can occur
// and resource can be reused on another async queue without a wait on a busy operation
@@ -1519,8 +1518,8 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
}
bool cp_dma = dev().settings().disableSdma_ ||
(!enableCopyRect && desc().buffer_ && dstResource.desc().buffer_ &&
(size[0] < dev().settings().cpDmaCopySizeMax_));
(!enableCopyRect && desc().buffer_ && dstResource.desc().buffer_ &&
(size[0] < dev().settings().cpDmaCopySizeMax_));
if (cp_dma) {
// Make sure compute is done before CP DMA start
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::KernelToCopy);
@@ -1563,9 +1562,9 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
}
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
copyRegion.gpuMemoryDepthPitch = (srcOrigin[2])
? srcOrigin[2]
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
copyRegion.gpuMemoryDepthPitch =
(srcOrigin[2]) ? srcOrigin[2]
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_, imgLayout, 1, &copyRegion);
} else if (!desc().buffer_ && dstResource.desc().buffer_) {
Pal::MemoryImageCopyRegion copyRegion = {};
@@ -1588,9 +1587,9 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
}
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
copyRegion.gpuMemoryDepthPitch = (dstOrigin[2])
? dstOrigin[2]
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
copyRegion.gpuMemoryDepthPitch =
(dstOrigin[2]) ? dstOrigin[2]
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, &copyRegion);
} else {
if (enableCopyRect) {
+1 -1
ファイルの表示
@@ -424,7 +424,7 @@ class Resource : public amd::HeapObject {
memRef_ = viewOwner_->memRef_;
memRef_->retain();
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
}
}
+2 -3
ファイルの表示
@@ -341,9 +341,8 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
#endif
}
if (apuSystem_ &&
((heaps[Pal::GpuHeapLocal].logicalSize + heaps[Pal::GpuHeapInvisible].logicalSize) <
(150 * Mi))) {
if (apuSystem_ && ((heaps[Pal::GpuHeapLocal].logicalSize +
heaps[Pal::GpuHeapInvisible].logicalSize) < (150 * Mi))) {
remoteAlloc_ = true;
}
+13 -15
ファイルの表示
@@ -896,7 +896,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
// \todo forces PAL to reuse CBs, but requires postamble
createInfo.flags.autoMemoryReuse = false;
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = VirtualGPU::Queue::MaxCommands *
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
VirtualGPU::Queue::MaxCommands *
(320 + ((profiling) ? 96 : 0) + ((dev().captureMgr() != nullptr) ? 512 : 0));
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
dev().settings().maxCmdBuffers_ * createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize;
@@ -925,8 +926,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
uint idx = index() % dev().numComputeEngines();
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
? 0
: (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
? 0
: (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
uint max_cmd_buffers = dev().settings().maxCmdBuffers_;
if (dev().numComputeEngines()) {
@@ -937,8 +938,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
}
const auto& info = dev().QueuePool().find(queues_[MainEngine]->iQueue_);
hwRing_ = (info != dev().QueuePool().end())
? info->second->index_
: (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES;
? info->second->index_
: (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES;
// Check if device has SDMA engines
if (dev().numDMAEngines() != 0 && !dev().settings().disableSdma_) {
@@ -2158,7 +2159,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) {
amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(vcmd.dst());
assert(dstMemory && "No svm Buffer to fill with!");
size_t offset = reinterpret_cast<uintptr_t>(vcmd.dst()) -
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
pal::Memory* memory = dev().getGpuMemory(dstMemory);
@@ -2828,15 +2829,13 @@ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
if (cmd.semaphoreCmd() == amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE) {
flushDMA(MainEngine);
if (Pal::Result::Success !=
queues_[MainEngine]->iQueue_->SignalQueueSemaphore(const_cast<Pal::IQueueSemaphore*>(sem),
cmd.fence())) {
if (Pal::Result::Success != queues_[MainEngine]->iQueue_->SignalQueueSemaphore(
const_cast<Pal::IQueueSemaphore*>(sem), cmd.fence())) {
LogError("Failed to signal external semaphore");
}
} else {
if (Pal::Result::Success !=
queues_[MainEngine]->iQueue_->WaitQueueSemaphore(const_cast<Pal::IQueueSemaphore*>(sem),
cmd.fence())) {
if (Pal::Result::Success != queues_[MainEngine]->iQueue_->WaitQueueSemaphore(
const_cast<Pal::IQueueSemaphore*>(sem), cmd.fence())) {
LogError("Failed to wait on external semaphore");
}
}
@@ -3657,9 +3656,8 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
//! Note: SVM with subbuffers has an issue with tracking.
//! Conformance can send read only subbuffer, but update the region
//! in the kernel.
if ((mem != nullptr) &&
((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
if ((mem != nullptr) && ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
mem->signalWrite(&dev());
}
if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+21 -22
ファイルの表示
@@ -1709,8 +1709,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
} else {
size_t totalSize = size[0];
// Do a staging copy
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
bool useShaderCopyPath =
setup_.disableHwlCopyBuffer_ || (totalSize <= dev().settings().sdmaCopyThreshold_) ||
(copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT);
if (!useShaderCopyPath) {
@@ -1843,8 +1843,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
} else {
size_t totalSize = size[0];
// Do a staging copy
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
bool useShaderCopyPath =
setup_.disableHwlCopyBuffer_ || (totalSize <= dev().settings().sdmaCopyThreshold_) ||
(copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT);
if (!useShaderCopyPath) {
@@ -2014,18 +2014,18 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
for (auto& packed_obj : packed_vector) {
constexpr uint32_t kFillType = FillBufferAligned;
uint32_t kpattern_size = (packed_obj.pattern_expanded_)
? HostBlitManager::FillBufferInfo::kExtendedSize
: patternSize;
? HostBlitManager::FillBufferInfo::kExtendedSize
: patternSize;
size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
size_t koffset = overall_offset;
overall_offset += packed_obj.fill_size_;
size_t globalWorkOffset[3] = {0, 0, 0};
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t)
: (kpattern_size & 0x7) == 0 ? sizeof(uint64_t)
: (kpattern_size & 0x3) == 0 ? sizeof(uint32_t)
: (kpattern_size & 0x1) == 0 ? sizeof(uint16_t)
: sizeof(uint8_t);
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t)
: (kpattern_size & 0x7) == 0 ? sizeof(uint64_t)
: (kpattern_size & 0x3) == 0 ? sizeof(uint32_t)
: (kpattern_size & 0x1) == 0 ? sizeof(uint16_t)
: sizeof(uint8_t);
// Program kernels arguments for the fill operation
cl_mem mem = as_cl<amd::Memory>(memory.owner());
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem, koffset);
@@ -2096,10 +2096,10 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern
size_t globalWorkSize[3] = {amd::alignUp(fillSizeX, 16), amd::alignUp(fillSizeY, 16), 1};
size_t localWorkSize[3] = {16, 16, 1};
uint32_t alignment = (patternSize & 0x7) == 0 ? sizeof(uint64_t)
: (patternSize & 0x3) == 0 ? sizeof(uint32_t)
: (patternSize & 0x1) == 0 ? sizeof(uint16_t)
: sizeof(uint8_t);
uint32_t alignment = (patternSize & 0x7) == 0 ? sizeof(uint64_t)
: (patternSize & 0x3) == 0 ? sizeof(uint32_t)
: (patternSize & 0x1) == 0 ? sizeof(uint16_t)
: sizeof(uint8_t);
cl_mem mem = as_cl<amd::Memory>(memory.owner());
if (alignment == sizeof(uint64_t)) {
@@ -2250,8 +2250,8 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
bool ipcShared = srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
bool useShaderCopyPath =
setup_.disableHwlCopyBuffer_ || (sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
(!(p2p || ipcShared) &&
(!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess() &&
!(copyMetadata.copyEnginePreference_ ==
@@ -2307,9 +2307,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
constexpr size_t kFillImageThreshold = 256 * 256;
// Use host fill if memory has direct access and image is small
if (setup_.disableFillImage_ ||
(gpuMem(memory).isHostMemDirectAccess() &&
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
if (setup_.disableFillImage_ || (gpuMem(memory).isHostMemDirectAccess() &&
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
@@ -2691,8 +2690,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, hsa_queue_t* schedulerQueue,
amd::NDRangeContainer ndrange(1, globalWorkOffset, globalWorkSize, localWorkSize);
device::Kernel* devKernel = const_cast<device::Kernel*>(
kernels_[Scheduler]->getDeviceKernel(dev()));
device::Kernel* devKernel =
const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
+72 -79
ファイルの表示
@@ -376,8 +376,8 @@ hsa_ven_amd_loader_1_00_pfn_t Device::amd_loader_ext_table = {nullptr};
hsa_status_t Device::loaderQueryHostAddress(const void* device, const void** host) {
return amd_loader_ext_table.hsa_ven_amd_loader_query_host_address
? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host)
: HSA_STATUS_ERROR;
? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host)
: HSA_STATUS_ERROR;
}
// ================================================================================================
@@ -413,9 +413,9 @@ bool Device::init() {
return false;
}
std::string ordinals = amd::IS_HIP
? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
: GPU_DEVICE_ORDINAL;
std::string ordinals =
amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
: GPU_DEVICE_ORDINAL;
if (ordinals[0] != '\0') {
size_t pos = 0;
std::vector<hsa_agent_t> valid_agents;
@@ -573,9 +573,9 @@ bool Device::create() {
return false;
}
if (HSA_STATUS_SUCCESS !=
hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID,
&pciDeviceId_)) {
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID,
&pciDeviceId_)) {
LogPrintfError("Unable to get PCI ID of HSA device %s", agent_name);
return false;
}
@@ -584,35 +584,34 @@ bool Device::create() {
uint count;
hsa_isa_t first_isa;
} agent_isas = {0, {0}};
if (HSA_STATUS_SUCCESS !=
hsa_agent_iterate_isas(
bkendDevice_,
[](hsa_isa_t isa, void* data) {
agent_isas_t* agent_isas = static_cast<agent_isas_t*>(data);
if (agent_isas->count++ == 0) {
agent_isas->first_isa = isa;
}
return HSA_STATUS_SUCCESS;
},
&agent_isas)) {
if (HSA_STATUS_SUCCESS != hsa_agent_iterate_isas(
bkendDevice_,
[](hsa_isa_t isa, void* data) {
agent_isas_t* agent_isas = static_cast<agent_isas_t*>(data);
if (agent_isas->count++ == 0) {
agent_isas->first_isa = isa;
}
return HSA_STATUS_SUCCESS;
},
&agent_isas)) {
LogPrintfError("Unable to iterate supported ISAs for HSA device %s (PCI ID %x)", agent_name,
pciDeviceId_);
return false;
}
uint32_t isa_name_length = 0;
if (HSA_STATUS_SUCCESS !=
hsa_isa_get_info_alt(agent_isas.first_isa, (hsa_isa_info_t)HSA_ISA_INFO_NAME_LENGTH,
&isa_name_length)) {
if (HSA_STATUS_SUCCESS != hsa_isa_get_info_alt(agent_isas.first_isa,
(hsa_isa_info_t)HSA_ISA_INFO_NAME_LENGTH,
&isa_name_length)) {
LogPrintfError("Unable to get ISA name length for HSA device %s (PCI ID %x)", agent_name,
pciDeviceId_);
return false;
}
std::vector<char> isa_name(isa_name_length + 1, '\0');
if (HSA_STATUS_SUCCESS !=
hsa_isa_get_info_alt(agent_isas.first_isa, (hsa_isa_info_t)HSA_ISA_INFO_NAME,
isa_name.data())) {
if (HSA_STATUS_SUCCESS != hsa_isa_get_info_alt(agent_isas.first_isa,
(hsa_isa_info_t)HSA_ISA_INFO_NAME,
isa_name.data())) {
LogPrintfError("Unable to get ISA name for HSA device %s (PCI ID %x)", agent_name,
pciDeviceId_);
return false;
@@ -663,10 +662,9 @@ bool Device::create() {
assert(!settings_);
roc::Settings* hsaSettings = new roc::Settings();
settings_ = hsaSettings;
if (!hsaSettings ||
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
isa->xnack() == amd::Isa::Feature::Enabled, coop_groups, isXgmi_,
hasValidHDPFlush)) {
if (!hsaSettings || !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
isa->xnack() == amd::Isa::Feature::Enabled, coop_groups,
isXgmi_, hasValidHDPFlush)) {
LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name,
pciDeviceId_);
return false;
@@ -969,11 +967,11 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler)
void Sampler::fillSampleDescriptor(hsa_ext_sampler_descriptor_v2_t& samplerDescriptor,
const amd::Sampler& sampler) const {
samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
for (int i = 0; i < 3; i++) {
switch (sampler.addressingMode(i)) {
case CL_ADDRESS_CLAMP_TO_EDGE:
@@ -1036,9 +1034,9 @@ bool Device::populateOCLDeviceConstants() {
::strncpy(info_.name_, isa().targetId(), sizeof(info_.name_) - 1);
char device_name[64] = {0};
if (HSA_STATUS_SUCCESS ==
hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME,
device_name)) {
if (HSA_STATUS_SUCCESS == hsa_agent_get_info(bkendDevice_,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME,
device_name)) {
::strncpy(info_.boardName_, device_name, sizeof(info_.boardName_) - 1);
}
@@ -1075,9 +1073,9 @@ bool Device::populateOCLDeviceConstants() {
info_.maxPhysicalComputeUnits_ = settings().enableWgpMode_ ? info_.maxPhysicalComputeUnits_ / 2
: info_.maxPhysicalComputeUnits_;
if (HSA_STATUS_SUCCESS !=
hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
&info_.globalMemCacheLineSize_)) {
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
&info_.globalMemCacheLineSize_)) {
return false;
}
info_.globalMemCacheLineSize_ =
@@ -1152,9 +1150,8 @@ bool Device::populateOCLDeviceConstants() {
checkAtomicSupport();
assert(cpu_agent_info_->fine_grain_pool.handle != 0);
if (HSA_STATUS_SUCCESS !=
hsa_amd_agent_iterate_memory_pools(bkendDevice_, Device::iterateGpuMemoryPoolCallback,
this)) {
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) {
return false;
}
@@ -1188,9 +1185,9 @@ bool Device::populateOCLDeviceConstants() {
}
size_t group_segment_size = 0;
if (HSA_STATUS_SUCCESS !=
hsa_amd_memory_pool_get_info(group_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE,
&group_segment_size)) {
if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(group_segment_,
HSA_AMD_MEMORY_POOL_INFO_SIZE,
&group_segment_size)) {
return false;
}
assert(group_segment_size > 0);
@@ -1229,16 +1226,16 @@ bool Device::populateOCLDeviceConstants() {
if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
size_t global_segment_size = 0;
if (HSA_STATUS_SUCCESS !=
hsa_amd_memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE,
&global_segment_size)) {
if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_,
HSA_AMD_MEMORY_POOL_INFO_SIZE,
&global_segment_size)) {
return false;
}
assert(global_segment_size > 0);
info_.globalMemSize_ = (static_cast<uint64_t>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
static_cast<uint64_t>(global_segment_size)) /
100u;
100u;
// For APU with vram size <= 512MiB, use a smaller single alloc percentage
if (info_.globalMemSize_ <= 536870912) {
@@ -1266,7 +1263,7 @@ bool Device::populateOCLDeviceConstants() {
info_.globalMemSize_ = std::max(info_.globalMemSize_, uint64_t(1 * Gi));
info_.globalMemSize_ = (static_cast<uint64_t>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
static_cast<uint64_t>(info_.globalMemSize_)) /
100u;
100u;
info_.maxMemAllocSize_ =
uint64_t(info_.globalMemSize_ * std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
@@ -1325,8 +1322,8 @@ bool Device::populateOCLDeviceConstants() {
info_.hostUnifiedMemory_ = 1;
info_.iommuv2_ = true;
}
info_.memBaseAddrAlign_ = 8 *
(flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(int64_t[16]) * 2 : MEMOBJ_BASE_ADDR_ALIGN);
info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(int64_t[16]) * 2
: MEMOBJ_BASE_ADDR_ALIGN);
info_.minDataTypeAlignSize_ = sizeof(int64_t[16]);
info_.maxConstantArgs_ = 8;
@@ -1629,14 +1626,14 @@ bool Device::populateOCLDeviceConstants() {
if (getIsaMeta(std::move(isa().isaName()), isaMeta)) {
std::string addressableNumVGPRs, totalNumVGPRs, vGPRAllocGranule;
info_.availableVGPRs_ = getValueFromIsaMeta(isaMeta, "AddressableNumVGPRs", addressableNumVGPRs)
? atoi(addressableNumVGPRs.c_str())
: 0;
? atoi(addressableNumVGPRs.c_str())
: 0;
info_.vgprsPerSimd_ = getValueFromIsaMeta(isaMeta, "TotalNumVGPRs", totalNumVGPRs)
? atoi(totalNumVGPRs.c_str())
: 0;
? atoi(totalNumVGPRs.c_str())
: 0;
info_.vgprAllocGranularity_ = getValueFromIsaMeta(isaMeta, "VGPRAllocGranule", vGPRAllocGranule)
? atoi(vGPRAllocGranule.c_str())
: 0;
? atoi(vGPRAllocGranule.c_str())
: 0;
info_.availableRegistersPerCU_ = info_.vgprsPerSimd_ * info_.simdPerCU_ * info_.wavefrontWidth_;
ClPrint(amd::LOG_INFO, amd::LOG_INIT,
@@ -1647,8 +1644,8 @@ bool Device::populateOCLDeviceConstants() {
std::string sgprValue;
info_.availableSGPRs_ = (getValueFromIsaMeta(isaMeta, "AddressableNumSGPRs", sgprValue))
? (atoi(sgprValue.c_str()))
: 0;
? (atoi(sgprValue.c_str()))
: 0;
if (!releaseIsaMeta(isaMeta)) {
LogInfo("Can not release the isa meta node");
}
@@ -1663,9 +1660,8 @@ bool Device::populateOCLDeviceConstants() {
}
// This capability should be available with xnack enabled
if (HSA_STATUS_SUCCESS !=
hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT,
&info_.hmmCpuMemoryAccessible_)) {
if (HSA_STATUS_SUCCESS != hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT,
&info_.hmmCpuMemoryAccessible_)) {
LogError("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT query failed.");
}
@@ -1805,9 +1801,9 @@ bool Device::bindExternalDevice(uint flags, void* const gfxDevice[], void* gfxCo
}
return info_.deviceTopology_.pcie.bus == info.pci_bus &&
info_.deviceTopology_.pcie.device == info.pci_device &&
info_.deviceTopology_.pcie.function == info.pci_function &&
info_.vendorId_ == info.vendor_id && pciDeviceId_ == info.device_id;
info_.deviceTopology_.pcie.device == info.pci_device &&
info_.deviceTopology_.pcie.function == info.pci_function &&
info_.vendorId_ == info.vendor_id && pciDeviceId_ == info.device_id;
#endif
}
@@ -2224,10 +2220,10 @@ void Device::releaseMemory(void* ptr, size_t size) const {
void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain,
bool contiguous) const {
const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle)
? gpu_ext_fine_grained_segment_
: (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
: gpuvm_segment_;
const hsa_amd_memory_pool_t& pool =
(pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) ? gpu_ext_fine_grained_segment_
: (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
: gpuvm_segment_;
if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle,
@@ -2474,9 +2470,8 @@ bool Device::SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryA
amd::Memory* svm_mem = amd::MemObjMap::FindMemObj(dev_ptr);
if ((nullptr == svm_mem) || ((svm_mem->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) == 0) ||
// Validate the range of provided memory
((svm_mem->getSize() -
(reinterpret_cast<const_address>(dev_ptr) -
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
((svm_mem->getSize() - (reinterpret_cast<const_address>(dev_ptr) -
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
LogPrintfError("SetSvmAttributes received unknown memory for update: %p!", dev_ptr);
return false;
}
@@ -2565,9 +2560,8 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
amd::Memory* svm_mem = amd::MemObjMap::FindMemObj(dev_ptr);
if ((nullptr == svm_mem) || ((svm_mem->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) == 0) ||
// Validate the range of provided memory
((svm_mem->getSize() -
(reinterpret_cast<const_address>(dev_ptr) -
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
((svm_mem->getSize() - (reinterpret_cast<const_address>(dev_ptr) -
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
LogPrintfError("GetSvmAttributes received unknown memory %p for state!", dev_ptr);
return false;
}
@@ -3493,9 +3487,8 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer
}
if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
if ((size != 0) &&
((reinterpret_cast<const_address>(dev_ptr) -
reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
if ((size != 0) && ((reinterpret_cast<const_address>(dev_ptr) -
reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
return false;
}
return true;
+10 -11
ファイルの表示
@@ -835,9 +835,8 @@ bool Buffer::create(bool alloc_local) {
} else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
// TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal,
// replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready.
if (HSA_STATUS_SUCCESS !=
hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr, HSA_AMD_SIGNAL_AMD_GPU_ONLY,
&signal_)) {
if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr,
HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal_)) {
ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
"[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed");
return false;
@@ -1316,8 +1315,8 @@ bool Image::create(bool alloc_local) {
// support alignment larger than HSA memory region allocation granularity.
// In this case, the user manages the alignment.
const size_t alloc_size = (deviceImageInfo_.alignment <= dev().alloc_granularity())
? deviceImageInfo_.size
: deviceImageInfo_.size + deviceImageInfo_.alignment;
? deviceImageInfo_.size
: deviceImageInfo_.size + deviceImageInfo_.alignment;
if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) {
originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size);
@@ -1357,8 +1356,8 @@ bool Image::createView(const Memory& parent) {
deviceMemory_ = parent.getDeviceMemory();
originalDeviceMemory_ = (parent.owner()->asBuffer() != nullptr)
? deviceMemory_
: static_cast<const Image&>(parent).originalDeviceMemory_;
? deviceMemory_
: static_cast<const Image&>(parent).originalDeviceMemory_;
// Detect image view from buffer to distinguish linear paths from tiled.
amd::Memory* ancestor = parent.owner();
@@ -1411,10 +1410,10 @@ bool Image::createView(const Memory& parent) {
break;
}
hsa_ext_image_t hsaImage;
if (HSA_STATUS_SUCCESS ==
hsa_ext_image_create_with_layout(
dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, permission_,
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, tryPitch, 0, &hsaImage)) {
if (HSA_STATUS_SUCCESS == hsa_ext_image_create_with_layout(
dev().getBackendDevice(), &imageDescriptor_, deviceMemory_,
permission_, HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, tryPitch, 0,
&hsaImage)) {
// The image pitch from app is not expectation of the GPU
LogWarning("[OCL] will use copy image");
workaround = true;
+4 -4
ファイルの表示
@@ -153,10 +153,10 @@ class Memory : public device::Memory {
// Get MemorySegment type in terms of host memory allocation flags
Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) {
return (memFlags & CL_MEM_SVM_ATOMICS) == 0
? Device::MemorySegment::kNoAtomics
: ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ? Device::MemorySegment::kUncachedAtomics
: Device::MemorySegment::kAtomics);
return (memFlags & CL_MEM_SVM_ATOMICS) == 0 ? Device::MemorySegment::kNoAtomics
: ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0
? Device::MemorySegment::kUncachedAtomics
: Device::MemorySegment::kAtomics);
}
private:
+7 -8
ファイルの表示
@@ -177,8 +177,8 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
case 4:
if (printFloat) {
const float fArg = size == 2
? amd::half2float(*(reinterpret_cast<const uint16_t*>(argument)))
: *(reinterpret_cast<const float*>(argument));
? amd::half2float(*(reinterpret_cast<const uint16_t*>(argument)))
: *(reinterpret_cast<const float*>(argument));
static const char* fSpecifiers = "eEfgGa";
std::string fmtF = fmt;
size_t posS = fmtF.find_first_of("%");
@@ -216,13 +216,12 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
hhFmt.erase(hhFmt.find_first_of("h"), 2);
amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
} else if (hlModifier) {
amd::Os::printf(hlFmt.data(),
size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
: *(reinterpret_cast<const uint32_t*>(argument)));
amd::Os::printf(hlFmt.data(), size == 2
? *(reinterpret_cast<const uint16_t*>(argument))
: *(reinterpret_cast<const uint32_t*>(argument)));
} else {
amd::Os::printf(fmt.data(),
size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
: *(reinterpret_cast<const uint32_t*>(argument)));
amd::Os::printf(fmt.data(), size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
: *(reinterpret_cast<const uint32_t*>(argument)));
}
}
break;
+2 -2
ファイルの表示
@@ -57,8 +57,8 @@ struct AmdAqlWrap {
// It’s incremented on the
// start and decremented on the finish. The parent kernel can be
// considered as done when the value is 0 and the state is DONE
uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
uint32_t reserved[5]; //!< For the future usage
+1 -1
ファイルの表示
@@ -240,7 +240,7 @@ void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidH
const uint32_t gfxStepping = isa.versionStepping();
const bool isGfx94x = gfxipMajor == 9 && gfxipMinor >= 4 &&
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
const bool isGfx90a = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10);
const bool isPreGfx908 =
(gfxipMajor < 9) || ((gfxipMajor == 9) && (gfxipMinor == 0) && (gfxStepping < 8));
+25 -23
ファイルの表示
@@ -879,11 +879,11 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
} else {
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i,
desc.typeName_.c_str(), desc.name_.c_str(),
(desc.size_ == 1) ? *reinterpret_cast<const uint8_t*>(srcArgPtr)
: (desc.size_ == 2) ? *reinterpret_cast<const uint16_t*>(srcArgPtr)
: (desc.size_ == 4) ? *reinterpret_cast<const uint32_t*>(srcArgPtr)
: (desc.size_ == 8) ? *reinterpret_cast<const uint64_t*>(srcArgPtr)
: 0LL,
(desc.size_ == 1) ? *reinterpret_cast<const uint8_t*>(srcArgPtr)
: (desc.size_ == 2) ? *reinterpret_cast<const uint16_t*>(srcArgPtr)
: (desc.size_ == 4) ? *reinterpret_cast<const uint32_t*>(srcArgPtr)
: (desc.size_ == 8) ? *reinterpret_cast<const uint64_t*>(srcArgPtr)
: 0LL,
desc.size_);
}
}
@@ -1362,10 +1362,10 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
cache_state, barrier_value_packet_.signal, barrier_value_packet_.value,
barrier_value_packet_.mask,
barrier_value_packet_.cond == 0 ? "EQ"
: barrier_value_packet_.cond == 1 ? "NE"
: barrier_value_packet_.cond == 2 ? "LT"
: "GTE",
barrier_value_packet_.cond == 0 ? "EQ"
: barrier_value_packet_.cond == 1 ? "NE"
: barrier_value_packet_.cond == 2 ? "LT"
: "GTE",
barrier_value_packet_.completion_signal, read, index);
// Clear dependent signals for the next packet
barrier_value_packet_.signal = hsa_signal_t{};
@@ -1432,21 +1432,23 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
cooperative_ = cooperative;
if (device.settings().fenceScopeAgent_) {
dispatchPacketHeaderNoSync_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
dispatchPacketHeaderNoSync_ =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
dispatchPacketHeader_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
} else {
dispatchPacketHeaderNoSync_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
dispatchPacketHeaderNoSync_ =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
dispatchPacketHeader_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
}
aqlHeader_ = dispatchPacketHeader_;
@@ -2091,8 +2093,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
// Find the requested agent for the transfer
hsa_agent_t agent =
(cmd.cpu_access() || (dev().settings().hmmFlags_ & Settings::Hmm::EnableSystemMemory))
? dev().getCpuAgent(cmd.numa_id())
: (static_cast<const roc::Device*>(cmd.device()))->getBackendDevice();
? dev().getCpuAgent(cmd.numa_id())
: (static_cast<const roc::Device*>(cmd.device()))->getBackendDevice();
// Initiate a prefetch command
hsa_status_t status =
@@ -3000,7 +3002,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
size_t fillSize = patternSize * cmd.times();
size_t offset = reinterpret_cast<uintptr_t>(cmd.dst()) -
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
Memory* memory = dev().getRocMemory(dstMemory);
@@ -3567,9 +3569,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
if (aql_packet != nullptr) {
*aql_packet = dispatchPacket;
aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
}
+3 -3
ファイルの表示
@@ -475,9 +475,9 @@ class VirtualGPU : public device::VirtualDevice {
const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
bool blocking = true, bool attach_signal = false);
template <typename AqlPacket>
bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking,
bool attach_signal = false);
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
uint16_t rest, bool blocking,
bool attach_signal = false);
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
+5 -5
ファイルの表示
@@ -384,10 +384,10 @@ class elfio {
bool is_sect_in_seg(Elf64_Off sect_begin, Elf_Xword sect_size, Elf64_Off seg_begin,
Elf64_Off seg_end) {
return seg_begin <= sect_begin && sect_begin + sect_size <= seg_end &&
sect_begin <
seg_end; // this is important criteria when sect_size == 0
// Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11)
// sect_begin=12, sect_size=0 -> shall return false!
sect_begin <
seg_end; // this is important criteria when sect_size == 0
// Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11)
// sect_begin=12, sect_size=0 -> shall return false!
}
//------------------------------------------------------------------------------
@@ -447,7 +447,7 @@ class elfio {
section* sec = sections_.at(i);
std::streampos headerPosition = (std::streamoff)header->get_sections_offset() +
header->get_section_entry_size() * sec->get_index();
header->get_section_entry_size() * sec->get_index();
sec->save(stream, headerPosition, sec->get_offset());
}
+1 -1
ファイルの表示
@@ -130,7 +130,7 @@ template <class S> class note_section_accessor_template {
Elf_Word descsz = convertor(*(const Elf_Word*)(data + current + sizeof(namesz)));
current += 3 * sizeof(Elf_Word) + ((namesz + align - 1) / align) * align +
((descsz + align - 1) / align) * align;
((descsz + align - 1) / align) * align;
}
}
+8 -8
ファイルの表示
@@ -104,8 +104,8 @@ template <class S> class relocation_section_accessor_template {
unsigned char other;
symbol_section_accessor symbols(elf_file, elf_file.sections[get_symbol_table_index()]);
ret = ret &&
symbols.get_symbol(symbol, symbolName, symbolValue, size, bind, symbolType, section, other);
ret = ret && symbols.get_symbol(symbol, symbolName, symbolValue, size, bind, symbolType,
section, other);
if (ret) { // Was it successful?
switch (type) {
@@ -207,9 +207,9 @@ template <class S> class relocation_section_accessor_template {
Elf_Half get_symbol_table_index() const { return (Elf_Half)relocation_section->get_link(); }
//------------------------------------------------------------------------------
template <class T>
void generic_get_entry_rel(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol, Elf_Word& type,
Elf_Sxword& addend) const {
template <class T> void generic_get_entry_rel(Elf_Xword index, Elf64_Addr& offset,
Elf_Word& symbol, Elf_Word& type,
Elf_Sxword& addend) const {
const endianess_convertor& convertor = elf_file.get_convertor();
const T* pEntry = reinterpret_cast<const T*>(relocation_section->get_data() +
@@ -222,9 +222,9 @@ template <class S> class relocation_section_accessor_template {
}
//------------------------------------------------------------------------------
template <class T>
void generic_get_entry_rela(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol, Elf_Word& type,
Elf_Sxword& addend) const {
template <class T> void generic_get_entry_rela(Elf_Xword index, Elf64_Addr& offset,
Elf_Word& symbol, Elf_Word& type,
Elf_Sxword& addend) const {
const endianess_convertor& convertor = elf_file.get_convertor();
const T* pEntry = reinterpret_cast<const T*>(relocation_section->get_data() +
+7 -7
ファイルの表示
@@ -255,10 +255,10 @@ template <class S> class symbol_section_accessor_template {
}
//------------------------------------------------------------------------------
template <class T>
bool generic_get_symbol(Elf_Xword index, std::string& name, Elf64_Addr& value, Elf_Xword& size,
unsigned char& bind, unsigned char& type, Elf_Half& section_index,
unsigned char& other) const {
template <class T> bool generic_get_symbol(Elf_Xword index, std::string& name, Elf64_Addr& value,
Elf_Xword& size, unsigned char& bind,
unsigned char& type, Elf_Half& section_index,
unsigned char& other) const {
bool ret = false;
if (0 != symbol_section->get_data() && index < get_symbols_num()) {
@@ -287,9 +287,9 @@ template <class S> class symbol_section_accessor_template {
}
//------------------------------------------------------------------------------
template <class T>
Elf_Word generic_add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size, unsigned char info,
unsigned char other, Elf_Half shndx) {
template <class T> Elf_Word generic_add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size,
unsigned char info, unsigned char other,
Elf_Half shndx) {
const endianess_convertor& convertor = elf_file.get_convertor();
T entry;
+4 -4
ファイルの表示
@@ -66,9 +66,9 @@ class endianess_convertor {
return value;
}
value = ((value & 0x00000000000000FFull) << 56) | ((value & 0x000000000000FF00ull) << 40) |
((value & 0x0000000000FF0000ull) << 24) | ((value & 0x00000000FF000000ull) << 8) |
((value & 0x000000FF00000000ull) >> 8) | ((value & 0x0000FF0000000000ull) >> 24) |
((value & 0x00FF000000000000ull) >> 40) | ((value & 0xFF00000000000000ull) >> 56);
((value & 0x0000000000FF0000ull) << 24) | ((value & 0x00000000FF000000ull) << 8) |
((value & 0x000000FF00000000ull) >> 8) | ((value & 0x0000FF0000000000ull) >> 24) |
((value & 0x00FF000000000000ull) >> 40) | ((value & 0xFF00000000000000ull) >> 56);
return value;
}
@@ -87,7 +87,7 @@ class endianess_convertor {
return value;
}
value = ((value & 0x000000FF) << 24) | ((value & 0x0000FF00) << 8) |
((value & 0x00FF0000) >> 8) | ((value & 0xFF000000) >> 24);
((value & 0x00FF0000) >> 8) | ((value & 0xFF000000) >> 24);
return value;
}
+1 -1
ファイルの表示
@@ -471,7 +471,7 @@ inline void Os::ThreadAffinityMask::clear(uint cpu) {
inline bool Os::ThreadAffinityMask::isSet(uint cpu) const {
return (KAFFINITY)0 !=
(mask_[cpu / (8 * sizeof(KAFFINITY))] & ((KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY)))));
(mask_[cpu / (8 * sizeof(KAFFINITY))] & ((KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY)))));
}
inline bool Os::ThreadAffinityMask::isEmpty() const {
+7 -8
ファイルの表示
@@ -301,10 +301,9 @@ const Event::EventWaitList Event::nullWaitList(0);
// ================================================================================================
Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
uint32_t commandWaitBits, const Event* waitingEvent)
: Event(queue,
amd::activity_prof::IsEnabled(amd::activity_prof::OperationId(type)) ||
queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
Agent::shouldPostEventEvents()),
: Event(queue, amd::activity_prof::IsEnabled(amd::activity_prof::OperationId(type)) ||
queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
Agent::shouldPostEventEvents()),
queue_(&queue),
next_(nullptr),
type_(type),
@@ -604,24 +603,24 @@ bool CopyMemoryCommand::isEntireMemory() const {
Coord3D imageSize(size()[0] * size()[1] * size()[2] *
source().asImage()->getImageFormat().getElementSize());
result = source().isEntirelyCovered(srcOrigin(), size()) &&
destination().isEntirelyCovered(dstOrigin(), imageSize);
destination().isEntirelyCovered(dstOrigin(), imageSize);
} break;
case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
Coord3D imageSize(size()[0] * size()[1] * size()[2] *
destination().asImage()->getImageFormat().getElementSize());
result = source().isEntirelyCovered(srcOrigin(), imageSize) &&
destination().isEntirelyCovered(dstOrigin(), size());
destination().isEntirelyCovered(dstOrigin(), size());
} break;
case CL_COMMAND_COPY_BUFFER_RECT: {
Coord3D rectSize(size()[0] * size()[1] * size()[2]);
Coord3D srcOffs(srcRect().start_);
Coord3D dstOffs(dstRect().start_);
result = source().isEntirelyCovered(srcOffs, rectSize) &&
destination().isEntirelyCovered(dstOffs, rectSize);
destination().isEntirelyCovered(dstOffs, rectSize);
} break;
default:
result = source().isEntirelyCovered(srcOrigin(), size()) &&
destination().isEntirelyCovered(dstOrigin(), size());
destination().isEntirelyCovered(dstOrigin(), size());
break;
}
return result;
+2 -3
ファイルの表示
@@ -260,9 +260,8 @@ int Context::create(const intptr_t* properties) {
}
// Check if OCL context can be associated with any external device
if (info_.flags_ &
(D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | D3D9DeviceKhr | D3D9DeviceEXKhr |
D3D9DeviceVAKhr)) {
if (info_.flags_ & (D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | D3D9DeviceKhr |
D3D9DeviceEXKhr | D3D9DeviceVAKhr)) {
// Loop through all devices
for (const auto& it : devices_) {
if (!it->bindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY)) {
+4 -4
ファイルの表示
@@ -75,10 +75,10 @@ size_t KernelParameters::localMemSize(size_t minDataTypeAlignment) const {
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
if (desc.size_ == 8) {
memSize = alignUp(memSize, minDataTypeAlignment) +
*reinterpret_cast<const uint64_t*>(values_ + desc.offset_);
*reinterpret_cast<const uint64_t*>(values_ + desc.offset_);
} else {
memSize = alignUp(memSize, minDataTypeAlignment) +
*reinterpret_cast<const uint32_t*>(values_ + desc.offset_);
*reinterpret_cast<const uint32_t*>(values_ + desc.offset_);
}
}
}
@@ -300,10 +300,10 @@ address KernelParameters::capture(device::VirtualDevice& vDev, uint64_t lclMemSi
} else if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
if (desc.size_ == 8) {
lclMemSize = alignUp(lclMemSize, device.info().minDataTypeAlignSize_) +
*reinterpret_cast<const uint64_t*>(values_ + desc.offset_);
*reinterpret_cast<const uint64_t*>(values_ + desc.offset_);
} else {
lclMemSize = alignUp(lclMemSize, device.info().minDataTypeAlignSize_) +
*reinterpret_cast<const uint32_t*>(values_ + desc.offset_);
*reinterpret_cast<const uint32_t*>(values_ + desc.offset_);
}
}
}
+6 -4
ファイルの表示
@@ -158,10 +158,11 @@ class KernelParameters : protected HeapObject {
execNewVcop_(0),
execPfpaVcop_(0),
deviceKernelArgs_(false) {
totalSize_ = signature.paramsSize() +
totalSize_ =
signature.paramsSize() +
(signature.numMemories() + signature.numSamplers() + signature.numQueues()) * sizeof(void*);
values_ = reinterpret_cast<address>(this) +
alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT);
alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT);
memoryObjOffset_ = signature_.paramsSize();
memoryObjects_ = reinterpret_cast<amd::Memory**>(values_ + memoryObjOffset_);
samplerObjOffset_ = memoryObjOffset_ + signature_.numMemories() * sizeof(amd::Memory*);
@@ -186,7 +187,7 @@ class KernelParameters : protected HeapObject {
execPfpaVcop_(rhs.execPfpaVcop_),
deviceKernelArgs_(false) {
values_ = reinterpret_cast<address>(this) +
alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT);
alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT);
memoryObjOffset_ = signature_.paramsSize();
memoryObjects_ = reinterpret_cast<amd::Memory**>(values_ + memoryObjOffset_);
samplerObjOffset_ = memoryObjOffset_ + signature_.numMemories() * sizeof(amd::Memory*);
@@ -223,7 +224,8 @@ class KernelParameters : protected HeapObject {
//! Allocate memory for this instance as well as the required storage for
// the values_, defined_, and rawPointer_ arrays.
void* operator new(size_t size, const KernelSignature& signature) {
size_t requiredSize = alignUp(size, PARAMETERS_MIN_ALIGNMENT) + signature.paramsSize() +
size_t requiredSize =
alignUp(size, PARAMETERS_MIN_ALIGNMENT) + signature.paramsSize() +
(signature.numMemories() + signature.numSamplers() + signature.numQueues()) * sizeof(void*);
return AlignedMemory::allocate(requiredSize, PARAMETERS_MIN_ALIGNMENT);
}
+15 -15
ファイルの表示
@@ -57,9 +57,9 @@ bool HostMemoryReference::allocateMemory(size_t size, const Context& context) {
size_t memoryAlignment = (CPU_MEMORY_ALIGNMENT_SIZE <= 0) ? 256 : CPU_MEMORY_ALIGNMENT_SIZE;
size_ = amd::alignUp(size, memoryAlignment);
//! \note memory size must be aligned for CAL pinning
hostMem_ = CPU_MEMORY_GUARD_PAGES
? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN, CPU_MEMORY_GUARD_PAGE_SIZE * Ki)
: context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN);
hostMem_ = CPU_MEMORY_GUARD_PAGES ? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN,
CPU_MEMORY_GUARD_PAGE_SIZE * Ki)
: context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN);
alloced_ = (hostMem_ != NULL);
return alloced_;
}
@@ -146,7 +146,7 @@ Memory::Memory(Memory& parent, Flags flags, size_t origin, size_t size, Type typ
if ((flags_ & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) {
flags_ |= parent_->getMemFlags() &
(CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS);
(CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS);
}
}
@@ -590,8 +590,8 @@ bool Buffer::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) con
bool Buffer::validateRegion(const Coord3D& origin, const Coord3D& region) const {
return ((region[0] > 0) && (origin[0] < getSize()) && ((origin[0] + region[0]) <= getSize()))
? true
: false;
? true
: false;
}
void Pipe::initDeviceMemory() {
@@ -614,7 +614,7 @@ Image::Image(const Format& format, Image& parent, uint baseMipLevel, cl_mem_flag
baseMipLevel_(baseMipLevel) {
if (baseMipLevel > 0) {
impl_.region_.c[0] = GETMIPDIM(parent.getWidth(), baseMipLevel) *
parent.getImageFormat().getElementSize() / format.getElementSize();
parent.getImageFormat().getElementSize() / format.getElementSize();
impl_.region_.c[1] = GETMIPDIM(parent.getHeight(), baseMipLevel);
impl_.region_.c[2] = GETMIPDIM(parent.getDepth(), baseMipLevel);
@@ -1030,9 +1030,9 @@ const cl_image_format Image::supportedFormats[] = {
{CL_DEPTH, CL_FLOAT},
};
const uint32_t NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of
// the table supportedFormats above and before sRGB
// and depth.
const uint32_t NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of
// the table supportedFormats above and before sRGB
// and depth.
const uint32_t NUM_CHANNEL_ORDER_OF_sRGB = 1; // The number of channel orders of sRGB at the end of
// the table supportedFormats above and before depth.
const uint32_t NUM_CHANNEL_ORDER_OF_DEPTH =
@@ -1246,8 +1246,8 @@ Image* Image::createView(const Context& context, const Format& format, device::V
bool Image::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const {
return (origin[0] == 0 && origin[1] == 0 && origin[2] == 0 && region[0] == getWidth() &&
region[1] == getHeight() && region[2] == getDepth())
? true
: false;
? true
: false;
}
bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const {
@@ -1255,15 +1255,15 @@ bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const {
(region[0] != 0) && (origin[1] < getHeight()) && (region[1] != 0) &&
(origin[2] < getDepth()) && (region[2] != 0) && ((origin[0] + region[0]) <= getWidth()) &&
((origin[1] + region[1]) <= getHeight()) && ((origin[2] + region[2]) <= getDepth()))
? true
: false;
? true
: false;
}
bool Image::isRowSliceValid(size_t rowPitch, size_t slice, size_t width, size_t height) const {
size_t tmpHeight = (getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? 1 : height;
bool valid = (rowPitch == 0) ||
((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize()));
((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize()));
return ((slice == 0) || ((slice != 0) && (slice >= rowPitch * tmpHeight))) ? valid : false;
}
+1 -1
ファイルの表示
@@ -530,7 +530,7 @@ class Image : public Memory {
//! Compare 2 image formats.
bool operator==(const Format& rhs) const {
return image_channel_order == rhs.image_channel_order &&
image_channel_data_type == rhs.image_channel_data_type;
image_channel_data_type == rhs.image_channel_data_type;
}
bool operator!=(const Format& rhs) const { return !(*this == rhs); }
+4 -5
ファイルの表示
@@ -170,8 +170,8 @@ int32_t Program::addDeviceProgram(Device& device, const void* image, size_t leng
}
}
options->oVariables->Legacy = !device.settings().useLightning_
? isAMDILTarget(*amd::aclutGetTargetInfo(binary))
: isHSAILTarget(*amd::aclutGetTargetInfo(binary));
? isAMDILTarget(*amd::aclutGetTargetInfo(binary))
: isHSAILTarget(*amd::aclutGetTargetInfo(binary));
amd::Hsail::BinaryFini(binary);
}
#endif // defined(WITH_COMPILER_LIB)
@@ -522,9 +522,8 @@ int32_t Program::build(const std::vector<Device*>& devices, const char* options,
for (const auto& it : devices) {
option::Options parsedOptions;
constexpr bool LinkOptsOnly = false;
if ((language_ != HIP) &&
!ParseAllOptions(cppstr, parsedOptions, optionChangable, LinkOptsOnly,
it->settings().useLightning_)) {
if ((language_ != HIP) && !ParseAllOptions(cppstr, parsedOptions, optionChangable, LinkOptsOnly,
it->settings().useLightning_)) {
programLog_ = parsedOptions.optionsLog();
LogError("Parsing compile options failed.");
return CL_INVALID_COMPILER_OPTIONS;
+252 -531
ファイルの表示
@@ -21,537 +21,258 @@
#ifndef FLAGS_HPP_
#define FLAGS_HPP_
#define RUNTIME_FLAGS(debug, release, release_on_stg) \
\
release(int, AMD_LOG_LEVEL, 0, "The default log level") release( \
uint, AMD_LOG_MASK, 0X7FFFFFFF, \
"The mask to enable specific kinds of logs") release(cstring, AMD_LOG_LEVEL_FILE, "", \
"Set output file for AMD_LOG_LEVEL, " \
"Default is stderr") release(size_t, \
AMD_LOG_LEVEL_SIZE, \
2048, \
"The max " \
"size of " \
"AMD_LOG " \
"generate" \
"d in MB " \
"if " \
"printed " \
"to a " \
"file") \
debug(uint, DEBUG_GPU_FLAGS, 0, "The debug options for GPU device") release( \
size_t, CQ_THREAD_STACK_SIZE, 256 * Ki, /* @todo: that much! */ \
"The default command queue thread stack size") release(int, GPU_MAX_WORKGROUP_SIZE, 0, \
"Maximum number of workitems in " \
"a workgroup for GPU, 0 -use " \
"default") \
debug(bool, CPU_MEMORY_GUARD_PAGES, false, "Use guard pages for CPU memory") debug( \
size_t, CPU_MEMORY_GUARD_PAGE_SIZE, \
64, "Size in KB of CPU memory guard page") debug(size_t, CPU_MEMORY_ALIGNMENT_SIZE, \
256, \
"Size in bytes for the default " \
"alignment for guarded memory on " \
"CPU") debug(size_t, \
PARAMETERS_MIN_ALIGNMENT, \
NATIVE_ALIGNMENT_SIZE, \
"Minimum alignment " \
"required for the " \
"abstract parameters " \
"stack") debug(size_t, \
MEMOBJ_BASE_ADDR_ALIGN, \
4 * Ki, \
"Align" \
"ment " \
"of " \
"the " \
"base " \
"addre" \
"ss " \
"of " \
"any " \
"alloc" \
"ate " \
"memor" \
"y " \
"objec" \
"t") \
release( \
uint, ROC_HMM_FLAGS, \
0, "ROCm HMM configuration flags") release(cstring, GPU_DEVICE_ORDINAL, "", \
"Select the device ordinal (comma " \
"seperated list of available " \
"devices)") release(bool, \
REMOTE_ALLOC, \
false, \
"Use remote " \
"memory for the " \
"global heap " \
"allocation") \
release(uint, GPU_CP_DMA_COPY_SIZE, 1, \
"Set maximum size of CP DMA copy in KiB") release(uint, \
GPU_MAX_HEAP_SIZE, \
100, \
"Set maximum size of " \
"the GPU heap to % " \
"of board memory") \
release( \
uint, GPU_STAGING_BUFFER_SIZE, 4, \
"Size of the GPU staging buffer in MiB") release(bool, \
GPU_DUMP_BLIT_KERNELS, \
false, \
"Dump the kernels for " \
"blit manager") \
release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \
"Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \
release(bool, GPU_FLUSH_ON_EXECUTION, false, \
"Submit commands to HW on every operation. 0 - Disable, 1 " \
"- Enable") release(bool, CL_KHR_FP64, true, \
"Enable/Disable support for double " \
"precision") release(cstring, \
AMD_OCL_BUILD_OPTIONS, \
0, \
"Set " \
"clBuildProgram() " \
"and " \
"clCompileProgram(" \
")'s options " \
"(override)") \
release(cstring, AMD_OCL_BUILD_OPTIONS_APPEND, 0, \
"Append clBuildProgram() and clCompileProgram()'s " \
"options") release(cstring, AMD_OCL_LINK_OPTIONS, 0, \
"Set clLinkProgram()'s options " \
"(override)") \
release( \
cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \
"Append clLinkProgram()'s options") debug(cstring, \
AMD_OCL_SUBST_OBJFILE, \
0, \
"Specify " \
"binary " \
"substitution" \
" config " \
"file for " \
"OpenCL") \
release( \
size_t, GPU_PINNED_XFER_SIZE, 32, \
"The pinned buffer size for pinning in read/write " \
"transfers in MiB") release(size_t, \
GPU_PINNED_MIN_XFER_SIZE, \
128, \
"The minimal buffer " \
"size for pinned " \
"read/write transfers " \
"in MiB") release(size_t, \
GPU_RESOURCE_CACHE_SIZE, \
64, \
"The " \
"reso" \
"urce" \
" cac" \
"he " \
"size" \
" in " \
"MB") \
release( \
size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
"The maximum size accepted for suballocations " \
"in KB") release(size_t, GPU_NUM_MEM_DEPENDENCY, \
256, \
"Number of memory objects for " \
"dependency tracking") \
release( \
size_t, GPU_XFER_BUFFER_SIZE, 0, \
"Transfer buffer size for image copy " \
"optimization in KB") release(bool, \
GPU_IMAGE_DMA, \
true, \
"Enable DRM " \
"DMA for " \
"image " \
"transfers") \
release( \
uint, GPU_SINGLE_ALLOC_PERCENT, 100, \
"Maximum size of a single allocation " \
"as percentage of total") release(uint, \
GPU_NUM_COMPUTE_RINGS, \
2, \
"GPU " \
"numb" \
"er " \
"of " \
"comp" \
"ute " \
"ring" \
"s. " \
"0 - " \
"disa" \
"bled" \
", 1 " \
", " \
"2,.." \
" - " \
"the " \
"numb" \
"er " \
"of " \
"comp" \
"ute " \
"ring" \
"s") \
release( \
bool, AMD_OCL_WAIT_COMMAND, false, \
"1 = Enable a wait for every " \
"submitted command") release(uint, \
GPU_PRINT_CHILD_KERNEL, \
0, \
"Print" \
"s " \
"the " \
"speci" \
"fied " \
"numbe" \
"r of " \
"the " \
"child" \
" kern" \
"els") \
release(bool, GPU_USE_DEVICE_QUEUE, \
false, \
"Use a dedicated device " \
"queue for the actual " \
"submissions") release(bool, \
AMD_THREAD_TRACE_ENABLE, \
true, \
"Ena" \
"ble" \
" th" \
"rea" \
"d " \
"tra" \
"ce " \
"ext" \
"ens" \
"io" \
"n") \
release( \
uint, OPENCL_VERSION, 200, \
"Force GPU opencl version") release(bool, \
HSA_LOCAL_MEMORY_ENABLE, \
true, \
"Enable HSA device local memory usage") \
release( \
uint, \
HSA_KERNARG_POOL_SIZE, \
1024 * 1024, \
"Kernarg pool size") release(bool, \
GPU_MIPMAP, \
true, \
"Enables GPU mipmap extension") \
release( \
uint, \
GPU_ENABLE_PAL, \
2, \
"Enables PAL " \
"backend. 0 - ROC, " \
"1 - PAL, 2 - ROC " \
"or PAL") release(bool, DISABLE_DEFERRED_ALLOC, \
false, \
"Disables deferred memory allocation on device") \
release( \
int, \
AMD_GPU_FORCE_SINGLE_FP_DENORM, \
-1, \
"Force denorm " \
"for single " \
"precision: -1 " \
"- don't " \
"force, 0 - " \
"disable, 1 - " \
"enable") \
release( \
uint, \
OCL_SET_SVM_SIZE, \
4 * 16384, \
"set SVM " \
"space " \
"size for " \
"discrete " \
"GPU") release(uint, \
GPU_WAVES_PER_SIMD, \
0, \
"Force the number of waves per SIMD (1-10)") \
release( \
bool, \
OCL_STUB_PROGRAMS, \
false, \
"1 = " \
"Enable" \
"s OCL " \
"progra" \
"ms " \
"stubin" \
"g") \
release( \
bool, \
GPU_ANALYZE_HANG, \
false, \
"1 " \
"= " \
"En" \
"ab" \
"le" \
"s " \
"GP" \
"U " \
"ha" \
"ng" \
" a" \
"na" \
"ly" \
"si" \
"s") \
release( \
uint, \
GPU_MAX_REMOTE_MEM_SIZE, \
2, \
"Maximum size (in Ki) that allows device memory substitution with system") \
release(bool, \
GPU_ADD_HBCC_SIZE, \
false, \
"Add HBCC size to the reported device memory") release(bool, \
PAL_DISABLE_SDMA, \
false, \
"1 = Disable SDMA for PAL") release(uint, \
PAL_RGP_DISP_COUNT, \
10000, \
"The number of dispatches for RGP capture with SQTT") release(uint, \
PAL_MALL_POLICY, \
0, \
"Controls the behaviour of allocations with respect to the MALL" \
"0 = MALL policy is decided by KMD" \
"1 = Allocations are never put through the MALL" \
"2 = Allocations will always be put through the MALL") release(bool, \
GPU_ENABLE_WAVE32_MODE, \
true, \
"Enables Wave32 compilation in HW if available") release(bool, \
GPU_ENABLE_LC, \
true, \
"Enables LC path") release(bool, GPU_ENABLE_HW_P2P, \
false, \
"Enables HW P2P path") release(bool, \
GPU_ENABLE_COOP_GROUPS, \
true, \
"Enables cooperative group launch") release(uint, \
GPU_MAX_COMMAND_BUFFERS, \
8, \
"The maximum number of command buffers allocated per queue") release(uint, \
GPU_MAX_HW_QUEUES, \
4, \
"The maximum number of HW queues allocated per device") release(bool, GPU_IMAGE_BUFFER_WAR, true, \
"Enables image buffer workaround") release(cstring, \
HIP_VISIBLE_DEVICES, \
"", \
"Only devices whose index is present in the sequence are visible to HIP") release(cstring, \
CUDA_VISIBLE_DEVICES, \
"", \
"Only devices whose index is present in the sequence are visible to CUDA") \
release(bool, \
GPU_ENABLE_WGP_MODE, \
true, \
"Enables WGP Mode in HW if available") \
release( \
bool, \
GPU_DUMP_CODE_OBJECT, \
false, \
"Enable dump code object") release(uint, \
GPU_MAX_USWC_ALLOC_SIZE, 2048, \
"Set a limit in Mb on the maximum USWC allocation size" \
"-1 = No limit") \
release( \
uint, \
AMD_SERIALIZE_KERNEL, \
0, \
"Serialize kernel enqueue, 0x1 = Wait for completion before enqueue" \
"0x2 = Wait for completion after enqueue 0x3 = both") release(uint, \
AMD_SERIALIZE_COPY, \
0, \
"Serialize copies, 0x1 = Wait for completion before enqueue" \
"0x2 = Wait for completion after enqueue 0x3 = both") release(uint, \
HIP_LAUNCH_BLOCKING, \
0, \
"Serialize kernel enqueue 0x1 = Wait for completion after enqueue," \
"same as AMD_SERIALIZE_KERNEL=2") release(bool, \
PAL_ALWAYS_RESIDENT, \
false, \
"Force memory resources to become resident at allocation time") release(uint, \
HIP_HOST_COHERENT, \
0, \
"Coherent memory in hipHostMalloc, 0x1 = memory is coherent with host" \
"0x0 = memory is not coherent between host and GPU") release(uint, AMD_OPT_FLUSH, 1, \
"Kernel flush option , 0x0 = Use system-scope fence operations." \
"0x1 = Use device-scope fence operations when possible.") \
release( \
bool, \
AMD_DIRECT_DISPATCH, \
false, \
"Enable direct kernel dispatch.") release(uint, \
HIP_HIDDEN_FREE_MEM, \
0, \
"Reserve free mem reporting in Mb" \
"0 = Disable") release(size_t, \
GPU_FORCE_BLIT_COPY_SIZE, \
16, \
"Use Blit until this size(in KB) for copies") release(uint, \
ROC_ACTIVE_WAIT_TIMEOUT, \
0, \
"Forces active wait of GPU interrup for the timeout(us)") release(bool, \
ROC_ENABLE_LARGE_BAR, \
true, \
"Enable Large Bar if supported by the device") release(bool, \
ROC_CPU_WAIT_FOR_SIGNAL, \
true, \
"Enable CPU wait for dependent HSA signals.") release(bool, \
ROC_SYSTEM_SCOPE_SIGNAL, \
true, \
"Enable system scope for signals (uses interrupts).") release(bool, \
GPU_FORCE_QUEUE_PROFILING, \
false, \
"Force command queue profiling by default") \
release( \
bool, \
HIP_MEM_POOL_SUPPORT, \
true, \
"Enables memory pool support in HIP") release(bool, \
HIP_MEM_POOL_USE_VM, \
true, \
"Enables memory pool support in HIP") release(bool, \
DEBUG_HIP_MEM_POOL_VMHEAP, \
true, \
"Enables virtual memory for memory pools") release(bool, \
PAL_HIP_IPC_FLAG, true, \
"Enable interprocess flag for device allocation in PAL HIP") \
release( \
uint, \
PAL_FORCE_ASIC_REVISION, \
0, \
"Force a specific asic revision for all devices") \
release( \
bool, \
PAL_EMBED_KERNEL_MD, \
false, \
"Enables writing kernel metadata into command buffers.") release(cstring, \
ROC_GLOBAL_CU_MASK, \
"", \
"Sets a global CU mask (entered as hex value) for all queues," \
"Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)") release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \
"Size in KBytes of prepinned memory") release(bool, \
AMD_CPU_AFFINITY, \
false, \
"Reset CPU affinity of any runtime threads") release(bool, \
ROC_USE_FGS_KERNARG, \
true, \
"Use fine grain kernel args segment for supported asics") release(uint, \
ROC_P2P_SDMA_SIZE, \
1024, \
"The minimum size in KB for P2P transfer with SDMA") release(uint, \
ROC_AQL_QUEUE_SIZE, \
16384, \
"AQL queue size in AQL packets") \
release( \
uint, \
ROC_SIGNAL_POOL_SIZE, \
64, \
"Initial size of HSA signal pool") \
release(uint, \
DEBUG_CLR_LIMIT_BLIT_WG, \
16, \
"Limit the number of workgroups in blit operations") release(bool, \
DEBUG_CLR_BLIT_KERNARG_OPT, \
false, \
"Enable blit kernel arguments optimization") release(bool, \
ROC_SKIP_KERNEL_ARG_COPY, \
false, \
"If true, then runtime can skip kernel arg copy") release(bool, \
GPU_STREAMOPS_CP_WAIT, \
false, \
"Force the stream wait memory operation to wait on CP.") release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, \
false, \
"Set this to true to force runtime unbundler in hiprtc.") release(size_t, \
HIP_INITIAL_DM_SIZE, \
8 * Mi, \
"Set initial heap size for device malloc.") \
release( \
bool, \
HIP_FORCE_DEV_KERNARG, \
true, \
"Force device mem for kernel args.") release(bool, \
DEBUG_CLR_GRAPH_PACKET_CAPTURE, \
true, \
"Enable/Disable graph packet capturing") release(bool, \
GPU_DEBUG_ENABLE, false, \
"Enables collection of extra info for debugger at some perf cost") \
release( \
cstring, \
HIPRTC_COMPILE_OPTIONS_APPEND, \
"", \
"Set compile options needed for hiprtc compilation") \
release( \
cstring, \
HIPRTC_LINK_OPTIONS_APPEND, \
"", \
"Set link options needed for hiprtc compilation") \
release( \
bool, \
HIP_VMEM_MANAGE_SUPPORT, \
true, \
"Virtual Memory Management Support") \
release( \
bool, \
DEBUG_HIP_GRAPH_DOT_PRINT, \
false, \
"Enable/Disable graph debug dot print dump") release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \
"Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \
release( \
uint, \
DEBUG_HIP_FORCE_GRAPH_QUEUES, \
4, \
"Forces the number of streams for the graph parallel execution") \
release( \
uint, \
DEBUG_HIP_BLOCK_SYNC, \
50, \
"Blocks synchronization on CPU until the callback processing is done") \
release(uint, \
DEBUG_CLR_MAX_BATCH_SIZE, \
1000, \
"Forces the callback to clean-up CPU submission queue") release(bool, DEBUG_CLR_SYSMEM_POOL, false, \
"Use sysmem pool implementation in runtime for amd commands") \
release(bool, \
DEBUG_HIP_KERNARG_COPY_OPT, \
true, \
"Enable/Disable multiple kern arg copies") release(bool, \
DEBUG_CLR_KERNARG_HDP_FLUSH_WA, \
false, \
"Toggle kernel arg copy workaround") release(bool, \
DEBUG_HIP_DYNAMIC_QUEUES, \
false, \
"Forces dynamic queue management") \
release( \
uint, \
HIP_SKIP_ABORT_ON_GPU_ERROR, \
true, \
"Set this to true, to avoid host side abort for GPU errors") \
release( \
bool, \
HIP_FORCE_SPIRV_CODEOBJECT, \
false, \
"Force use of SPIRV instead of device specific code object.") \
release( \
uint, \
DEBUG_CLR_BATCH_CPU_SYNC_SIZE, \
8, \
"Forces the minimum batch size for CPU sync")
// clang-format off
#define RUNTIME_FLAGS(debug,release,release_on_stg) \
\
release(int, AMD_LOG_LEVEL, 0, \
"The default log level") \
release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \
"The mask to enable specific kinds of logs") \
release(cstring, AMD_LOG_LEVEL_FILE, "", \
"Set output file for AMD_LOG_LEVEL, Default is stderr") \
release(size_t, AMD_LOG_LEVEL_SIZE, 2048, \
"The max size of AMD_LOG generated in MB if printed to a file") \
debug(uint, DEBUG_GPU_FLAGS, 0, \
"The debug options for GPU device") \
release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \
"The default command queue thread stack size") \
release(int, GPU_MAX_WORKGROUP_SIZE, 0, \
"Maximum number of workitems in a workgroup for GPU, 0 -use default") \
debug(bool, CPU_MEMORY_GUARD_PAGES, false, \
"Use guard pages for CPU memory") \
debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \
"Size in KB of CPU memory guard page") \
debug(size_t, CPU_MEMORY_ALIGNMENT_SIZE, 256, \
"Size in bytes for the default alignment for guarded memory on CPU") \
debug(size_t, PARAMETERS_MIN_ALIGNMENT, NATIVE_ALIGNMENT_SIZE, \
"Minimum alignment required for the abstract parameters stack") \
debug(size_t, MEMOBJ_BASE_ADDR_ALIGN, 4*Ki, \
"Alignment of the base address of any allocate memory object") \
release(uint, ROC_HMM_FLAGS, 0, \
"ROCm HMM configuration flags") \
release(cstring, GPU_DEVICE_ORDINAL, "", \
"Select the device ordinal (comma seperated list of available devices)") \
release(bool, REMOTE_ALLOC, false, \
"Use remote memory for the global heap allocation") \
release(uint, GPU_CP_DMA_COPY_SIZE, 1, \
"Set maximum size of CP DMA copy in KiB") \
release(uint, GPU_MAX_HEAP_SIZE, 100, \
"Set maximum size of the GPU heap to % of board memory") \
release(uint, GPU_STAGING_BUFFER_SIZE, 4, \
"Size of the GPU staging buffer in MiB") \
release(bool, GPU_DUMP_BLIT_KERNELS, false, \
"Dump the kernels for blit manager") \
release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \
"Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \
release(bool, GPU_FLUSH_ON_EXECUTION, false, \
"Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \
release(bool, CL_KHR_FP64, true, \
"Enable/Disable support for double precision") \
release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \
"Set clBuildProgram() and clCompileProgram()'s options (override)") \
release(cstring, AMD_OCL_BUILD_OPTIONS_APPEND, 0, \
"Append clBuildProgram() and clCompileProgram()'s options") \
release(cstring, AMD_OCL_LINK_OPTIONS, 0, \
"Set clLinkProgram()'s options (override)") \
release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \
"Append clLinkProgram()'s options") \
debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \
"Specify binary substitution config file for OpenCL") \
release(size_t, GPU_PINNED_XFER_SIZE, 32, \
"The pinned buffer size for pinning in read/write transfers in MiB") \
release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \
"The minimal buffer size for pinned read/write transfers in MiB") \
release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \
"The resource cache size in MB") \
release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
"The maximum size accepted for suballocations in KB") \
release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \
"Number of memory objects for dependency tracking") \
release(size_t, GPU_XFER_BUFFER_SIZE, 0, \
"Transfer buffer size for image copy optimization in KB") \
release(bool, GPU_IMAGE_DMA, true, \
"Enable DRM DMA for image transfers") \
release(uint, GPU_SINGLE_ALLOC_PERCENT, 100, \
"Maximum size of a single allocation as percentage of total") \
release(uint, GPU_NUM_COMPUTE_RINGS, 2, \
"GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
release(bool, AMD_OCL_WAIT_COMMAND, false, \
"1 = Enable a wait for every submitted command") \
release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
"Prints the specified number of the child kernels") \
release(bool, GPU_USE_DEVICE_QUEUE, false, \
"Use a dedicated device queue for the actual submissions") \
release(bool, AMD_THREAD_TRACE_ENABLE, true, \
"Enable thread trace extension") \
release(uint, OPENCL_VERSION, 200, \
"Force GPU opencl version") \
release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \
"Enable HSA device local memory usage") \
release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \
"Kernarg pool size") \
release(bool, GPU_MIPMAP, true, \
"Enables GPU mipmap extension") \
release(uint, GPU_ENABLE_PAL, 2, \
"Enables PAL backend. 0 - ROC, 1 - PAL, 2 - ROC or PAL") \
release(bool, DISABLE_DEFERRED_ALLOC, false, \
"Disables deferred memory allocation on device") \
release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \
"Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
release(uint, OCL_SET_SVM_SIZE, 4*16384, \
"set SVM space size for discrete GPU") \
release(uint, GPU_WAVES_PER_SIMD, 0, \
"Force the number of waves per SIMD (1-10)") \
release(bool, OCL_STUB_PROGRAMS, false, \
"1 = Enables OCL programs stubing") \
release(bool, GPU_ANALYZE_HANG, false, \
"1 = Enables GPU hang analysis") \
release(uint, GPU_MAX_REMOTE_MEM_SIZE, 2, \
"Maximum size (in Ki) that allows device memory substitution with system") \
release(bool, GPU_ADD_HBCC_SIZE, false, \
"Add HBCC size to the reported device memory") \
release(bool, PAL_DISABLE_SDMA, false, \
"1 = Disable SDMA for PAL") \
release(uint, PAL_RGP_DISP_COUNT, 10000, \
"The number of dispatches for RGP capture with SQTT") \
release(uint, PAL_MALL_POLICY, 0, \
"Controls the behaviour of allocations with respect to the MALL" \
"0 = MALL policy is decided by KMD" \
"1 = Allocations are never put through the MALL" \
"2 = Allocations will always be put through the MALL") \
release(bool, GPU_ENABLE_WAVE32_MODE, true, \
"Enables Wave32 compilation in HW if available") \
release(bool, GPU_ENABLE_LC, true, \
"Enables LC path") \
release(bool, GPU_ENABLE_HW_P2P, false, \
"Enables HW P2P path") \
release(bool, GPU_ENABLE_COOP_GROUPS, true, \
"Enables cooperative group launch") \
release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \
"The maximum number of command buffers allocated per queue") \
release(uint, GPU_MAX_HW_QUEUES, 4, \
"The maximum number of HW queues allocated per device") \
release(bool, GPU_IMAGE_BUFFER_WAR, true, \
"Enables image buffer workaround") \
release(cstring, HIP_VISIBLE_DEVICES, "", \
"Only devices whose index is present in the sequence are visible to HIP") \
release(cstring, CUDA_VISIBLE_DEVICES, "", \
"Only devices whose index is present in the sequence are visible to CUDA") \
release(bool, GPU_ENABLE_WGP_MODE, true, \
"Enables WGP Mode in HW if available") \
release(bool, GPU_DUMP_CODE_OBJECT, false, \
"Enable dump code object") \
release(uint, GPU_MAX_USWC_ALLOC_SIZE, 2048, \
"Set a limit in Mb on the maximum USWC allocation size" \
"-1 = No limit") \
release(uint, AMD_SERIALIZE_KERNEL, 0, \
"Serialize kernel enqueue, 0x1 = Wait for completion before enqueue" \
"0x2 = Wait for completion after enqueue 0x3 = both") \
release(uint, AMD_SERIALIZE_COPY, 0, \
"Serialize copies, 0x1 = Wait for completion before enqueue" \
"0x2 = Wait for completion after enqueue 0x3 = both") \
release(uint, HIP_LAUNCH_BLOCKING, 0, \
"Serialize kernel enqueue 0x1 = Wait for completion after enqueue," \
"same as AMD_SERIALIZE_KERNEL=2") \
release(bool, PAL_ALWAYS_RESIDENT, false, \
"Force memory resources to become resident at allocation time") \
release(uint, HIP_HOST_COHERENT, 0, \
"Coherent memory in hipHostMalloc, 0x1 = memory is coherent with host"\
"0x0 = memory is not coherent between host and GPU") \
release(uint, AMD_OPT_FLUSH, 1, \
"Kernel flush option , 0x0 = Use system-scope fence operations." \
"0x1 = Use device-scope fence operations when possible.") \
release(bool, AMD_DIRECT_DISPATCH, false, \
"Enable direct kernel dispatch.") \
release(uint, HIP_HIDDEN_FREE_MEM, 0, \
"Reserve free mem reporting in Mb" \
"0 = Disable") \
release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 16, \
"Use Blit until this size(in KB) for copies") \
release(uint, ROC_ACTIVE_WAIT_TIMEOUT, 0, \
"Forces active wait of GPU interrup for the timeout(us)") \
release(bool, ROC_ENABLE_LARGE_BAR, true, \
"Enable Large Bar if supported by the device") \
release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \
"Enable CPU wait for dependent HSA signals.") \
release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \
"Enable system scope for signals (uses interrupts).") \
release(bool, GPU_FORCE_QUEUE_PROFILING, false, \
"Force command queue profiling by default") \
release(bool, HIP_MEM_POOL_SUPPORT, true, \
"Enables memory pool support in HIP") \
release(bool, HIP_MEM_POOL_USE_VM, true, \
"Enables memory pool support in HIP") \
release(bool, DEBUG_HIP_MEM_POOL_VMHEAP, true, \
"Enables virtual memory for memory pools") \
release(bool, PAL_HIP_IPC_FLAG, true, \
"Enable interprocess flag for device allocation in PAL HIP") \
release(uint, PAL_FORCE_ASIC_REVISION, 0, \
"Force a specific asic revision for all devices") \
release(bool, PAL_EMBED_KERNEL_MD, false, \
"Enables writing kernel metadata into command buffers.") \
release(cstring, ROC_GLOBAL_CU_MASK, "", \
"Sets a global CU mask (entered as hex value) for all queues," \
"Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)") \
release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \
"Size in KBytes of prepinned memory") \
release(bool, AMD_CPU_AFFINITY, false, \
"Reset CPU affinity of any runtime threads") \
release(bool, ROC_USE_FGS_KERNARG, true, \
"Use fine grain kernel args segment for supported asics") \
release(uint, ROC_P2P_SDMA_SIZE, 1024, \
"The minimum size in KB for P2P transfer with SDMA") \
release(uint, ROC_AQL_QUEUE_SIZE, 16384, \
"AQL queue size in AQL packets") \
release(uint, ROC_SIGNAL_POOL_SIZE, 64, \
"Initial size of HSA signal pool") \
release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \
"Limit the number of workgroups in blit operations") \
release(bool, DEBUG_CLR_BLIT_KERNARG_OPT, false, \
"Enable blit kernel arguments optimization") \
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
"If true, then runtime can skip kernel arg copy") \
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
"Force the stream wait memory operation to wait on CP.") \
release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false, \
"Set this to true to force runtime unbundler in hiprtc.") \
release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \
"Set initial heap size for device malloc.") \
release(bool, HIP_FORCE_DEV_KERNARG, true, \
"Force device mem for kernel args.") \
release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \
"Enable/Disable graph packet capturing") \
release(bool, GPU_DEBUG_ENABLE, false, \
"Enables collection of extra info for debugger at some perf cost") \
release(cstring, HIPRTC_COMPILE_OPTIONS_APPEND, "", \
"Set compile options needed for hiprtc compilation") \
release(cstring, HIPRTC_LINK_OPTIONS_APPEND, "", \
"Set link options needed for hiprtc compilation") \
release(bool, HIP_VMEM_MANAGE_SUPPORT, true, \
"Virtual Memory Management Support") \
release(bool, DEBUG_HIP_GRAPH_DOT_PRINT, false, \
"Enable/Disable graph debug dot print dump") \
release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \
"Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \
release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \
"Forces the number of streams for the graph parallel execution") \
release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \
"Blocks synchronization on CPU until the callback processing is done")\
release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000, \
"Forces the callback to clean-up CPU submission queue") \
release(bool, DEBUG_CLR_SYSMEM_POOL, false, \
"Use sysmem pool implementation in runtime for amd commands") \
release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true, \
"Enable/Disable multiple kern arg copies") \
release(bool, DEBUG_CLR_KERNARG_HDP_FLUSH_WA, false, \
"Toggle kernel arg copy workaround") \
release(bool, DEBUG_HIP_DYNAMIC_QUEUES, false, \
"Forces dynamic queue management") \
release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true, \
"Set this to true, to avoid host side abort for GPU errors") \
release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \
"Force use of SPIRV instead of device specific code object.") \
release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8, \
"Forces the minimum batch size for CPU sync") // clang-format on
namespace amd {
+2 -2
ファイルの表示
@@ -256,7 +256,7 @@ inline float half2float(const uint16_t Val) {
uint32_t signBit = ((uint32_t)(Val & 0x8000)) << signBitShift;
uint32_t exponent = (Val & halfExpoentMask) >> 10;
uint32_t fraction = ((uint32_t)(Val & halfFractionMask))
<< 13; // Aligning half fraction to float
<< 13; // Aligning half fraction to float
union {
uint32_t u32Arg;
float fArg;
@@ -283,7 +283,7 @@ inline float half2float(const uint16_t Val) {
}
}
uint32_t floatExponent = ((exponent + floatExponentBias - halfExponentBias) & 0xff)
<< floatExponentShift;
<< floatExponentShift;
u32Arg = signBit | floatExponent | fraction;
return fArg;
}
+3 -3
ファイルの表示
@@ -1,10 +1,10 @@
Language: Cpp
BasedOnStyle: Google
AlignEscapedNewlinesLeft: false
AlignOperands: false
AlignOperands: Align
ColumnLimit: 100
AlwaysBreakTemplateDeclarations: false
BreakTemplateDeclarations: No
DerivePointerAlignment: false
IndentFunctionDeclarationAfterType: false
MaxEmptyLinesToKeep: 2
SortIncludes: false
SortIncludes: Never

変更されたファイルが多すぎるため、一部のファイルは表示されません さらに表示